diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-29 10:44:27 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-29 10:44:27 -0700 |
commit | 56c455b38dba47ae9cb48d71b2a106d769d1a694 (patch) | |
tree | a34645bb8a4067855a8affacc7a158fbcb742107 /fs/xfs | |
parent | bedf1495271bc2ea57903762b722f339ea680d0d (diff) | |
parent | 9419092fb2630c30e4ffeb9ef61007ef0c61827a (diff) |
Merge tag 'xfs-6.4-merge-1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
Pull xfs updates from Dave Chinner:
"This consists mainly of online scrub functionality and the design
documentation for the upcoming online repair functionality built on
top of the scrub code:
- Added detailed design documentation for the upcoming online repair
feature
- major update to online scrub to complete the reverse mapping
cross-referencing infrastructure enabling us to fully validate
allocated metadata against owner records. This is the last piece of
scrub infrastructure needed before we can start merging online
repair functionality.
- Fixes for the ascii-ci hashing issues
- deprecation of the ascii-ci functionality
- on-disk format verification bug fixes
- various random bug fixes for syzbot and other bug reports"
* tag 'xfs-6.4-merge-1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (107 commits)
xfs: fix livelock in delayed allocation at ENOSPC
xfs: Extend table marker on deprecated mount options table
xfs: fix duplicate includes
xfs: fix BUG_ON in xfs_getbmap()
xfs: verify buffer contents when we skip log replay
xfs: _{attr,data}_map_shared should take ILOCK_EXCL until iread_extents is completely done
xfs: remove WARN when dquot cache insertion fails
xfs: don't consider future format versions valid
xfs: deprecate the ascii-ci feature
xfs: test the ascii case-insensitive hash
xfs: stabilize the dirent name transformation function used for ascii-ci dir hash computation
xfs: cross-reference rmap records with refcount btrees
xfs: cross-reference rmap records with inode btrees
xfs: cross-reference rmap records with free space btrees
xfs: cross-reference rmap records with ag btrees
xfs: introduce bitmap type for AG blocks
xfs: convert xbitmap to interval tree
xfs: drop the _safe behavior from the xbitmap foreach macro
xfs: don't load local xattr values during scrub
xfs: remove the for_each_xbitmap_ helpers
...
Diffstat (limited to 'fs/xfs')
81 files changed, 5199 insertions, 1887 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 9fac5ea8d0e4..52e1823241fb 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -47,6 +47,33 @@ config XFS_SUPPORT_V4 To continue supporting the old V4 format (crc=0), say Y. To close off an attack surface, say N. +config XFS_SUPPORT_ASCII_CI + bool "Support deprecated case-insensitive ascii (ascii-ci=1) format" + depends on XFS_FS + default y + help + The ASCII case insensitivity filesystem feature only works correctly + on systems that have been coerced into using ISO 8859-1, and it does + not work on extended attributes. The kernel has no visibility into + the locale settings in userspace, so it corrupts UTF-8 names. + Enabling this feature makes XFS vulnerable to mixed case sensitivity + attacks. Because of this, the feature is deprecated. All users + should upgrade by backing up their files, reformatting, and restoring + from the backup. + + Administrators and users can detect such a filesystem by running + xfs_info against a filesystem mountpoint and checking for a string + beginning with "ascii-ci=". If the string "ascii-ci=1" is found, the + filesystem is a case-insensitive filesystem. If no such string is + found, please upgrade xfsprogs to the latest version and try again. + + This option will become default N in September 2025. Support for the + feature will be removed entirely in September 2030. Distributors + can say N here to withdraw support earlier. + + To continue supporting case-insensitivity (ascii-ci=1), say Y. + To close off an attack surface, say N. + config XFS_QUOTA bool "XFS Quota support" depends on XFS_FS @@ -93,10 +120,15 @@ config XFS_RT If unsure, say N. +config XFS_DRAIN_INTENTS + bool + select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL + config XFS_ONLINE_SCRUB bool "XFS online metadata check support" default n depends on XFS_FS + select XFS_DRAIN_INTENTS help If you say Y here you will be able to check metadata on a mounted XFS filesystem. This feature is intended to reduce diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 92d88dc3c9f7..16e4eb431230 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -136,6 +136,8 @@ ifeq ($(CONFIG_MEMORY_FAILURE),y) xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o endif +xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o + # online scrub/repair ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) @@ -146,6 +148,7 @@ xfs-y += $(addprefix scrub/, \ agheader.o \ alloc.o \ attr.o \ + bitmap.o \ bmap.o \ btree.o \ common.o \ @@ -156,6 +159,7 @@ xfs-y += $(addprefix scrub/, \ ialloc.o \ inode.o \ parent.o \ + readdir.o \ refcount.o \ rmap.o \ scrub.o \ @@ -169,7 +173,6 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ - bitmap.o \ repair.o \ ) endif diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 86696a1c6891..1b078bbbf225 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -81,6 +81,19 @@ xfs_perag_get_tag( return pag; } +/* Get a passive reference to the given perag. */ +struct xfs_perag * +xfs_perag_hold( + struct xfs_perag *pag) +{ + ASSERT(atomic_read(&pag->pag_ref) > 0 || + atomic_read(&pag->pag_active_ref) > 0); + + trace_xfs_perag_hold(pag, _RET_IP_); + atomic_inc(&pag->pag_ref); + return pag; +} + void xfs_perag_put( struct xfs_perag *pag) @@ -247,6 +260,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); + xfs_defer_drain_free(&pag->pag_intents_drain); cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_buf_hash_destroy(pag); @@ -372,6 +386,7 @@ xfs_initialize_perag( spin_lock_init(&pag->pag_state_lock); INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); + xfs_defer_drain_init(&pag->pag_intents_drain); init_waitqueue_head(&pag->pagb_wait); init_waitqueue_head(&pag->pag_active_wq); pag->pagb_count = 0; @@ -408,6 +423,7 @@ xfs_initialize_perag( return 0; out_remove_pag: + xfs_defer_drain_free(&pag->pag_intents_drain); radix_tree_delete(&mp->m_perag_tree, index); out_free_pag: kmem_free(pag); @@ -418,6 +434,7 @@ out_unwind_new_pags: if (!pag) break; xfs_buf_hash_destroy(pag); + xfs_defer_drain_free(&pag->pag_intents_drain); kmem_free(pag); } return error; @@ -1043,10 +1060,8 @@ xfs_ag_extend_space( if (error) return error; - error = xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno, - be32_to_cpu(agf->agf_length) - len), - len, &XFS_RMAP_OINFO_SKIP_UPDATE, - XFS_AG_RESV_NONE); + error = xfs_free_extent(tp, pag, be32_to_cpu(agf->agf_length) - len, + len, &XFS_RMAP_OINFO_SKIP_UPDATE, XFS_AG_RESV_NONE); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 5e18536dfdce..2e0aef87d633 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -101,6 +101,14 @@ struct xfs_perag { /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; + /* + * We use xfs_drain to track the number of deferred log intent items + * that have been queued (but not yet processed) so that waiters (e.g. + * scrub) will not lock resources when other threads are in the middle + * of processing a chain of intent items only to find momentary + * inconsistencies. + */ + struct xfs_defer_drain pag_intents_drain; #endif /* __KERNEL__ */ }; @@ -134,6 +142,7 @@ void xfs_free_perag(struct xfs_mount *mp); struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int tag); +struct xfs_perag *xfs_perag_hold(struct xfs_perag *pag); void xfs_perag_put(struct xfs_perag *pag); /* Active AG references */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 203f16c48c19..fdfa08cbf4db 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -233,6 +233,52 @@ xfs_alloc_update( return xfs_btree_update(cur, &rec); } +/* Convert the ondisk btree record to its incore representation. */ +void +xfs_alloc_btrec_to_irec( + const union xfs_btree_rec *rec, + struct xfs_alloc_rec_incore *irec) +{ + irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); + irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); +} + +/* Simple checks for free space records. */ +xfs_failaddr_t +xfs_alloc_check_irec( + struct xfs_btree_cur *cur, + const struct xfs_alloc_rec_incore *irec) +{ + struct xfs_perag *pag = cur->bc_ag.pag; + + if (irec->ar_blockcount == 0) + return __this_address; + + /* check for valid extent range, including overflow */ + if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount)) + return __this_address; + + return NULL; +} + +static inline int +xfs_alloc_complain_bad_rec( + struct xfs_btree_cur *cur, + xfs_failaddr_t fa, + const struct xfs_alloc_rec_incore *irec) +{ + struct xfs_mount *mp = cur->bc_mp; + + xfs_warn(mp, + "%s Freespace BTree record corruption in AG %d detected at %pS!", + cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", + cur->bc_ag.pag->pag_agno, fa); + xfs_warn(mp, + "start block 0x%x block count 0x%x", irec->ar_startblock, + irec->ar_blockcount); + return -EFSCORRUPTED; +} + /* * Get the data from the pointed-to record. */ @@ -243,35 +289,23 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat) /* output: success/failure */ { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_alloc_rec_incore irec; union xfs_btree_rec *rec; + xfs_failaddr_t fa; int error; error = xfs_btree_get_rec(cur, &rec, stat); if (error || !(*stat)) return error; - *bno = be32_to_cpu(rec->alloc.ar_startblock); - *len = be32_to_cpu(rec->alloc.ar_blockcount); - - if (*len == 0) - goto out_bad_rec; - - /* check for valid extent range, including overflow */ - if (!xfs_verify_agbext(pag, *bno, *len)) - goto out_bad_rec; + xfs_alloc_btrec_to_irec(rec, &irec); + fa = xfs_alloc_check_irec(cur, &irec); + if (fa) + return xfs_alloc_complain_bad_rec(cur, fa, &irec); + *bno = irec.ar_startblock; + *len = irec.ar_blockcount; return 0; - -out_bad_rec: - xfs_warn(mp, - "%s Freespace BTree record corruption in AG %d detected!", - cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", - pag->pag_agno); - xfs_warn(mp, - "start block 0x%x block count 0x%x", *bno, *len); - return -EFSCORRUPTED; } /* @@ -2405,6 +2439,7 @@ xfs_defer_agfl_block( trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); + xfs_extent_free_get_group(mp, xefi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); } @@ -2421,8 +2456,8 @@ __xfs_free_extent_later( bool skip_discard) { struct xfs_extent_free_item *xefi; -#ifdef DEBUG struct xfs_mount *mp = tp->t_mountp; +#ifdef DEBUG xfs_agnumber_t agno; xfs_agblock_t agbno; @@ -2456,9 +2491,11 @@ __xfs_free_extent_later( } else { xefi->xefi_owner = XFS_RMAP_OWN_NULL; } - trace_xfs_bmap_free_defer(tp->t_mountp, + trace_xfs_bmap_free_defer(mp, XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); + + xfs_extent_free_get_group(mp, xefi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); } @@ -3596,7 +3633,8 @@ xfs_free_extent_fix_freelist( int __xfs_free_extent( struct xfs_trans *tp, - xfs_fsblock_t bno, + struct xfs_perag *pag, + xfs_agblock_t agbno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, @@ -3604,12 +3642,9 @@ __xfs_free_extent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *agbp; - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); struct xfs_agf *agf; int error; unsigned int busy_flags = 0; - struct xfs_perag *pag; ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); @@ -3618,10 +3653,9 @@ __xfs_free_extent( XFS_ERRTAG_FREE_EXTENT)) return -EIO; - pag = xfs_perag_get(mp, agno); error = xfs_free_extent_fix_freelist(tp, pag, &agbp); if (error) - goto err; + return error; agf = agbp->b_addr; if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { @@ -3635,20 +3669,18 @@ __xfs_free_extent( goto err_release; } - error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type); + error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo, + type); if (error) goto err_release; if (skip_discard) busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags); - xfs_perag_put(pag); return 0; err_release: xfs_trans_brelse(tp, agbp); -err: - xfs_perag_put(pag); return error; } @@ -3666,9 +3698,13 @@ xfs_alloc_query_range_helper( { struct xfs_alloc_query_range_info *query = priv; struct xfs_alloc_rec_incore irec; + xfs_failaddr_t fa; + + xfs_alloc_btrec_to_irec(rec, &irec); + fa = xfs_alloc_check_irec(cur, &irec); + if (fa) + return xfs_alloc_complain_bad_rec(cur, fa, &irec); - irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); - irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); return query->fn(cur, &irec, query->priv); } @@ -3709,13 +3745,16 @@ xfs_alloc_query_all( return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); } -/* Is there a record covering a given extent? */ +/* + * Scan part of the keyspace of the free space and tell us if the area has no + * records, is fully mapped by records, or is partially filled. + */ int -xfs_alloc_has_record( +xfs_alloc_has_records( struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, - bool *exists) + enum xbtree_recpacking *outcome) { union xfs_btree_irec low; union xfs_btree_irec high; @@ -3725,7 +3764,7 @@ xfs_alloc_has_record( memset(&high, 0xFF, sizeof(high)); high.a.ar_startblock = bno + len - 1; - return xfs_btree_has_record(cur, &low, &high, exists); + return xfs_btree_has_records(cur, &low, &high, NULL, outcome); } /* diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 2b246d74c189..5dbb25546d0b 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -141,7 +141,8 @@ int xfs_alloc_vextent_first_ag(struct xfs_alloc_arg *args, int /* error */ __xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t bno, /* starting block number of extent */ + struct xfs_perag *pag, + xfs_agblock_t agbno, xfs_extlen_t len, /* length of extent */ const struct xfs_owner_info *oinfo, /* extent owner */ enum xfs_ag_resv_type type, /* block reservation type */ @@ -150,12 +151,13 @@ __xfs_free_extent( static inline int xfs_free_extent( struct xfs_trans *tp, - xfs_fsblock_t bno, + struct xfs_perag *pag, + xfs_agblock_t agbno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type) { - return __xfs_free_extent(tp, bno, len, oinfo, type, false); + return __xfs_free_extent(tp, pag, agbno, len, oinfo, type, false); } int /* error */ @@ -179,6 +181,12 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ +union xfs_btree_rec; +void xfs_alloc_btrec_to_irec(const union xfs_btree_rec *rec, + struct xfs_alloc_rec_incore *irec); +xfs_failaddr_t xfs_alloc_check_irec(struct xfs_btree_cur *cur, + const struct xfs_alloc_rec_incore *irec); + int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, struct xfs_buf **agfbpp); int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, @@ -205,8 +213,8 @@ int xfs_alloc_query_range(struct xfs_btree_cur *cur, int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn, void *priv); -int xfs_alloc_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, bool *exist); +int xfs_alloc_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, enum xbtree_recpacking *outcome); typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno, void *priv); @@ -235,9 +243,13 @@ struct xfs_extent_free_item { uint64_t xefi_owner; xfs_fsblock_t xefi_startblock;/* starting fs block number */ xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ + struct xfs_perag *xefi_pag; unsigned int xefi_flags; }; +void xfs_extent_free_get_group(struct xfs_mount *mp, + struct xfs_extent_free_item *xefi); + #define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */ #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 0f29c7b1b39f..c65228efed4a 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -260,20 +260,27 @@ STATIC int64_t xfs_bnobt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { + ASSERT(!mask || mask->alloc.ar_startblock); + return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) - - be32_to_cpu(k2->alloc.ar_startblock); + be32_to_cpu(k2->alloc.ar_startblock); } STATIC int64_t xfs_cntbt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { int64_t diff; + ASSERT(!mask || (mask->alloc.ar_blockcount && + mask->alloc.ar_startblock)); + diff = be32_to_cpu(k1->alloc.ar_blockcount) - be32_to_cpu(k2->alloc.ar_blockcount); if (diff) @@ -423,6 +430,19 @@ xfs_cntbt_recs_inorder( be32_to_cpu(r2->alloc.ar_startblock)); } +STATIC enum xbtree_key_contig +xfs_allocbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->alloc.ar_startblock); + + return xbtree_key_contig(be32_to_cpu(key1->alloc.ar_startblock), + be32_to_cpu(key2->alloc.ar_startblock)); +} + static const struct xfs_btree_ops xfs_bnobt_ops = { .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), @@ -443,6 +463,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = { .diff_two_keys = xfs_bnobt_diff_two_keys, .keys_inorder = xfs_bnobt_keys_inorder, .recs_inorder = xfs_bnobt_recs_inorder, + .keys_contiguous = xfs_allocbt_keys_contiguous, }; static const struct xfs_btree_ops xfs_cntbt_ops = { @@ -465,6 +486,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .diff_two_keys = xfs_cntbt_diff_two_keys, .keys_inorder = xfs_cntbt_keys_inorder, .recs_inorder = xfs_cntbt_recs_inorder, + .keys_contiguous = NULL, /* not needed right now */ }; /* Allocate most of a new allocation btree cursor. */ @@ -492,9 +514,7 @@ xfs_allocbt_init_common( cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); } - /* take a reference for the cursor */ - atomic_inc(&pag->pag_ref); - cur->bc_ag.pag = pag; + cur->bc_ag.pag = xfs_perag_hold(pag); if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 34de6e6898c4..b512de0540d5 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1083,6 +1083,34 @@ struct xfs_iread_state { xfs_extnum_t loaded; }; +int +xfs_bmap_complain_bad_rec( + struct xfs_inode *ip, + int whichfork, + xfs_failaddr_t fa, + const struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = ip->i_mount; + const char *forkname; + + switch (whichfork) { + case XFS_DATA_FORK: forkname = "data"; break; + case XFS_ATTR_FORK: forkname = "attr"; break; + case XFS_COW_FORK: forkname = "CoW"; break; + default: forkname = "???"; break; + } + + xfs_warn(mp, + "Bmap BTree record corruption in inode 0x%llx %s fork detected at %pS!", + ip->i_ino, forkname, fa); + xfs_warn(mp, + "Offset 0x%llx, start block 0x%llx, block count 0x%llx state 0x%x", + irec->br_startoff, irec->br_startblock, irec->br_blockcount, + irec->br_state); + + return -EFSCORRUPTED; +} + /* Stuff every bmbt record from this block into the incore extent map. */ static int xfs_iread_bmbt_block( @@ -1125,7 +1153,8 @@ xfs_iread_bmbt_block( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iread_extents(2)", frp, sizeof(*frp), fa); - return -EFSCORRUPTED; + return xfs_bmap_complain_bad_rec(ip, whichfork, fa, + &new); } xfs_iext_insert(ip, &ir->icur, &new, xfs_bmap_fork_to_state(whichfork)); @@ -1171,6 +1200,12 @@ xfs_iread_extents( goto out; } ASSERT(ir.loaded == xfs_iext_count(ifp)); + /* + * Use release semantics so that we can use acquire semantics in + * xfs_need_iread_extents and be guaranteed to see a valid mapping tree + * after that load. + */ + smp_store_release(&ifp->if_needextents, 0); return 0; out: xfs_iext_destroy(ifp); @@ -3505,7 +3540,6 @@ xfs_bmap_btalloc_at_eof( * original non-aligned state so the caller can proceed on allocation * failure as if this function was never called. */ - args->fsbno = ap->blkno; args->alignment = 1; return 0; } @@ -6075,6 +6109,7 @@ __xfs_bmap_add( bi->bi_whichfork = whichfork; bi->bi_bmap = *bmap; + xfs_bmap_update_get_group(tp->t_mountp, bi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list); return 0; } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index dd08361ca5a6..e33470e39728 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -145,7 +145,7 @@ static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags) { BMAP_COWFORK, "COW" } /* Return true if the extent is an allocated extent, written or not. */ -static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) +static inline bool xfs_bmap_is_real_extent(const struct xfs_bmbt_irec *irec) { return irec->br_startblock != HOLESTARTBLOCK && irec->br_startblock != DELAYSTARTBLOCK && @@ -238,9 +238,13 @@ struct xfs_bmap_intent { enum xfs_bmap_intent_type bi_type; int bi_whichfork; struct xfs_inode *bi_owner; + struct xfs_perag *bi_pag; struct xfs_bmbt_irec bi_bmap; }; +void xfs_bmap_update_get_group(struct xfs_mount *mp, + struct xfs_bmap_intent *bi); + int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi); void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); @@ -261,6 +265,8 @@ static inline uint32_t xfs_bmap_fork_to_state(int whichfork) xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *irec); +int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork, + xfs_failaddr_t fa, const struct xfs_bmbt_irec *irec); int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index b8ad95050c9b..1b40e5f8b1ec 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -382,11 +382,14 @@ STATIC int64_t xfs_bmbt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { uint64_t a = be64_to_cpu(k1->bmbt.br_startoff); uint64_t b = be64_to_cpu(k2->bmbt.br_startoff); + ASSERT(!mask || mask->bmbt.br_startoff); + /* * Note: This routine previously casted a and b to int64 and subtracted * them to generate a result. This lead to problems if b was the @@ -500,6 +503,19 @@ xfs_bmbt_recs_inorder( xfs_bmbt_disk_get_startoff(&r2->bmbt); } +STATIC enum xbtree_key_contig +xfs_bmbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->bmbt.br_startoff); + + return xbtree_key_contig(be64_to_cpu(key1->bmbt.br_startoff), + be64_to_cpu(key2->bmbt.br_startoff)); +} + static const struct xfs_btree_ops xfs_bmbt_ops = { .rec_len = sizeof(xfs_bmbt_rec_t), .key_len = sizeof(xfs_bmbt_key_t), @@ -520,6 +536,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .buf_ops = &xfs_bmbt_buf_ops, .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, + .keys_contiguous = xfs_bmbt_keys_contiguous, }; /* diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c4649cc624e1..6a6503ab0cd7 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2067,8 +2067,7 @@ xfs_btree_get_leaf_keys( for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { rec = xfs_btree_rec_addr(cur, n, block); cur->bc_ops->init_high_key_from_rec(&hkey, rec); - if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey) - > 0) + if (xfs_btree_keycmp_gt(cur, &hkey, &max_hkey)) max_hkey = hkey; } @@ -2096,7 +2095,7 @@ xfs_btree_get_node_keys( max_hkey = xfs_btree_high_key_addr(cur, 1, block); for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { hkey = xfs_btree_high_key_addr(cur, n, block); - if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0) + if (xfs_btree_keycmp_gt(cur, hkey, max_hkey)) max_hkey = hkey; } @@ -2183,8 +2182,8 @@ __xfs_btree_updkeys( nlkey = xfs_btree_key_addr(cur, ptr, block); nhkey = xfs_btree_high_key_addr(cur, ptr, block); if (!force_all && - !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 || - cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0)) + xfs_btree_keycmp_eq(cur, nlkey, lkey) && + xfs_btree_keycmp_eq(cur, nhkey, hkey)) break; xfs_btree_copy_keys(cur, nlkey, lkey, 1); xfs_btree_log_keys(cur, bp, ptr, ptr); @@ -4716,7 +4715,6 @@ xfs_btree_simple_query_range( { union xfs_btree_rec *recp; union xfs_btree_key rec_key; - int64_t diff; int stat; bool firstrec = true; int error; @@ -4746,20 +4744,17 @@ xfs_btree_simple_query_range( if (error || !stat) break; - /* Skip if high_key(rec) < low_key. */ + /* Skip if low_key > high_key(rec). */ if (firstrec) { cur->bc_ops->init_high_key_from_rec(&rec_key, recp); firstrec = false; - diff = cur->bc_ops->diff_two_keys(cur, low_key, - &rec_key); - if (diff > 0) + if (xfs_btree_keycmp_gt(cur, low_key, &rec_key)) goto advloop; } - /* Stop if high_key < low_key(rec). */ + /* Stop if low_key(rec) > high_key. */ cur->bc_ops->init_key_from_rec(&rec_key, recp); - diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key); - if (diff > 0) + if (xfs_btree_keycmp_gt(cur, &rec_key, high_key)) break; /* Callback */ @@ -4813,8 +4808,6 @@ xfs_btree_overlapped_query_range( union xfs_btree_key *hkp; union xfs_btree_rec *recp; struct xfs_btree_block *block; - int64_t ldiff; - int64_t hdiff; int level; struct xfs_buf *bp; int i; @@ -4854,25 +4847,23 @@ pop_up: block); cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); - ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey, - low_key); - cur->bc_ops->init_key_from_rec(&rec_key, recp); - hdiff = cur->bc_ops->diff_two_keys(cur, high_key, - &rec_key); /* + * If (query's high key < record's low key), then there + * are no more interesting records in this block. Pop + * up to the leaf level to find more record blocks. + * * If (record's high key >= query's low key) and * (query's high key >= record's low key), then * this record overlaps the query range; callback. */ - if (ldiff >= 0 && hdiff >= 0) { + if (xfs_btree_keycmp_lt(cur, high_key, &rec_key)) + goto pop_up; + if (xfs_btree_keycmp_ge(cur, &rec_hkey, low_key)) { error = fn(cur, recp, priv); if (error) break; - } else if (hdiff < 0) { - /* Record is larger than high key; pop. */ - goto pop_up; } cur->bc_levels[level].ptr++; continue; @@ -4884,15 +4875,18 @@ pop_up: block); pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block); - ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key); - hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp); - /* + * If (query's high key < pointer's low key), then there are no + * more interesting keys in this block. Pop up one leaf level + * to continue looking for records. + * * If (pointer's high key >= query's low key) and * (query's high key >= pointer's low key), then * this record overlaps the query range; follow pointer. */ - if (ldiff >= 0 && hdiff >= 0) { + if (xfs_btree_keycmp_lt(cur, high_key, lkp)) + goto pop_up; + if (xfs_btree_keycmp_ge(cur, hkp, low_key)) { level--; error = xfs_btree_lookup_get_block(cur, level, pp, &block); @@ -4907,9 +4901,6 @@ pop_up: #endif cur->bc_levels[level].ptr = 1; continue; - } else if (hdiff < 0) { - /* The low key is larger than the upper range; pop. */ - goto pop_up; } cur->bc_levels[level].ptr++; } @@ -4937,6 +4928,19 @@ out: return error; } +static inline void +xfs_btree_key_from_irec( + struct xfs_btree_cur *cur, + union xfs_btree_key *key, + const union xfs_btree_irec *irec) +{ + union xfs_btree_rec rec; + + cur->bc_rec = *irec; + cur->bc_ops->init_rec_from_cur(cur, &rec); + cur->bc_ops->init_key_from_rec(key, &rec); +} + /* * Query a btree for all records overlapping a given interval of keys. The * supplied function will be called with each record found; return one of the @@ -4951,21 +4955,15 @@ xfs_btree_query_range( xfs_btree_query_range_fn fn, void *priv) { - union xfs_btree_rec rec; union xfs_btree_key low_key; union xfs_btree_key high_key; /* Find the keys of both ends of the interval. */ - cur->bc_rec = *high_rec; - cur->bc_ops->init_rec_from_cur(cur, &rec); - cur->bc_ops->init_key_from_rec(&high_key, &rec); + xfs_btree_key_from_irec(cur, &high_key, high_rec); + xfs_btree_key_from_irec(cur, &low_key, low_rec); - cur->bc_rec = *low_rec; - cur->bc_ops->init_rec_from_cur(cur, &rec); - cur->bc_ops->init_key_from_rec(&low_key, &rec); - - /* Enforce low key < high key. */ - if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0) + /* Enforce low key <= high key. */ + if (!xfs_btree_keycmp_le(cur, &low_key, &high_key)) return -EINVAL; if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) @@ -5027,34 +5025,132 @@ xfs_btree_diff_two_ptrs( return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); } -/* If there's an extent, we're done. */ +struct xfs_btree_has_records { + /* Keys for the start and end of the range we want to know about. */ + union xfs_btree_key start_key; + union xfs_btree_key end_key; + + /* Mask for key comparisons, if desired. */ + const union xfs_btree_key *key_mask; + + /* Highest record key we've seen so far. */ + union xfs_btree_key high_key; + + enum xbtree_recpacking outcome; +}; + STATIC int -xfs_btree_has_record_helper( +xfs_btree_has_records_helper( struct xfs_btree_cur *cur, const union xfs_btree_rec *rec, void *priv) { - return -ECANCELED; + union xfs_btree_key rec_key; + union xfs_btree_key rec_high_key; + struct xfs_btree_has_records *info = priv; + enum xbtree_key_contig key_contig; + + cur->bc_ops->init_key_from_rec(&rec_key, rec); + + if (info->outcome == XBTREE_RECPACKING_EMPTY) { + info->outcome = XBTREE_RECPACKING_SPARSE; + + /* + * If the first record we find does not overlap the start key, + * then there is a hole at the start of the search range. + * Classify this as sparse and stop immediately. + */ + if (xfs_btree_masked_keycmp_lt(cur, &info->start_key, &rec_key, + info->key_mask)) + return -ECANCELED; + } else { + /* + * If a subsequent record does not overlap with the any record + * we've seen so far, there is a hole in the middle of the + * search range. Classify this as sparse and stop. + * If the keys overlap and this btree does not allow overlap, + * signal corruption. + */ + key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key, + &rec_key, info->key_mask); + if (key_contig == XBTREE_KEY_OVERLAP && + !(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return -EFSCORRUPTED; + if (key_contig == XBTREE_KEY_GAP) + return -ECANCELED; + } + + /* + * If high_key(rec) is larger than any other high key we've seen, + * remember it for later. + */ + cur->bc_ops->init_high_key_from_rec(&rec_high_key, rec); + if (xfs_btree_masked_keycmp_gt(cur, &rec_high_key, &info->high_key, + info->key_mask)) + info->high_key = rec_high_key; /* struct copy */ + + return 0; } -/* Is there a record covering a given range of keys? */ +/* + * Scan part of the keyspace of a btree and tell us if that keyspace does not + * map to any records; is fully mapped to records; or is partially mapped to + * records. This is the btree record equivalent to determining if a file is + * sparse. + * + * For most btree types, the record scan should use all available btree key + * fields to compare the keys encountered. These callers should pass NULL for + * @mask. However, some callers (e.g. scanning physical space in the rmapbt) + * want to ignore some part of the btree record keyspace when performing the + * comparison. These callers should pass in a union xfs_btree_key object with + * the fields that *should* be a part of the comparison set to any nonzero + * value, and the rest zeroed. + */ int -xfs_btree_has_record( +xfs_btree_has_records( struct xfs_btree_cur *cur, const union xfs_btree_irec *low, const union xfs_btree_irec *high, - bool *exists) + const union xfs_btree_key *mask, + enum xbtree_recpacking *outcome) { + struct xfs_btree_has_records info = { + .outcome = XBTREE_RECPACKING_EMPTY, + .key_mask = mask, + }; int error; - error = xfs_btree_query_range(cur, low, high, - &xfs_btree_has_record_helper, NULL); - if (error == -ECANCELED) { - *exists = true; - return 0; + /* Not all btrees support this operation. */ + if (!cur->bc_ops->keys_contiguous) { + ASSERT(0); + return -EOPNOTSUPP; } - *exists = false; - return error; + + xfs_btree_key_from_irec(cur, &info.start_key, low); + xfs_btree_key_from_irec(cur, &info.end_key, high); + + error = xfs_btree_query_range(cur, low, high, + xfs_btree_has_records_helper, &info); + if (error == -ECANCELED) + goto out; + if (error) + return error; + + if (info.outcome == XBTREE_RECPACKING_EMPTY) + goto out; + + /* + * If the largest high_key(rec) we saw during the walk is greater than + * the end of the search range, classify this as full. Otherwise, + * there is a hole at the end of the search range. + */ + if (xfs_btree_masked_keycmp_ge(cur, &info.high_key, &info.end_key, + mask)) + info.outcome = XBTREE_RECPACKING_FULL; + +out: + *outcome = info.outcome; + return 0; } /* Are there more records in this btree? */ diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 29c4b4ccb909..a2aa36b23e25 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -90,6 +90,27 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); #define XFS_BTREE_STATS_ADD(cur, stat, val) \ XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val) +enum xbtree_key_contig { + XBTREE_KEY_GAP = 0, + XBTREE_KEY_CONTIGUOUS, + XBTREE_KEY_OVERLAP, +}; + +/* + * Decide if these two numeric btree key fields are contiguous, overlapping, + * or if there's a gap between them. @x should be the field from the high + * key and @y should be the field from the low key. + */ +static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y) +{ + x++; + if (x < y) + return XBTREE_KEY_GAP; + if (x == y) + return XBTREE_KEY_CONTIGUOUS; + return XBTREE_KEY_OVERLAP; +} + struct xfs_btree_ops { /* size of the key and record structures */ size_t key_len; @@ -140,11 +161,14 @@ struct xfs_btree_ops { /* * Difference between key2 and key1 -- positive if key1 > key2, - * negative if key1 < key2, and zero if equal. + * negative if key1 < key2, and zero if equal. If the @mask parameter + * is non NULL, each key field to be used in the comparison must + * contain a nonzero value. */ int64_t (*diff_two_keys)(struct xfs_btree_cur *cur, const union xfs_btree_key *key1, - const union xfs_btree_key *key2); + const union xfs_btree_key *key2, + const union xfs_btree_key *mask); const struct xfs_buf_ops *buf_ops; @@ -157,6 +181,22 @@ struct xfs_btree_ops { int (*recs_inorder)(struct xfs_btree_cur *cur, const union xfs_btree_rec *r1, const union xfs_btree_rec *r2); + + /* + * Are these two btree keys immediately adjacent? + * + * Given two btree keys @key1 and @key2, decide if it is impossible for + * there to be a third btree key K satisfying the relationship + * @key1 < K < @key2. To determine if two btree records are + * immediately adjacent, @key1 should be the high key of the first + * record and @key2 should be the low key of the second record. + * If the @mask parameter is non NULL, each key field to be used in the + * comparison must contain a nonzero value. + */ + enum xbtree_key_contig (*keys_contiguous)(struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask); }; /* @@ -540,12 +580,105 @@ void xfs_btree_get_keys(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key); union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, union xfs_btree_key *key); -int xfs_btree_has_record(struct xfs_btree_cur *cur, +typedef bool (*xfs_btree_key_gap_fn)(struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2); + +int xfs_btree_has_records(struct xfs_btree_cur *cur, const union xfs_btree_irec *low, - const union xfs_btree_irec *high, bool *exists); + const union xfs_btree_irec *high, + const union xfs_btree_key *mask, + enum xbtree_recpacking *outcome); + bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur); +/* Key comparison helpers */ +static inline bool +xfs_btree_keycmp_lt( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) < 0; +} + +static inline bool +xfs_btree_keycmp_gt( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) > 0; +} + +static inline bool +xfs_btree_keycmp_eq( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) == 0; +} + +static inline bool +xfs_btree_keycmp_le( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return !xfs_btree_keycmp_gt(cur, key1, key2); +} + +static inline bool +xfs_btree_keycmp_ge( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return !xfs_btree_keycmp_lt(cur, key1, key2); +} + +static inline bool +xfs_btree_keycmp_ne( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return !xfs_btree_keycmp_eq(cur, key1, key2); +} + +/* Masked key comparison helpers */ +static inline bool +xfs_btree_masked_keycmp_lt( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) < 0; +} + +static inline bool +xfs_btree_masked_keycmp_gt( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) > 0; +} + +static inline bool +xfs_btree_masked_keycmp_ge( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + return !xfs_btree_masked_keycmp_lt(cur, key1, key2, mask); +} + /* Does this cursor point to the last block in the given level? */ static inline bool xfs_btree_islastblock( diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 5a321b783398..bcfb6a4203cd 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -397,6 +397,7 @@ xfs_defer_cancel_list( list_for_each_safe(pwi, n, &dfp->dfp_work) { list_del(pwi); dfp->dfp_count--; + trace_xfs_defer_cancel_item(mp, dfp, pwi); ops->cancel_item(pwi); } ASSERT(dfp->dfp_count == 0); @@ -476,6 +477,7 @@ xfs_defer_finish_one( list_for_each_safe(li, n, &dfp->dfp_work) { list_del(li); dfp->dfp_count--; + trace_xfs_defer_finish_item(tp->t_mountp, dfp, li); error = ops->finish_item(tp, dfp->dfp_done, li, &state); if (error == -EAGAIN) { int ret; @@ -623,7 +625,7 @@ xfs_defer_add( struct list_head *li) { struct xfs_defer_pending *dfp = NULL; - const struct xfs_defer_op_type *ops; + const struct xfs_defer_op_type *ops = defer_op_types[type]; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); @@ -636,7 +638,6 @@ xfs_defer_add( if (!list_empty(&tp->t_dfops)) { dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, dfp_list); - ops = defer_op_types[dfp->dfp_type]; if (dfp->dfp_type != type || (ops->max_items && dfp->dfp_count >= ops->max_items)) dfp = NULL; @@ -653,6 +654,7 @@ xfs_defer_add( } list_add_tail(li, &dfp->dfp_work); + trace_xfs_defer_add_item(tp->t_mountp, dfp, li); dfp->dfp_count++; } diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 92bac3373f1f..f5462fd582d5 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -64,7 +64,7 @@ xfs_ascii_ci_hashname( int i; for (i = 0, hash = 0; i < name->len; i++) - hash = tolower(name->name[i]) ^ rol32(hash, 7); + hash = xfs_ascii_ci_xfrm(name->name[i]) ^ rol32(hash, 7); return hash; } @@ -85,7 +85,8 @@ xfs_ascii_ci_compname( for (i = 0; i < len; i++) { if (args->name[i] == name[i]) continue; - if (tolower(args->name[i]) != tolower(name[i])) + if (xfs_ascii_ci_xfrm(args->name[i]) != + xfs_ascii_ci_xfrm(name[i])) return XFS_CMP_DIFFERENT; result = XFS_CMP_CASE; } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index dd39f17dd9a9..19af22a16c41 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -248,4 +248,35 @@ unsigned int xfs_dir3_data_end_offset(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr); bool xfs_dir2_namecheck(const void *name, size_t length); +/* + * The "ascii-ci" feature was created to speed up case-insensitive lookups for + * a Samba product. Because of the inherent problems with CI and UTF-8 + * encoding, etc, it was decided that Samba would be configured to export + * latin1/iso 8859-1 encodings as that covered >90% of the target markets for + * the product. Hence the "ascii-ci" casefolding code could be encoded into + * the XFS directory operations and remove all the overhead of casefolding from + * Samba. + * + * To provide consistent hashing behavior between the userspace and kernel, + * these functions prepare names for hashing by transforming specific bytes + * to other bytes. Robustness with other encodings is not guaranteed. + */ +static inline bool xfs_ascii_ci_need_xfrm(unsigned char c) +{ + if (c >= 0x41 && c <= 0x5a) /* A-Z */ + return true; + if (c >= 0xc0 && c <= 0xd6) /* latin A-O with accents */ + return true; + if (c >= 0xd8 && c <= 0xde) /* latin O-Y with accents */ + return true; + return false; +} + +static inline unsigned char xfs_ascii_ci_xfrm(unsigned char c) +{ + if (xfs_ascii_ci_need_xfrm(c)) + c -= 'A' - 'a'; + return c; +} + #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 7ee292aecbeb..a16d5de16933 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -95,33 +95,25 @@ xfs_inobt_btrec_to_irec( irec->ir_free = be64_to_cpu(rec->inobt.ir_free); } -/* - * Get the data from the pointed-to record. - */ -int -xfs_inobt_get_rec( - struct xfs_btree_cur *cur, - struct xfs_inobt_rec_incore *irec, - int *stat) +/* Simple checks for inode records. */ +xfs_failaddr_t +xfs_inobt_check_irec( + struct xfs_btree_cur *cur, + const struct xfs_inobt_rec_incore *irec) { - struct xfs_mount *mp = cur->bc_mp; - union xfs_btree_rec *rec; - int error; uint64_t realfree; - error = xfs_btree_get_rec(cur, &rec, stat); - if (error || *stat == 0) - return error; - - xfs_inobt_btrec_to_irec(mp, rec, irec); - + /* Record has to be properly aligned within the AG. */ if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino)) - goto out_bad_rec; + return __this_address; + if (!xfs_verify_agino(cur->bc_ag.pag, + irec->ir_startino + XFS_INODES_PER_CHUNK - 1)) + return __this_address; if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || irec->ir_count > XFS_INODES_PER_CHUNK) - goto out_bad_rec; + return __this_address; if (irec->ir_freecount > XFS_INODES_PER_CHUNK) - goto out_bad_rec; + return __this_address; /* if there are no holes, return the first available offset */ if (!xfs_inobt_issparse(irec->ir_holemask)) @@ -129,15 +121,23 @@ xfs_inobt_get_rec( else realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec); if (hweight64(realfree) != irec->ir_freecount) - goto out_bad_rec; + return __this_address; - return 0; + return NULL; +} + +static inline int +xfs_inobt_complain_bad_rec( + struct xfs_btree_cur *cur, + xfs_failaddr_t fa, + const struct xfs_inobt_rec_incore *irec) +{ + struct xfs_mount *mp = cur->bc_mp; -out_bad_rec: xfs_warn(mp, - "%s Inode BTree record corruption in AG %d detected!", + "%s Inode BTree record corruption in AG %d detected at %pS!", cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", - cur->bc_ag.pag->pag_agno); + cur->bc_ag.pag->pag_agno, fa); xfs_warn(mp, "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", irec->ir_startino, irec->ir_count, irec->ir_freecount, @@ -146,6 +146,32 @@ out_bad_rec: } /* + * Get the data from the pointed-to record. + */ +int +xfs_inobt_get_rec( + struct xfs_btree_cur *cur, + struct xfs_inobt_rec_incore *irec, + int *stat) +{ + struct xfs_mount *mp = cur->bc_mp; + union xfs_btree_rec *rec; + xfs_failaddr_t fa; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (error || *stat == 0) + return error; + + xfs_inobt_btrec_to_irec(mp, rec, irec); + fa = xfs_inobt_check_irec(cur, irec); + if (fa) + return xfs_inobt_complain_bad_rec(cur, fa, irec); + + return 0; +} + +/* * Insert a single inobt record. Cursor must already point to desired location. */ int @@ -1952,8 +1978,6 @@ xfs_difree_inobt( */ if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { - struct xfs_perag *pag = agbp->b_pag; - xic->deleted = true; xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino); @@ -2617,44 +2641,50 @@ xfs_ialloc_read_agi( return 0; } -/* Is there an inode record covering a given range of inode numbers? */ -int -xfs_ialloc_has_inode_record( - struct xfs_btree_cur *cur, - xfs_agino_t low, - xfs_agino_t high, - bool *exists) +/* How many inodes are backed by inode clusters ondisk? */ +STATIC int +xfs_ialloc_count_ondisk( + struct xfs_btree_cur *cur, + xfs_agino_t low, + xfs_agino_t high, + unsigned int *allocated) { struct xfs_inobt_rec_incore irec; - xfs_agino_t agino; - uint16_t holemask; - int has_record; - int i; - int error; + unsigned int ret = 0; + int has_record; + int error; - *exists = false; error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record); - while (error == 0 && has_record) { + if (error) + return error; + + while (has_record) { + unsigned int i, hole_idx; + error = xfs_inobt_get_rec(cur, &irec, &has_record); - if (error || irec.ir_startino > high) + if (error) + return error; + if (irec.ir_startino > high) break; - agino = irec.ir_startino; - holemask = irec.ir_holemask; - for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; holemask >>= 1, - i++, agino += XFS_INODES_PER_HOLEMASK_BIT) { - if (holemask & 1) + for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { + if (irec.ir_startino + i < low) continue; - if (agino + XFS_INODES_PER_HOLEMASK_BIT > low && - agino <= high) { - *exists = true; - return 0; - } + if (irec.ir_startino + i > high) + break; + + hole_idx = i / XFS_INODES_PER_HOLEMASK_BIT; + if (!(irec.ir_holemask & (1U << hole_idx))) + ret++; } error = xfs_btree_increment(cur, 0, &has_record); + if (error) + return error; } - return error; + + *allocated = ret; + return 0; } /* Is there an inode record covering a given extent? */ @@ -2663,15 +2693,27 @@ xfs_ialloc_has_inodes_at_extent( struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, - bool *exists) + enum xbtree_recpacking *outcome) { - xfs_agino_t low; - xfs_agino_t high; + xfs_agino_t agino; + xfs_agino_t last_agino; + unsigned int allocated; + int error; + + agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno); + last_agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1; - low = XFS_AGB_TO_AGINO(cur->bc_mp, bno); - high = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1; + error = xfs_ialloc_count_ondisk(cur, agino, last_agino, &allocated); + if (error) + return error; - return xfs_ialloc_has_inode_record(cur, low, high, exists); + if (allocated == 0) + *outcome = XBTREE_RECPACKING_EMPTY; + else if (allocated == last_agino - agino + 1) + *outcome = XBTREE_RECPACKING_FULL; + else + *outcome = XBTREE_RECPACKING_SPARSE; + return 0; } struct xfs_ialloc_count_inodes { @@ -2688,8 +2730,13 @@ xfs_ialloc_count_inodes_rec( { struct xfs_inobt_rec_incore irec; struct xfs_ialloc_count_inodes *ci = priv; + xfs_failaddr_t fa; xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); + fa = xfs_inobt_check_irec(cur, &irec); + if (fa) + return xfs_inobt_complain_bad_rec(cur, fa, &irec); + ci->count += irec.ir_count; ci->freecount += irec.ir_freecount; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index ab8c30b4ec22..fe824bb04a09 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -93,10 +93,11 @@ union xfs_btree_rec; void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, const union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec); +xfs_failaddr_t xfs_inobt_check_irec(struct xfs_btree_cur *cur, + const struct xfs_inobt_rec_incore *irec); int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur, - xfs_agblock_t bno, xfs_extlen_t len, bool *exists); -int xfs_ialloc_has_inode_record(struct xfs_btree_cur *cur, xfs_agino_t low, - xfs_agino_t high, bool *exists); + xfs_agblock_t bno, xfs_extlen_t len, + enum xbtree_recpacking *outcome); int xfs_ialloc_count_inodes(struct xfs_btree_cur *cur, xfs_agino_t *count, xfs_agino_t *freecount); int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 9b28211d5a4c..5a945ae21b5d 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -156,9 +156,12 @@ __xfs_inobt_free_block( struct xfs_buf *bp, enum xfs_ag_resv_type resv) { + xfs_fsblock_t fsbno; + xfs_inobt_mod_blockcount(cur, -1); - return xfs_free_extent(cur->bc_tp, - XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)), 1, + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); + return xfs_free_extent(cur->bc_tp, cur->bc_ag.pag, + XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1, &XFS_RMAP_OINFO_INOBT, resv); } @@ -266,10 +269,13 @@ STATIC int64_t xfs_inobt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { + ASSERT(!mask || mask->inobt.ir_startino); + return (int64_t)be32_to_cpu(k1->inobt.ir_startino) - - be32_to_cpu(k2->inobt.ir_startino); + be32_to_cpu(k2->inobt.ir_startino); } static xfs_failaddr_t @@ -380,6 +386,19 @@ xfs_inobt_recs_inorder( be32_to_cpu(r2->inobt.ir_startino); } +STATIC enum xbtree_key_contig +xfs_inobt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->inobt.ir_startino); + + return xbtree_key_contig(be32_to_cpu(key1->inobt.ir_startino), + be32_to_cpu(key2->inobt.ir_startino)); +} + static const struct xfs_btree_ops xfs_inobt_ops = { .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), @@ -399,6 +418,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, + .keys_contiguous = xfs_inobt_keys_contiguous, }; static const struct xfs_btree_ops xfs_finobt_ops = { @@ -420,6 +440,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, + .keys_contiguous = xfs_inobt_keys_contiguous, }; /* @@ -447,9 +468,7 @@ xfs_inobt_init_common( if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - /* take a reference for the cursor */ - atomic_inc(&pag->pag_ref); - cur->bc_ag.pag = pag; + cur->bc_ag.pag = xfs_perag_hold(pag); return cur; } @@ -607,7 +626,7 @@ xfs_iallocbt_maxlevels_ondisk(void) */ uint64_t xfs_inobt_irec_to_allocmask( - struct xfs_inobt_rec_incore *rec) + const struct xfs_inobt_rec_incore *rec) { uint64_t bitmap = 0; uint64_t inodespbit; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index e859a6e05230..3262c3fe5ebe 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -53,7 +53,7 @@ struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_perag *pag, extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); /* ir_holemask to inode allocation bitmap conversion */ -uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *); +uint64_t xfs_inobt_irec_to_allocmask(const struct xfs_inobt_rec_incore *irec); #if defined(DEBUG) || defined(XFS_WARN) int xfs_inobt_rec_check_count(struct xfs_mount *, diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 6b21760184d9..5a2e7ddfa76d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -140,7 +140,8 @@ xfs_iformat_extents( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_extents(2)", dp, sizeof(*dp), fa); - return -EFSCORRUPTED; + return xfs_bmap_complain_bad_rec(ip, whichfork, + fa, &new); } xfs_iext_insert(ip, &icur, &new, state); @@ -226,10 +227,15 @@ xfs_iformat_data_fork( /* * Initialize the extent count early, as the per-format routines may - * depend on it. + * depend on it. Use release semantics to set needextents /after/ we + * set the format. This ensures that we can use acquire semantics on + * needextents in xfs_need_iread_extents() and be guaranteed to see a + * valid format value after that load. */ ip->i_df.if_format = dip->di_format; ip->i_df.if_nextents = xfs_dfork_data_extents(dip); + smp_store_release(&ip->i_df.if_needextents, + ip->i_df.if_format == XFS_DINODE_FMT_BTREE ? 1 : 0); switch (inode->i_mode & S_IFMT) { case S_IFIFO: @@ -282,8 +288,17 @@ xfs_ifork_init_attr( enum xfs_dinode_fmt format, xfs_extnum_t nextents) { + /* + * Initialize the extent count early, as the per-format routines may + * depend on it. Use release semantics to set needextents /after/ we + * set the format. This ensures that we can use acquire semantics on + * needextents in xfs_need_iread_extents() and be guaranteed to see a + * valid format value after that load. + */ ip->i_af.if_format = format; ip->i_af.if_nextents = nextents; + smp_store_release(&ip->i_af.if_needextents, + ip->i_af.if_format == XFS_DINODE_FMT_BTREE ? 1 : 0); } void diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index d3943d6ad0b9..96d307784c85 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -24,6 +24,7 @@ struct xfs_ifork { xfs_extnum_t if_nextents; /* # of extents in this fork */ short if_broot_bytes; /* bytes allocated for root */ int8_t if_format; /* format of this fork */ + uint8_t if_needextents; /* extents have not been read */ }; /* @@ -260,9 +261,10 @@ int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip, uint nr_to_add); /* returns true if the fork has extents but they are not read in yet. */ -static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp) +static inline bool xfs_need_iread_extents(const struct xfs_ifork *ifp) { - return ifp->if_format == XFS_DINODE_FMT_BTREE && ifp->if_height == 0; + /* see xfs_iformat_{data,attr}_fork() for needextents semantics */ + return smp_load_acquire(&ifp->if_needextents) != 0; } #endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index bcf46aa0d08b..c1c65774dcc2 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -120,45 +120,41 @@ xfs_refcount_btrec_to_irec( irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount); } -/* - * Get the data from the pointed-to record. - */ -int -xfs_refcount_get_rec( +/* Simple checks for refcount records. */ +xfs_failaddr_t +xfs_refcount_check_irec( struct xfs_btree_cur *cur, - struct xfs_refcount_irec *irec, - int *stat) + const struct xfs_refcount_irec *irec) { - struct xfs_mount *mp = cur->bc_mp; struct xfs_perag *pag = cur->bc_ag.pag; - union xfs_btree_rec *rec; - int error; - - error = xfs_btree_get_rec(cur, &rec, stat); - if (error || !*stat) - return error; - xfs_refcount_btrec_to_irec(rec, irec); if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) - goto out_bad_rec; + return __this_address; if (!xfs_refcount_check_domain(irec)) - goto out_bad_rec; + return __this_address; /* check for valid extent range, including overflow */ if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount)) - goto out_bad_rec; + return __this_address; if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) - goto out_bad_rec; + return __this_address; - trace_xfs_refcount_get(cur->bc_mp, pag->pag_agno, irec); - return 0; + return NULL; +} + +static inline int +xfs_refcount_complain_bad_rec( + struct xfs_btree_cur *cur, + xfs_failaddr_t fa, + const struct xfs_refcount_irec *irec) +{ + struct xfs_mount *mp = cur->bc_mp; -out_bad_rec: xfs_warn(mp, - "Refcount BTree record corruption in AG %d detected!", - pag->pag_agno); + "Refcount BTree record corruption in AG %d detected at %pS!", + cur->bc_ag.pag->pag_agno, fa); xfs_warn(mp, "Start block 0x%x, block count 0x%x, references 0x%x", irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount); @@ -166,6 +162,32 @@ out_bad_rec: } /* + * Get the data from the pointed-to record. + */ +int +xfs_refcount_get_rec( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, + int *stat) +{ + union xfs_btree_rec *rec; + xfs_failaddr_t fa; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (error || !*stat) + return error; + + xfs_refcount_btrec_to_irec(rec, irec); + fa = xfs_refcount_check_irec(cur, irec); + if (fa) + return xfs_refcount_complain_bad_rec(cur, fa, irec); + + trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + return 0; +} + +/* * Update the record referred to by cur to the value given * by [bno, len, refcount]. * This either works (return 0) or gets an EFSCORRUPTED error. @@ -1332,26 +1354,22 @@ xfs_refcount_finish_one( xfs_agblock_t bno; unsigned long nr_ops = 0; int shape_changes = 0; - struct xfs_perag *pag; - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock); trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock), ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock), ri->ri_blockcount); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) { - error = -EIO; - goto out_drop; - } + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + return -EIO; /* * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_ag.pag != pag) { + if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { nr_ops = rcur->bc_ag.refc.nr_ops; shape_changes = rcur->bc_ag.refc.shape_changes; xfs_refcount_finish_one_cleanup(tp, rcur, 0); @@ -1359,12 +1377,12 @@ xfs_refcount_finish_one( *pcur = NULL; } if (rcur == NULL) { - error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING, - &agbp); + error = xfs_alloc_read_agf(ri->ri_pag, tp, + XFS_ALLOC_FLAG_FREEING, &agbp); if (error) - goto out_drop; + return error; - rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); + rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, ri->ri_pag); rcur->bc_ag.refc.nr_ops = nr_ops; rcur->bc_ag.refc.shape_changes = shape_changes; } @@ -1375,7 +1393,7 @@ xfs_refcount_finish_one( error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, XFS_REFCOUNT_ADJUST_INCREASE); if (error) - goto out_drop; + return error; if (ri->ri_blockcount > 0) error = xfs_refcount_continue_op(rcur, ri, bno); break; @@ -1383,31 +1401,29 @@ xfs_refcount_finish_one( error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, XFS_REFCOUNT_ADJUST_DECREASE); if (error) - goto out_drop; + return error; if (ri->ri_blockcount > 0) error = xfs_refcount_continue_op(rcur, ri, bno); break; case XFS_REFCOUNT_ALLOC_COW: error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount); if (error) - goto out_drop; + return error; ri->ri_blockcount = 0; break; case XFS_REFCOUNT_FREE_COW: error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount); if (error) - goto out_drop; + return error; ri->ri_blockcount = 0; break; default: ASSERT(0); - error = -EFSCORRUPTED; + return -EFSCORRUPTED; } if (!error && ri->ri_blockcount > 0) - trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, + trace_xfs_refcount_finish_one_leftover(mp, ri->ri_pag->pag_agno, ri->ri_type, bno, ri->ri_blockcount); -out_drop: - xfs_perag_put(pag); return error; } @@ -1435,6 +1451,7 @@ __xfs_refcount_add( ri->ri_startblock = startblock; ri->ri_blockcount = blockcount; + xfs_refcount_update_get_group(tp->t_mountp, ri); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); } @@ -1876,7 +1893,8 @@ xfs_refcount_recover_extent( INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - if (XFS_IS_CORRUPT(cur->bc_mp, + if (xfs_refcount_check_irec(cur, &rr->rr_rrec) != NULL || + XFS_IS_CORRUPT(cur->bc_mp, rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { kfree(rr); return -EFSCORRUPTED; @@ -1980,14 +1998,17 @@ out_free: return error; } -/* Is there a record covering a given extent? */ +/* + * Scan part of the keyspace of the refcount records and tell us if the area + * has no records, is fully mapped by records, or is partially filled. + */ int -xfs_refcount_has_record( +xfs_refcount_has_records( struct xfs_btree_cur *cur, enum xfs_refc_domain domain, xfs_agblock_t bno, xfs_extlen_t len, - bool *exists) + enum xbtree_recpacking *outcome) { union xfs_btree_irec low; union xfs_btree_irec high; @@ -1998,7 +2019,7 @@ xfs_refcount_has_record( high.rc.rc_startblock = bno + len - 1; low.rc.rc_domain = high.rc.rc_domain = domain; - return xfs_btree_has_record(cur, &low, &high, exists); + return xfs_btree_has_records(cur, &low, &high, NULL, outcome); } int __init diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index c633477ce3ce..783cd89ca195 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -50,6 +50,7 @@ enum xfs_refcount_intent_type { struct xfs_refcount_intent { struct list_head ri_list; + struct xfs_perag *ri_pag; enum xfs_refcount_intent_type ri_type; xfs_extlen_t ri_blockcount; xfs_fsblock_t ri_startblock; @@ -67,6 +68,9 @@ xfs_refcount_check_domain( return true; } +void xfs_refcount_update_get_group(struct xfs_mount *mp, + struct xfs_refcount_intent *ri); + void xfs_refcount_increase_extent(struct xfs_trans *tp, struct xfs_bmbt_irec *irec); void xfs_refcount_decrease_extent(struct xfs_trans *tp, @@ -107,12 +111,14 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, */ #define XFS_REFCOUNT_ITEM_OVERHEAD 32 -extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, +extern int xfs_refcount_has_records(struct xfs_btree_cur *cur, enum xfs_refc_domain domain, xfs_agblock_t bno, - xfs_extlen_t len, bool *exists); + xfs_extlen_t len, enum xbtree_recpacking *outcome); union xfs_btree_rec; extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); +xfs_failaddr_t xfs_refcount_check_irec(struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *irec); extern int xfs_refcount_insert(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index f3b860970b26..d4afc5f4e6a5 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -112,8 +112,9 @@ xfs_refcountbt_free_block( XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); - error = xfs_free_extent(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_REFC, - XFS_AG_RESV_METADATA); + error = xfs_free_extent(cur->bc_tp, cur->bc_ag.pag, + XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1, + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); if (error) return error; @@ -201,10 +202,13 @@ STATIC int64_t xfs_refcountbt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { + ASSERT(!mask || mask->refc.rc_startblock); + return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - - be32_to_cpu(k2->refc.rc_startblock); + be32_to_cpu(k2->refc.rc_startblock); } STATIC xfs_failaddr_t @@ -299,6 +303,19 @@ xfs_refcountbt_recs_inorder( be32_to_cpu(r2->refc.rc_startblock); } +STATIC enum xbtree_key_contig +xfs_refcountbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->refc.rc_startblock); + + return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock), + be32_to_cpu(key2->refc.rc_startblock)); +} + static const struct xfs_btree_ops xfs_refcountbt_ops = { .rec_len = sizeof(struct xfs_refcount_rec), .key_len = sizeof(struct xfs_refcount_key), @@ -318,6 +335,7 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = { .diff_two_keys = xfs_refcountbt_diff_two_keys, .keys_inorder = xfs_refcountbt_keys_inorder, .recs_inorder = xfs_refcountbt_recs_inorder, + .keys_contiguous = xfs_refcountbt_keys_contiguous, }; /* @@ -339,10 +357,7 @@ xfs_refcountbt_init_common( cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - /* take a reference for the cursor */ - atomic_inc(&pag->pag_ref); - cur->bc_ag.pag = pag; - + cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_ag.refc.nr_ops = 0; cur->bc_ag.refc.shape_changes = 0; cur->bc_ops = &xfs_refcountbt_ops; diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index df720041cd3d..f4dc23b3b837 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -193,7 +193,7 @@ done: } /* Convert an internal btree record to an rmap record. */ -int +xfs_failaddr_t xfs_rmap_btrec_to_irec( const union xfs_btree_rec *rec, struct xfs_rmap_irec *irec) @@ -205,51 +205,74 @@ xfs_rmap_btrec_to_irec( irec); } -/* - * Get the data from the pointed-to record. - */ -int -xfs_rmap_get_rec( - struct xfs_btree_cur *cur, - struct xfs_rmap_irec *irec, - int *stat) +/* Simple checks for rmap records. */ +xfs_failaddr_t +xfs_rmap_check_irec( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *irec) { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_perag *pag = cur->bc_ag.pag; - union xfs_btree_rec *rec; - int error; - - error = xfs_btree_get_rec(cur, &rec, stat); - if (error || !*stat) - return error; - - if (xfs_rmap_btrec_to_irec(rec, irec)) - goto out_bad_rec; + struct xfs_mount *mp = cur->bc_mp; + bool is_inode; + bool is_unwritten; + bool is_bmbt; + bool is_attr; if (irec->rm_blockcount == 0) - goto out_bad_rec; + return __this_address; if (irec->rm_startblock <= XFS_AGFL_BLOCK(mp)) { if (irec->rm_owner != XFS_RMAP_OWN_FS) - goto out_bad_rec; + return __this_address; if (irec->rm_blockcount != XFS_AGFL_BLOCK(mp) + 1) - goto out_bad_rec; + return __this_address; } else { /* check for valid extent range, including overflow */ - if (!xfs_verify_agbext(pag, irec->rm_startblock, - irec->rm_blockcount)) - goto out_bad_rec; + if (!xfs_verify_agbext(cur->bc_ag.pag, irec->rm_startblock, + irec->rm_blockcount)) + return __this_address; } if (!(xfs_verify_ino(mp, irec->rm_owner) || (irec->rm_owner <= XFS_RMAP_OWN_FS && irec->rm_owner >= XFS_RMAP_OWN_MIN))) - goto out_bad_rec; + return __this_address; + + /* Check flags. */ + is_inode = !XFS_RMAP_NON_INODE_OWNER(irec->rm_owner); + is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK; + is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK; + is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN; + + if (is_bmbt && irec->rm_offset != 0) + return __this_address; + + if (!is_inode && irec->rm_offset != 0) + return __this_address; + + if (is_unwritten && (is_bmbt || !is_inode || is_attr)) + return __this_address; + + if (!is_inode && (is_bmbt || is_unwritten || is_attr)) + return __this_address; + + /* Check for a valid fork offset, if applicable. */ + if (is_inode && !is_bmbt && + !xfs_verify_fileext(mp, irec->rm_offset, irec->rm_blockcount)) + return __this_address; + + return NULL; +} + +static inline int +xfs_rmap_complain_bad_rec( + struct xfs_btree_cur *cur, + xfs_failaddr_t fa, + const struct xfs_rmap_irec *irec) +{ + struct xfs_mount *mp = cur->bc_mp; - return 0; -out_bad_rec: xfs_warn(mp, - "Reverse Mapping BTree record corruption in AG %d detected!", - pag->pag_agno); + "Reverse Mapping BTree record corruption in AG %d detected at %pS!", + cur->bc_ag.pag->pag_agno, fa); xfs_warn(mp, "Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x", irec->rm_owner, irec->rm_flags, irec->rm_startblock, @@ -257,6 +280,32 @@ out_bad_rec: return -EFSCORRUPTED; } +/* + * Get the data from the pointed-to record. + */ +int +xfs_rmap_get_rec( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *irec, + int *stat) +{ + union xfs_btree_rec *rec; + xfs_failaddr_t fa; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (error || !*stat) + return error; + + fa = xfs_rmap_btrec_to_irec(rec, irec); + if (!fa) + fa = xfs_rmap_check_irec(cur, irec); + if (fa) + return xfs_rmap_complain_bad_rec(cur, fa, irec); + + return 0; +} + struct xfs_find_left_neighbor_info { struct xfs_rmap_irec high; struct xfs_rmap_irec *irec; @@ -2320,11 +2369,14 @@ xfs_rmap_query_range_helper( { struct xfs_rmap_query_range_info *query = priv; struct xfs_rmap_irec irec; - int error; + xfs_failaddr_t fa; + + fa = xfs_rmap_btrec_to_irec(rec, &irec); + if (!fa) + fa = xfs_rmap_check_irec(cur, &irec); + if (fa) + return xfs_rmap_complain_bad_rec(cur, fa, &irec); - error = xfs_rmap_btrec_to_irec(rec, &irec); - if (error) - return error; return query->fn(cur, &irec, query->priv); } @@ -2394,7 +2446,6 @@ xfs_rmap_finish_one( struct xfs_btree_cur **pcur) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_perag *pag; struct xfs_btree_cur *rcur; struct xfs_buf *agbp = NULL; int error = 0; @@ -2402,26 +2453,22 @@ xfs_rmap_finish_one( xfs_agblock_t bno; bool unwritten; - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock)); bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock); - trace_xfs_rmap_deferred(mp, pag->pag_agno, ri->ri_type, bno, + trace_xfs_rmap_deferred(mp, ri->ri_pag->pag_agno, ri->ri_type, bno, ri->ri_owner, ri->ri_whichfork, ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount, ri->ri_bmap.br_state); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) { - error = -EIO; - goto out_drop; - } - + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) + return -EIO; /* * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_ag.pag != pag) { + if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { xfs_rmap_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; @@ -2432,15 +2479,13 @@ xfs_rmap_finish_one( * rmapbt, because a shape change could cause us to * allocate blocks. */ - error = xfs_free_extent_fix_freelist(tp, pag, &agbp); + error = xfs_free_extent_fix_freelist(tp, ri->ri_pag, &agbp); if (error) - goto out_drop; - if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) { - error = -EFSCORRUPTED; - goto out_drop; - } + return error; + if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) + return -EFSCORRUPTED; - rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); + rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag); } *pcur = rcur; @@ -2480,8 +2525,7 @@ xfs_rmap_finish_one( ASSERT(0); error = -EFSCORRUPTED; } -out_drop: - xfs_perag_put(pag); + return error; } @@ -2526,6 +2570,7 @@ __xfs_rmap_add( ri->ri_whichfork = whichfork; ri->ri_bmap = *bmap; + xfs_rmap_update_get_group(tp->t_mountp, ri); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list); } @@ -2664,14 +2709,21 @@ xfs_rmap_compare( return 0; } -/* Is there a record covering a given extent? */ +/* + * Scan the physical storage part of the keyspace of the reverse mapping index + * and tell us if the area has no records, is fully mapped by records, or is + * partially filled. + */ int -xfs_rmap_has_record( +xfs_rmap_has_records( struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, - bool *exists) + enum xbtree_recpacking *outcome) { + union xfs_btree_key mask = { + .rmap.rm_startblock = cpu_to_be32(-1U), + }; union xfs_btree_irec low; union xfs_btree_irec high; @@ -2680,68 +2732,144 @@ xfs_rmap_has_record( memset(&high, 0xFF, sizeof(high)); high.r.rm_startblock = bno + len - 1; - return xfs_btree_has_record(cur, &low, &high, exists); + return xfs_btree_has_records(cur, &low, &high, &mask, outcome); } -/* - * Is there a record for this owner completely covering a given physical - * extent? If so, *has_rmap will be set to true. If there is no record - * or the record only covers part of the range, we set *has_rmap to false. - * This function doesn't perform range lookups or offset checks, so it is - * not suitable for checking data fork blocks. - */ -int -xfs_rmap_record_exists( - struct xfs_btree_cur *cur, +struct xfs_rmap_ownercount { + /* Owner that we're looking for. */ + struct xfs_rmap_irec good; + + /* rmap search keys */ + struct xfs_rmap_irec low; + struct xfs_rmap_irec high; + + struct xfs_rmap_matches *results; + + /* Stop early if we find a nonmatch? */ + bool stop_on_nonmatch; +}; + +/* Does this rmap represent space that can have multiple owners? */ +static inline bool +xfs_rmap_shareable( + struct xfs_mount *mp, + const struct xfs_rmap_irec *rmap) +{ + if (!xfs_has_reflink(mp)) + return false; + if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + return false; + if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | + XFS_RMAP_BMBT_BLOCK)) + return false; + return true; +} + +static inline void +xfs_rmap_ownercount_init( + struct xfs_rmap_ownercount *roc, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, - bool *has_rmap) + struct xfs_rmap_matches *results) { - uint64_t owner; - uint64_t offset; - unsigned int flags; - int has_record; - struct xfs_rmap_irec irec; - int error; + memset(roc, 0, sizeof(*roc)); + roc->results = results; + + roc->low.rm_startblock = bno; + memset(&roc->high, 0xFF, sizeof(roc->high)); + roc->high.rm_startblock = bno + len - 1; + + memset(results, 0, sizeof(*results)); + roc->good.rm_startblock = bno; + roc->good.rm_blockcount = len; + roc->good.rm_owner = oinfo->oi_owner; + roc->good.rm_offset = oinfo->oi_offset; + if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) + roc->good.rm_flags |= XFS_RMAP_ATTR_FORK; + if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) + roc->good.rm_flags |= XFS_RMAP_BMBT_BLOCK; +} - xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); - ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) || - (flags & XFS_RMAP_BMBT_BLOCK)); +/* Figure out if this is a match for the owner. */ +STATIC int +xfs_rmap_count_owners_helper( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_rmap_ownercount *roc = priv; + struct xfs_rmap_irec check = *rec; + unsigned int keyflags; + bool filedata; + int64_t delta; + + filedata = !XFS_RMAP_NON_INODE_OWNER(check.rm_owner) && + !(check.rm_flags & XFS_RMAP_BMBT_BLOCK); + + /* Trim the part of check that comes before the comparison range. */ + delta = (int64_t)roc->good.rm_startblock - check.rm_startblock; + if (delta > 0) { + check.rm_startblock += delta; + check.rm_blockcount -= delta; + if (filedata) + check.rm_offset += delta; + } - error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec, - &has_record); - if (error) - return error; - if (!has_record) { - *has_rmap = false; - return 0; + /* Trim the part of check that comes after the comparison range. */ + delta = (check.rm_startblock + check.rm_blockcount) - + (roc->good.rm_startblock + roc->good.rm_blockcount); + if (delta > 0) + check.rm_blockcount -= delta; + + /* Don't care about unwritten status for establishing ownership. */ + keyflags = check.rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK); + + if (check.rm_startblock == roc->good.rm_startblock && + check.rm_blockcount == roc->good.rm_blockcount && + check.rm_owner == roc->good.rm_owner && + check.rm_offset == roc->good.rm_offset && + keyflags == roc->good.rm_flags) { + roc->results->matches++; + } else { + roc->results->non_owner_matches++; + if (xfs_rmap_shareable(cur->bc_mp, &roc->good) ^ + xfs_rmap_shareable(cur->bc_mp, &check)) + roc->results->bad_non_owner_matches++; } - *has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno && - irec.rm_startblock + irec.rm_blockcount >= bno + len); + if (roc->results->non_owner_matches && roc->stop_on_nonmatch) + return -ECANCELED; + return 0; } -struct xfs_rmap_key_state { - uint64_t owner; - uint64_t offset; - unsigned int flags; -}; - -/* For each rmap given, figure out if it doesn't match the key we want. */ -STATIC int -xfs_rmap_has_other_keys_helper( +/* Count the number of owners and non-owners of this range of blocks. */ +int +xfs_rmap_count_owners( struct xfs_btree_cur *cur, - const struct xfs_rmap_irec *rec, - void *priv) + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo, + struct xfs_rmap_matches *results) { - struct xfs_rmap_key_state *rks = priv; + struct xfs_rmap_ownercount roc; + int error; - if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset && - ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) - return 0; - return -ECANCELED; + xfs_rmap_ownercount_init(&roc, bno, len, oinfo, results); + error = xfs_rmap_query_range(cur, &roc.low, &roc.high, + xfs_rmap_count_owners_helper, &roc); + if (error) + return error; + + /* + * There can't be any non-owner rmaps that conflict with the given + * owner if we didn't find any rmaps matching the owner. + */ + if (!results->matches) + results->bad_non_owner_matches = 0; + + return 0; } /* @@ -2754,28 +2882,26 @@ xfs_rmap_has_other_keys( xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, - bool *has_rmap) + bool *has_other) { - struct xfs_rmap_irec low = {0}; - struct xfs_rmap_irec high; - struct xfs_rmap_key_state rks; + struct xfs_rmap_matches res; + struct xfs_rmap_ownercount roc; int error; - xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags); - *has_rmap = false; - - low.rm_startblock = bno; - memset(&high, 0xFF, sizeof(high)); - high.rm_startblock = bno + len - 1; + xfs_rmap_ownercount_init(&roc, bno, len, oinfo, &res); + roc.stop_on_nonmatch = true; - error = xfs_rmap_query_range(cur, &low, &high, - xfs_rmap_has_other_keys_helper, &rks); + error = xfs_rmap_query_range(cur, &roc.low, &roc.high, + xfs_rmap_count_owners_helper, &roc); if (error == -ECANCELED) { - *has_rmap = true; + *has_other = true; return 0; } + if (error) + return error; - return error; + *has_other = false; + return 0; } const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = { diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 2dac88cea28d..3c98d9d50afb 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -62,13 +62,14 @@ xfs_rmap_irec_offset_pack( return x; } -static inline int +static inline xfs_failaddr_t xfs_rmap_irec_offset_unpack( __u64 offset, struct xfs_rmap_irec *irec) { if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS)) - return -EFSCORRUPTED; + return __this_address; + irec->rm_offset = XFS_RMAP_OFF(offset); irec->rm_flags = 0; if (offset & XFS_RMAP_OFF_ATTR_FORK) @@ -77,7 +78,7 @@ xfs_rmap_irec_offset_unpack( irec->rm_flags |= XFS_RMAP_BMBT_BLOCK; if (offset & XFS_RMAP_OFF_UNWRITTEN) irec->rm_flags |= XFS_RMAP_UNWRITTEN; - return 0; + return NULL; } static inline void @@ -162,8 +163,12 @@ struct xfs_rmap_intent { int ri_whichfork; uint64_t ri_owner; struct xfs_bmbt_irec ri_bmap; + struct xfs_perag *ri_pag; }; +void xfs_rmap_update_get_group(struct xfs_mount *mp, + struct xfs_rmap_intent *ri); + /* functions for updating the rmapbt based on bmbt map/unmap operations */ void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); @@ -188,16 +193,31 @@ int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, int xfs_rmap_compare(const struct xfs_rmap_irec *a, const struct xfs_rmap_irec *b); union xfs_btree_rec; -int xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec, +xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_rmap_irec *irec); -int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, bool *exists); -int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno, +xfs_failaddr_t xfs_rmap_check_irec(struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *irec); + +int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, enum xbtree_recpacking *outcome); + +struct xfs_rmap_matches { + /* Number of owner matches. */ + unsigned long long matches; + + /* Number of non-owner matches. */ + unsigned long long non_owner_matches; + + /* Number of non-owner matches that conflict with the owner matches. */ + unsigned long long bad_non_owner_matches; +}; + +int xfs_rmap_count_owners(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, - bool *has_rmap); + struct xfs_rmap_matches *rmatch); int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, - bool *has_rmap); + bool *has_other); int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap); extern const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index d3285684bb5e..6c81b20e97d2 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -156,6 +156,16 @@ xfs_rmapbt_get_maxrecs( return cur->bc_mp->m_rmap_mxr[level != 0]; } +/* + * Convert the ondisk record's offset field into the ondisk key's offset field. + * Fork and bmbt are significant parts of the rmap record key, but written + * status is merely a record attribute. + */ +static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec) +{ + return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN); +} + STATIC void xfs_rmapbt_init_key_from_rec( union xfs_btree_key *key, @@ -163,7 +173,7 @@ xfs_rmapbt_init_key_from_rec( { key->rmap.rm_startblock = rec->rmap.rm_startblock; key->rmap.rm_owner = rec->rmap.rm_owner; - key->rmap.rm_offset = rec->rmap.rm_offset; + key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); } /* @@ -186,7 +196,7 @@ xfs_rmapbt_init_high_key_from_rec( key->rmap.rm_startblock = rec->rmap.rm_startblock; be32_add_cpu(&key->rmap.rm_startblock, adj); key->rmap.rm_owner = rec->rmap.rm_owner; - key->rmap.rm_offset = rec->rmap.rm_offset; + key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) return; @@ -219,6 +229,16 @@ xfs_rmapbt_init_ptr_from_cur( ptr->s = agf->agf_roots[cur->bc_btnum]; } +/* + * Mask the appropriate parts of the ondisk key field for a key comparison. + * Fork and bmbt are significant parts of the rmap record key, but written + * status is merely a record attribute. + */ +static inline uint64_t offset_keymask(uint64_t offset) +{ + return offset & ~XFS_RMAP_OFF_UNWRITTEN; +} + STATIC int64_t xfs_rmapbt_key_diff( struct xfs_btree_cur *cur, @@ -240,8 +260,8 @@ xfs_rmapbt_key_diff( else if (y > x) return -1; - x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset)); - y = rec->rm_offset; + x = offset_keymask(be64_to_cpu(kp->rm_offset)); + y = offset_keymask(xfs_rmap_irec_offset_pack(rec)); if (x > y) return 1; else if (y > x) @@ -253,31 +273,43 @@ STATIC int64_t xfs_rmapbt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { const struct xfs_rmap_key *kp1 = &k1->rmap; const struct xfs_rmap_key *kp2 = &k2->rmap; int64_t d; __u64 x, y; + /* Doesn't make sense to mask off the physical space part */ + ASSERT(!mask || mask->rmap.rm_startblock); + d = (int64_t)be32_to_cpu(kp1->rm_startblock) - - be32_to_cpu(kp2->rm_startblock); + be32_to_cpu(kp2->rm_startblock); if (d) return d; - x = be64_to_cpu(kp1->rm_owner); - y = be64_to_cpu(kp2->rm_owner); - if (x > y) - return 1; - else if (y > x) - return -1; + if (!mask || mask->rmap.rm_owner) { + x = be64_to_cpu(kp1->rm_owner); + y = be64_to_cpu(kp2->rm_owner); + if (x > y) + return 1; + else if (y > x) + return -1; + } + + if (!mask || mask->rmap.rm_offset) { + /* Doesn't make sense to allow offset but not owner */ + ASSERT(!mask || mask->rmap.rm_owner); + + x = offset_keymask(be64_to_cpu(kp1->rm_offset)); + y = offset_keymask(be64_to_cpu(kp2->rm_offset)); + if (x > y) + return 1; + else if (y > x) + return -1; + } - x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset)); - y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset)); - if (x > y) - return 1; - else if (y > x) - return -1; return 0; } @@ -387,8 +419,8 @@ xfs_rmapbt_keys_inorder( return 1; else if (a > b) return 0; - a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset)); - b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset)); + a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset)); + b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset)); if (a <= b) return 1; return 0; @@ -417,13 +449,33 @@ xfs_rmapbt_recs_inorder( return 1; else if (a > b) return 0; - a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset)); - b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset)); + a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset)); + b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset)); if (a <= b) return 1; return 0; } +STATIC enum xbtree_key_contig +xfs_rmapbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->rmap.rm_startblock); + + /* + * We only support checking contiguity of the physical space component. + * If any callers ever need more specificity than that, they'll have to + * implement it here. + */ + ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset)); + + return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock), + be32_to_cpu(key2->rmap.rm_startblock)); +} + static const struct xfs_btree_ops xfs_rmapbt_ops = { .rec_len = sizeof(struct xfs_rmap_rec), .key_len = 2 * sizeof(struct xfs_rmap_key), @@ -443,6 +495,7 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = { .diff_two_keys = xfs_rmapbt_diff_two_keys, .keys_inorder = xfs_rmapbt_keys_inorder, .recs_inorder = xfs_rmapbt_recs_inorder, + .keys_contiguous = xfs_rmapbt_keys_contiguous, }; static struct xfs_btree_cur * @@ -460,10 +513,7 @@ xfs_rmapbt_init_common( cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); cur->bc_ops = &xfs_rmapbt_ops; - /* take a reference for the cursor */ - atomic_inc(&pag->pag_ref); - cur->bc_ag.pag = pag; - + cur->bc_ag.pag = xfs_perag_hold(pag); return cur; } diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 99cc03a298e2..ba0f17bc1dc0 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -72,7 +72,8 @@ xfs_sb_validate_v5_features( } /* - * We support all XFS versions newer than a v4 superblock with V2 directories. + * We current support XFS v5 formats with known features and v4 superblocks with + * at least V2 directories. */ bool xfs_sb_good_version( @@ -86,16 +87,16 @@ xfs_sb_good_version( if (xfs_sb_is_v5(sbp)) return xfs_sb_validate_v5_features(sbp); + /* versions prior to v4 are not supported */ + if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_4) + return false; + /* We must not have any unknown v4 feature bits set */ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) return false; - /* versions prior to v4 are not supported */ - if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4) - return false; - /* V4 filesystems need v2 directories and unwritten extents */ if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) return false; diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 5ebdda7e1078..851220021484 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -204,6 +204,18 @@ enum xfs_ag_resv_type { XFS_AG_RESV_RMAPBT, }; +/* Results of scanning a btree keyspace to check occupancy. */ +enum xbtree_recpacking { + /* None of the keyspace maps to records. */ + XBTREE_RECPACKING_EMPTY = 0, + + /* Some, but not all, of the keyspace maps to records. */ + XBTREE_RECPACKING_SPARSE, + + /* The entire keyspace maps to records. */ + XBTREE_RECPACKING_FULL, +}; + /* * Type verifier functions */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 4dd52b15f09c..6c6e5eba42c8 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -18,6 +18,15 @@ #include "scrub/scrub.h" #include "scrub/common.h" +int +xchk_setup_agheader( + struct xfs_scrub *sc) +{ + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + return xchk_setup_fs(sc); +} + /* Superblock */ /* Cross-reference with the other btrees. */ @@ -42,8 +51,9 @@ xchk_superblock_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); /* scrub teardown will take care of sc->sa for us */ } @@ -505,9 +515,10 @@ xchk_agf_xref( xchk_agf_xref_freeblks(sc); xchk_agf_xref_cntbt(sc); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_agf_xref_btreeblks(sc); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); xchk_agf_xref_refcblks(sc); /* scrub teardown will take care of sc->sa for us */ @@ -633,8 +644,9 @@ xchk_agfl_block_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_AG); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_AG); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); } /* Scrub an AGFL block. */ @@ -689,8 +701,9 @@ xchk_agfl_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); /* * Scrub teardown will take care of sc->sa for us. Leave sc->sa @@ -844,8 +857,9 @@ xchk_agi_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); xchk_agi_xref_icounts(sc); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); xchk_agi_xref_fiblocks(sc); /* scrub teardown will take care of sc->sa for us */ diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index c37e6d72760b..bbaa65422c4f 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2018 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -487,10 +487,11 @@ xrep_agfl_walk_rmap( /* Strike out the blocks that are cross-linked according to the rmapbt. */ STATIC int xrep_agfl_check_extent( - struct xrep_agfl *ra, uint64_t start, - uint64_t len) + uint64_t len, + void *priv) { + struct xrep_agfl *ra = priv; xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start); xfs_agblock_t last_agbno = agbno + len - 1; int error; @@ -538,7 +539,6 @@ xrep_agfl_collect_blocks( struct xrep_agfl ra; struct xfs_mount *mp = sc->mp; struct xfs_btree_cur *cur; - struct xbitmap_range *br, *n; int error; ra.sc = sc; @@ -579,11 +579,7 @@ xrep_agfl_collect_blocks( /* Strike out the blocks that are cross-linked. */ ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); - for_each_xbitmap_extent(br, n, agfl_extents) { - error = xrep_agfl_check_extent(&ra, br->start, br->len); - if (error) - break; - } + error = xbitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra); xfs_btree_del_cursor(ra.rmap_cur, error); if (error) goto out_bmp; @@ -629,21 +625,58 @@ xrep_agfl_update_agf( XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); } +struct xrep_agfl_fill { + struct xbitmap used_extents; + struct xfs_scrub *sc; + __be32 *agfl_bno; + xfs_agblock_t flcount; + unsigned int fl_off; +}; + +/* Fill the AGFL with whatever blocks are in this extent. */ +static int +xrep_agfl_fill( + uint64_t start, + uint64_t len, + void *priv) +{ + struct xrep_agfl_fill *af = priv; + struct xfs_scrub *sc = af->sc; + xfs_fsblock_t fsbno = start; + int error; + + while (fsbno < start + len && af->fl_off < af->flcount) + af->agfl_bno[af->fl_off++] = + cpu_to_be32(XFS_FSB_TO_AGBNO(sc->mp, fsbno++)); + + trace_xrep_agfl_insert(sc->mp, sc->sa.pag->pag_agno, + XFS_FSB_TO_AGBNO(sc->mp, start), len); + + error = xbitmap_set(&af->used_extents, start, fsbno - 1); + if (error) + return error; + + if (af->fl_off == af->flcount) + return -ECANCELED; + + return 0; +} + /* Write out a totally new AGFL. */ -STATIC void +STATIC int xrep_agfl_init_header( struct xfs_scrub *sc, struct xfs_buf *agfl_bp, struct xbitmap *agfl_extents, xfs_agblock_t flcount) { + struct xrep_agfl_fill af = { + .sc = sc, + .flcount = flcount, + }; struct xfs_mount *mp = sc->mp; - __be32 *agfl_bno; - struct xbitmap_range *br; - struct xbitmap_range *n; struct xfs_agfl *agfl; - xfs_agblock_t agbno; - unsigned int fl_off; + int error; ASSERT(flcount <= xfs_agfl_size(mp)); @@ -662,36 +695,18 @@ xrep_agfl_init_header( * blocks than fit in the AGFL, they will be freed in a subsequent * step. */ - fl_off = 0; - agfl_bno = xfs_buf_to_agfl_bno(agfl_bp); - for_each_xbitmap_extent(br, n, agfl_extents) { - agbno = XFS_FSB_TO_AGBNO(mp, br->start); - - trace_xrep_agfl_insert(mp, sc->sa.pag->pag_agno, agbno, - br->len); - - while (br->len > 0 && fl_off < flcount) { - agfl_bno[fl_off] = cpu_to_be32(agbno); - fl_off++; - agbno++; - - /* - * We've now used br->start by putting it in the AGFL, - * so bump br so that we don't reap the block later. - */ - br->start++; - br->len--; - } - - if (br->len) - break; - list_del(&br->list); - kfree(br); - } + xbitmap_init(&af.used_extents); + af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp), + xbitmap_walk(agfl_extents, xrep_agfl_fill, &af); + error = xbitmap_disunion(agfl_extents, &af.used_extents); + if (error) + return error; /* Write new AGFL to disk. */ xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF); xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1); + xbitmap_destroy(&af.used_extents); + return 0; } /* Repair the AGFL. */ @@ -744,7 +759,9 @@ xrep_agfl( * buffers until we know that part works. */ xrep_agfl_update_agf(sc, agf_bp, flcount); - xrep_agfl_init_header(sc, agfl_bp, &agfl_extents, flcount); + error = xrep_agfl_init_header(sc, agfl_bp, &agfl_extents, flcount); + if (error) + goto err; /* * Ok, the AGFL should be ready to go now. Roll the transaction to diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 3b38f4e2a537..279af72b1671 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -24,10 +24,19 @@ int xchk_setup_ag_allocbt( struct xfs_scrub *sc) { + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + return xchk_setup_ag_btree(sc, false); } /* Free space btree scrubber. */ + +struct xchk_alloc { + /* Previous free space extent. */ + struct xfs_alloc_rec_incore prev; +}; + /* * Ensure there's a corresponding cntbt/bnobt record matching this * bnobt/cntbt record, respectively. @@ -75,9 +84,11 @@ xchk_allocbt_xref_other( STATIC void xchk_allocbt_xref( struct xfs_scrub *sc, - xfs_agblock_t agbno, - xfs_extlen_t len) + const struct xfs_alloc_rec_incore *irec) { + xfs_agblock_t agbno = irec->ar_startblock; + xfs_extlen_t len = irec->ar_blockcount; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return; @@ -85,25 +96,44 @@ xchk_allocbt_xref( xchk_xref_is_not_inode_chunk(sc, agbno, len); xchk_xref_has_no_owner(sc, agbno, len); xchk_xref_is_not_shared(sc, agbno, len); + xchk_xref_is_not_cow_staging(sc, agbno, len); +} + +/* Flag failures for records that could be merged. */ +STATIC void +xchk_allocbt_mergeable( + struct xchk_btree *bs, + struct xchk_alloc *ca, + const struct xfs_alloc_rec_incore *irec) +{ + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (ca->prev.ar_blockcount > 0 && + ca->prev.ar_startblock + ca->prev.ar_blockcount == irec->ar_startblock && + ca->prev.ar_blockcount + irec->ar_blockcount < (uint32_t)~0U) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + memcpy(&ca->prev, irec, sizeof(*irec)); } /* Scrub a bnobt/cntbt record. */ STATIC int xchk_allocbt_rec( - struct xchk_btree *bs, - const union xfs_btree_rec *rec) + struct xchk_btree *bs, + const union xfs_btree_rec *rec) { - struct xfs_perag *pag = bs->cur->bc_ag.pag; - xfs_agblock_t bno; - xfs_extlen_t len; + struct xfs_alloc_rec_incore irec; + struct xchk_alloc *ca = bs->private; - bno = be32_to_cpu(rec->alloc.ar_startblock); - len = be32_to_cpu(rec->alloc.ar_blockcount); - - if (!xfs_verify_agbext(pag, bno, len)) + xfs_alloc_btrec_to_irec(rec, &irec); + if (xfs_alloc_check_irec(bs->cur, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } - xchk_allocbt_xref(bs->sc, bno, len); + xchk_allocbt_mergeable(bs, ca, &irec); + xchk_allocbt_xref(bs->sc, &irec); return 0; } @@ -114,10 +144,11 @@ xchk_allocbt( struct xfs_scrub *sc, xfs_btnum_t which) { + struct xchk_alloc ca = { }; struct xfs_btree_cur *cur; cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur; - return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, NULL); + return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca); } int @@ -141,15 +172,15 @@ xchk_xref_is_used_space( xfs_agblock_t agbno, xfs_extlen_t len) { - bool is_freesp; + enum xbtree_recpacking outcome; int error; if (!sc->sa.bno_cur || xchk_skip_xref(sc->sm)) return; - error = xfs_alloc_has_record(sc->sa.bno_cur, agbno, len, &is_freesp); + error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome); if (!xchk_should_check_xref(sc, &error, &sc->sa.bno_cur)) return; - if (is_freesp) + if (outcome != XBTREE_RECPACKING_EMPTY) xchk_btree_xref_set_corrupt(sc, sc->sa.bno_cur, 0); } diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 31529b9bf389..6c16d9530cca 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -15,11 +15,51 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" +#include "xfs_attr_sf.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" #include "scrub/attr.h" +/* Free the buffers linked from the xattr buffer. */ +static void +xchk_xattr_buf_cleanup( + void *priv) +{ + struct xchk_xattr_buf *ab = priv; + + kvfree(ab->freemap); + ab->freemap = NULL; + kvfree(ab->usedmap); + ab->usedmap = NULL; + kvfree(ab->value); + ab->value = NULL; + ab->value_sz = 0; +} + +/* + * Allocate the free space bitmap if we're trying harder; there are leaf blocks + * in the attr fork; or we can't tell if there are leaf blocks. + */ +static inline bool +xchk_xattr_want_freemap( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp; + + if (sc->flags & XCHK_TRY_HARDER) + return true; + + if (!sc->ip) + return true; + + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + if (!ifp) + return false; + + return xfs_ifork_has_extents(ifp); +} + /* * Allocate enough memory to hold an attr value and attr block bitmaps, * reallocating the buffer if necessary. Buffer contents are not preserved @@ -28,41 +68,49 @@ static int xchk_setup_xattr_buf( struct xfs_scrub *sc, - size_t value_size, - gfp_t flags) + size_t value_size) { - size_t sz; + size_t bmp_sz; struct xchk_xattr_buf *ab = sc->buf; + void *new_val; - /* - * We need enough space to read an xattr value from the file or enough - * space to hold three copies of the xattr free space bitmap. We don't - * need the buffer space for both purposes at the same time. - */ - sz = 3 * sizeof(long) * BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); - sz = max_t(size_t, sz, value_size); + bmp_sz = sizeof(long) * BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); - /* - * If there's already a buffer, figure out if we need to reallocate it - * to accommodate a larger size. - */ - if (ab) { - if (sz <= ab->sz) - return 0; - kvfree(ab); - sc->buf = NULL; - } + if (ab) + goto resize_value; - /* - * Don't zero the buffer upon allocation to avoid runtime overhead. - * All users must be careful never to read uninitialized contents. - */ - ab = kvmalloc(sizeof(*ab) + sz, flags); + ab = kvzalloc(sizeof(struct xchk_xattr_buf), XCHK_GFP_FLAGS); if (!ab) return -ENOMEM; - - ab->sz = sz; sc->buf = ab; + sc->buf_cleanup = xchk_xattr_buf_cleanup; + + ab->usedmap = kvmalloc(bmp_sz, XCHK_GFP_FLAGS); + if (!ab->usedmap) + return -ENOMEM; + + if (xchk_xattr_want_freemap(sc)) { + ab->freemap = kvmalloc(bmp_sz, XCHK_GFP_FLAGS); + if (!ab->freemap) + return -ENOMEM; + } + +resize_value: + if (ab->value_sz >= value_size) + return 0; + + if (ab->value) { + kvfree(ab->value); + ab->value = NULL; + ab->value_sz = 0; + } + + new_val = kvmalloc(value_size, XCHK_GFP_FLAGS); + if (!new_val) + return -ENOMEM; + + ab->value = new_val; + ab->value_sz = value_size; return 0; } @@ -79,8 +127,7 @@ xchk_setup_xattr( * without the inode lock held, which means we can sleep. */ if (sc->flags & XCHK_TRY_HARDER) { - error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, - XCHK_GFP_FLAGS); + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX); if (error) return error; } @@ -111,11 +158,24 @@ xchk_xattr_listent( int namelen, int valuelen) { + struct xfs_da_args args = { + .op_flags = XFS_DA_OP_NOTIME, + .attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK, + .geo = context->dp->i_mount->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .dp = context->dp, + .name = name, + .namelen = namelen, + .hashval = xfs_da_hashname(name, namelen), + .trans = context->tp, + .valuelen = valuelen, + }; + struct xchk_xattr_buf *ab; struct xchk_xattr *sx; - struct xfs_da_args args = { NULL }; int error = 0; sx = container_of(context, struct xchk_xattr, context); + ab = sx->sc->buf; if (xchk_should_terminate(sx->sc, &error)) { context->seen_enough = error; @@ -128,18 +188,32 @@ xchk_xattr_listent( return; } + /* Only one namespace bit allowed. */ + if (hweight32(flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) { + xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); + goto fail_xref; + } + /* Does this name make sense? */ if (!xfs_attr_namecheck(name, namelen)) { xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); - return; + goto fail_xref; } /* + * Local xattr values are stored in the attr leaf block, so we don't + * need to retrieve the value from a remote block to detect corruption + * problems. + */ + if (flags & XFS_ATTR_LOCAL) + goto fail_xref; + + /* * Try to allocate enough memory to extrat the attr value. If that * doesn't work, we overload the seen_enough variable to convey * the error message back to the main scrub function. */ - error = xchk_setup_xattr_buf(sx->sc, valuelen, XCHK_GFP_FLAGS); + error = xchk_setup_xattr_buf(sx->sc, valuelen); if (error == -ENOMEM) error = -EDEADLOCK; if (error) { @@ -147,17 +221,7 @@ xchk_xattr_listent( return; } - args.op_flags = XFS_DA_OP_NOTIME; - args.attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK; - args.geo = context->dp->i_mount->m_attr_geo; - args.whichfork = XFS_ATTR_FORK; - args.dp = context->dp; - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(args.name, args.namelen); - args.trans = context->tp; - args.value = xchk_xattr_valuebuf(sx->sc); - args.valuelen = valuelen; + args.value = ab->value; error = xfs_attr_get_ilocked(&args); /* ENODATA means the hash lookup failed and the attr is bad */ @@ -213,25 +277,23 @@ xchk_xattr_set_map( STATIC bool xchk_xattr_check_freemap( struct xfs_scrub *sc, - unsigned long *map, struct xfs_attr3_icleaf_hdr *leafhdr) { - unsigned long *freemap = xchk_xattr_freemap(sc); - unsigned long *dstmap = xchk_xattr_dstmap(sc); + struct xchk_xattr_buf *ab = sc->buf; unsigned int mapsize = sc->mp->m_attr_geo->blksize; int i; /* Construct bitmap of freemap contents. */ - bitmap_zero(freemap, mapsize); + bitmap_zero(ab->freemap, mapsize); for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { - if (!xchk_xattr_set_map(sc, freemap, + if (!xchk_xattr_set_map(sc, ab->freemap, leafhdr->freemap[i].base, leafhdr->freemap[i].size)) return false; } /* Look for bits that are set in freemap and are marked in use. */ - return bitmap_and(dstmap, freemap, map, mapsize) == 0; + return !bitmap_intersects(ab->freemap, ab->usedmap, mapsize); } /* @@ -251,7 +313,7 @@ xchk_xattr_entry( __u32 *last_hashval) { struct xfs_mount *mp = ds->state->mp; - unsigned long *usedmap = xchk_xattr_usedmap(ds->sc); + struct xchk_xattr_buf *ab = ds->sc->buf; char *name_end; struct xfs_attr_leaf_name_local *lentry; struct xfs_attr_leaf_name_remote *rentry; @@ -291,7 +353,7 @@ xchk_xattr_entry( if (name_end > buf_end) xchk_da_set_corrupt(ds, level); - if (!xchk_xattr_set_map(ds->sc, usedmap, nameidx, namesize)) + if (!xchk_xattr_set_map(ds->sc, ab->usedmap, nameidx, namesize)) xchk_da_set_corrupt(ds, level); if (!(ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) *usedbytes += namesize; @@ -311,35 +373,26 @@ xchk_xattr_block( struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr_leaf_entry *ent; struct xfs_attr_leaf_entry *entries; - unsigned long *usedmap; + struct xchk_xattr_buf *ab = ds->sc->buf; char *buf_end; size_t off; __u32 last_hashval = 0; unsigned int usedbytes = 0; unsigned int hdrsize; int i; - int error; if (*last_checked == blk->blkno) return 0; - /* Allocate memory for block usage checking. */ - error = xchk_setup_xattr_buf(ds->sc, 0, XCHK_GFP_FLAGS); - if (error == -ENOMEM) - return -EDEADLOCK; - if (error) - return error; - usedmap = xchk_xattr_usedmap(ds->sc); - *last_checked = blk->blkno; - bitmap_zero(usedmap, mp->m_attr_geo->blksize); + bitmap_zero(ab->usedmap, mp->m_attr_geo->blksize); /* Check all the padding. */ if (xfs_has_crc(ds->sc->mp)) { - struct xfs_attr3_leafblock *leaf = bp->b_addr; + struct xfs_attr3_leafblock *leaf3 = bp->b_addr; - if (leaf->hdr.pad1 != 0 || leaf->hdr.pad2 != 0 || - leaf->hdr.info.hdr.pad != 0) + if (leaf3->hdr.pad1 != 0 || leaf3->hdr.pad2 != 0 || + leaf3->hdr.info.hdr.pad != 0) xchk_da_set_corrupt(ds, level); } else { if (leaf->hdr.pad1 != 0 || leaf->hdr.info.pad != 0) @@ -356,7 +409,7 @@ xchk_xattr_block( xchk_da_set_corrupt(ds, level); if (leafhdr.firstused < hdrsize) xchk_da_set_corrupt(ds, level); - if (!xchk_xattr_set_map(ds->sc, usedmap, 0, hdrsize)) + if (!xchk_xattr_set_map(ds->sc, ab->usedmap, 0, hdrsize)) xchk_da_set_corrupt(ds, level); if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) @@ -370,7 +423,7 @@ xchk_xattr_block( for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) { /* Mark the leaf entry itself. */ off = (char *)ent - (char *)leaf; - if (!xchk_xattr_set_map(ds->sc, usedmap, off, + if (!xchk_xattr_set_map(ds->sc, ab->usedmap, off, sizeof(xfs_attr_leaf_entry_t))) { xchk_da_set_corrupt(ds, level); goto out; @@ -384,7 +437,7 @@ xchk_xattr_block( goto out; } - if (!xchk_xattr_check_freemap(ds->sc, usedmap, &leafhdr)) + if (!xchk_xattr_check_freemap(ds->sc, &leafhdr)) xchk_da_set_corrupt(ds, level); if (leafhdr.usedbytes != usedbytes) @@ -468,38 +521,115 @@ out: return error; } +/* Check space usage of shortform attrs. */ +STATIC int +xchk_xattr_check_sf( + struct xfs_scrub *sc) +{ + struct xchk_xattr_buf *ab = sc->buf; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + struct xfs_attr_sf_entry *next; + struct xfs_ifork *ifp; + unsigned char *end; + int i; + int error = 0; + + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + + bitmap_zero(ab->usedmap, ifp->if_bytes); + sf = (struct xfs_attr_shortform *)sc->ip->i_af.if_u1.if_data; + end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes; + xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(sf->hdr)); + + sfe = &sf->list[0]; + if ((unsigned char *)sfe > end) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + + for (i = 0; i < sf->hdr.count; i++) { + unsigned char *name = sfe->nameval; + unsigned char *value = &sfe->nameval[sfe->namelen]; + + if (xchk_should_terminate(sc, &error)) + return error; + + next = xfs_attr_sf_nextentry(sfe); + if ((unsigned char *)next > end) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + + if (!xchk_xattr_set_map(sc, ab->usedmap, + (char *)sfe - (char *)sf, + sizeof(struct xfs_attr_sf_entry))) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + + if (!xchk_xattr_set_map(sc, ab->usedmap, + (char *)name - (char *)sf, + sfe->namelen)) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + + if (!xchk_xattr_set_map(sc, ab->usedmap, + (char *)value - (char *)sf, + sfe->valuelen)) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + + sfe = next; + } + + return 0; +} + /* Scrub the extended attribute metadata. */ int xchk_xattr( struct xfs_scrub *sc) { - struct xchk_xattr sx; + struct xchk_xattr sx = { + .sc = sc, + .context = { + .dp = sc->ip, + .tp = sc->tp, + .resynch = 1, + .put_listent = xchk_xattr_listent, + .allow_incomplete = true, + }, + }; xfs_dablk_t last_checked = -1U; int error = 0; if (!xfs_inode_hasattr(sc->ip)) return -ENOENT; - memset(&sx, 0, sizeof(sx)); - /* Check attribute tree structure */ - error = xchk_da_btree(sc, XFS_ATTR_FORK, xchk_xattr_rec, - &last_checked); + /* Allocate memory for xattr checking. */ + error = xchk_setup_xattr_buf(sc, 0); + if (error == -ENOMEM) + return -EDEADLOCK; if (error) - goto out; + return error; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - goto out; + /* Check the physical structure of the xattr. */ + if (sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL) + error = xchk_xattr_check_sf(sc); + else + error = xchk_da_btree(sc, XFS_ATTR_FORK, xchk_xattr_rec, + &last_checked); + if (error) + return error; - /* Check that every attr key can also be looked up by hash. */ - sx.context.dp = sc->ip; - sx.context.resynch = 1; - sx.context.put_listent = xchk_xattr_listent; - sx.context.tp = sc->tp; - sx.context.allow_incomplete = true; - sx.sc = sc; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; /* - * Look up every xattr in this file by name. + * Look up every xattr in this file by name and hash. * * Use the backend implementation of xfs_attr_list to call * xchk_xattr_listent on every attribute key in this inode. @@ -516,11 +646,11 @@ xchk_xattr( */ error = xfs_attr_list_ilocked(&sx.context); if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error)) - goto out; + return error; /* Did our listent function try to return any errors? */ if (sx.context.seen_enough < 0) - error = sx.context.seen_enough; -out: - return error; + return sx.context.seen_enough; + + return 0; } diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h index 3590e10e3e62..48fd9402c432 100644 --- a/fs/xfs/scrub/attr.h +++ b/fs/xfs/scrub/attr.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * Copyright (C) 2019 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2019-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_ATTR_H__ #define __XFS_SCRUB_ATTR_H__ @@ -10,59 +10,15 @@ * Temporary storage for online scrub and repair of extended attributes. */ struct xchk_xattr_buf { - /* Size of @buf, in bytes. */ - size_t sz; + /* Bitmap of used space in xattr leaf blocks and shortform forks. */ + unsigned long *usedmap; - /* - * Memory buffer -- either used for extracting attr values while - * walking the attributes; or for computing attr block bitmaps when - * checking the attribute tree. - * - * Each bitmap contains enough bits to track every byte in an attr - * block (rounded up to the size of an unsigned long). The attr block - * used space bitmap starts at the beginning of the buffer; the free - * space bitmap follows immediately after; and we have a third buffer - * for storing intermediate bitmap results. - */ - uint8_t buf[]; -}; - -/* A place to store attribute values. */ -static inline uint8_t * -xchk_xattr_valuebuf( - struct xfs_scrub *sc) -{ - struct xchk_xattr_buf *ab = sc->buf; - - return ab->buf; -} - -/* A bitmap of space usage computed by walking an attr leaf block. */ -static inline unsigned long * -xchk_xattr_usedmap( - struct xfs_scrub *sc) -{ - struct xchk_xattr_buf *ab = sc->buf; + /* Bitmap of free space in xattr leaf blocks. */ + unsigned long *freemap; - return (unsigned long *)ab->buf; -} - -/* A bitmap of free space computed by walking attr leaf block free info. */ -static inline unsigned long * -xchk_xattr_freemap( - struct xfs_scrub *sc) -{ - return xchk_xattr_usedmap(sc) + - BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); -} - -/* A bitmap used to hold temporary results. */ -static inline unsigned long * -xchk_xattr_dstmap( - struct xfs_scrub *sc) -{ - return xchk_xattr_freemap(sc) + - BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); -} + /* Memory buffer used to extract xattr values. */ + void *value; + size_t value_sz; +}; #endif /* __XFS_SCRUB_ATTR_H__ */ diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index a255f09e9f0a..0c959be396ea 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -1,11 +1,12 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2018 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_bit.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" @@ -13,27 +14,160 @@ #include "scrub/scrub.h" #include "scrub/bitmap.h" +#include <linux/interval_tree_generic.h> + +struct xbitmap_node { + struct rb_node bn_rbnode; + + /* First set bit of this interval and subtree. */ + uint64_t bn_start; + + /* Last set bit of this interval. */ + uint64_t bn_last; + + /* Last set bit of this subtree. Do not touch this. */ + uint64_t __bn_subtree_last; +}; + +/* Define our own interval tree type with uint64_t parameters. */ + +#define START(node) ((node)->bn_start) +#define LAST(node) ((node)->bn_last) + /* - * Set a range of this bitmap. Caller must ensure the range is not set. - * - * This is the logical equivalent of bitmap |= mask(start, len). + * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll + * forward-declare them anyway for clarity. */ +static inline void +xbitmap_tree_insert(struct xbitmap_node *node, struct rb_root_cached *root); + +static inline void +xbitmap_tree_remove(struct xbitmap_node *node, struct rb_root_cached *root); + +static inline struct xbitmap_node * +xbitmap_tree_iter_first(struct rb_root_cached *root, uint64_t start, + uint64_t last); + +static inline struct xbitmap_node * +xbitmap_tree_iter_next(struct xbitmap_node *node, uint64_t start, + uint64_t last); + +INTERVAL_TREE_DEFINE(struct xbitmap_node, bn_rbnode, uint64_t, + __bn_subtree_last, START, LAST, static inline, xbitmap_tree) + +/* Iterate each interval of a bitmap. Do not change the bitmap. */ +#define for_each_xbitmap_extent(bn, bitmap) \ + for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \ + struct xbitmap_node, bn_rbnode); \ + (bn) != NULL; \ + (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \ + struct xbitmap_node, bn_rbnode)) + +/* Clear a range of this bitmap. */ +int +xbitmap_clear( + struct xbitmap *bitmap, + uint64_t start, + uint64_t len) +{ + struct xbitmap_node *bn; + struct xbitmap_node *new_bn; + uint64_t last = start + len - 1; + + while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last))) { + if (bn->bn_start < start && bn->bn_last > last) { + uint64_t old_last = bn->bn_last; + + /* overlaps with the entire clearing range */ + xbitmap_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap_tree_insert(bn, &bitmap->xb_root); + + /* add an extent */ + new_bn = kmalloc(sizeof(struct xbitmap_node), + XCHK_GFP_FLAGS); + if (!new_bn) + return -ENOMEM; + new_bn->bn_start = last + 1; + new_bn->bn_last = old_last; + xbitmap_tree_insert(new_bn, &bitmap->xb_root); + } else if (bn->bn_start < start) { + /* overlaps with the left side of the clearing range */ + xbitmap_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap_tree_insert(bn, &bitmap->xb_root); + } else if (bn->bn_last > last) { + /* overlaps with the right side of the clearing range */ + xbitmap_tree_remove(bn, &bitmap->xb_root); + bn->bn_start = last + 1; + xbitmap_tree_insert(bn, &bitmap->xb_root); + break; + } else { + /* in the middle of the clearing range */ + xbitmap_tree_remove(bn, &bitmap->xb_root); + kfree(bn); + } + } + + return 0; +} + +/* Set a range of this bitmap. */ int xbitmap_set( struct xbitmap *bitmap, uint64_t start, uint64_t len) { - struct xbitmap_range *bmr; + struct xbitmap_node *left; + struct xbitmap_node *right; + uint64_t last = start + len - 1; + int error; - bmr = kmalloc(sizeof(struct xbitmap_range), XCHK_GFP_FLAGS); - if (!bmr) - return -ENOMEM; + /* Is this whole range already set? */ + left = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + if (left && left->bn_start <= start && left->bn_last >= last) + return 0; + + /* Clear out everything in the range we want to set. */ + error = xbitmap_clear(bitmap, start, len); + if (error) + return error; + + /* Do we have a left-adjacent extent? */ + left = xbitmap_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); + ASSERT(!left || left->bn_last + 1 == start); + + /* Do we have a right-adjacent extent? */ + right = xbitmap_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); + ASSERT(!right || right->bn_start == last + 1); - INIT_LIST_HEAD(&bmr->list); - bmr->start = start; - bmr->len = len; - list_add_tail(&bmr->list, &bitmap->list); + if (left && right) { + /* combine left and right adjacent extent */ + xbitmap_tree_remove(left, &bitmap->xb_root); + xbitmap_tree_remove(right, &bitmap->xb_root); + left->bn_last = right->bn_last; + xbitmap_tree_insert(left, &bitmap->xb_root); + kfree(right); + } else if (left) { + /* combine with left extent */ + xbitmap_tree_remove(left, &bitmap->xb_root); + left->bn_last = last; + xbitmap_tree_insert(left, &bitmap->xb_root); + } else if (right) { + /* combine with right extent */ + xbitmap_tree_remove(right, &bitmap->xb_root); + right->bn_start = start; + xbitmap_tree_insert(right, &bitmap->xb_root); + } else { + /* add an extent */ + left = kmalloc(sizeof(struct xbitmap_node), XCHK_GFP_FLAGS); + if (!left) + return -ENOMEM; + left->bn_start = start; + left->bn_last = last; + xbitmap_tree_insert(left, &bitmap->xb_root); + } return 0; } @@ -43,12 +177,11 @@ void xbitmap_destroy( struct xbitmap *bitmap) { - struct xbitmap_range *bmr; - struct xbitmap_range *n; + struct xbitmap_node *bn; - for_each_xbitmap_extent(bmr, n, bitmap) { - list_del(&bmr->list); - kfree(bmr); + while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) { + xbitmap_tree_remove(bn, &bitmap->xb_root); + kfree(bn); } } @@ -57,27 +190,7 @@ void xbitmap_init( struct xbitmap *bitmap) { - INIT_LIST_HEAD(&bitmap->list); -} - -/* Compare two btree extents. */ -static int -xbitmap_range_cmp( - void *priv, - const struct list_head *a, - const struct list_head *b) -{ - struct xbitmap_range *ap; - struct xbitmap_range *bp; - - ap = container_of(a, struct xbitmap_range, list); - bp = container_of(b, struct xbitmap_range, list); - - if (ap->start > bp->start) - return 1; - if (ap->start < bp->start) - return -1; - return 0; + bitmap->xb_root = RB_ROOT_CACHED; } /* @@ -94,118 +207,26 @@ xbitmap_range_cmp( * * This is the logical equivalent of bitmap &= ~sub. */ -#define LEFT_ALIGNED (1 << 0) -#define RIGHT_ALIGNED (1 << 1) int xbitmap_disunion( struct xbitmap *bitmap, struct xbitmap *sub) { - struct list_head *lp; - struct xbitmap_range *br; - struct xbitmap_range *new_br; - struct xbitmap_range *sub_br; - uint64_t sub_start; - uint64_t sub_len; - int state; - int error = 0; + struct xbitmap_node *bn; + int error; - if (list_empty(&bitmap->list) || list_empty(&sub->list)) + if (xbitmap_empty(bitmap) || xbitmap_empty(sub)) return 0; - ASSERT(!list_empty(&sub->list)); - - list_sort(NULL, &bitmap->list, xbitmap_range_cmp); - list_sort(NULL, &sub->list, xbitmap_range_cmp); - - /* - * Now that we've sorted both lists, we iterate bitmap once, rolling - * forward through sub and/or bitmap as necessary until we find an - * overlap or reach the end of either list. We do not reset lp to the - * head of bitmap nor do we reset sub_br to the head of sub. The - * list traversal is similar to merge sort, but we're deleting - * instead. In this manner we avoid O(n^2) operations. - */ - sub_br = list_first_entry(&sub->list, struct xbitmap_range, - list); - lp = bitmap->list.next; - while (lp != &bitmap->list) { - br = list_entry(lp, struct xbitmap_range, list); - - /* - * Advance sub_br and/or br until we find a pair that - * intersect or we run out of extents. - */ - while (sub_br->start + sub_br->len <= br->start) { - if (list_is_last(&sub_br->list, &sub->list)) - goto out; - sub_br = list_next_entry(sub_br, list); - } - if (sub_br->start >= br->start + br->len) { - lp = lp->next; - continue; - } - /* trim sub_br to fit the extent we have */ - sub_start = sub_br->start; - sub_len = sub_br->len; - if (sub_br->start < br->start) { - sub_len -= br->start - sub_br->start; - sub_start = br->start; - } - if (sub_len > br->len) - sub_len = br->len; - - state = 0; - if (sub_start == br->start) - state |= LEFT_ALIGNED; - if (sub_start + sub_len == br->start + br->len) - state |= RIGHT_ALIGNED; - switch (state) { - case LEFT_ALIGNED: - /* Coincides with only the left. */ - br->start += sub_len; - br->len -= sub_len; - break; - case RIGHT_ALIGNED: - /* Coincides with only the right. */ - br->len -= sub_len; - lp = lp->next; - break; - case LEFT_ALIGNED | RIGHT_ALIGNED: - /* Total overlap, just delete ex. */ - lp = lp->next; - list_del(&br->list); - kfree(br); - break; - case 0: - /* - * Deleting from the middle: add the new right extent - * and then shrink the left extent. - */ - new_br = kmalloc(sizeof(struct xbitmap_range), - XCHK_GFP_FLAGS); - if (!new_br) { - error = -ENOMEM; - goto out; - } - INIT_LIST_HEAD(&new_br->list); - new_br->start = sub_start + sub_len; - new_br->len = br->start + br->len - new_br->start; - list_add(&new_br->list, &br->list); - br->len = sub_start - br->start; - lp = lp->next; - break; - default: - ASSERT(0); - break; - } + for_each_xbitmap_extent(bn, sub) { + error = xbitmap_clear(bitmap, bn->bn_start, + bn->bn_last - bn->bn_start + 1); + if (error) + return error; } -out: - return error; + return 0; } -#undef LEFT_ALIGNED -#undef RIGHT_ALIGNED /* * Record all btree blocks seen while iterating all records of a btree. @@ -242,6 +263,38 @@ out: * For the 300th record we just exit, with the list being [1, 4, 2, 3]. */ +/* Mark a btree block to the agblock bitmap. */ +STATIC int +xagb_bitmap_visit_btblock( + struct xfs_btree_cur *cur, + int level, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + struct xfs_buf *bp; + xfs_fsblock_t fsbno; + xfs_agblock_t agbno; + + xfs_btree_get_block(cur, level, &bp); + if (!bp) + return 0; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); + agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + + return xagb_bitmap_set(bitmap, agbno, 1); +} + +/* Mark all (per-AG) btree blocks in the agblock bitmap. */ +int +xagb_bitmap_set_btblocks( + struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur) +{ + return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock, + XFS_BTREE_VISIT_ALL, bitmap); +} + /* * Record all the buffers pointed to by the btree cursor. Callers already * engaged in a btree walk should call this function to capture the list of @@ -304,12 +357,97 @@ uint64_t xbitmap_hweight( struct xbitmap *bitmap) { - struct xbitmap_range *bmr; - struct xbitmap_range *n; + struct xbitmap_node *bn; uint64_t ret = 0; - for_each_xbitmap_extent(bmr, n, bitmap) - ret += bmr->len; + for_each_xbitmap_extent(bn, bitmap) + ret += bn->bn_last - bn->bn_start + 1; return ret; } + +/* Call a function for every run of set bits in this bitmap. */ +int +xbitmap_walk( + struct xbitmap *bitmap, + xbitmap_walk_fn fn, + void *priv) +{ + struct xbitmap_node *bn; + int error = 0; + + for_each_xbitmap_extent(bn, bitmap) { + error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); + if (error) + break; + } + + return error; +} + +struct xbitmap_walk_bits { + xbitmap_walk_bits_fn fn; + void *priv; +}; + +/* Walk all the bits in a run. */ +static int +xbitmap_walk_bits_in_run( + uint64_t start, + uint64_t len, + void *priv) +{ + struct xbitmap_walk_bits *wb = priv; + uint64_t i; + int error = 0; + + for (i = start; i < start + len; i++) { + error = wb->fn(i, wb->priv); + if (error) + break; + } + + return error; +} + +/* Call a function for every set bit in this bitmap. */ +int +xbitmap_walk_bits( + struct xbitmap *bitmap, + xbitmap_walk_bits_fn fn, + void *priv) +{ + struct xbitmap_walk_bits wb = {.fn = fn, .priv = priv}; + + return xbitmap_walk(bitmap, xbitmap_walk_bits_in_run, &wb); +} + +/* Does this bitmap have no bits set at all? */ +bool +xbitmap_empty( + struct xbitmap *bitmap) +{ + return bitmap->xb_root.rb_root.rb_node == NULL; +} + +/* Is the start of the range set or clear? And for how long? */ +bool +xbitmap_test( + struct xbitmap *bitmap, + uint64_t start, + uint64_t *len) +{ + struct xbitmap_node *bn; + uint64_t last = start + *len - 1; + + bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + if (!bn) + return false; + if (bn->bn_start <= start) { + if (bn->bn_last < last) + *len = bn->bn_last - start + 1; + return true; + } + *len = bn->bn_start - start; + return false; +} diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index 900646b72de1..84981724ecaf 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -1,31 +1,19 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2018 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_BITMAP_H__ #define __XFS_SCRUB_BITMAP_H__ -struct xbitmap_range { - struct list_head list; - uint64_t start; - uint64_t len; -}; - struct xbitmap { - struct list_head list; + struct rb_root_cached xb_root; }; void xbitmap_init(struct xbitmap *bitmap); void xbitmap_destroy(struct xbitmap *bitmap); -#define for_each_xbitmap_extent(bex, n, bitmap) \ - list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) - -#define for_each_xbitmap_block(b, bex, n, bitmap) \ - list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) \ - for ((b) = (bex)->start; (b) < (bex)->start + (bex)->len; (b)++) - +int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len); int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len); int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub); int xbitmap_set_btcur_path(struct xbitmap *bitmap, @@ -34,4 +22,93 @@ int xbitmap_set_btblocks(struct xbitmap *bitmap, struct xfs_btree_cur *cur); uint64_t xbitmap_hweight(struct xbitmap *bitmap); +/* + * Return codes for the bitmap iterator functions are 0 to continue iterating, + * and non-zero to stop iterating. Any non-zero value will be passed up to the + * iteration caller. The special value -ECANCELED can be used to stop + * iteration, because neither bitmap iterator ever generates that error code on + * its own. Callers must not modify the bitmap while walking it. + */ +typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv); +int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn, + void *priv); + +typedef int (*xbitmap_walk_bits_fn)(uint64_t bit, void *priv); +int xbitmap_walk_bits(struct xbitmap *bitmap, xbitmap_walk_bits_fn fn, + void *priv); + +bool xbitmap_empty(struct xbitmap *bitmap); +bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len); + +/* Bitmaps, but for type-checked for xfs_agblock_t */ + +struct xagb_bitmap { + struct xbitmap agbitmap; +}; + +static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) +{ + xbitmap_init(&bitmap->agbitmap); +} + +static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) +{ + xbitmap_destroy(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap_clear(&bitmap->agbitmap, start, len); +} +static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap_set(&bitmap->agbitmap, start, len); +} + +static inline bool +xagb_bitmap_test( + struct xagb_bitmap *bitmap, + xfs_agblock_t start, + xfs_extlen_t *len) +{ + uint64_t biglen = *len; + bool ret; + + ret = xbitmap_test(&bitmap->agbitmap, start, &biglen); + + if (start + biglen >= UINT_MAX) { + ASSERT(0); + biglen = UINT_MAX - start; + } + + *len = biglen; + return ret; +} + +static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, + struct xagb_bitmap *sub) +{ + return xbitmap_disunion(&bitmap->agbitmap, &sub->agbitmap); +} + +static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) +{ + return xbitmap_hweight(&bitmap->agbitmap); +} +static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) +{ + return xbitmap_empty(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, + xbitmap_walk_fn fn, void *priv) +{ + return xbitmap_walk(&bitmap->agbitmap, fn, priv); +} + +int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); + #endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index dbbc7037074c..87ab9f95a487 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -31,12 +31,15 @@ xchk_setup_inode_bmap( { int error; - error = xchk_get_inode(sc); + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + error = xchk_iget_for_scrubbing(sc); if (error) goto out; - sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); + sc->ilock_flags = XFS_IOLOCK_EXCL; + xfs_ilock(sc->ip, XFS_IOLOCK_EXCL); /* * We don't want any ephemeral data fork updates sitting around @@ -47,6 +50,9 @@ xchk_setup_inode_bmap( sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) { struct address_space *mapping = VFS_I(sc->ip)->i_mapping; + sc->ilock_flags |= XFS_MMAPLOCK_EXCL; + xfs_ilock(sc->ip, XFS_MMAPLOCK_EXCL); + inode_dio_wait(VFS_I(sc->ip)); /* @@ -90,11 +96,23 @@ out: struct xchk_bmap_info { struct xfs_scrub *sc; + + /* Incore extent tree cursor */ struct xfs_iext_cursor icur; - xfs_fileoff_t lastoff; + + /* Previous fork mapping that we examined */ + struct xfs_bmbt_irec prev_rec; + + /* Is this a realtime fork? */ bool is_rt; + + /* May mappings point to shared space? */ bool is_shared; + + /* Was the incore extent tree loaded? */ bool was_loaded; + + /* Which inode fork are we checking? */ int whichfork; }; @@ -147,49 +165,7 @@ xchk_bmap_get_rmap( return has_rmap; } -static inline bool -xchk_bmap_has_prev( - struct xchk_bmap_info *info, - struct xfs_bmbt_irec *irec) -{ - struct xfs_bmbt_irec got; - struct xfs_ifork *ifp; - - ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); - - if (!xfs_iext_peek_prev_extent(ifp, &info->icur, &got)) - return false; - if (got.br_startoff + got.br_blockcount != irec->br_startoff) - return false; - if (got.br_startblock + got.br_blockcount != irec->br_startblock) - return false; - if (got.br_state != irec->br_state) - return false; - return true; -} - -static inline bool -xchk_bmap_has_next( - struct xchk_bmap_info *info, - struct xfs_bmbt_irec *irec) -{ - struct xfs_bmbt_irec got; - struct xfs_ifork *ifp; - - ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); - - if (!xfs_iext_peek_next_extent(ifp, &info->icur, &got)) - return false; - if (irec->br_startoff + irec->br_blockcount != got.br_startoff) - return false; - if (irec->br_startblock + irec->br_blockcount != got.br_startblock) - return false; - if (got.br_state != irec->br_state) - return false; - return true; -} - -/* Make sure that we have rmapbt records for this extent. */ +/* Make sure that we have rmapbt records for this data/attr fork extent. */ STATIC void xchk_bmap_xref_rmap( struct xchk_bmap_info *info, @@ -198,41 +174,39 @@ xchk_bmap_xref_rmap( { struct xfs_rmap_irec rmap; unsigned long long rmap_end; - uint64_t owner; + uint64_t owner = info->sc->ip->i_ino; if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm)) return; - if (info->whichfork == XFS_COW_FORK) - owner = XFS_RMAP_OWN_COW; - else - owner = info->sc->ip->i_ino; - /* Find the rmap record for this irec. */ if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap)) return; - /* Check the rmap. */ + /* + * The rmap must be an exact match for this incore file mapping record, + * which may have arisen from multiple ondisk records. + */ + if (rmap.rm_startblock != agbno) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; - if (rmap.rm_startblock > agbno || - agbno + irec->br_blockcount > rmap_end) + if (rmap_end != agbno + irec->br_blockcount) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); - /* - * Check the logical offsets if applicable. CoW staging extents - * don't track logical offsets since the mappings only exist in - * memory. - */ - if (info->whichfork != XFS_COW_FORK) { - rmap_end = (unsigned long long)rmap.rm_offset + - rmap.rm_blockcount; - if (rmap.rm_offset > irec->br_startoff || - irec->br_startoff + irec->br_blockcount > rmap_end) - xchk_fblock_xref_set_corrupt(info->sc, - info->whichfork, irec->br_startoff); - } + /* Check the logical offsets. */ + if (rmap.rm_offset != irec->br_startoff) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + rmap_end = (unsigned long long)rmap.rm_offset + rmap.rm_blockcount; + if (rmap_end != irec->br_startoff + irec->br_blockcount) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* Check the owner */ if (rmap.rm_owner != owner) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -244,8 +218,7 @@ xchk_bmap_xref_rmap( * records because the blocks are owned (on-disk) by the refcountbt, * which doesn't track unwritten state. */ - if (owner != XFS_RMAP_OWN_COW && - !!(irec->br_state == XFS_EXT_UNWRITTEN) != + if (!!(irec->br_state == XFS_EXT_UNWRITTEN) != !!(rmap.rm_flags & XFS_RMAP_UNWRITTEN)) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -257,34 +230,60 @@ xchk_bmap_xref_rmap( if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); +} + +/* Make sure that we have rmapbt records for this COW fork extent. */ +STATIC void +xchk_bmap_xref_rmap_cow( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec, + xfs_agblock_t agbno) +{ + struct xfs_rmap_irec rmap; + unsigned long long rmap_end; + uint64_t owner = XFS_RMAP_OWN_COW; + + if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm)) + return; + + /* Find the rmap record for this irec. */ + if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap)) + return; /* - * If the rmap starts before this bmbt record, make sure there's a bmbt - * record for the previous offset that is contiguous with this mapping. - * Skip this for CoW fork extents because the refcount btree (and not - * the inode) is the ondisk owner for those extents. + * CoW staging extents are owned by the refcount btree, so the rmap + * can start before and end after the physical space allocated to this + * mapping. There are no offsets to check. */ - if (info->whichfork != XFS_COW_FORK && rmap.rm_startblock < agbno && - !xchk_bmap_has_prev(info, irec)) { + if (rmap.rm_startblock > agbno) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; + if (rmap_end < agbno + irec->br_blockcount) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* Check the owner */ + if (rmap.rm_owner != owner) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); - return; - } /* - * If the rmap ends after this bmbt record, make sure there's a bmbt - * record for the next offset that is contiguous with this mapping. - * Skip this for CoW fork extents because the refcount btree (and not - * the inode) is the ondisk owner for those extents. + * No flags allowed. Note that the (in-memory) CoW fork distinguishes + * between unwritten and written extents, but we don't track that in + * the rmap records because the blocks are owned (on-disk) by the + * refcountbt, which doesn't track unwritten state. */ - rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; - if (info->whichfork != XFS_COW_FORK && - rmap_end > agbno + irec->br_blockcount && - !xchk_bmap_has_next(info, irec)) { + if (rmap.rm_flags & XFS_RMAP_ATTR_FORK) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + if (rmap.rm_flags & XFS_RMAP_UNWRITTEN) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); - return; - } } /* Cross-reference a single rtdev extent record. */ @@ -305,6 +304,7 @@ xchk_bmap_iextent_xref( struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { + struct xfs_owner_info oinfo; struct xfs_mount *mp = info->sc->mp; xfs_agnumber_t agno; xfs_agblock_t agbno; @@ -322,17 +322,35 @@ xchk_bmap_iextent_xref( xchk_xref_is_used_space(info->sc, agbno, len); xchk_xref_is_not_inode_chunk(info->sc, agbno, len); - xchk_bmap_xref_rmap(info, irec, agbno); switch (info->whichfork) { case XFS_DATA_FORK: - if (xfs_is_reflink_inode(info->sc->ip)) - break; - fallthrough; + xchk_bmap_xref_rmap(info, irec, agbno); + if (!xfs_is_reflink_inode(info->sc->ip)) { + xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, + info->whichfork, irec->br_startoff); + xchk_xref_is_only_owned_by(info->sc, agbno, + irec->br_blockcount, &oinfo); + xchk_xref_is_not_shared(info->sc, agbno, + irec->br_blockcount); + } + xchk_xref_is_not_cow_staging(info->sc, agbno, + irec->br_blockcount); + break; case XFS_ATTR_FORK: + xchk_bmap_xref_rmap(info, irec, agbno); + xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, + info->whichfork, irec->br_startoff); + xchk_xref_is_only_owned_by(info->sc, agbno, irec->br_blockcount, + &oinfo); xchk_xref_is_not_shared(info->sc, agbno, irec->br_blockcount); + xchk_xref_is_not_cow_staging(info->sc, agbno, + irec->br_blockcount); break; case XFS_COW_FORK: + xchk_bmap_xref_rmap_cow(info, irec, agbno); + xchk_xref_is_only_owned_by(info->sc, agbno, irec->br_blockcount, + &XFS_RMAP_OINFO_COW); xchk_xref_is_cow_staging(info->sc, agbno, irec->br_blockcount); xchk_xref_is_not_shared(info->sc, agbno, @@ -382,7 +400,8 @@ xchk_bmap_iextent( * Check for out-of-order extents. This record could have come * from the incore list, for which there is no ordering check. */ - if (irec->br_startoff < info->lastoff) + if (irec->br_startoff < info->prev_rec.br_startoff + + info->prev_rec.br_blockcount) xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -392,15 +411,7 @@ xchk_bmap_iextent( xchk_bmap_dirattr_extent(ip, info, irec); - /* There should never be a "hole" extent in either extent list. */ - if (irec->br_startblock == HOLESTARTBLOCK) - xchk_fblock_set_corrupt(info->sc, info->whichfork, - irec->br_startoff); - /* Make sure the extent points to a valid place. */ - if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) - xchk_fblock_set_corrupt(info->sc, info->whichfork, - irec->br_startoff); if (info->is_rt && !xfs_verify_rtext(mp, irec->br_startblock, irec->br_blockcount)) xchk_fblock_set_corrupt(info->sc, info->whichfork, @@ -468,6 +479,12 @@ xchk_bmapbt_rec( return 0; xfs_bmbt_disk_get_all(&rec->bmbt, &irec); + if (xfs_bmap_validate_extent(ip, info->whichfork, &irec) != NULL) { + xchk_fblock_set_corrupt(bs->sc, info->whichfork, + irec.br_startoff); + return 0; + } + if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur, &iext_irec) || irec.br_startoff != iext_irec.br_startoff || @@ -618,45 +635,57 @@ xchk_bmap_check_ag_rmaps( return error; } -/* Make sure each rmap has a corresponding bmbt entry. */ -STATIC int -xchk_bmap_check_rmaps( - struct xfs_scrub *sc, - int whichfork) +/* + * Decide if we want to walk every rmap btree in the fs to make sure that each + * rmap for this file fork has corresponding bmbt entries. + */ +static bool +xchk_bmap_want_check_rmaps( + struct xchk_bmap_info *info) { - struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); - struct xfs_perag *pag; - xfs_agnumber_t agno; - bool zero_size; - int error; + struct xfs_scrub *sc = info->sc; + struct xfs_ifork *ifp; - if (!xfs_has_rmapbt(sc->mp) || - whichfork == XFS_COW_FORK || - (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) - return 0; + if (!xfs_has_rmapbt(sc->mp)) + return false; + if (info->whichfork == XFS_COW_FORK) + return false; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return false; /* Don't support realtime rmap checks yet. */ - if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK) - return 0; - - ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL); + if (info->is_rt) + return false; /* - * Only do this for complex maps that are in btree format, or for - * situations where we would seem to have a size but zero extents. - * The inode repair code can zap broken iforks, which means we have - * to flag this bmap as corrupt if there are rmaps that need to be - * reattached. + * The inode repair code zaps broken inode forks by resetting them back + * to EXTENTS format and zero extent records. If we encounter a fork + * in this state along with evidence that the fork isn't supposed to be + * empty, we need to scan the reverse mappings to decide if we're going + * to rebuild the fork. Data forks with nonzero file size are scanned. + * xattr forks are never empty of content, so they are always scanned. */ + ifp = xfs_ifork_ptr(sc->ip, info->whichfork); + if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents == 0) { + if (info->whichfork == XFS_DATA_FORK && + i_size_read(VFS_I(sc->ip)) == 0) + return false; - if (whichfork == XFS_DATA_FORK) - zero_size = i_size_read(VFS_I(sc->ip)) == 0; - else - zero_size = false; + return true; + } - if (ifp->if_format != XFS_DINODE_FMT_BTREE && - (zero_size || ifp->if_nextents > 0)) - return 0; + return false; +} + +/* Make sure each rmap has a corresponding bmbt entry. */ +STATIC int +xchk_bmap_check_rmaps( + struct xfs_scrub *sc, + int whichfork) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + int error; for_each_perag(sc->mp, agno, pag) { error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag); @@ -683,7 +712,8 @@ xchk_bmap_iextent_delalloc( * Check for out-of-order extents. This record could have come * from the incore list, for which there is no ordering check. */ - if (irec->br_startoff < info->lastoff) + if (irec->br_startoff < info->prev_rec.br_startoff + + info->prev_rec.br_blockcount) xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -697,6 +727,101 @@ xchk_bmap_iextent_delalloc( irec->br_startoff); } +/* Decide if this individual fork mapping is ok. */ +static bool +xchk_bmap_iext_mapping( + struct xchk_bmap_info *info, + const struct xfs_bmbt_irec *irec) +{ + /* There should never be a "hole" extent in either extent list. */ + if (irec->br_startblock == HOLESTARTBLOCK) + return false; + if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) + return false; + return true; +} + +/* Are these two mappings contiguous with each other? */ +static inline bool +xchk_are_bmaps_contiguous( + const struct xfs_bmbt_irec *b1, + const struct xfs_bmbt_irec *b2) +{ + /* Don't try to combine unallocated mappings. */ + if (!xfs_bmap_is_real_extent(b1)) + return false; + if (!xfs_bmap_is_real_extent(b2)) + return false; + + /* Does b2 come right after b1 in the logical and physical range? */ + if (b1->br_startoff + b1->br_blockcount != b2->br_startoff) + return false; + if (b1->br_startblock + b1->br_blockcount != b2->br_startblock) + return false; + if (b1->br_state != b2->br_state) + return false; + return true; +} + +/* + * Walk the incore extent records, accumulating consecutive contiguous records + * into a single incore mapping. Returns true if @irec has been set to a + * mapping or false if there are no more mappings. Caller must ensure that + * @info.icur is zeroed before the first call. + */ +static int +xchk_bmap_iext_iter( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + xfs_filblks_t prev_len; + + ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); + + /* Advance to the next iextent record and check the mapping. */ + xfs_iext_next(ifp, &info->icur); + if (!xfs_iext_get_extent(ifp, &info->icur, irec)) + return false; + + if (!xchk_bmap_iext_mapping(info, irec)) { + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + return false; + } + + /* + * Iterate subsequent iextent records and merge them with the one + * that we just read, if possible. + */ + prev_len = irec->br_blockcount; + while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) { + if (!xchk_are_bmaps_contiguous(irec, &got)) + break; + + if (!xchk_bmap_iext_mapping(info, &got)) { + xchk_fblock_set_corrupt(info->sc, info->whichfork, + got.br_startoff); + return false; + } + + /* + * Notify the user of mergeable records in the data or attr + * forks. CoW forks only exist in memory so we ignore them. + */ + if (info->whichfork != XFS_COW_FORK && + prev_len + got.br_blockcount > BMBT_BLOCKCOUNT_MASK) + xchk_ino_set_preen(info->sc, info->sc->ip->i_ino); + + irec->br_blockcount += got.br_blockcount; + prev_len = got.br_blockcount; + xfs_iext_next(ifp, &info->icur); + } + + return true; +} + /* * Scrub an inode fork's block mappings. * @@ -776,10 +901,15 @@ xchk_bmap( if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) goto out; - /* Scrub extent records. */ - info.lastoff = 0; - ifp = xfs_ifork_ptr(ip, whichfork); - for_each_xfs_iext(ifp, &info.icur, &irec) { + /* + * Scrub extent records. We use a special iterator function here that + * combines adjacent mappings if they are logically and physically + * contiguous. For large allocations that require multiple bmbt + * records, this reduces the number of cross-referencing calls, which + * reduces runtime. Cross referencing with the rmap is simpler because + * the rmap must match the combined mapping exactly. + */ + while (xchk_bmap_iext_iter(&info, &irec)) { if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) goto out; @@ -794,12 +924,14 @@ xchk_bmap( xchk_bmap_iextent_delalloc(ip, &info, &irec); else xchk_bmap_iextent(ip, &info, &irec); - info.lastoff = irec.br_startoff + irec.br_blockcount; + memcpy(&info.prev_rec, &irec, sizeof(struct xfs_bmbt_irec)); } - error = xchk_bmap_check_rmaps(sc, whichfork); - if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error)) - goto out; + if (xchk_bmap_want_check_rmaps(&info)) { + error = xchk_bmap_check_rmaps(sc, whichfork); + if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error)) + goto out; + } out: return error; } diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 0fd36d5b4646..1935b9ce1885 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -36,6 +36,7 @@ __xchk_btree_process_error( switch (*error) { case -EDEADLOCK: + case -ECHRNG: /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); break; @@ -118,6 +119,16 @@ xchk_btree_xref_set_corrupt( __return_address); } +void +xchk_btree_set_preen( + struct xfs_scrub *sc, + struct xfs_btree_cur *cur, + int level) +{ + __xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_PREEN, + __return_address); +} + /* * Make sure this record is in order and doesn't stray outside of the parent * keys. @@ -140,29 +151,30 @@ xchk_btree_rec( trace_xchk_btree_rec(bs->sc, cur, 0); - /* If this isn't the first record, are they in order? */ - if (cur->bc_levels[0].ptr > 1 && + /* Are all records across all record blocks in order? */ + if (bs->lastrec_valid && !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec)) xchk_btree_set_corrupt(bs->sc, cur, 0); memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len); + bs->lastrec_valid = true; if (cur->bc_nlevels == 1) return; - /* Is this at least as large as the parent low key? */ + /* Is low_key(rec) at least as large as the parent low key? */ cur->bc_ops->init_key_from_rec(&key, rec); keyblock = xfs_btree_get_block(cur, 1, &bp); keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock); - if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0) + if (xfs_btree_keycmp_lt(cur, &key, keyp)) xchk_btree_set_corrupt(bs->sc, cur, 1); if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) return; - /* Is this no larger than the parent high key? */ + /* Is high_key(rec) no larger than the parent high key? */ cur->bc_ops->init_high_key_from_rec(&hkey, rec); keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock); - if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0) + if (xfs_btree_keycmp_lt(cur, keyp, &hkey)) xchk_btree_set_corrupt(bs->sc, cur, 1); } @@ -187,29 +199,30 @@ xchk_btree_key( trace_xchk_btree_key(bs->sc, cur, level); - /* If this isn't the first key, are they in order? */ - if (cur->bc_levels[level].ptr > 1 && - !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1], key)) + /* Are all low keys across all node blocks in order? */ + if (bs->lastkey[level - 1].valid && + !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1].key, key)) xchk_btree_set_corrupt(bs->sc, cur, level); - memcpy(&bs->lastkey[level - 1], key, cur->bc_ops->key_len); + memcpy(&bs->lastkey[level - 1].key, key, cur->bc_ops->key_len); + bs->lastkey[level - 1].valid = true; if (level + 1 >= cur->bc_nlevels) return; - /* Is this at least as large as the parent low key? */ + /* Is this block's low key at least as large as the parent low key? */ keyblock = xfs_btree_get_block(cur, level + 1, &bp); keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock); - if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0) + if (xfs_btree_keycmp_lt(cur, key, keyp)) xchk_btree_set_corrupt(bs->sc, cur, level); if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) return; - /* Is this no larger than the parent high key? */ + /* Is this block's high key no larger than the parent high key? */ key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block); keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock); - if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0) + if (xfs_btree_keycmp_lt(cur, keyp, key)) xchk_btree_set_corrupt(bs->sc, cur, level); } @@ -389,7 +402,7 @@ xchk_btree_check_block_owner( if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO) bs->cur = NULL; - xchk_xref_is_owned_by(bs->sc, agbno, 1, bs->oinfo); + xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo); if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP) bs->cur = NULL; @@ -519,6 +532,48 @@ xchk_btree_check_minrecs( } /* + * If this btree block has a parent, make sure that the parent's keys capture + * the keyspace contained in this block. + */ +STATIC void +xchk_btree_block_check_keys( + struct xchk_btree *bs, + int level, + struct xfs_btree_block *block) +{ + union xfs_btree_key block_key; + union xfs_btree_key *block_high_key; + union xfs_btree_key *parent_low_key, *parent_high_key; + struct xfs_btree_cur *cur = bs->cur; + struct xfs_btree_block *parent_block; + struct xfs_buf *bp; + + if (level == cur->bc_nlevels - 1) + return; + + xfs_btree_get_keys(cur, block, &block_key); + + /* Make sure the low key of this block matches the parent. */ + parent_block = xfs_btree_get_block(cur, level + 1, &bp); + parent_low_key = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, + parent_block); + if (xfs_btree_keycmp_ne(cur, &block_key, parent_low_key)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, level); + return; + } + + if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return; + + /* Make sure the high key of this block matches the parent. */ + parent_high_key = xfs_btree_high_key_addr(cur, + cur->bc_levels[level + 1].ptr, parent_block); + block_high_key = xfs_btree_high_key_from_key(cur, &block_key); + if (xfs_btree_keycmp_ne(cur, block_high_key, parent_high_key)) + xchk_btree_set_corrupt(bs->sc, bs->cur, level); +} + +/* * Grab and scrub a btree block given a btree pointer. Returns block * and buffer pointers (if applicable) if they're ok to use. */ @@ -569,7 +624,12 @@ xchk_btree_get_block( * Check the block's siblings; this function absorbs error codes * for us. */ - return xchk_btree_block_check_siblings(bs, *pblock); + error = xchk_btree_block_check_siblings(bs, *pblock); + if (error) + return error; + + xchk_btree_block_check_keys(bs, level, *pblock); + return 0; } /* @@ -601,7 +661,7 @@ xchk_btree_block_keys( parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, parent_block); - if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0) + if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys)) xchk_btree_set_corrupt(bs->sc, cur, 1); if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) @@ -612,7 +672,7 @@ xchk_btree_block_keys( high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr, parent_block); - if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0) + if (xfs_btree_keycmp_ne(cur, high_bk, high_pk)) xchk_btree_set_corrupt(bs->sc, cur, 1); } diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h index da61a53a0b61..9d7b9ee8bef4 100644 --- a/fs/xfs/scrub/btree.h +++ b/fs/xfs/scrub/btree.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_BTREE_H__ #define __XFS_SCRUB_BTREE_H__ @@ -19,6 +19,8 @@ bool xchk_btree_xref_process_error(struct xfs_scrub *sc, /* Check for btree corruption. */ void xchk_btree_set_corrupt(struct xfs_scrub *sc, struct xfs_btree_cur *cur, int level); +void xchk_btree_set_preen(struct xfs_scrub *sc, struct xfs_btree_cur *cur, + int level); /* Check for btree xref discrepancies. */ void xchk_btree_xref_set_corrupt(struct xfs_scrub *sc, @@ -29,6 +31,11 @@ typedef int (*xchk_btree_rec_fn)( struct xchk_btree *bs, const union xfs_btree_rec *rec); +struct xchk_btree_key { + union xfs_btree_key key; + bool valid; +}; + struct xchk_btree { /* caller-provided scrub state */ struct xfs_scrub *sc; @@ -38,11 +45,12 @@ struct xchk_btree { void *private; /* internal scrub state */ + bool lastrec_valid; union xfs_btree_rec lastrec; struct list_head to_check; /* this element must come last! */ - union xfs_btree_key lastkey[]; + struct xchk_btree_key lastkey[]; }; /* diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 848a8e32e56f..9aa79665c608 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -75,6 +75,7 @@ __xchk_process_error( case 0: return true; case -EDEADLOCK: + case -ECHRNG: /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry( sc->ip ? sc->ip : XFS_I(file_inode(sc->file)), @@ -130,6 +131,7 @@ __xchk_fblock_process_error( case 0: return true; case -EDEADLOCK: + case -ECHRNG: /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); break; @@ -396,26 +398,19 @@ want_ag_read_header_failure( } /* - * Grab the perag structure and all the headers for an AG. + * Grab the AG header buffers for the attached perag structure. * * The headers should be released by xchk_ag_free, but as a fail safe we attach * all the buffers we grab to the scrub transaction so they'll all be freed - * when we cancel it. Returns ENOENT if we can't grab the perag structure. + * when we cancel it. */ -int -xchk_ag_read_headers( +static inline int +xchk_perag_read_headers( struct xfs_scrub *sc, - xfs_agnumber_t agno, struct xchk_ag *sa) { - struct xfs_mount *mp = sc->mp; int error; - ASSERT(!sa->pag); - sa->pag = xfs_perag_get(mp, agno); - if (!sa->pag) - return -ENOENT; - error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) return error; @@ -427,6 +422,104 @@ xchk_ag_read_headers( return 0; } +/* + * Grab the AG headers for the attached perag structure and wait for pending + * intents to drain. + */ +static int +xchk_perag_drain_and_lock( + struct xfs_scrub *sc) +{ + struct xchk_ag *sa = &sc->sa; + int error = 0; + + ASSERT(sa->pag != NULL); + ASSERT(sa->agi_bp == NULL); + ASSERT(sa->agf_bp == NULL); + + do { + if (xchk_should_terminate(sc, &error)) + return error; + + error = xchk_perag_read_headers(sc, sa); + if (error) + return error; + + /* + * If we've grabbed an inode for scrubbing then we assume that + * holding its ILOCK will suffice to coordinate with any intent + * chains involving this inode. + */ + if (sc->ip) + return 0; + + /* + * Decide if this AG is quiet enough for all metadata to be + * consistent with each other. XFS allows the AG header buffer + * locks to cycle across transaction rolls while processing + * chains of deferred ops, which means that there could be + * other threads in the middle of processing a chain of + * deferred ops. For regular operations we are careful about + * ordering operations to prevent collisions between threads + * (which is why we don't need a per-AG lock), but scrub and + * repair have to serialize against chained operations. + * + * We just locked all the AG headers buffers; now take a look + * to see if there are any intents in progress. If there are, + * drop the AG headers and wait for the intents to drain. + * Since we hold all the AG header locks for the duration of + * the scrub, this is the only time we have to sample the + * intents counter; any threads increasing it after this point + * can't possibly be in the middle of a chain of AG metadata + * updates. + * + * Obviously, this should be slanted against scrub and in favor + * of runtime threads. + */ + if (!xfs_perag_intent_busy(sa->pag)) + return 0; + + if (sa->agf_bp) { + xfs_trans_brelse(sc->tp, sa->agf_bp); + sa->agf_bp = NULL; + } + + if (sa->agi_bp) { + xfs_trans_brelse(sc->tp, sa->agi_bp); + sa->agi_bp = NULL; + } + + if (!(sc->flags & XCHK_FSGATES_DRAIN)) + return -ECHRNG; + error = xfs_perag_intent_drain(sa->pag); + if (error == -ERESTARTSYS) + error = -EINTR; + } while (!error); + + return error; +} + +/* + * Grab the per-AG structure, grab all AG header buffers, and wait until there + * aren't any pending intents. Returns -ENOENT if we can't grab the perag + * structure. + */ +int +xchk_ag_read_headers( + struct xfs_scrub *sc, + xfs_agnumber_t agno, + struct xchk_ag *sa) +{ + struct xfs_mount *mp = sc->mp; + + ASSERT(!sa->pag); + sa->pag = xfs_perag_get(mp, agno); + if (!sa->pag) + return -ENOENT; + + return xchk_perag_drain_and_lock(sc); +} + /* Release all the AG btree cursors. */ void xchk_ag_btcur_free( @@ -550,6 +643,14 @@ xchk_ag_init( /* Per-scrubber setup functions */ +void +xchk_trans_cancel( + struct xfs_scrub *sc) +{ + xfs_trans_cancel(sc->tp); + sc->tp = NULL; +} + /* * Grab an empty transaction so that we can re-grab locked buffers if * one of our btrees turns out to be cyclic. @@ -625,80 +726,273 @@ xchk_checkpoint_log( return 0; } +/* Verify that an inode is allocated ondisk, then return its cached inode. */ +int +xchk_iget( + struct xfs_scrub *sc, + xfs_ino_t inum, + struct xfs_inode **ipp) +{ + return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); +} + /* - * Given an inode and the scrub control structure, grab either the - * inode referenced in the control structure or the inode passed in. - * The inode is not locked. + * Try to grab an inode in a manner that avoids races with physical inode + * allocation. If we can't, return the locked AGI buffer so that the caller + * can single-step the loading process to see where things went wrong. + * Callers must have a valid scrub transaction. + * + * If the iget succeeds, return 0, a NULL AGI, and the inode. + * + * If the iget fails, return the error, the locked AGI, and a NULL inode. This + * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are + * no longer allocated; or any other corruption or runtime error. + * + * If the AGI read fails, return the error, a NULL AGI, and NULL inode. + * + * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode. */ int -xchk_get_inode( +xchk_iget_agi( + struct xfs_scrub *sc, + xfs_ino_t inum, + struct xfs_buf **agi_bpp, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + struct xfs_perag *pag; + int error; + + ASSERT(sc->tp != NULL); + +again: + *agi_bpp = NULL; + *ipp = NULL; + error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Attach the AGI buffer to the scrub transaction to avoid deadlocks + * in the iget cache miss path. + */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); + error = xfs_ialloc_read_agi(pag, tp, agi_bpp); + xfs_perag_put(pag); + if (error) + return error; + + error = xfs_iget(mp, tp, inum, + XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp); + if (error == -EAGAIN) { + /* + * The inode may be in core but temporarily unavailable and may + * require the AGI buffer before it can be returned. Drop the + * AGI buffer and retry the lookup. + * + * Incore lookup will fail with EAGAIN on a cache hit if the + * inode is queued to the inactivation list. The inactivation + * worker may remove the inode from the unlinked list and hence + * needs the AGI. + * + * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN + * to allow inodegc to make progress and move the inode to + * IRECLAIMABLE state where xfs_iget will be able to return it + * again if it can lock the inode. + */ + xfs_trans_brelse(tp, *agi_bpp); + delay(1); + goto again; + } + if (error) + return error; + + /* We got the inode, so we can release the AGI. */ + ASSERT(*ipp != NULL); + xfs_trans_brelse(tp, *agi_bpp); + *agi_bpp = NULL; + return 0; +} + +/* Install an inode that we opened by handle for scrubbing. */ +int +xchk_install_handle_inode( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (VFS_I(ip)->i_generation != sc->sm->sm_gen) { + xchk_irele(sc, ip); + return -ENOENT; + } + + sc->ip = ip; + return 0; +} + +/* + * In preparation to scrub metadata structures that hang off of an inode, + * grab either the inode referenced in the scrub control structure or the + * inode passed in. If the inumber does not reference an allocated inode + * record, the function returns ENOENT to end the scrub early. The inode + * is not locked. + */ +int +xchk_iget_for_scrubbing( struct xfs_scrub *sc) { struct xfs_imap imap; struct xfs_mount *mp = sc->mp; struct xfs_perag *pag; + struct xfs_buf *agi_bp; struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); struct xfs_inode *ip = NULL; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino); int error; + ASSERT(sc->tp == NULL); + /* We want to scan the inode we already had opened. */ if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { sc->ip = ip_in; return 0; } - /* Look up the inode, see if the generation number matches. */ + /* Reject internal metadata files and obviously bad inode numbers. */ if (xfs_internal_inum(mp, sc->sm->sm_ino)) return -ENOENT; - error = xfs_iget(mp, NULL, sc->sm->sm_ino, - XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, &ip); - switch (error) { - case -ENOENT: - /* Inode doesn't exist, just bail out. */ + if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) + return -ENOENT; + + /* Try a regular untrusted iget. */ + error = xchk_iget(sc, sc->sm->sm_ino, &ip); + if (!error) + return xchk_install_handle_inode(sc, ip); + if (error == -ENOENT) return error; - case 0: - /* Got an inode, continue. */ - break; - case -EINVAL: + if (error != -EINVAL) + goto out_error; + + /* + * EINVAL with IGET_UNTRUSTED probably means one of several things: + * userspace gave us an inode number that doesn't correspond to fs + * space; the inode btree lacks a record for this inode; or there is a + * record, and it says this inode is free. + * + * We want to look up this inode in the inobt to distinguish two + * scenarios: (1) the inobt says the inode is free, in which case + * there's nothing to do; and (2) the inobt says the inode is + * allocated, but loading it failed due to corruption. + * + * Allocate a transaction and grab the AGI to prevent inobt activity + * in this AG. Retry the iget in case someone allocated a new inode + * after the first iget failed. + */ + error = xchk_trans_alloc(sc, 0); + if (error) + goto out_error; + + error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip); + if (error == 0) { + /* Actually got the inode, so install it. */ + xchk_trans_cancel(sc); + return xchk_install_handle_inode(sc, ip); + } + if (error == -ENOENT) + goto out_gone; + if (error != -EINVAL) + goto out_cancel; + + /* Ensure that we have protected against inode allocation/freeing. */ + if (agi_bp == NULL) { + ASSERT(agi_bp != NULL); + error = -ECANCELED; + goto out_cancel; + } + + /* + * Untrusted iget failed a second time. Let's try an inobt lookup. + * If the inobt thinks this the inode neither can exist inside the + * filesystem nor is allocated, return ENOENT to signal that the check + * can be skipped. + * + * If the lookup returns corruption, we'll mark this inode corrupt and + * exit to userspace. There's little chance of fixing anything until + * the inobt is straightened out, but there's nothing we can do here. + * + * If the lookup encounters any other error, exit to userspace. + * + * If the lookup succeeds, something else must be very wrong in the fs + * such that setting up the incore inode failed in some strange way. + * Treat those as corruptions. + */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino)); + if (!pag) { + error = -EFSCORRUPTED; + goto out_cancel; + } + + error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap, + XFS_IGET_UNTRUSTED); + xfs_perag_put(pag); + if (error == -EINVAL || error == -ENOENT) + goto out_gone; + if (!error) + error = -EFSCORRUPTED; + +out_cancel: + xchk_trans_cancel(sc); +out_error: + trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), + error, __return_address); + return error; +out_gone: + /* The file is gone, so there's nothing to check. */ + xchk_trans_cancel(sc); + return -ENOENT; +} + +/* Release an inode, possibly dropping it in the process. */ +void +xchk_irele( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (current->journal_info != NULL) { + ASSERT(current->journal_info == sc->tp); + /* - * -EINVAL with IGET_UNTRUSTED could mean one of several - * things: userspace gave us an inode number that doesn't - * correspond to fs space, or doesn't have an inobt entry; - * or it could simply mean that the inode buffer failed the - * read verifiers. + * If we are in a transaction, we /cannot/ drop the inode + * ourselves, because the VFS will trigger writeback, which + * can require a transaction. Clear DONTCACHE to force the + * inode to the LRU, where someone else can take care of + * dropping it. * - * Try just the inode mapping lookup -- if it succeeds, then - * the inode buffer verifier failed and something needs fixing. - * Otherwise, we really couldn't find it so tell userspace - * that it no longer exists. + * Note that when we grabbed our reference to the inode, it + * could have had an active ref and DONTCACHE set if a sysadmin + * is trying to coerce a change in file access mode. icache + * hits do not clear DONTCACHE, so we must do it here. */ - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino)); - if (pag) { - error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap, - XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE); - xfs_perag_put(pag); - if (error) - return -ENOENT; - } - error = -EFSCORRUPTED; - fallthrough; - default: - trace_xchk_op_error(sc, - XFS_INO_TO_AGNO(mp, sc->sm->sm_ino), - XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), - error, __return_address); - return error; - } - if (VFS_I(ip)->i_generation != sc->sm->sm_gen) { - xfs_irele(ip); - return -ENOENT; + spin_lock(&VFS_I(ip)->i_lock); + VFS_I(ip)->i_state &= ~I_DONTCACHE; + spin_unlock(&VFS_I(ip)->i_lock); + } else if (atomic_read(&VFS_I(ip)->i_count) == 1) { + /* + * If this is the last reference to the inode and the caller + * permits it, set DONTCACHE to avoid thrashing. + */ + d_mark_dontcache(VFS_I(ip)); } - sc->ip = ip; - return 0; + xfs_irele(ip); } -/* Set us up to scrub a file's contents. */ +/* + * Set us up to scrub metadata mapped by a file's fork. Callers must not use + * this to operate on user-accessible regular file data because the MMAPLOCK is + * not taken. + */ int xchk_setup_inode_contents( struct xfs_scrub *sc, @@ -706,13 +1000,14 @@ xchk_setup_inode_contents( { int error; - error = xchk_get_inode(sc); + error = xchk_iget_for_scrubbing(sc); if (error) return error; - /* Got the inode, lock it and we're ready to go. */ - sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + /* Lock the inode so the VFS cannot touch this file. */ + sc->ilock_flags = XFS_IOLOCK_EXCL; xfs_ilock(sc->ip, sc->ilock_flags); + error = xchk_trans_alloc(sc, resblks); if (error) goto out; @@ -869,28 +1164,6 @@ xchk_metadata_inode_forks( return 0; } -/* - * Try to lock an inode in violation of the usual locking order rules. For - * example, trying to get the IOLOCK while in transaction context, or just - * plain breaking AG-order or inode-order inode locking rules. Either way, - * the only way to avoid an ABBA deadlock is to use trylock and back off if - * we can't. - */ -int -xchk_ilock_inverted( - struct xfs_inode *ip, - uint lock_mode) -{ - int i; - - for (i = 0; i < 20; i++) { - if (xfs_ilock_nowait(ip, lock_mode)) - return 0; - delay(1); - } - return -EDEADLOCK; -} - /* Pause background reaping of resources. */ void xchk_stop_reaping( @@ -916,3 +1189,25 @@ xchk_start_reaping( } sc->flags &= ~XCHK_REAPING_DISABLED; } + +/* + * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub + * operation. Callers must not hold any locks that intersect with the CPU + * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs + * to change kernel code. + */ +void +xchk_fsgates_enable( + struct xfs_scrub *sc, + unsigned int scrub_fsgates) +{ + ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL)); + ASSERT(!(sc->flags & scrub_fsgates)); + + trace_xchk_fsgates_enable(sc, scrub_fsgates); + + if (scrub_fsgates & XCHK_FSGATES_DRAIN) + xfs_drain_wait_enable(); + + sc->flags |= scrub_fsgates; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index b73648d81d23..18b5f2b62f13 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_COMMON_H__ #define __XFS_SCRUB_COMMON_H__ @@ -32,6 +32,8 @@ xchk_should_terminate( } int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks); +void xchk_trans_cancel(struct xfs_scrub *sc); + bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int *error); bool xchk_fblock_process_error(struct xfs_scrub *sc, int whichfork, @@ -72,6 +74,7 @@ bool xchk_should_check_xref(struct xfs_scrub *sc, int *error, struct xfs_btree_cur **curpp); /* Setup functions */ +int xchk_setup_agheader(struct xfs_scrub *sc); int xchk_setup_fs(struct xfs_scrub *sc); int xchk_setup_ag_allocbt(struct xfs_scrub *sc); int xchk_setup_ag_iallocbt(struct xfs_scrub *sc); @@ -132,10 +135,16 @@ int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur, const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks); int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log); -int xchk_get_inode(struct xfs_scrub *sc); +int xchk_iget_for_scrubbing(struct xfs_scrub *sc); int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks); void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp); +int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp); +int xchk_iget_agi(struct xfs_scrub *sc, xfs_ino_t inum, + struct xfs_buf **agi_bpp, struct xfs_inode **ipp); +void xchk_irele(struct xfs_scrub *sc, struct xfs_inode *ip); +int xchk_install_handle_inode(struct xfs_scrub *sc, struct xfs_inode *ip); + /* * Don't bother cross-referencing if we already found corruption or cross * referencing discrepancies. @@ -147,8 +156,21 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) } int xchk_metadata_inode_forks(struct xfs_scrub *sc); -int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode); void xchk_stop_reaping(struct xfs_scrub *sc); void xchk_start_reaping(struct xfs_scrub *sc); +/* + * Setting up a hook to wait for intents to drain is costly -- we have to take + * the CPU hotplug lock and force an i-cache flush on all CPUs once to set it + * up, and again to tear it down. These costs add up quickly, so we only want + * to enable the drain waiter if the drain actually detected a conflict with + * running intent chains. + */ +static inline bool xchk_need_intent_drain(struct xfs_scrub *sc) +{ + return sc->flags & XCHK_NEED_DRAIN; +} + +void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks); + #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index d17cee177085..82b150d3b8b7 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -39,6 +39,7 @@ xchk_da_process_error( switch (*error) { case -EDEADLOCK: + case -ECHRNG: /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); break; diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h index 1f3515c6d5a8..4f8c2138a1ec 100644 --- a/fs/xfs/scrub/dabtree.h +++ b/fs/xfs/scrub/dabtree.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_DABTREE_H__ #define __XFS_SCRUB_DABTREE_H__ diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index d1b0f23c2c59..0b491784b759 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -18,6 +18,7 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" +#include "scrub/readdir.h" /* Set us up to scrub directories. */ int @@ -31,168 +32,114 @@ xchk_setup_directory( /* Scrub a directory entry. */ -struct xchk_dir_ctx { - /* VFS fill-directory iterator */ - struct dir_context dir_iter; - - struct xfs_scrub *sc; -}; - -/* Check that an inode's mode matches a given DT_ type. */ -STATIC int +/* Check that an inode's mode matches a given XFS_DIR3_FT_* type. */ +STATIC void xchk_dir_check_ftype( - struct xchk_dir_ctx *sdc, + struct xfs_scrub *sc, xfs_fileoff_t offset, - xfs_ino_t inum, - int dtype) + struct xfs_inode *ip, + int ftype) { - struct xfs_mount *mp = sdc->sc->mp; - struct xfs_inode *ip; - int ino_dtype; - int error = 0; + struct xfs_mount *mp = sc->mp; if (!xfs_has_ftype(mp)) { - if (dtype != DT_UNKNOWN && dtype != DT_DIR) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, - offset); - goto out; - } - - /* - * Grab the inode pointed to by the dirent. We release the - * inode before we cancel the scrub transaction. Since we're - * don't know a priori that releasing the inode won't trigger - * eofblocks cleanup (which allocates what would be a nested - * transaction), we can't use DONTCACHE here because DONTCACHE - * inodes can trigger immediate inactive cleanup of the inode. - * - * If _iget returns -EINVAL or -ENOENT then the child inode number is - * garbage and the directory is corrupt. If the _iget returns - * -EFSCORRUPTED or -EFSBADCRC then the child is corrupt which is a - * cross referencing error. Any other error is an operational error. - */ - error = xfs_iget(mp, sdc->sc->tp, inum, 0, 0, &ip); - if (error == -EINVAL || error == -ENOENT) { - error = -EFSCORRUPTED; - xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, 0, &error); - goto out; + if (ftype != XFS_DIR3_FT_UNKNOWN && ftype != XFS_DIR3_FT_DIR) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return; } - if (!xchk_fblock_xref_process_error(sdc->sc, XFS_DATA_FORK, offset, - &error)) - goto out; - /* Convert mode to the DT_* values that dir_emit uses. */ - ino_dtype = xfs_dir3_get_dtype(mp, - xfs_mode_to_ftype(VFS_I(ip)->i_mode)); - if (ino_dtype != dtype) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); - xfs_irele(ip); -out: - return error; + if (xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); } /* * Scrub a single directory entry. * - * We use the VFS directory iterator (i.e. readdir) to call this - * function for every directory entry in a directory. Once we're here, - * we check the inode number to make sure it's sane, then we check that - * we can look up this filename. Finally, we check the ftype. + * Check the inode number to make sure it's sane, then we check that we can + * look up this filename. Finally, we check the ftype. */ -STATIC bool +STATIC int xchk_dir_actor( - struct dir_context *dir_iter, - const char *name, - int namelen, - loff_t pos, - u64 ino, - unsigned type) + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) { - struct xfs_mount *mp; + struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip; - struct xchk_dir_ctx *sdc; - struct xfs_name xname; xfs_ino_t lookup_ino; xfs_dablk_t offset; - bool checked_ftype = false; int error = 0; - sdc = container_of(dir_iter, struct xchk_dir_ctx, dir_iter); - ip = sdc->sc->ip; - mp = ip->i_mount; offset = xfs_dir2_db_to_da(mp->m_dir_geo, - xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos)); + xfs_dir2_dataptr_to_db(mp->m_dir_geo, dapos)); - if (xchk_should_terminate(sdc->sc, &error)) - return !error; + if (xchk_should_terminate(sc, &error)) + return error; /* Does this inode number make sense? */ if (!xfs_verify_dir_ino(mp, ino)) { - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); - goto out; + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return -ECANCELED; } /* Does this name make sense? */ - if (!xfs_dir2_namecheck(name, namelen)) { - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); - goto out; + if (!xfs_dir2_namecheck(name->name, name->len)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return -ECANCELED; } - if (!strncmp(".", name, namelen)) { + if (!strncmp(".", name->name, name->len)) { /* If this is "." then check that the inum matches the dir. */ - if (xfs_has_ftype(mp) && type != DT_DIR) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, - offset); - checked_ftype = true; - if (ino != ip->i_ino) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, - offset); - } else if (!strncmp("..", name, namelen)) { + if (ino != dp->i_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + } else if (!strncmp("..", name->name, name->len)) { /* * If this is ".." in the root inode, check that the inum * matches this dir. */ - if (xfs_has_ftype(mp) && type != DT_DIR) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, - offset); - checked_ftype = true; - if (ip->i_ino == mp->m_sb.sb_rootino && ino != ip->i_ino) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, - offset); + if (dp->i_ino == mp->m_sb.sb_rootino && ino != dp->i_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); } /* Verify that we can look up this name by hash. */ - xname.name = name; - xname.len = namelen; - xname.type = XFS_DIR3_FT_UNKNOWN; - - error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL); + error = xchk_dir_lookup(sc, dp, name, &lookup_ino); /* ENOENT means the hash lookup failed and the dir is corrupt */ if (error == -ENOENT) error = -EFSCORRUPTED; - if (!xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset, - &error)) + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error)) goto out; if (lookup_ino != ino) { - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); - goto out; + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return -ECANCELED; } - /* Verify the file type. This function absorbs error codes. */ - if (!checked_ftype) { - error = xchk_dir_check_ftype(sdc, offset, lookup_ino, type); - if (error) - goto out; - } -out: /* - * A negative error code returned here is supposed to cause the - * dir_emit caller (xfs_readdir) to abort the directory iteration - * and return zero to xchk_directory. + * Grab the inode pointed to by the dirent. We release the inode + * before we cancel the scrub transaction. + * + * If _iget returns -EINVAL or -ENOENT then the child inode number is + * garbage and the directory is corrupt. If the _iget returns + * -EFSCORRUPTED or -EFSBADCRC then the child is corrupt which is a + * cross referencing error. Any other error is an operational error. */ - if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return false; - return !error; + error = xchk_iget(sc, ino, &ip); + if (error == -EINVAL || error == -ENOENT) { + error = -EFSCORRUPTED; + xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error); + goto out; + } + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, offset, &error)) + goto out; + + xchk_dir_check_ftype(sc, offset, ip, name->type); + xchk_irele(sc, ip); +out: + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -ECANCELED; + return error; } /* Scrub a directory btree record. */ @@ -201,6 +148,7 @@ xchk_dir_rec( struct xchk_da_btree *ds, int level) { + struct xfs_name dname = { }; struct xfs_da_state_blk *blk = &ds->state->path.blk[level]; struct xfs_mount *mp = ds->state->mp; struct xfs_inode *dp = ds->dargs.dp; @@ -297,7 +245,11 @@ xchk_dir_rec( xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); goto out_relse; } - calc_hash = xfs_da_hashname(dent->name, dent->namelen); + + /* Does the directory hash match? */ + dname.name = dent->name; + dname.len = dent->namelen; + calc_hash = xfs_dir2_hashname(mp, &dname); if (calc_hash != hash) xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); @@ -803,14 +755,7 @@ int xchk_directory( struct xfs_scrub *sc) { - struct xchk_dir_ctx sdc = { - .dir_iter.actor = xchk_dir_actor, - .dir_iter.pos = 0, - .sc = sc, - }; - size_t bufsize; - loff_t oldpos; - int error = 0; + int error; if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) return -ENOENT; @@ -818,7 +763,7 @@ xchk_directory( /* Plausible size? */ if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); - goto out; + return 0; } /* Check directory tree structure */ @@ -827,7 +772,7 @@ xchk_directory( return error; if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return error; + return 0; /* Check the freespace. */ error = xchk_directory_blocks(sc); @@ -835,44 +780,11 @@ xchk_directory( return error; if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return error; - - /* - * Check that every dirent we see can also be looked up by hash. - * Userspace usually asks for a 32k buffer, so we will too. - */ - bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, - sc->ip->i_disk_size); - - /* - * Look up every name in this directory by hash. - * - * Use the xfs_readdir function to call xchk_dir_actor on - * every directory entry in this directory. In _actor, we check - * the name, inode number, and ftype (if applicable) of the - * entry. xfs_readdir uses the VFS filldir functions to provide - * iteration context. - * - * The VFS grabs a read or write lock via i_rwsem before it reads - * or writes to a directory. If we've gotten this far we've - * already obtained IOLOCK_EXCL, which (since 4.10) is the same as - * getting a write lock on i_rwsem. Therefore, it is safe for us - * to drop the ILOCK here in order to reuse the _readdir and - * _dir_lookup routines, which do their own ILOCK locking. - */ - oldpos = 0; - sc->ilock_flags &= ~XFS_ILOCK_EXCL; - xfs_iunlock(sc->ip, XFS_ILOCK_EXCL); - while (true) { - error = xfs_readdir(sc->tp, sc->ip, &sdc.dir_iter, bufsize); - if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, - &error)) - goto out; - if (oldpos == sdc.dir_iter.pos) - break; - oldpos = sdc.dir_iter.pos; - } + return 0; -out: + /* Look up every name in this directory by hash. */ + error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL); + if (error == -ECANCELED) + error = 0; return error; } diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index f0c7f41897b9..faa315be7978 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Copyright (C) 2019 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2019-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -130,6 +130,13 @@ xchk_setup_fscounters( struct xchk_fscounters *fsc; int error; + /* + * If the AGF doesn't track btreeblks, we have to lock the AGF to count + * btree block usage by walking the actual btrees. + */ + if (!xfs_has_lazysbcount(sc->mp)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index aa65ec88a0c0..d2b2a1cb6533 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -1,12 +1,14 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2019 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2019-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h index d0b938d3d028..66a273f8585b 100644 --- a/fs/xfs/scrub/health.h +++ b/fs/xfs/scrub/health.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2019 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2019-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_HEALTH_H__ #define __XFS_SCRUB_HEALTH_H__ diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index e312be7cd375..575f22a02ebe 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -32,6 +32,8 @@ int xchk_setup_ag_iallocbt( struct xfs_scrub *sc) { + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); return xchk_setup_ag_btree(sc, sc->flags & XCHK_TRY_HARDER); } @@ -49,83 +51,237 @@ struct xchk_iallocbt { }; /* - * If we're checking the finobt, cross-reference with the inobt. - * Otherwise we're checking the inobt; if there is an finobt, make sure - * we have a record or not depending on freecount. + * Does the finobt have a record for this inode with the same hole/free state? + * This is a bit complicated because of the following: + * + * - The finobt need not have a record if all inodes in the inobt record are + * allocated. + * - The finobt need not have a record if all inodes in the inobt record are + * free. + * - The finobt need not have a record if the inobt record says this is a hole. + * This likely doesn't happen in practice. */ -static inline void -xchk_iallocbt_chunk_xref_other( +STATIC int +xchk_inobt_xref_finobt( + struct xfs_scrub *sc, + struct xfs_inobt_rec_incore *irec, + xfs_agino_t agino, + bool free, + bool hole) +{ + struct xfs_inobt_rec_incore frec; + struct xfs_btree_cur *cur = sc->sa.fino_cur; + bool ffree, fhole; + unsigned int frec_idx, fhole_idx; + int has_record; + int error; + + ASSERT(cur->bc_btnum == XFS_BTNUM_FINO); + + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record); + if (error) + return error; + if (!has_record) + goto no_record; + + error = xfs_inobt_get_rec(cur, &frec, &has_record); + if (!has_record) + return -EFSCORRUPTED; + + if (frec.ir_startino + XFS_INODES_PER_CHUNK <= agino) + goto no_record; + + /* There's a finobt record; free and hole status must match. */ + frec_idx = agino - frec.ir_startino; + ffree = frec.ir_free & (1ULL << frec_idx); + fhole_idx = frec_idx / XFS_INODES_PER_HOLEMASK_BIT; + fhole = frec.ir_holemask & (1U << fhole_idx); + + if (ffree != free) + xchk_btree_xref_set_corrupt(sc, cur, 0); + if (fhole != hole) + xchk_btree_xref_set_corrupt(sc, cur, 0); + return 0; + +no_record: + /* inobt record is fully allocated */ + if (irec->ir_free == 0) + return 0; + + /* inobt record is totally unallocated */ + if (irec->ir_free == XFS_INOBT_ALL_FREE) + return 0; + + /* inobt record says this is a hole */ + if (hole) + return 0; + + /* finobt doesn't care about allocated inodes */ + if (!free) + return 0; + + xchk_btree_xref_set_corrupt(sc, cur, 0); + return 0; +} + +/* + * Make sure that each inode of this part of an inobt record has the same + * sparse and free status as the finobt. + */ +STATIC void +xchk_inobt_chunk_xref_finobt( struct xfs_scrub *sc, struct xfs_inobt_rec_incore *irec, - xfs_agino_t agino) + xfs_agino_t agino, + unsigned int nr_inodes) { - struct xfs_btree_cur **pcur; - bool has_irec; + xfs_agino_t i; + unsigned int rec_idx; int error; - if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT) - pcur = &sc->sa.ino_cur; - else - pcur = &sc->sa.fino_cur; - if (!(*pcur)) - return; - error = xfs_ialloc_has_inode_record(*pcur, agino, agino, &has_irec); - if (!xchk_should_check_xref(sc, &error, pcur)) + ASSERT(sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT); + + if (!sc->sa.fino_cur || xchk_skip_xref(sc->sm)) return; - if (((irec->ir_freecount > 0 && !has_irec) || - (irec->ir_freecount == 0 && has_irec))) - xchk_btree_xref_set_corrupt(sc, *pcur, 0); + + for (i = agino, rec_idx = agino - irec->ir_startino; + i < agino + nr_inodes; + i++, rec_idx++) { + bool free, hole; + unsigned int hole_idx; + + free = irec->ir_free & (1ULL << rec_idx); + hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT; + hole = irec->ir_holemask & (1U << hole_idx); + + error = xchk_inobt_xref_finobt(sc, irec, i, free, hole); + if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur)) + return; + } +} + +/* + * Does the inobt have a record for this inode with the same hole/free state? + * The inobt must always have a record if there's a finobt record. + */ +STATIC int +xchk_finobt_xref_inobt( + struct xfs_scrub *sc, + struct xfs_inobt_rec_incore *frec, + xfs_agino_t agino, + bool ffree, + bool fhole) +{ + struct xfs_inobt_rec_incore irec; + struct xfs_btree_cur *cur = sc->sa.ino_cur; + bool free, hole; + unsigned int rec_idx, hole_idx; + int has_record; + int error; + + ASSERT(cur->bc_btnum == XFS_BTNUM_INO); + + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record); + if (error) + return error; + if (!has_record) + goto no_record; + + error = xfs_inobt_get_rec(cur, &irec, &has_record); + if (!has_record) + return -EFSCORRUPTED; + + if (irec.ir_startino + XFS_INODES_PER_CHUNK <= agino) + goto no_record; + + /* There's an inobt record; free and hole status must match. */ + rec_idx = agino - irec.ir_startino; + free = irec.ir_free & (1ULL << rec_idx); + hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT; + hole = irec.ir_holemask & (1U << hole_idx); + + if (ffree != free) + xchk_btree_xref_set_corrupt(sc, cur, 0); + if (fhole != hole) + xchk_btree_xref_set_corrupt(sc, cur, 0); + return 0; + +no_record: + /* finobt should never have a record for which the inobt does not */ + xchk_btree_xref_set_corrupt(sc, cur, 0); + return 0; } -/* Cross-reference with the other btrees. */ +/* + * Make sure that each inode of this part of an finobt record has the same + * sparse and free status as the inobt. + */ STATIC void -xchk_iallocbt_chunk_xref( +xchk_finobt_chunk_xref_inobt( struct xfs_scrub *sc, - struct xfs_inobt_rec_incore *irec, + struct xfs_inobt_rec_incore *frec, xfs_agino_t agino, - xfs_agblock_t agbno, - xfs_extlen_t len) + unsigned int nr_inodes) { - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + xfs_agino_t i; + unsigned int rec_idx; + int error; + + ASSERT(sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT); + + if (!sc->sa.ino_cur || xchk_skip_xref(sc->sm)) return; - xchk_xref_is_used_space(sc, agbno, len); - xchk_iallocbt_chunk_xref_other(sc, irec, agino); - xchk_xref_is_owned_by(sc, agbno, len, &XFS_RMAP_OINFO_INODES); - xchk_xref_is_not_shared(sc, agbno, len); + for (i = agino, rec_idx = agino - frec->ir_startino; + i < agino + nr_inodes; + i++, rec_idx++) { + bool ffree, fhole; + unsigned int hole_idx; + + ffree = frec->ir_free & (1ULL << rec_idx); + hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT; + fhole = frec->ir_holemask & (1U << hole_idx); + + error = xchk_finobt_xref_inobt(sc, frec, i, ffree, fhole); + if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur)) + return; + } } -/* Is this chunk worth checking? */ +/* Is this chunk worth checking and cross-referencing? */ STATIC bool xchk_iallocbt_chunk( struct xchk_btree *bs, struct xfs_inobt_rec_incore *irec, xfs_agino_t agino, - xfs_extlen_t len) + unsigned int nr_inodes) { + struct xfs_scrub *sc = bs->sc; struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_perag *pag = bs->cur->bc_ag.pag; - xfs_agblock_t bno; + xfs_agblock_t agbno; + xfs_extlen_t len; - bno = XFS_AGINO_TO_AGBNO(mp, agino); + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + len = XFS_B_TO_FSB(mp, nr_inodes * mp->m_sb.sb_inodesize); - if (!xfs_verify_agbext(pag, bno, len)) + if (!xfs_verify_agbext(pag, agbno, len)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len); + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return false; + xchk_xref_is_used_space(sc, agbno, len); + if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT) + xchk_inobt_chunk_xref_finobt(sc, irec, agino, nr_inodes); + else + xchk_finobt_chunk_xref_inobt(sc, irec, agino, nr_inodes); + xchk_xref_is_only_owned_by(sc, agbno, len, &XFS_RMAP_OINFO_INODES); + xchk_xref_is_not_shared(sc, agbno, len); + xchk_xref_is_not_cow_staging(sc, agbno, len); return true; } -/* Count the number of free inodes. */ -static unsigned int -xchk_iallocbt_freecount( - xfs_inofree_t freemask) -{ - BUILD_BUG_ON(sizeof(freemask) != sizeof(__u64)); - return hweight64(freemask); -} - /* * Check that an inode's allocation status matches ir_free in the inobt * record. First we try querying the in-core inode state, and if the inode @@ -272,7 +428,7 @@ xchk_iallocbt_check_cluster( return 0; } - xchk_xref_is_owned_by(bs->sc, agbno, M_IGEO(mp)->blocks_per_cluster, + xchk_xref_is_only_owned_by(bs->sc, agbno, M_IGEO(mp)->blocks_per_cluster, &XFS_RMAP_OINFO_INODES); /* Grab the inode cluster buffer. */ @@ -420,36 +576,22 @@ xchk_iallocbt_rec( const union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; - struct xfs_perag *pag = bs->cur->bc_ag.pag; struct xchk_iallocbt *iabt = bs->private; struct xfs_inobt_rec_incore irec; uint64_t holes; xfs_agino_t agino; - xfs_extlen_t len; int holecount; int i; int error = 0; - unsigned int real_freecount; uint16_t holemask; xfs_inobt_btrec_to_irec(mp, rec, &irec); - - if (irec.ir_count > XFS_INODES_PER_CHUNK || - irec.ir_freecount > XFS_INODES_PER_CHUNK) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - - real_freecount = irec.ir_freecount + - (XFS_INODES_PER_CHUNK - irec.ir_count); - if (real_freecount != xchk_iallocbt_freecount(irec.ir_free)) + if (xfs_inobt_check_irec(bs->cur, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } agino = irec.ir_startino; - /* Record has to be properly aligned within the AG. */ - if (!xfs_verify_agino(pag, agino) || - !xfs_verify_agino(pag, agino + XFS_INODES_PER_CHUNK - 1)) { - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - goto out; - } xchk_iallocbt_rec_alignment(bs, &irec); if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) @@ -459,12 +601,11 @@ xchk_iallocbt_rec( /* Handle non-sparse inodes */ if (!xfs_inobt_issparse(irec.ir_holemask)) { - len = XFS_B_TO_FSB(mp, - XFS_INODES_PER_CHUNK * mp->m_sb.sb_inodesize); if (irec.ir_count != XFS_INODES_PER_CHUNK) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - if (!xchk_iallocbt_chunk(bs, &irec, agino, len)) + if (!xchk_iallocbt_chunk(bs, &irec, agino, + XFS_INODES_PER_CHUNK)) goto out; goto check_clusters; } @@ -472,8 +613,6 @@ xchk_iallocbt_rec( /* Check each chunk of a sparse inode cluster. */ holemask = irec.ir_holemask; holecount = 0; - len = XFS_B_TO_FSB(mp, - XFS_INODES_PER_HOLEMASK_BIT * mp->m_sb.sb_inodesize); holes = ~xfs_inobt_irec_to_allocmask(&irec); if ((holes & irec.ir_free) != holes || irec.ir_freecount > irec.ir_count) @@ -482,8 +621,9 @@ xchk_iallocbt_rec( for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; i++) { if (holemask & 1) holecount += XFS_INODES_PER_HOLEMASK_BIT; - else if (!xchk_iallocbt_chunk(bs, &irec, agino, len)) - break; + else if (!xchk_iallocbt_chunk(bs, &irec, agino, + XFS_INODES_PER_HOLEMASK_BIT)) + goto out; holemask >>= 1; agino += XFS_INODES_PER_HOLEMASK_BIT; } @@ -493,6 +633,9 @@ xchk_iallocbt_rec( xchk_btree_set_corrupt(bs->sc, bs->cur, 0); check_clusters: + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + error = xchk_iallocbt_check_clusters(bs, &irec); if (error) goto out; @@ -622,18 +765,18 @@ xchk_xref_inode_check( xfs_agblock_t agbno, xfs_extlen_t len, struct xfs_btree_cur **icur, - bool should_have_inodes) + enum xbtree_recpacking expected) { - bool has_inodes; + enum xbtree_recpacking outcome; int error; if (!(*icur) || xchk_skip_xref(sc->sm)) return; - error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &has_inodes); + error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &outcome); if (!xchk_should_check_xref(sc, &error, icur)) return; - if (has_inodes != should_have_inodes) + if (outcome != expected) xchk_btree_xref_set_corrupt(sc, *icur, 0); } @@ -644,8 +787,10 @@ xchk_xref_is_not_inode_chunk( xfs_agblock_t agbno, xfs_extlen_t len) { - xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, false); - xchk_xref_inode_check(sc, agbno, len, &sc->sa.fino_cur, false); + xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, + XBTREE_RECPACKING_EMPTY); + xchk_xref_inode_check(sc, agbno, len, &sc->sa.fino_cur, + XBTREE_RECPACKING_EMPTY); } /* xref check that the extent is covered by inodes */ @@ -655,5 +800,6 @@ xchk_xref_is_inode_chunk( xfs_agblock_t agbno, xfs_extlen_t len) { - xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, true); + xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, + XBTREE_RECPACKING_FULL); } diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 7a2f38e5202c..3e1e02e340a6 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -11,8 +11,11 @@ #include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_ag.h" #include "xfs_inode.h" #include "xfs_ialloc.h" +#include "xfs_icache.h" #include "xfs_da_format.h" #include "xfs_reflink.h" #include "xfs_rmap.h" @@ -20,45 +23,176 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "scrub/trace.h" + +/* Prepare the attached inode for scrubbing. */ +static inline int +xchk_prepare_iscrub( + struct xfs_scrub *sc) +{ + int error; + + sc->ilock_flags = XFS_IOLOCK_EXCL; + xfs_ilock(sc->ip, sc->ilock_flags); + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + sc->ilock_flags |= XFS_ILOCK_EXCL; + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + return 0; +} + +/* Install this scrub-by-handle inode and prepare it for scrubbing. */ +static inline int +xchk_install_handle_iscrub( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + int error; + + error = xchk_install_handle_inode(sc, ip); + if (error) + return error; + + return xchk_prepare_iscrub(sc); +} /* - * Grab total control of the inode metadata. It doesn't matter here if - * the file data is still changing; exclusive access to the metadata is - * the goal. + * Grab total control of the inode metadata. In the best case, we grab the + * incore inode and take all locks on it. If the incore inode cannot be + * constructed due to corruption problems, lock the AGI so that we can single + * step the loading process to fix everything that can go wrong. */ int xchk_setup_inode( struct xfs_scrub *sc) { + struct xfs_imap imap; + struct xfs_inode *ip; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); + struct xfs_buf *agi_bp; + struct xfs_perag *pag; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino); int error; + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + /* We want to scan the opened inode, so lock it and exit. */ + if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { + sc->ip = ip_in; + return xchk_prepare_iscrub(sc); + } + + /* Reject internal metadata files and obviously bad inode numbers. */ + if (xfs_internal_inum(mp, sc->sm->sm_ino)) + return -ENOENT; + if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) + return -ENOENT; + + /* Try a regular untrusted iget. */ + error = xchk_iget(sc, sc->sm->sm_ino, &ip); + if (!error) + return xchk_install_handle_iscrub(sc, ip); + if (error == -ENOENT) + return error; + if (error != -EFSCORRUPTED && error != -EFSBADCRC && error != -EINVAL) + goto out_error; + /* - * Try to get the inode. If the verifiers fail, we try again - * in raw mode. + * EINVAL with IGET_UNTRUSTED probably means one of several things: + * userspace gave us an inode number that doesn't correspond to fs + * space; the inode btree lacks a record for this inode; or there is + * a record, and it says this inode is free. + * + * EFSCORRUPTED/EFSBADCRC could mean that the inode was mappable, but + * some other metadata corruption (e.g. inode forks) prevented + * instantiation of the incore inode. Or it could mean the inobt is + * corrupt. + * + * We want to look up this inode in the inobt directly to distinguish + * three different scenarios: (1) the inobt says the inode is free, + * in which case there's nothing to do; (2) the inobt is corrupt so we + * should flag the corruption and exit to userspace to let it fix the + * inobt; and (3) the inobt says the inode is allocated, but loading it + * failed due to corruption. + * + * Allocate a transaction and grab the AGI to prevent inobt activity in + * this AG. Retry the iget in case someone allocated a new inode after + * the first iget failed. */ - error = xchk_get_inode(sc); - switch (error) { - case 0: - break; - case -EFSCORRUPTED: - case -EFSBADCRC: - return xchk_trans_alloc(sc, 0); - default: - return error; + error = xchk_trans_alloc(sc, 0); + if (error) + goto out_error; + + error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip); + if (error == 0) { + /* Actually got the incore inode, so install it and proceed. */ + xchk_trans_cancel(sc); + return xchk_install_handle_iscrub(sc, ip); + } + if (error == -ENOENT) + goto out_gone; + if (error != -EFSCORRUPTED && error != -EFSBADCRC && error != -EINVAL) + goto out_cancel; + + /* Ensure that we have protected against inode allocation/freeing. */ + if (agi_bp == NULL) { + ASSERT(agi_bp != NULL); + error = -ECANCELED; + goto out_cancel; } - /* Got the inode, lock it and we're ready to go. */ - sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); - error = xchk_trans_alloc(sc, 0); + /* + * Untrusted iget failed a second time. Let's try an inobt lookup. + * If the inobt doesn't think this is an allocated inode then we'll + * return ENOENT to signal that the check can be skipped. + * + * If the lookup signals corruption, we'll mark this inode corrupt and + * exit to userspace. There's little chance of fixing anything until + * the inobt is straightened out, but there's nothing we can do here. + * + * If the lookup encounters a runtime error, exit to userspace. + */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino)); + if (!pag) { + error = -EFSCORRUPTED; + goto out_cancel; + } + + error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap, + XFS_IGET_UNTRUSTED); + xfs_perag_put(pag); + if (error == -EINVAL || error == -ENOENT) + goto out_gone; if (error) - goto out; - sc->ilock_flags |= XFS_ILOCK_EXCL; - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + goto out_cancel; -out: - /* scrub teardown will unlock and release the inode for us */ + /* + * The lookup succeeded. Chances are the ondisk inode is corrupt and + * preventing iget from reading it. Retain the scrub transaction and + * the AGI buffer to prevent anyone from allocating or freeing inodes. + * This ensures that we preserve the inconsistency between the inobt + * saying the inode is allocated and the icache being unable to load + * the inode until we can flag the corruption in xchk_inode. The + * scrub function has to note the corruption, since we're not really + * supposed to do that from the setup function. + */ + return 0; + +out_cancel: + xchk_trans_cancel(sc); +out_error: + trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), + error, __return_address); return error; +out_gone: + /* The file is gone, so there's nothing to check. */ + xchk_trans_cancel(sc); + return -ENOENT; } /* Inode core */ @@ -553,8 +687,9 @@ xchk_inode_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_inode_xref_finobt(sc, ino); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); xchk_inode_xref_bmap(sc, dip); out_free: diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index d8dff3fd8053..58d5dfb7ea21 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -16,6 +16,7 @@ #include "xfs_dir2_priv.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/readdir.h" /* Set us up to scrub parents. */ int @@ -30,122 +31,93 @@ xchk_setup_parent( /* Look for an entry in a parent pointing to this inode. */ struct xchk_parent_ctx { - struct dir_context dc; struct xfs_scrub *sc; - xfs_ino_t ino; xfs_nlink_t nlink; - bool cancelled; }; /* Look for a single entry in a directory pointing to an inode. */ -STATIC bool +STATIC int xchk_parent_actor( - struct dir_context *dc, - const char *name, - int namelen, - loff_t pos, - u64 ino, - unsigned type) + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) { - struct xchk_parent_ctx *spc; + struct xchk_parent_ctx *spc = priv; int error = 0; - spc = container_of(dc, struct xchk_parent_ctx, dc); - if (spc->ino == ino) + /* Does this name make sense? */ + if (!xfs_dir2_namecheck(name->name, name->len)) + error = -EFSCORRUPTED; + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) + return error; + + if (sc->ip->i_ino == ino) spc->nlink++; - /* - * If we're facing a fatal signal, bail out. Store the cancellation - * status separately because the VFS readdir code squashes error codes - * into short directory reads. - */ if (xchk_should_terminate(spc->sc, &error)) - spc->cancelled = true; + return error; - return !error; + return 0; } -/* Count the number of dentries in the parent dir that point to this inode. */ -STATIC int -xchk_parent_count_parent_dentries( - struct xfs_scrub *sc, - struct xfs_inode *parent, - xfs_nlink_t *nlink) +/* + * Try to lock a parent directory for checking dirents. Returns the inode + * flags for the locks we now hold, or zero if we failed. + */ +STATIC unsigned int +xchk_parent_ilock_dir( + struct xfs_inode *dp) { - struct xchk_parent_ctx spc = { - .dc.actor = xchk_parent_actor, - .ino = sc->ip->i_ino, - .sc = sc, - }; - size_t bufsize; - loff_t oldpos; - uint lock_mode; - int error = 0; + if (!xfs_ilock_nowait(dp, XFS_ILOCK_SHARED)) + return 0; - /* - * If there are any blocks, read-ahead block 0 as we're almost - * certain to have the next operation be a read there. This is - * how we guarantee that the parent's extent map has been loaded, - * if there is one. - */ - lock_mode = xfs_ilock_data_map_shared(parent); - if (parent->i_df.if_nextents > 0) - error = xfs_dir3_data_readahead(parent, 0, 0); - xfs_iunlock(parent, lock_mode); - if (error) - return error; + if (!xfs_need_iread_extents(&dp->i_df)) + return XFS_ILOCK_SHARED; - /* - * Iterate the parent dir to confirm that there is - * exactly one entry pointing back to the inode being - * scanned. - */ - bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, - parent->i_disk_size); - oldpos = 0; - while (true) { - error = xfs_readdir(sc->tp, parent, &spc.dc, bufsize); - if (error) - goto out; - if (spc.cancelled) { - error = -EAGAIN; - goto out; - } - if (oldpos == spc.dc.pos) - break; - oldpos = spc.dc.pos; - } - *nlink = spc.nlink; -out: - return error; + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + if (!xfs_ilock_nowait(dp, XFS_ILOCK_EXCL)) + return 0; + + return XFS_ILOCK_EXCL; } /* - * Given the inode number of the alleged parent of the inode being - * scrubbed, try to validate that the parent has exactly one directory - * entry pointing back to the inode being scrubbed. + * Given the inode number of the alleged parent of the inode being scrubbed, + * try to validate that the parent has exactly one directory entry pointing + * back to the inode being scrubbed. Returns -EAGAIN if we need to revalidate + * the dotdot entry. */ STATIC int xchk_parent_validate( struct xfs_scrub *sc, - xfs_ino_t dnum, - bool *try_again) + xfs_ino_t parent_ino) { + struct xchk_parent_ctx spc = { + .sc = sc, + .nlink = 0, + }; struct xfs_mount *mp = sc->mp; struct xfs_inode *dp = NULL; xfs_nlink_t expected_nlink; - xfs_nlink_t nlink; + unsigned int lock_mode; int error = 0; - *try_again = false; - - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - goto out; + /* Is this the root dir? Then '..' must point to itself. */ + if (sc->ip == mp->m_rootip) { + if (sc->ip->i_ino != mp->m_sb.sb_rootino || + sc->ip->i_ino != parent_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } /* '..' must not point to ourselves. */ - if (sc->ip->i_ino == dnum) { + if (sc->ip->i_ino == parent_ino) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } /* @@ -155,106 +127,51 @@ xchk_parent_validate( expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1; /* - * Grab this parent inode. We release the inode before we - * cancel the scrub transaction. Since we're don't know a - * priori that releasing the inode won't trigger eofblocks - * cleanup (which allocates what would be a nested transaction) - * if the parent pointer erroneously points to a file, we - * can't use DONTCACHE here because DONTCACHE inodes can trigger - * immediate inactive cleanup of the inode. + * Grab the parent directory inode. This must be released before we + * cancel the scrub transaction. * * If _iget returns -EINVAL or -ENOENT then the parent inode number is * garbage and the directory is corrupt. If the _iget returns * -EFSCORRUPTED or -EFSBADCRC then the parent is corrupt which is a * cross referencing error. Any other error is an operational error. */ - error = xfs_iget(mp, sc->tp, dnum, XFS_IGET_UNTRUSTED, 0, &dp); + error = xchk_iget(sc, parent_ino, &dp); if (error == -EINVAL || error == -ENOENT) { error = -EFSCORRUPTED; xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error); - goto out; + return error; } if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; + return error; if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); goto out_rele; } - /* - * We prefer to keep the inode locked while we lock and search - * its alleged parent for a forward reference. If we can grab - * the iolock, validate the pointers and we're done. We must - * use nowait here to avoid an ABBA deadlock on the parent and - * the child inodes. - */ - if (xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED)) { - error = xchk_parent_count_parent_dentries(sc, dp, &nlink); - if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, - &error)) - goto out_unlock; - if (nlink != expected_nlink) - xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out_unlock; - } - - /* - * The game changes if we get here. We failed to lock the parent, - * so we're going to try to verify both pointers while only holding - * one lock so as to avoid deadlocking with something that's actually - * trying to traverse down the directory tree. - */ - xfs_iunlock(sc->ip, sc->ilock_flags); - sc->ilock_flags = 0; - error = xchk_ilock_inverted(dp, XFS_IOLOCK_SHARED); - if (error) + lock_mode = xchk_parent_ilock_dir(dp); + if (!lock_mode) { + xfs_iunlock(sc->ip, XFS_ILOCK_EXCL); + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + error = -EAGAIN; goto out_rele; + } - /* Go looking for our dentry. */ - error = xchk_parent_count_parent_dentries(sc, dp, &nlink); + /* Look for a directory entry in the parent pointing to the child. */ + error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc); if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out_unlock; - /* Drop the parent lock, relock this inode. */ - xfs_iunlock(dp, XFS_IOLOCK_SHARED); - error = xchk_ilock_inverted(sc->ip, XFS_IOLOCK_EXCL); - if (error) - goto out_rele; - sc->ilock_flags = XFS_IOLOCK_EXCL; - - /* - * If we're an unlinked directory, the parent /won't/ have a link - * to us. Otherwise, it should have one link. We have to re-set - * it here because we dropped the lock on sc->ip. - */ - expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1; - - /* Look up '..' to see if the inode changed. */ - error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL); - if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out_rele; - - /* Drat, parent changed. Try again! */ - if (dnum != dp->i_ino) { - xfs_irele(dp); - *try_again = true; - return 0; - } - xfs_irele(dp); - /* - * '..' didn't change, so check that there was only one entry - * for us in the parent. + * Ensure that the parent has as many links to the child as the child + * thinks it has to the parent. */ - if (nlink != expected_nlink) + if (spc.nlink != expected_nlink) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - return error; out_unlock: - xfs_iunlock(dp, XFS_IOLOCK_SHARED); + xfs_iunlock(dp, lock_mode); out_rele: - xfs_irele(dp); -out: + xchk_irele(sc, dp); return error; } @@ -264,9 +181,7 @@ xchk_parent( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; - xfs_ino_t dnum; - bool try_again; - int tries = 0; + xfs_ino_t parent_ino; int error = 0; /* @@ -279,56 +194,29 @@ xchk_parent( /* We're not a special inode, are we? */ if (!xfs_verify_dir_ino(mp, sc->ip->i_ino)) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } - /* - * The VFS grabs a read or write lock via i_rwsem before it reads - * or writes to a directory. If we've gotten this far we've - * already obtained IOLOCK_EXCL, which (since 4.10) is the same as - * getting a write lock on i_rwsem. Therefore, it is safe for us - * to drop the ILOCK here in order to do directory lookups. - */ - sc->ilock_flags &= ~(XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL); - xfs_iunlock(sc->ip, XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL); - - /* Look up '..' */ - error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL); - if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; - if (!xfs_verify_dir_ino(mp, dnum)) { - xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; - } + do { + if (xchk_should_terminate(sc, &error)) + break; - /* Is this the root dir? Then '..' must point to itself. */ - if (sc->ip == mp->m_rootip) { - if (sc->ip->i_ino != mp->m_sb.sb_rootino || - sc->ip->i_ino != dnum) + /* Look up '..' */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, + &parent_ino); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + return error; + if (!xfs_verify_dir_ino(mp, parent_ino)) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; - } + return 0; + } - do { - error = xchk_parent_validate(sc, dnum, &try_again); - if (error) - goto out; - } while (try_again && ++tries < 20); + /* + * Check that the dotdot entry points to a parent directory + * containing a dirent pointing to this subdirectory. + */ + error = xchk_parent_validate(sc, parent_ino); + } while (error == -EAGAIN); - /* - * We gave it our best shot but failed, so mark this scrub - * incomplete. Userspace can decide if it wants to try again. - */ - if (try_again && tries == 20) - xchk_set_incomplete(sc); -out: - /* - * If we failed to lock the parent inode even after a retry, just mark - * this scrub incomplete and return. - */ - if ((sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) { - error = 0; - xchk_set_incomplete(sc); - } return error; } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 9eeac8565394..e6caa358cbda 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -53,6 +53,9 @@ xchk_setup_quota( if (!xfs_this_quota_on(sc->mp, dqtype)) return -ENOENT; + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + error = xchk_setup_fs(sc); if (error) return error; diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c new file mode 100644 index 000000000000..e51c1544be63 --- /dev/null +++ b/fs/xfs/scrub/readdir.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_trace.h" +#include "xfs_bmap.h" +#include "xfs_trans.h" +#include "xfs_error.h" +#include "scrub/scrub.h" +#include "scrub/readdir.h" + +/* Call a function for every entry in a shortform directory. */ +STATIC int +xchk_dir_walk_sf( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, + void *priv) +{ + struct xfs_name name = { + .name = ".", + .len = 1, + .type = XFS_DIR3_FT_DIR, + }; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + struct xfs_dir2_sf_entry *sfep; + struct xfs_dir2_sf_hdr *sfp; + xfs_ino_t ino; + xfs_dir2_dataptr_t dapos; + unsigned int i; + int error; + + ASSERT(dp->i_df.if_bytes == dp->i_disk_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + + sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; + + /* dot entry */ + dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + geo->data_entry_offset); + + error = dirent_fn(sc, dp, dapos, &name, dp->i_ino, priv); + if (error) + return error; + + /* dotdot entry */ + dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + geo->data_entry_offset + + xfs_dir2_data_entsize(mp, sizeof(".") - 1)); + ino = xfs_dir2_sf_get_parent_ino(sfp); + name.name = ".."; + name.len = 2; + + error = dirent_fn(sc, dp, dapos, &name, ino, priv); + if (error) + return error; + + /* iterate everything else */ + sfep = xfs_dir2_sf_firstentry(sfp); + for (i = 0; i < sfp->count; i++) { + dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + xfs_dir2_sf_get_offset(sfep)); + ino = xfs_dir2_sf_get_ino(mp, sfp, sfep); + name.name = sfep->name; + name.len = sfep->namelen; + name.type = xfs_dir2_sf_get_ftype(mp, sfep); + + error = dirent_fn(sc, dp, dapos, &name, ino, priv); + if (error) + return error; + + sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); + } + + return 0; +} + +/* Call a function for every entry in a block directory. */ +STATIC int +xchk_dir_walk_block( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, + void *priv) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + struct xfs_buf *bp; + unsigned int off, next_off, end; + int error; + + error = xfs_dir3_block_read(sc->tp, dp, &bp); + if (error) + return error; + + /* Walk each directory entry. */ + end = xfs_dir3_data_end_offset(geo, bp->b_addr); + for (off = geo->data_entry_offset; off < end; off = next_off) { + struct xfs_name name = { }; + struct xfs_dir2_data_unused *dup = bp->b_addr + off; + struct xfs_dir2_data_entry *dep = bp->b_addr + off; + xfs_ino_t ino; + xfs_dir2_dataptr_t dapos; + + /* Skip an empty entry. */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + next_off = off + be16_to_cpu(dup->length); + continue; + } + + /* Otherwise, find the next entry and report it. */ + next_off = off + xfs_dir2_data_entsize(mp, dep->namelen); + if (next_off > end) + break; + + dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, off); + ino = be64_to_cpu(dep->inumber); + name.name = dep->name; + name.len = dep->namelen; + name.type = xfs_dir2_data_get_ftype(mp, dep); + + error = dirent_fn(sc, dp, dapos, &name, ino, priv); + if (error) + break; + } + + xfs_trans_brelse(sc->tp, bp); + return error; +} + +/* Read a leaf-format directory buffer. */ +STATIC int +xchk_read_leaf_dir_buf( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_da_geometry *geo, + xfs_dir2_off_t *curoff, + struct xfs_buf **bpp) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec map; + struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); + xfs_dablk_t last_da; + xfs_dablk_t map_off; + xfs_dir2_off_t new_off; + + *bpp = NULL; + + /* + * Look for mapped directory blocks at or above the current offset. + * Truncate down to the nearest directory block to start the scanning + * operation. + */ + last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET); + map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *curoff)); + + if (!xfs_iext_lookup_extent(dp, ifp, map_off, &icur, &map)) + return 0; + if (map.br_startoff >= last_da) + return 0; + xfs_trim_extent(&map, map_off, last_da - map_off); + + /* Read the directory block of that first mapping. */ + new_off = xfs_dir2_da_to_byte(geo, map.br_startoff); + if (new_off > *curoff) + *curoff = new_off; + + return xfs_dir3_data_read(tp, dp, map.br_startoff, 0, bpp); +} + +/* Call a function for every entry in a leaf directory. */ +STATIC int +xchk_dir_walk_leaf( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, + void *priv) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + struct xfs_buf *bp = NULL; + xfs_dir2_off_t curoff = 0; + unsigned int offset = 0; + int error; + + /* Iterate every directory offset in this directory. */ + while (curoff < XFS_DIR2_LEAF_OFFSET) { + struct xfs_name name = { }; + struct xfs_dir2_data_unused *dup; + struct xfs_dir2_data_entry *dep; + xfs_ino_t ino; + unsigned int length; + xfs_dir2_dataptr_t dapos; + + /* + * If we have no buffer, or we're off the end of the + * current buffer, need to get another one. + */ + if (!bp || offset >= geo->blksize) { + if (bp) { + xfs_trans_brelse(sc->tp, bp); + bp = NULL; + } + + error = xchk_read_leaf_dir_buf(sc->tp, dp, geo, &curoff, + &bp); + if (error || !bp) + break; + + /* + * Find our position in the block. + */ + offset = geo->data_entry_offset; + curoff += geo->data_entry_offset; + } + + /* Skip an empty entry. */ + dup = bp->b_addr + offset; + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + length = be16_to_cpu(dup->length); + offset += length; + curoff += length; + continue; + } + + /* Otherwise, find the next entry and report it. */ + dep = bp->b_addr + offset; + length = xfs_dir2_data_entsize(mp, dep->namelen); + + dapos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; + ino = be64_to_cpu(dep->inumber); + name.name = dep->name; + name.len = dep->namelen; + name.type = xfs_dir2_data_get_ftype(mp, dep); + + error = dirent_fn(sc, dp, dapos, &name, ino, priv); + if (error) + break; + + /* Advance to the next entry. */ + offset += length; + curoff += length; + } + + if (bp) + xfs_trans_brelse(sc->tp, bp); + return error; +} + +/* + * Call a function for every entry in a directory. + * + * Callers must hold the ILOCK. File types are XFS_DIR3_FT_*. + */ +int +xchk_dir_walk( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, + void *priv) +{ + struct xfs_da_args args = { + .dp = dp, + .geo = dp->i_mount->m_dir_geo, + .trans = sc->tp, + }; + bool isblock; + int error; + + if (xfs_is_shutdown(dp->i_mount)) + return -EIO; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) + return xchk_dir_walk_sf(sc, dp, dirent_fn, priv); + + /* dir2 functions require that the data fork is loaded */ + error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK); + if (error) + return error; + + error = xfs_dir2_isblock(&args, &isblock); + if (error) + return error; + + if (isblock) + return xchk_dir_walk_block(sc, dp, dirent_fn, priv); + + return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv); +} + +/* + * Look up the inode number for an exact name in a directory. + * + * Callers must hold the ILOCK. File types are XFS_DIR3_FT_*. Names are not + * checked for correctness. + */ +int +xchk_dir_lookup( + struct xfs_scrub *sc, + struct xfs_inode *dp, + const struct xfs_name *name, + xfs_ino_t *ino) +{ + struct xfs_da_args args = { + .dp = dp, + .geo = dp->i_mount->m_dir_geo, + .trans = sc->tp, + .name = name->name, + .namelen = name->len, + .filetype = name->type, + .hashval = xfs_dir2_hashname(dp->i_mount, name), + .whichfork = XFS_DATA_FORK, + .op_flags = XFS_DA_OP_OKNOENT, + }; + bool isblock, isleaf; + int error; + + if (xfs_is_shutdown(dp->i_mount)) + return -EIO; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + error = xfs_dir2_sf_lookup(&args); + goto out_check_rval; + } + + /* dir2 functions require that the data fork is loaded */ + error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK); + if (error) + return error; + + error = xfs_dir2_isblock(&args, &isblock); + if (error) + return error; + + if (isblock) { + error = xfs_dir2_block_lookup(&args); + goto out_check_rval; + } + + error = xfs_dir2_isleaf(&args, &isleaf); + if (error) + return error; + + if (isleaf) { + error = xfs_dir2_leaf_lookup(&args); + goto out_check_rval; + } + + error = xfs_dir2_node_lookup(&args); + +out_check_rval: + if (error == -EEXIST) + error = 0; + if (!error) + *ino = args.inumber; + return error; +} diff --git a/fs/xfs/scrub/readdir.h b/fs/xfs/scrub/readdir.h new file mode 100644 index 000000000000..55787f4df123 --- /dev/null +++ b/fs/xfs/scrub/readdir.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_READDIR_H__ +#define __XFS_SCRUB_READDIR_H__ + +typedef int (*xchk_dirent_fn)(struct xfs_scrub *sc, struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, const struct xfs_name *name, + xfs_ino_t ino, void *priv); + +int xchk_dir_walk(struct xfs_scrub *sc, struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, void *priv); + +int xchk_dir_lookup(struct xfs_scrub *sc, struct xfs_inode *dp, + const struct xfs_name *name, xfs_ino_t *ino); + +#endif /* __XFS_SCRUB_READDIR_H__ */ diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index d9c1b3cea4a5..304ea1e1bfb0 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -1,21 +1,22 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_ag.h" #include "xfs_btree.h" #include "xfs_rmap.h" #include "xfs_refcount.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" -#include "xfs_trans_resv.h" -#include "xfs_mount.h" -#include "xfs_ag.h" +#include "scrub/trace.h" /* * Set us up to scrub reference count btrees. @@ -24,6 +25,8 @@ int xchk_setup_ag_refcountbt( struct xfs_scrub *sc) { + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); return xchk_setup_ag_btree(sc, false); } @@ -300,8 +303,10 @@ xchk_refcountbt_xref_rmap( goto out_free; xchk_refcountbt_process_rmap_fragments(&refchk); - if (irec->rc_refcount != refchk.seen) + if (irec->rc_refcount != refchk.seen) { + trace_xchk_refcount_incorrect(sc->sa.pag, irec, refchk.seen); xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + } out_free: list_for_each_entry_safe(frag, n, &refchk.fragments, list) { @@ -325,6 +330,107 @@ xchk_refcountbt_xref( xchk_refcountbt_xref_rmap(sc, irec); } +struct xchk_refcbt_records { + /* Previous refcount record. */ + struct xfs_refcount_irec prev_rec; + + /* The next AG block where we aren't expecting shared extents. */ + xfs_agblock_t next_unshared_agbno; + + /* Number of CoW blocks we expect. */ + xfs_agblock_t cow_blocks; + + /* Was the last record a shared or CoW staging extent? */ + enum xfs_refc_domain prev_domain; +}; + +STATIC int +xchk_refcountbt_rmap_check_gap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + xfs_agblock_t *next_bno = priv; + + if (*next_bno != NULLAGBLOCK && rec->rm_startblock < *next_bno) + return -ECANCELED; + + *next_bno = rec->rm_startblock + rec->rm_blockcount; + return 0; +} + +/* + * Make sure that a gap in the reference count records does not correspond to + * overlapping records (i.e. shared extents) in the reverse mappings. + */ +static inline void +xchk_refcountbt_xref_gaps( + struct xfs_scrub *sc, + struct xchk_refcbt_records *rrc, + xfs_agblock_t bno) +{ + struct xfs_rmap_irec low; + struct xfs_rmap_irec high; + xfs_agblock_t next_bno = NULLAGBLOCK; + int error; + + if (bno <= rrc->next_unshared_agbno || !sc->sa.rmap_cur || + xchk_skip_xref(sc->sm)) + return; + + memset(&low, 0, sizeof(low)); + low.rm_startblock = rrc->next_unshared_agbno; + memset(&high, 0xFF, sizeof(high)); + high.rm_startblock = bno - 1; + + error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high, + xchk_refcountbt_rmap_check_gap, &next_bno); + if (error == -ECANCELED) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + else + xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur); +} + +static inline bool +xchk_refcount_mergeable( + struct xchk_refcbt_records *rrc, + const struct xfs_refcount_irec *r2) +{ + const struct xfs_refcount_irec *r1 = &rrc->prev_rec; + + /* Ignore if prev_rec is not yet initialized. */ + if (r1->rc_blockcount > 0) + return false; + + if (r1->rc_domain != r2->rc_domain) + return false; + if (r1->rc_startblock + r1->rc_blockcount != r2->rc_startblock) + return false; + if (r1->rc_refcount != r2->rc_refcount) + return false; + if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount > + MAXREFCEXTLEN) + return false; + + return true; +} + +/* Flag failures for records that could be merged. */ +STATIC void +xchk_refcountbt_check_mergeable( + struct xchk_btree *bs, + struct xchk_refcbt_records *rrc, + const struct xfs_refcount_irec *irec) +{ + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (xchk_refcount_mergeable(rrc, irec)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec)); +} + /* Scrub a refcountbt record. */ STATIC int xchk_refcountbt_rec( @@ -332,27 +438,37 @@ xchk_refcountbt_rec( const union xfs_btree_rec *rec) { struct xfs_refcount_irec irec; - xfs_agblock_t *cow_blocks = bs->private; - struct xfs_perag *pag = bs->cur->bc_ag.pag; + struct xchk_refcbt_records *rrc = bs->private; xfs_refcount_btrec_to_irec(rec, &irec); - - /* Check the domain and refcount are not incompatible. */ - if (!xfs_refcount_check_domain(&irec)) + if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } if (irec.rc_domain == XFS_REFC_DOMAIN_COW) - (*cow_blocks) += irec.rc_blockcount; - - /* Check the extent. */ - if (!xfs_verify_agbext(pag, irec.rc_startblock, irec.rc_blockcount)) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + rrc->cow_blocks += irec.rc_blockcount; - if (irec.rc_refcount == 0) + /* Shared records always come before CoW records. */ + if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED && + rrc->prev_domain == XFS_REFC_DOMAIN_COW) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + rrc->prev_domain = irec.rc_domain; + xchk_refcountbt_check_mergeable(bs, rrc, &irec); xchk_refcountbt_xref(bs->sc, &irec); + /* + * If this is a record for a shared extent, check that all blocks + * between the previous record and this one have at most one reverse + * mapping. + */ + if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED) { + xchk_refcountbt_xref_gaps(bs->sc, rrc, irec.rc_startblock); + rrc->next_unshared_agbno = irec.rc_startblock + + irec.rc_blockcount; + } + return 0; } @@ -394,15 +510,25 @@ int xchk_refcountbt( struct xfs_scrub *sc) { - xfs_agblock_t cow_blocks = 0; + struct xchk_refcbt_records rrc = { + .cow_blocks = 0, + .next_unshared_agbno = 0, + .prev_domain = XFS_REFC_DOMAIN_SHARED, + }; int error; error = xchk_btree(sc, sc->sa.refc_cur, xchk_refcountbt_rec, - &XFS_RMAP_OINFO_REFC, &cow_blocks); + &XFS_RMAP_OINFO_REFC, &rrc); if (error) return error; - xchk_refcount_xref_rmap(sc, cow_blocks); + /* + * Check that all blocks between the last refcount > 1 record and the + * end of the AG have at most one reverse mapping. + */ + xchk_refcountbt_xref_gaps(sc, &rrc, sc->mp->m_sb.sb_agblocks); + + xchk_refcount_xref_rmap(sc, rrc.cow_blocks); return 0; } @@ -458,16 +584,37 @@ xchk_xref_is_not_shared( xfs_agblock_t agbno, xfs_extlen_t len) { - bool shared; + enum xbtree_recpacking outcome; + int error; + + if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_refcount_has_records(sc->sa.refc_cur, + XFS_REFC_DOMAIN_SHARED, agbno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (outcome != XBTREE_RECPACKING_EMPTY) + xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); +} + +/* xref check that the extent is not being used for CoW staging. */ +void +xchk_xref_is_not_cow_staging( + struct xfs_scrub *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + enum xbtree_recpacking outcome; int error; if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm)) return; - error = xfs_refcount_has_record(sc->sa.refc_cur, XFS_REFC_DOMAIN_SHARED, - agbno, len, &shared); + error = xfs_refcount_has_records(sc->sa.refc_cur, XFS_REFC_DOMAIN_COW, + agbno, len, &outcome); if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) return; - if (shared) + if (outcome != XBTREE_RECPACKING_EMPTY) xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); } diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 1b71174ec0d6..ac6d8803e660 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2018 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -60,6 +60,9 @@ xrep_attempt( sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; sc->flags |= XREP_ALREADY_FIXED; return -EAGAIN; + case -ECHRNG: + sc->flags |= XCHK_NEED_DRAIN; + return -EAGAIN; case -EDEADLOCK: /* Tell the caller to try again having grabbed all the locks. */ if (!(sc->flags & XCHK_TRY_HARDER)) { @@ -442,6 +445,30 @@ xrep_init_btblock( * buffers associated with @bitmap. */ +static int +xrep_invalidate_block( + uint64_t fsbno, + void *priv) +{ + struct xfs_scrub *sc = priv; + struct xfs_buf *bp; + int error; + + /* Skip AG headers and post-EOFS blocks */ + if (!xfs_verify_fsbno(sc->mp, fsbno)) + return 0; + + error = xfs_buf_incore(sc->mp->m_ddev_targp, + XFS_FSB_TO_DADDR(sc->mp, fsbno), + XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp); + if (error) + return 0; + + xfs_trans_bjoin(sc->tp, bp); + xfs_trans_binval(sc->tp, bp); + return 0; +} + /* * Invalidate buffers for per-AG btree blocks we're dumping. This function * is not intended for use with file data repairs; we have bunmapi for that. @@ -451,11 +478,6 @@ xrep_invalidate_blocks( struct xfs_scrub *sc, struct xbitmap *bitmap) { - struct xbitmap_range *bmr; - struct xbitmap_range *n; - struct xfs_buf *bp; - xfs_fsblock_t fsbno; - /* * For each block in each extent, see if there's an incore buffer for * exactly that block; if so, invalidate it. The buffer cache only @@ -464,23 +486,7 @@ xrep_invalidate_blocks( * because we never own those; and if we can't TRYLOCK the buffer we * assume it's owned by someone else. */ - for_each_xbitmap_block(fsbno, bmr, n, bitmap) { - int error; - - /* Skip AG headers and post-EOFS blocks */ - if (!xfs_verify_fsbno(sc->mp, fsbno)) - continue; - error = xfs_buf_incore(sc->mp->m_ddev_targp, - XFS_FSB_TO_DADDR(sc->mp, fsbno), - XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp); - if (error) - continue; - - xfs_trans_bjoin(sc->tp, bp); - xfs_trans_binval(sc->tp, bp); - } - - return 0; + return xbitmap_walk_bits(bitmap, xrep_invalidate_block, sc); } /* Ensure the freelist is the correct size. */ @@ -501,6 +507,15 @@ xrep_fix_freelist( can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK); } +/* Information about reaping extents after a repair. */ +struct xrep_reap_state { + struct xfs_scrub *sc; + + /* Reverse mapping owner and metadata reservation type. */ + const struct xfs_owner_info *oinfo; + enum xfs_ag_resv_type resv; +}; + /* * Put a block back on the AGFL. */ @@ -545,17 +560,23 @@ xrep_put_freelist( /* Dispose of a single block. */ STATIC int xrep_reap_block( - struct xfs_scrub *sc, - xfs_fsblock_t fsbno, - const struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type resv) + uint64_t fsbno, + void *priv) { + struct xrep_reap_state *rs = priv; + struct xfs_scrub *sc = rs->sc; struct xfs_btree_cur *cur; struct xfs_buf *agf_bp = NULL; xfs_agblock_t agbno; bool has_other_rmap; int error; + ASSERT(sc->ip != NULL || + XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); + trace_xrep_dispose_btree_extent(sc->mp, + XFS_FSB_TO_AGNO(sc->mp, fsbno), + XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1); + agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); @@ -574,7 +595,8 @@ xrep_reap_block( cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag); /* Can we find any other rmappings? */ - error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap); + error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo, + &has_other_rmap); xfs_btree_del_cursor(cur, error); if (error) goto out_free; @@ -594,11 +616,12 @@ xrep_reap_block( */ if (has_other_rmap) error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno, - 1, oinfo); - else if (resv == XFS_AG_RESV_AGFL) + 1, rs->oinfo); + else if (rs->resv == XFS_AG_RESV_AGFL) error = xrep_put_freelist(sc, agbno); else - error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv); + error = xfs_free_extent(sc->tp, sc->sa.pag, agbno, 1, rs->oinfo, + rs->resv); if (agf_bp != sc->sa.agf_bp) xfs_trans_brelse(sc->tp, agf_bp); if (error) @@ -622,26 +645,15 @@ xrep_reap_extents( const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type) { - struct xbitmap_range *bmr; - struct xbitmap_range *n; - xfs_fsblock_t fsbno; - int error = 0; + struct xrep_reap_state rs = { + .sc = sc, + .oinfo = oinfo, + .resv = type, + }; ASSERT(xfs_has_rmapbt(sc->mp)); - for_each_xbitmap_block(fsbno, bmr, n, bitmap) { - ASSERT(sc->ip != NULL || - XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); - trace_xrep_dispose_btree_extent(sc->mp, - XFS_FSB_TO_AGNO(sc->mp, fsbno), - XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1); - - error = xrep_reap_block(sc, fsbno, oinfo, type); - if (error) - break; - } - - return error; + return xbitmap_walk_bits(bitmap, xrep_reap_block, &rs); } /* diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 840f74ec431c..dce791c679ee 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2018 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_REPAIR_H__ #define __XFS_SCRUB_REPAIR_H__ @@ -31,6 +31,7 @@ int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb, const struct xfs_buf_ops *ops); struct xbitmap; +struct xagb_bitmap; int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist); diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 229826b2e1c0..d29a26ecddd6 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -1,21 +1,29 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_btree.h" #include "xfs_rmap.h" #include "xfs_refcount.h" +#include "xfs_ag.h" +#include "xfs_bit.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_refcount_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" -#include "xfs_ag.h" +#include "scrub/bitmap.h" /* * Set us up to scrub reverse mapping btrees. @@ -24,11 +32,39 @@ int xchk_setup_ag_rmapbt( struct xfs_scrub *sc) { + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + return xchk_setup_ag_btree(sc, false); } /* Reverse-mapping scrubber. */ +struct xchk_rmap { + /* + * The furthest-reaching of the rmapbt records that we've already + * processed. This enables us to detect overlapping records for space + * allocations that cannot be shared. + */ + struct xfs_rmap_irec overlap_rec; + + /* + * The previous rmapbt record, so that we can check for two records + * that could be one. + */ + struct xfs_rmap_irec prev_rec; + + /* Bitmaps containing all blocks for each type of AG metadata. */ + struct xagb_bitmap fs_owned; + struct xagb_bitmap log_owned; + struct xagb_bitmap ag_owned; + struct xagb_bitmap inobt_owned; + struct xagb_bitmap refcbt_owned; + + /* Did we complete the AG space metadata bitmaps? */ + bool bitmaps_complete; +}; + /* Cross-reference a rmap against the refcount btree. */ STATIC void xchk_rmapbt_xref_refc( @@ -84,80 +120,415 @@ xchk_rmapbt_xref( xchk_rmapbt_xref_refc(sc, irec); } -/* Scrub an rmapbt record. */ -STATIC int -xchk_rmapbt_rec( - struct xchk_btree *bs, - const union xfs_btree_rec *rec) +/* + * Check for bogus UNWRITTEN flags in the rmapbt node block keys. + * + * In reverse mapping records, the file mapping extent state + * (XFS_RMAP_OFF_UNWRITTEN) is a record attribute, not a key field. It is not + * involved in lookups in any way. In older kernels, the functions that + * convert rmapbt records to keys forgot to filter out the extent state bit, + * even though the key comparison functions have filtered the flag correctly. + * If we spot an rmap key with the unwritten bit set in rm_offset, we should + * mark the btree as needing optimization to rebuild the btree without those + * flags. + */ +STATIC void +xchk_rmapbt_check_unwritten_in_keyflags( + struct xchk_btree *bs) { - struct xfs_mount *mp = bs->cur->bc_mp; - struct xfs_rmap_irec irec; - struct xfs_perag *pag = bs->cur->bc_ag.pag; - bool non_inode; - bool is_unwritten; - bool is_bmbt; - bool is_attr; - int error; + struct xfs_scrub *sc = bs->sc; + struct xfs_btree_cur *cur = bs->cur; + struct xfs_btree_block *keyblock; + union xfs_btree_key *lkey, *hkey; + __be64 badflag = cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN); + unsigned int level; - error = xfs_rmap_btrec_to_irec(rec, &irec); - if (!xchk_btree_process_error(bs->sc, bs->cur, 0, &error)) - goto out; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_PREEN) + return; + + for (level = 1; level < cur->bc_nlevels; level++) { + struct xfs_buf *bp; + unsigned int ptr; + + /* Only check the first time we've seen this node block. */ + if (cur->bc_levels[level].ptr > 1) + continue; + + keyblock = xfs_btree_get_block(cur, level, &bp); + for (ptr = 1; ptr <= be16_to_cpu(keyblock->bb_numrecs); ptr++) { + lkey = xfs_btree_key_addr(cur, ptr, keyblock); + + if (lkey->rmap.rm_offset & badflag) { + xchk_btree_set_preen(sc, cur, level); + break; + } + + hkey = xfs_btree_high_key_addr(cur, ptr, keyblock); + if (hkey->rmap.rm_offset & badflag) { + xchk_btree_set_preen(sc, cur, level); + break; + } + } + } +} + +static inline bool +xchk_rmapbt_is_shareable( + struct xfs_scrub *sc, + const struct xfs_rmap_irec *irec) +{ + if (!xfs_has_reflink(sc->mp)) + return false; + if (XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)) + return false; + if (irec->rm_flags & (XFS_RMAP_BMBT_BLOCK | XFS_RMAP_ATTR_FORK | + XFS_RMAP_UNWRITTEN)) + return false; + return true; +} + +/* Flag failures for records that overlap but cannot. */ +STATIC void +xchk_rmapbt_check_overlapping( + struct xchk_btree *bs, + struct xchk_rmap *cr, + const struct xfs_rmap_irec *irec) +{ + xfs_agblock_t pnext, inext; + + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + /* No previous record? */ + if (cr->overlap_rec.rm_blockcount == 0) + goto set_prev; + + /* Do overlap_rec and irec overlap? */ + pnext = cr->overlap_rec.rm_startblock + cr->overlap_rec.rm_blockcount; + if (pnext <= irec->rm_startblock) + goto set_prev; - /* Check extent. */ - if (irec.rm_startblock + irec.rm_blockcount <= irec.rm_startblock) + /* Overlap is only allowed if both records are data fork mappings. */ + if (!xchk_rmapbt_is_shareable(bs->sc, &cr->overlap_rec) || + !xchk_rmapbt_is_shareable(bs->sc, irec)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - if (irec.rm_owner == XFS_RMAP_OWN_FS) { + /* Save whichever rmap record extends furthest. */ + inext = irec->rm_startblock + irec->rm_blockcount; + if (pnext > inext) + return; + +set_prev: + memcpy(&cr->overlap_rec, irec, sizeof(struct xfs_rmap_irec)); +} + +/* Decide if two reverse-mapping records can be merged. */ +static inline bool +xchk_rmap_mergeable( + struct xchk_rmap *cr, + const struct xfs_rmap_irec *r2) +{ + const struct xfs_rmap_irec *r1 = &cr->prev_rec; + + /* Ignore if prev_rec is not yet initialized. */ + if (cr->prev_rec.rm_blockcount == 0) + return false; + + if (r1->rm_owner != r2->rm_owner) + return false; + if (r1->rm_startblock + r1->rm_blockcount != r2->rm_startblock) + return false; + if ((unsigned long long)r1->rm_blockcount + r2->rm_blockcount > + XFS_RMAP_LEN_MAX) + return false; + if (XFS_RMAP_NON_INODE_OWNER(r2->rm_owner)) + return true; + /* must be an inode owner below here */ + if (r1->rm_flags != r2->rm_flags) + return false; + if (r1->rm_flags & XFS_RMAP_BMBT_BLOCK) + return true; + return r1->rm_offset + r1->rm_blockcount == r2->rm_offset; +} + +/* Flag failures for records that could be merged. */ +STATIC void +xchk_rmapbt_check_mergeable( + struct xchk_btree *bs, + struct xchk_rmap *cr, + const struct xfs_rmap_irec *irec) +{ + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (xchk_rmap_mergeable(cr, irec)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec)); +} + +/* Compare an rmap for AG metadata against the metadata walk. */ +STATIC int +xchk_rmapbt_mark_bitmap( + struct xchk_btree *bs, + struct xchk_rmap *cr, + const struct xfs_rmap_irec *irec) +{ + struct xfs_scrub *sc = bs->sc; + struct xagb_bitmap *bmp = NULL; + xfs_extlen_t fsbcount = irec->rm_blockcount; + + /* + * Skip corrupt records. It is essential that we detect records in the + * btree that cannot overlap but do, flag those as CORRUPT, and skip + * the bitmap comparison to avoid generating false XCORRUPT reports. + */ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * If the AG metadata walk didn't complete, there's no point in + * comparing against partial results. + */ + if (!cr->bitmaps_complete) + return 0; + + switch (irec->rm_owner) { + case XFS_RMAP_OWN_FS: + bmp = &cr->fs_owned; + break; + case XFS_RMAP_OWN_LOG: + bmp = &cr->log_owned; + break; + case XFS_RMAP_OWN_AG: + bmp = &cr->ag_owned; + break; + case XFS_RMAP_OWN_INOBT: + bmp = &cr->inobt_owned; + break; + case XFS_RMAP_OWN_REFC: + bmp = &cr->refcbt_owned; + break; + } + + if (!bmp) + return 0; + + if (xagb_bitmap_test(bmp, irec->rm_startblock, &fsbcount)) { /* - * xfs_verify_agbno returns false for static fs metadata. - * Since that only exists at the start of the AG, validate - * that by hand. + * The start of this reverse mapping corresponds to a set + * region in the bitmap. If the mapping covers more area than + * the set region, then it covers space that wasn't found by + * the AG metadata walk. */ - if (irec.rm_startblock != 0 || - irec.rm_blockcount != XFS_AGFL_BLOCK(mp) + 1) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + if (fsbcount < irec->rm_blockcount) + xchk_btree_xref_set_corrupt(bs->sc, + bs->sc->sa.rmap_cur, 0); } else { /* - * Otherwise we must point somewhere past the static metadata - * but before the end of the FS. Run the regular check. + * The start of this reverse mapping does not correspond to a + * completely set region in the bitmap. The region wasn't + * fully set by walking the AG metadata, so this is a + * cross-referencing corruption. */ - if (!xfs_verify_agbno(pag, irec.rm_startblock) || - !xfs_verify_agbno(pag, irec.rm_startblock + - irec.rm_blockcount - 1)) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + xchk_btree_xref_set_corrupt(bs->sc, bs->sc->sa.rmap_cur, 0); } - /* Check flags. */ - non_inode = XFS_RMAP_NON_INODE_OWNER(irec.rm_owner); - is_bmbt = irec.rm_flags & XFS_RMAP_BMBT_BLOCK; - is_attr = irec.rm_flags & XFS_RMAP_ATTR_FORK; - is_unwritten = irec.rm_flags & XFS_RMAP_UNWRITTEN; + /* Unset the region so that we can detect missing rmap records. */ + return xagb_bitmap_clear(bmp, irec->rm_startblock, irec->rm_blockcount); +} - if (is_bmbt && irec.rm_offset != 0) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); +/* Scrub an rmapbt record. */ +STATIC int +xchk_rmapbt_rec( + struct xchk_btree *bs, + const union xfs_btree_rec *rec) +{ + struct xchk_rmap *cr = bs->private; + struct xfs_rmap_irec irec; - if (non_inode && irec.rm_offset != 0) + if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL || + xfs_rmap_check_irec(bs->cur, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } - if (is_unwritten && (is_bmbt || non_inode || is_attr)) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + xchk_rmapbt_check_unwritten_in_keyflags(bs); + xchk_rmapbt_check_mergeable(bs, cr, &irec); + xchk_rmapbt_check_overlapping(bs, cr, &irec); + xchk_rmapbt_xref(bs->sc, &irec); - if (non_inode && (is_bmbt || is_unwritten || is_attr)) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return xchk_rmapbt_mark_bitmap(bs, cr, &irec); +} - if (!non_inode) { - if (!xfs_verify_ino(mp, irec.rm_owner)) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - } else { - /* Non-inode owner within the magic values? */ - if (irec.rm_owner <= XFS_RMAP_OWN_MIN || - irec.rm_owner > XFS_RMAP_OWN_FS) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); +/* Add an AGFL block to the rmap list. */ +STATIC int +xchk_rmapbt_walk_agfl( + struct xfs_mount *mp, + xfs_agblock_t agbno, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + + return xagb_bitmap_set(bitmap, agbno, 1); +} + +/* + * Set up bitmaps mapping all the AG metadata to compare with the rmapbt + * records. + * + * Grab our own btree cursors here if the scrub setup function didn't give us a + * btree cursor due to reports of poor health. We need to find out if the + * rmapbt disagrees with primary metadata btrees to tag the rmapbt as being + * XCORRUPT. + */ +STATIC int +xchk_rmapbt_walk_ag_metadata( + struct xfs_scrub *sc, + struct xchk_rmap *cr) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *agfl_bp; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_btree_cur *cur; + int error; + + /* OWN_FS: AG headers */ + error = xagb_bitmap_set(&cr->fs_owned, XFS_SB_BLOCK(mp), + XFS_AGFL_BLOCK(mp) - XFS_SB_BLOCK(mp) + 1); + if (error) + goto out; + + /* OWN_LOG: Internal log */ + if (xfs_ag_contains_log(mp, sc->sa.pag->pag_agno)) { + error = xagb_bitmap_set(&cr->log_owned, + XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart), + mp->m_sb.sb_logblocks); + if (error) + goto out; + } + + /* OWN_AG: bnobt, cntbt, rmapbt, and AGFL */ + cur = sc->sa.bno_cur; + if (!cur) + cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag, XFS_BTNUM_BNO); + error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur); + if (cur != sc->sa.bno_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; + + cur = sc->sa.cnt_cur; + if (!cur) + cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag, XFS_BTNUM_CNT); + error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur); + if (cur != sc->sa.cnt_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; + + error = xagb_bitmap_set_btblocks(&cr->ag_owned, sc->sa.rmap_cur); + if (error) + goto out; + + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + goto out; + + error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xchk_rmapbt_walk_agfl, + &cr->ag_owned); + xfs_trans_brelse(sc->tp, agfl_bp); + if (error) + goto out; + + /* OWN_INOBT: inobt, finobt */ + cur = sc->sa.ino_cur; + if (!cur) + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp, + XFS_BTNUM_INO); + error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur); + if (cur != sc->sa.ino_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; + + if (xfs_has_finobt(sc->mp)) { + cur = sc->sa.fino_cur; + if (!cur) + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, + sc->sa.agi_bp, XFS_BTNUM_FINO); + error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur); + if (cur != sc->sa.fino_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; + } + + /* OWN_REFC: refcountbt */ + if (xfs_has_reflink(sc->mp)) { + cur = sc->sa.refc_cur; + if (!cur) + cur = xfs_refcountbt_init_cursor(sc->mp, sc->tp, + sc->sa.agf_bp, sc->sa.pag); + error = xagb_bitmap_set_btblocks(&cr->refcbt_owned, cur); + if (cur != sc->sa.refc_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; } - xchk_rmapbt_xref(bs->sc, &irec); out: - return error; + /* + * If there's an error, set XFAIL and disable the bitmap + * cross-referencing checks, but proceed with the scrub anyway. + */ + if (error) + xchk_btree_xref_process_error(sc, sc->sa.rmap_cur, + sc->sa.rmap_cur->bc_nlevels - 1, &error); + else + cr->bitmaps_complete = true; + return 0; +} + +/* + * Check for set regions in the bitmaps; if there are any, the rmap records do + * not describe all the AG metadata. + */ +STATIC void +xchk_rmapbt_check_bitmaps( + struct xfs_scrub *sc, + struct xchk_rmap *cr) +{ + struct xfs_btree_cur *cur = sc->sa.rmap_cur; + unsigned int level; + + if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XFAIL)) + return; + if (!cur) + return; + level = cur->bc_nlevels - 1; + + /* + * Any bitmap with bits still set indicates that the reverse mapping + * doesn't cover the entire primary structure. + */ + if (xagb_bitmap_hweight(&cr->fs_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); + + if (xagb_bitmap_hweight(&cr->log_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); + + if (xagb_bitmap_hweight(&cr->ag_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); + + if (xagb_bitmap_hweight(&cr->inobt_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); + + if (xagb_bitmap_hweight(&cr->refcbt_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); } /* Scrub the rmap btree for some AG. */ @@ -165,42 +536,63 @@ int xchk_rmapbt( struct xfs_scrub *sc) { - return xchk_btree(sc, sc->sa.rmap_cur, xchk_rmapbt_rec, - &XFS_RMAP_OINFO_AG, NULL); + struct xchk_rmap *cr; + int error; + + cr = kzalloc(sizeof(struct xchk_rmap), XCHK_GFP_FLAGS); + if (!cr) + return -ENOMEM; + + xagb_bitmap_init(&cr->fs_owned); + xagb_bitmap_init(&cr->log_owned); + xagb_bitmap_init(&cr->ag_owned); + xagb_bitmap_init(&cr->inobt_owned); + xagb_bitmap_init(&cr->refcbt_owned); + + error = xchk_rmapbt_walk_ag_metadata(sc, cr); + if (error) + goto out; + + error = xchk_btree(sc, sc->sa.rmap_cur, xchk_rmapbt_rec, + &XFS_RMAP_OINFO_AG, cr); + if (error) + goto out; + + xchk_rmapbt_check_bitmaps(sc, cr); + +out: + xagb_bitmap_destroy(&cr->refcbt_owned); + xagb_bitmap_destroy(&cr->inobt_owned); + xagb_bitmap_destroy(&cr->ag_owned); + xagb_bitmap_destroy(&cr->log_owned); + xagb_bitmap_destroy(&cr->fs_owned); + kfree(cr); + return error; } -/* xref check that the extent is owned by a given owner */ -static inline void -xchk_xref_check_owner( +/* xref check that the extent is owned only by a given owner */ +void +xchk_xref_is_only_owned_by( struct xfs_scrub *sc, xfs_agblock_t bno, xfs_extlen_t len, - const struct xfs_owner_info *oinfo, - bool should_have_rmap) + const struct xfs_owner_info *oinfo) { - bool has_rmap; + struct xfs_rmap_matches res; int error; if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) return; - error = xfs_rmap_record_exists(sc->sa.rmap_cur, bno, len, oinfo, - &has_rmap); + error = xfs_rmap_count_owners(sc->sa.rmap_cur, bno, len, oinfo, &res); if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) return; - if (has_rmap != should_have_rmap) + if (res.matches != 1) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + if (res.bad_non_owner_matches) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + if (res.non_owner_matches) xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); -} - -/* xref check that the extent is owned by a given owner */ -void -xchk_xref_is_owned_by( - struct xfs_scrub *sc, - xfs_agblock_t bno, - xfs_extlen_t len, - const struct xfs_owner_info *oinfo) -{ - xchk_xref_check_owner(sc, bno, len, oinfo, true); } /* xref check that the extent is not owned by a given owner */ @@ -211,7 +603,19 @@ xchk_xref_is_not_owned_by( xfs_extlen_t len, const struct xfs_owner_info *oinfo) { - xchk_xref_check_owner(sc, bno, len, oinfo, false); + struct xfs_rmap_matches res; + int error; + + if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_rmap_count_owners(sc->sa.rmap_cur, bno, len, oinfo, &res); + if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (res.matches != 0) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + if (res.bad_non_owner_matches) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); } /* xref check that the extent has no reverse mapping at all */ @@ -221,15 +625,15 @@ xchk_xref_has_no_owner( xfs_agblock_t bno, xfs_extlen_t len) { - bool has_rmap; + enum xbtree_recpacking outcome; int error; if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) return; - error = xfs_rmap_has_record(sc->sa.rmap_cur, bno, len, &has_rmap); + error = xfs_rmap_has_records(sc->sa.rmap_cur, bno, len, &outcome); if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) return; - if (has_rmap) + if (outcome != XBTREE_RECPACKING_EMPTY) xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); } diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 0a3bde64c675..e7dace7b4be8 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 07a7a75f987f..02819bedc5b1 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -145,6 +145,21 @@ xchk_probe( /* Scrub setup and teardown */ +static inline void +xchk_fsgates_disable( + struct xfs_scrub *sc) +{ + if (!(sc->flags & XCHK_FSGATES_ALL)) + return; + + trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL); + + if (sc->flags & XCHK_FSGATES_DRAIN) + xfs_drain_wait_disable(); + + sc->flags &= ~XCHK_FSGATES_ALL; +} + /* Free all the resources and finish the transactions. */ STATIC int xchk_teardown( @@ -166,7 +181,7 @@ xchk_teardown( xfs_iunlock(sc->ip, sc->ilock_flags); if (sc->ip != ip_in && !xfs_internal_inum(sc->mp, sc->ip->i_ino)) - xfs_irele(sc->ip); + xchk_irele(sc, sc->ip); sc->ip = NULL; } if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) @@ -174,9 +189,14 @@ xchk_teardown( if (sc->flags & XCHK_REAPING_DISABLED) xchk_start_reaping(sc); if (sc->buf) { + if (sc->buf_cleanup) + sc->buf_cleanup(sc->buf); kvfree(sc->buf); + sc->buf_cleanup = NULL; sc->buf = NULL; } + + xchk_fsgates_disable(sc); return error; } @@ -191,25 +211,25 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { }, [XFS_SCRUB_TYPE_SB] = { /* superblock */ .type = ST_PERAG, - .setup = xchk_setup_fs, + .setup = xchk_setup_agheader, .scrub = xchk_superblock, .repair = xrep_superblock, }, [XFS_SCRUB_TYPE_AGF] = { /* agf */ .type = ST_PERAG, - .setup = xchk_setup_fs, + .setup = xchk_setup_agheader, .scrub = xchk_agf, .repair = xrep_agf, }, [XFS_SCRUB_TYPE_AGFL]= { /* agfl */ .type = ST_PERAG, - .setup = xchk_setup_fs, + .setup = xchk_setup_agheader, .scrub = xchk_agfl, .repair = xrep_agfl, }, [XFS_SCRUB_TYPE_AGI] = { /* agi */ .type = ST_PERAG, - .setup = xchk_setup_fs, + .setup = xchk_setup_agheader, .scrub = xchk_agi, .repair = xrep_agi, }, @@ -491,23 +511,20 @@ retry_op: /* Set up for the operation. */ error = sc->ops->setup(sc); + if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) + goto try_harder; + if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) + goto need_drain; if (error) goto out_teardown; /* Scrub for errors. */ error = sc->ops->scrub(sc); - if (!(sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) { - /* - * Scrubbers return -EDEADLOCK to mean 'try harder'. - * Tear down everything we hold, then set up again with - * preparation for worst-case scenarios. - */ - error = xchk_teardown(sc, 0); - if (error) - goto out_sc; - sc->flags |= XCHK_TRY_HARDER; - goto retry_op; - } else if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)) + if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) + goto try_harder; + if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) + goto need_drain; + if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)) goto out_teardown; xchk_update_health(sc); @@ -565,4 +582,21 @@ out: error = 0; } return error; +need_drain: + error = xchk_teardown(sc, 0); + if (error) + goto out_sc; + sc->flags |= XCHK_NEED_DRAIN; + goto retry_op; +try_harder: + /* + * Scrubbers return -EDEADLOCK to mean 'try harder'. Tear down + * everything we hold, then set up again with preparation for + * worst-case scenarios. + */ + error = xchk_teardown(sc, 0); + if (error) + goto out_sc; + sc->flags |= XCHK_TRY_HARDER; + goto retry_op; } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index b4d391b4c938..e71903474cd7 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_SCRUB_H__ #define __XFS_SCRUB_SCRUB_H__ @@ -77,7 +77,17 @@ struct xfs_scrub { */ struct xfs_inode *ip; + /* Kernel memory buffer used by scrubbers; freed at teardown. */ void *buf; + + /* + * Clean up resources owned by whatever is in the buffer. Cleanup can + * be deferred with this hook as a means for scrub functions to pass + * data to repair functions. This function must not free the buffer + * itself. + */ + void (*buf_cleanup)(void *buf); + uint ilock_flags; /* See the XCHK/XREP state flags below. */ @@ -96,9 +106,19 @@ struct xfs_scrub { /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ #define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */ -#define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */ +#define XCHK_REAPING_DISABLED (1 << 1) /* background block reaping paused */ +#define XCHK_FSGATES_DRAIN (1 << 2) /* defer ops draining enabled */ +#define XCHK_NEED_DRAIN (1 << 3) /* scrub needs to drain defer ops */ #define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */ +/* + * The XCHK_FSGATES* flags reflect functionality in the main filesystem that + * are only enabled for this particular online fsck. When not in use, the + * features are gated off via dynamic code patching, which is why the state + * must be enabled during scrub setup and can only be torn down afterwards. + */ +#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN) + /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); int xchk_superblock(struct xfs_scrub *sc); @@ -152,7 +172,7 @@ void xchk_xref_is_not_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno, xfs_extlen_t len); void xchk_xref_is_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno, xfs_extlen_t len); -void xchk_xref_is_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno, +void xchk_xref_is_only_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno, xfs_extlen_t len, const struct xfs_owner_info *oinfo); void xchk_xref_is_not_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno, xfs_extlen_t len, const struct xfs_owner_info *oinfo); @@ -162,6 +182,8 @@ void xchk_xref_is_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno, xfs_extlen_t len); void xchk_xref_is_not_shared(struct xfs_scrub *sc, xfs_agblock_t bno, xfs_extlen_t len); +void xchk_xref_is_not_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno, + xfs_extlen_t len); #ifdef CONFIG_XFS_RT void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno, xfs_extlen_t len); diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index c1c99ffe7408..38708fb9a5d7 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index b5f94676c37c..0a975439d2b6 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 93ece6df02e3..68efd6fda61c 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> * * NOTE: none of these tracepoints shall be considered a stable kernel ABI * as they can change at any time. See xfs_trace.h for documentation of @@ -30,6 +30,9 @@ TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi); TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi); TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi); +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); + TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF); @@ -93,6 +96,13 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XFS_SCRUB_OFLAG_WARNING, "warning" }, \ { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" } +#define XFS_SCRUB_STATE_STRINGS \ + { XCHK_TRY_HARDER, "try_harder" }, \ + { XCHK_REAPING_DISABLED, "reaping_disabled" }, \ + { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ + { XCHK_NEED_DRAIN, "need_drain" }, \ + { XREP_ALREADY_FIXED, "already_fixed" } + DECLARE_EVENT_CLASS(xchk_class, TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, int error), @@ -139,6 +149,33 @@ DEFINE_SCRUB_EVENT(xchk_deadlock_retry); DEFINE_SCRUB_EVENT(xrep_attempt); DEFINE_SCRUB_EVENT(xrep_done); +DECLARE_EVENT_CLASS(xchk_fsgate_class, + TP_PROTO(struct xfs_scrub *sc, unsigned int fsgate_flags), + TP_ARGS(sc, fsgate_flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(unsigned int, fsgate_flags) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->fsgate_flags = fsgate_flags; + ), + TP_printk("dev %d:%d type %s fsgates '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_flags(__entry->fsgate_flags, "|", XFS_SCRUB_STATE_STRINGS)) +) + +#define DEFINE_SCRUB_FSHOOK_EVENT(name) \ +DEFINE_EVENT(xchk_fsgate_class, name, \ + TP_PROTO(struct xfs_scrub *sc, unsigned int fsgates_flags), \ + TP_ARGS(sc, fsgates_flags)) + +DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_enable); +DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_disable); + TRACE_EVENT(xchk_op_error, TP_PROTO(struct xfs_scrub *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int error, void *ret_ip), @@ -657,6 +694,38 @@ TRACE_EVENT(xchk_fscounters_within_range, __entry->old_value) ) +TRACE_EVENT(xchk_refcount_incorrect, + TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec, + xfs_nlink_t seen), + TP_ARGS(pag, irec, seen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + __field(xfs_nlink_t, seen) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->domain = irec->rc_domain; + __entry->startblock = irec->rc_startblock; + __entry->blockcount = irec->rc_blockcount; + __entry->refcount = irec->rc_refcount; + __entry->seen = seen; + ), + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u seen %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), + __entry->startblock, + __entry->blockcount, + __entry->refcount, + __entry->seen) +) + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h index 2ceae614ade8..a39befa743ce 100644 --- a/fs/xfs/scrub/xfs_scrub.h +++ b/fs/xfs/scrub/xfs_scrub.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_H__ #define __XFS_SCRUB_H__ diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 6e2f0013380a..7551c3ec4ea5 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -24,6 +24,7 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_ag.h" struct kmem_cache *xfs_bui_cache; struct kmem_cache *xfs_bud_cache; @@ -363,6 +364,34 @@ xfs_bmap_update_create_done( return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item; } +/* Take a passive ref to the AG containing the space we're mapping. */ +void +xfs_bmap_update_get_group( + struct xfs_mount *mp, + struct xfs_bmap_intent *bi) +{ + xfs_agnumber_t agno; + + agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock); + + /* + * Bump the intent count on behalf of the deferred rmap and refcount + * intent items that that we can queue when we finish this bmap work. + * This new intent item will bump the intent count before the bmap + * intent drops the intent count, ensuring that the intent count + * remains nonzero across the transaction roll. + */ + bi->bi_pag = xfs_perag_intent_get(mp, agno); +} + +/* Release a passive AG ref after finishing mapping work. */ +static inline void +xfs_bmap_update_put_group( + struct xfs_bmap_intent *bi) +{ + xfs_perag_intent_put(bi->bi_pag); +} + /* Process a deferred rmap update. */ STATIC int xfs_bmap_update_finish_item( @@ -381,6 +410,8 @@ xfs_bmap_update_finish_item( ASSERT(bi->bi_type == XFS_BMAP_UNMAP); return -EAGAIN; } + + xfs_bmap_update_put_group(bi); kmem_cache_free(xfs_bmap_intent_cache, bi); return error; } @@ -393,7 +424,7 @@ xfs_bmap_update_abort_intent( xfs_bui_release(BUI_ITEM(intent)); } -/* Cancel a deferred rmap update. */ +/* Cancel a deferred bmap update. */ STATIC void xfs_bmap_update_cancel_item( struct list_head *item) @@ -401,6 +432,8 @@ xfs_bmap_update_cancel_item( struct xfs_bmap_intent *bi; bi = container_of(item, struct xfs_bmap_intent, bi_list); + + xfs_bmap_update_put_group(bi); kmem_cache_free(xfs_bmap_intent_cache, bi); } @@ -509,10 +542,12 @@ xfs_bui_item_recover( fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + xfs_bmap_update_get_group(mp, &fake); error = xfs_trans_log_finish_bmap_update(tp, budp, &fake); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map, sizeof(*map)); + xfs_bmap_update_put_group(&fake); if (error) goto err_cancel; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index a09dd2606479..f032d3a4b727 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -314,15 +314,13 @@ xfs_getbmap_report_one( if (isnullstartblock(got->br_startblock) || got->br_startblock == DELAYSTARTBLOCK) { /* - * Delalloc extents that start beyond EOF can occur due to - * speculative EOF allocation when the delalloc extent is larger - * than the largest freespace extent at conversion time. These - * extents cannot be converted by data writeback, so can exist - * here even if we are not supposed to be finding delalloc - * extents. + * Take the flush completion as being a point-in-time snapshot + * where there are no delalloc extents, and if any new ones + * have been created racily, just skip them as being 'after' + * the flush and so don't get reported. */ - if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip))) - ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0); + if (!(bmv->bmv_iflags & BMV_IF_DELALLOC)) + return 0; p->bmv_oflags |= BMV_OF_DELALLOC; p->bmv_block = -2; diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index ffa94102094d..43167f543afc 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -943,6 +943,16 @@ xlog_recover_buf_commit_pass2( if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { trace_xfs_log_recover_buf_skip(log, buf_f); xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); + + /* + * We're skipping replay of this buffer log item due to the log + * item LSN being behind the ondisk buffer. Verify the buffer + * contents since we aren't going to run the write verifier. + */ + if (bp->b_ops) { + bp->b_ops->verify_read(bp); + error = bp->b_error; + } goto out_release; } diff --git a/fs/xfs/xfs_dahash_test.c b/fs/xfs/xfs_dahash_test.c index 230651ab5ce4..0dab5941e080 100644 --- a/fs/xfs/xfs_dahash_test.c +++ b/fs/xfs/xfs_dahash_test.c @@ -9,6 +9,9 @@ #include "xfs_format.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_dir2_priv.h" #include "xfs_dahash_test.h" /* 4096 random bytes */ @@ -533,108 +536,109 @@ static struct dahash_test { uint16_t start; /* random 12 bit offset in buf */ uint16_t length; /* random 8 bit length of test */ xfs_dahash_t dahash; /* expected dahash result */ + xfs_dahash_t ascii_ci_dahash; /* expected ascii-ci dahash result */ } test[] __initdata = { - {0x0567, 0x0097, 0x96951389}, - {0x0869, 0x0055, 0x6455ab4f}, - {0x0c51, 0x00be, 0x8663afde}, - {0x044a, 0x00fc, 0x98fbe432}, - {0x0f29, 0x0079, 0x42371997}, - {0x08ba, 0x0052, 0x942be4f7}, - {0x01f2, 0x0013, 0x5262687e}, - {0x09e3, 0x00e2, 0x8ffb0908}, - {0x007c, 0x0051, 0xb3158491}, - {0x0854, 0x001f, 0x83bb20d9}, - {0x031b, 0x0008, 0x98970bdf}, - {0x0de7, 0x0027, 0xbfbf6f6c}, - {0x0f76, 0x0005, 0x906a7105}, - {0x092e, 0x00d0, 0x86631850}, - {0x0233, 0x0082, 0xdbdd914e}, - {0x04c9, 0x0075, 0x5a400a9e}, - {0x0b66, 0x0099, 0xae128b45}, - {0x000d, 0x00ed, 0xe61c216a}, - {0x0a31, 0x003d, 0xf69663b9}, - {0x00a3, 0x0052, 0x643c39ae}, - {0x0125, 0x00d5, 0x7c310b0d}, - {0x0105, 0x004a, 0x06a77e74}, - {0x0858, 0x008e, 0x265bc739}, - {0x045e, 0x0095, 0x13d6b192}, - {0x0dab, 0x003c, 0xc4498704}, - {0x00cd, 0x00b5, 0x802a4e2d}, - {0x069b, 0x008c, 0x5df60f71}, - {0x0454, 0x006c, 0x5f03d8bb}, - {0x040e, 0x0032, 0x0ce513b5}, - {0x0874, 0x00e2, 0x6a811fb3}, - {0x0521, 0x00b4, 0x93296833}, - {0x0ddc, 0x00cf, 0xf9305338}, - {0x0a70, 0x0023, 0x239549ea}, - {0x083e, 0x0027, 0x2d88ba97}, - {0x0241, 0x00a7, 0xfe0b32e1}, - {0x0dfc, 0x0096, 0x1a11e815}, - {0x023e, 0x001e, 0xebc9a1f3}, - {0x067e, 0x0066, 0xb1067f81}, - {0x09ea, 0x000e, 0x46fd7247}, - {0x036b, 0x008c, 0x1a39acdf}, - {0x078f, 0x0030, 0x964042ab}, - {0x085c, 0x008f, 0x1829edab}, - {0x02ec, 0x009f, 0x6aefa72d}, - {0x043b, 0x00ce, 0x65642ff5}, - {0x0a32, 0x00b8, 0xbd82759e}, - {0x0d3c, 0x0087, 0xf4d66d54}, - {0x09ec, 0x008a, 0x06bfa1ff}, - {0x0902, 0x0015, 0x755025d2}, - {0x08fe, 0x000e, 0xf690ce2d}, - {0x00fb, 0x00dc, 0xe55f1528}, - {0x0eaa, 0x003a, 0x0fe0a8d7}, - {0x05fb, 0x0006, 0x86281cfb}, - {0x0dd1, 0x00a7, 0x60ab51b4}, - {0x0005, 0x001b, 0xf51d969b}, - {0x077c, 0x00dd, 0xc2fed268}, - {0x0575, 0x00f5, 0x432c0b1a}, - {0x05be, 0x0088, 0x78baa04b}, - {0x0c89, 0x0068, 0xeda9e428}, - {0x0f5c, 0x0068, 0xec143c76}, - {0x06a8, 0x0009, 0xd72651ce}, - {0x060f, 0x008e, 0x765426cd}, - {0x07b1, 0x0047, 0x2cfcfa0c}, - {0x04f1, 0x0041, 0x55b172f9}, - {0x0e05, 0x00ac, 0x61efde93}, - {0x0bf7, 0x0097, 0x05b83eee}, - {0x04e9, 0x00f3, 0x9928223a}, - {0x023a, 0x0005, 0xdfada9bc}, - {0x0acb, 0x000e, 0x2217cecd}, - {0x0148, 0x0060, 0xbc3f7405}, - {0x0764, 0x0059, 0xcbc201b1}, - {0x021f, 0x0059, 0x5d6b2256}, - {0x0f1e, 0x006c, 0xdefeeb45}, - {0x071c, 0x00b9, 0xb9b59309}, - {0x0564, 0x0063, 0xae064271}, - {0x0b14, 0x0044, 0xdb867d9b}, - {0x0e5a, 0x0055, 0xff06b685}, - {0x015e, 0x00ba, 0x1115ccbc}, - {0x0379, 0x00e6, 0x5f4e58dd}, - {0x013b, 0x0067, 0x4897427e}, - {0x0e64, 0x0071, 0x7af2b7a4}, - {0x0a11, 0x0050, 0x92105726}, - {0x0109, 0x0055, 0xd0d000f9}, - {0x00aa, 0x0022, 0x815d229d}, - {0x09ac, 0x004f, 0x02f9d985}, - {0x0e1b, 0x00ce, 0x5cf92ab4}, - {0x08af, 0x00d8, 0x17ca72d1}, - {0x0e33, 0x000a, 0xda2dba6b}, - {0x0ee3, 0x006a, 0xb00048e5}, - {0x0648, 0x001a, 0x2364b8cb}, - {0x0315, 0x0085, 0x0596fd0d}, - {0x0fbb, 0x003e, 0x298230ca}, - {0x0422, 0x006a, 0x78ada4ab}, - {0x04ba, 0x0073, 0xced1fbc2}, - {0x007d, 0x0061, 0x4b7ff236}, - {0x070b, 0x00d0, 0x261cf0ae}, - {0x0c1a, 0x0035, 0x8be92ee2}, - {0x0af8, 0x0063, 0x824dcf03}, - {0x08f8, 0x006d, 0xd289710c}, - {0x021b, 0x00ee, 0x6ac1c41d}, - {0x05b5, 0x00da, 0x8e52f0e2}, + {0x0567, 0x0097, 0x96951389, 0xc153aa0d}, + {0x0869, 0x0055, 0x6455ab4f, 0xd07f69bf}, + {0x0c51, 0x00be, 0x8663afde, 0xf9add90c}, + {0x044a, 0x00fc, 0x98fbe432, 0xbf2abb76}, + {0x0f29, 0x0079, 0x42371997, 0x282588b3}, + {0x08ba, 0x0052, 0x942be4f7, 0x2e023547}, + {0x01f2, 0x0013, 0x5262687e, 0x5266287e}, + {0x09e3, 0x00e2, 0x8ffb0908, 0x1da892f3}, + {0x007c, 0x0051, 0xb3158491, 0xb67f9e63}, + {0x0854, 0x001f, 0x83bb20d9, 0x22bb21db}, + {0x031b, 0x0008, 0x98970bdf, 0x9cd70adf}, + {0x0de7, 0x0027, 0xbfbf6f6c, 0xae3f296c}, + {0x0f76, 0x0005, 0x906a7105, 0x906a7105}, + {0x092e, 0x00d0, 0x86631850, 0xa3f6ac04}, + {0x0233, 0x0082, 0xdbdd914e, 0x5d8c7aac}, + {0x04c9, 0x0075, 0x5a400a9e, 0x12f60711}, + {0x0b66, 0x0099, 0xae128b45, 0x7551310d}, + {0x000d, 0x00ed, 0xe61c216a, 0xc22d3c4c}, + {0x0a31, 0x003d, 0xf69663b9, 0x51960bf8}, + {0x00a3, 0x0052, 0x643c39ae, 0xa93c73a8}, + {0x0125, 0x00d5, 0x7c310b0d, 0xf221cbb3}, + {0x0105, 0x004a, 0x06a77e74, 0xa4ef4561}, + {0x0858, 0x008e, 0x265bc739, 0xd6c36d9b}, + {0x045e, 0x0095, 0x13d6b192, 0x5f5c1d62}, + {0x0dab, 0x003c, 0xc4498704, 0x10414654}, + {0x00cd, 0x00b5, 0x802a4e2d, 0xfbd17c9d}, + {0x069b, 0x008c, 0x5df60f71, 0x91ddca5f}, + {0x0454, 0x006c, 0x5f03d8bb, 0x5c59fce0}, + {0x040e, 0x0032, 0x0ce513b5, 0xa8cd99b1}, + {0x0874, 0x00e2, 0x6a811fb3, 0xca028316}, + {0x0521, 0x00b4, 0x93296833, 0x2c4d4880}, + {0x0ddc, 0x00cf, 0xf9305338, 0x2c94210d}, + {0x0a70, 0x0023, 0x239549ea, 0x22b561aa}, + {0x083e, 0x0027, 0x2d88ba97, 0x5cd8bb9d}, + {0x0241, 0x00a7, 0xfe0b32e1, 0x17b506b8}, + {0x0dfc, 0x0096, 0x1a11e815, 0xee4141bd}, + {0x023e, 0x001e, 0xebc9a1f3, 0x5689a1f3}, + {0x067e, 0x0066, 0xb1067f81, 0xd9952571}, + {0x09ea, 0x000e, 0x46fd7247, 0x42b57245}, + {0x036b, 0x008c, 0x1a39acdf, 0x58bf1586}, + {0x078f, 0x0030, 0x964042ab, 0xb04218b9}, + {0x085c, 0x008f, 0x1829edab, 0x9ceca89c}, + {0x02ec, 0x009f, 0x6aefa72d, 0x634cc2a7}, + {0x043b, 0x00ce, 0x65642ff5, 0x6c8a584e}, + {0x0a32, 0x00b8, 0xbd82759e, 0x0f96a34f}, + {0x0d3c, 0x0087, 0xf4d66d54, 0xb71ba5f4}, + {0x09ec, 0x008a, 0x06bfa1ff, 0x576ca80f}, + {0x0902, 0x0015, 0x755025d2, 0x517225c2}, + {0x08fe, 0x000e, 0xf690ce2d, 0xf690cf3d}, + {0x00fb, 0x00dc, 0xe55f1528, 0x707d7d92}, + {0x0eaa, 0x003a, 0x0fe0a8d7, 0x87638cc5}, + {0x05fb, 0x0006, 0x86281cfb, 0x86281cf9}, + {0x0dd1, 0x00a7, 0x60ab51b4, 0xe28ef00c}, + {0x0005, 0x001b, 0xf51d969b, 0xe71dd6d3}, + {0x077c, 0x00dd, 0xc2fed268, 0xdc30c555}, + {0x0575, 0x00f5, 0x432c0b1a, 0x81dd7d16}, + {0x05be, 0x0088, 0x78baa04b, 0xd69b433e}, + {0x0c89, 0x0068, 0xeda9e428, 0xe9b4fa0a}, + {0x0f5c, 0x0068, 0xec143c76, 0x9947067a}, + {0x06a8, 0x0009, 0xd72651ce, 0xd72651ee}, + {0x060f, 0x008e, 0x765426cd, 0x2099626f}, + {0x07b1, 0x0047, 0x2cfcfa0c, 0x1a4baa07}, + {0x04f1, 0x0041, 0x55b172f9, 0x15331a79}, + {0x0e05, 0x00ac, 0x61efde93, 0x320568cc}, + {0x0bf7, 0x0097, 0x05b83eee, 0xc72fb7a3}, + {0x04e9, 0x00f3, 0x9928223a, 0xe8c77de2}, + {0x023a, 0x0005, 0xdfada9bc, 0xdfadb9be}, + {0x0acb, 0x000e, 0x2217cecd, 0x0017d6cd}, + {0x0148, 0x0060, 0xbc3f7405, 0xf5fd6615}, + {0x0764, 0x0059, 0xcbc201b1, 0xbb089bf4}, + {0x021f, 0x0059, 0x5d6b2256, 0xa16a0a59}, + {0x0f1e, 0x006c, 0xdefeeb45, 0xfc34f9d6}, + {0x071c, 0x00b9, 0xb9b59309, 0xb645eae2}, + {0x0564, 0x0063, 0xae064271, 0x954dc6d1}, + {0x0b14, 0x0044, 0xdb867d9b, 0xdf432309}, + {0x0e5a, 0x0055, 0xff06b685, 0xa65ff257}, + {0x015e, 0x00ba, 0x1115ccbc, 0x11c365f4}, + {0x0379, 0x00e6, 0x5f4e58dd, 0x2d176d31}, + {0x013b, 0x0067, 0x4897427e, 0xc40532fe}, + {0x0e64, 0x0071, 0x7af2b7a4, 0x1fb7bf43}, + {0x0a11, 0x0050, 0x92105726, 0xb1185e51}, + {0x0109, 0x0055, 0xd0d000f9, 0x60a60bfd}, + {0x00aa, 0x0022, 0x815d229d, 0x215d379c}, + {0x09ac, 0x004f, 0x02f9d985, 0x10b90b20}, + {0x0e1b, 0x00ce, 0x5cf92ab4, 0x6a477573}, + {0x08af, 0x00d8, 0x17ca72d1, 0x385af156}, + {0x0e33, 0x000a, 0xda2dba6b, 0xda2dbb69}, + {0x0ee3, 0x006a, 0xb00048e5, 0xa9a2decc}, + {0x0648, 0x001a, 0x2364b8cb, 0x3364b1cb}, + {0x0315, 0x0085, 0x0596fd0d, 0xa651740f}, + {0x0fbb, 0x003e, 0x298230ca, 0x7fc617c7}, + {0x0422, 0x006a, 0x78ada4ab, 0xc576ae2a}, + {0x04ba, 0x0073, 0xced1fbc2, 0xaac8455b}, + {0x007d, 0x0061, 0x4b7ff236, 0x347d5739}, + {0x070b, 0x00d0, 0x261cf0ae, 0xc7fb1c10}, + {0x0c1a, 0x0035, 0x8be92ee2, 0x8be9b4e1}, + {0x0af8, 0x0063, 0x824dcf03, 0x53010388}, + {0x08f8, 0x006d, 0xd289710c, 0x30418edd}, + {0x021b, 0x00ee, 0x6ac1c41d, 0x2557e9a3}, + {0x05b5, 0x00da, 0x8e52f0e2, 0x98531012}, }; int __init @@ -644,12 +648,19 @@ xfs_dahash_test(void) unsigned int errors = 0; for (i = 0; i < ARRAY_SIZE(test); i++) { + struct xfs_name xname = { }; xfs_dahash_t hash; hash = xfs_da_hashname(test_buf + test[i].start, test[i].length); if (hash != test[i].dahash) errors++; + + xname.name = test_buf + test[i].start; + xname.len = test[i].length; + hash = xfs_ascii_ci_hashname(&xname); + if (hash != test[i].ascii_ci_dahash) + errors++; } if (errors) { diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 8fb90da89787..7f071757f278 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -798,7 +798,6 @@ xfs_qm_dqget_cache_insert( error = radix_tree_insert(tree, id, dqp); if (unlikely(error)) { /* Duplicate found! Caller must try again. */ - WARN_ON(error != -EEXIST); mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_dup(dqp); return error; diff --git a/fs/xfs/xfs_drain.c b/fs/xfs/xfs_drain.c new file mode 100644 index 000000000000..005a66be44a2 --- /dev/null +++ b/fs/xfs/xfs_drain.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_ag.h" +#include "xfs_trace.h" + +/* + * Use a static key here to reduce the overhead of xfs_drain_rele. If the + * compiler supports jump labels, the static branch will be replaced by a nop + * sled when there are no xfs_drain_wait callers. Online fsck is currently + * the only caller, so this is a reasonable tradeoff. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +static DEFINE_STATIC_KEY_FALSE(xfs_drain_waiter_gate); + +void +xfs_drain_wait_disable(void) +{ + static_branch_dec(&xfs_drain_waiter_gate); +} + +void +xfs_drain_wait_enable(void) +{ + static_branch_inc(&xfs_drain_waiter_gate); +} + +void +xfs_defer_drain_init( + struct xfs_defer_drain *dr) +{ + atomic_set(&dr->dr_count, 0); + init_waitqueue_head(&dr->dr_waiters); +} + +void +xfs_defer_drain_free(struct xfs_defer_drain *dr) +{ + ASSERT(atomic_read(&dr->dr_count) == 0); +} + +/* Increase the pending intent count. */ +static inline void xfs_defer_drain_grab(struct xfs_defer_drain *dr) +{ + atomic_inc(&dr->dr_count); +} + +static inline bool has_waiters(struct wait_queue_head *wq_head) +{ + /* + * This memory barrier is paired with the one in set_current_state on + * the waiting side. + */ + smp_mb__after_atomic(); + return waitqueue_active(wq_head); +} + +/* Decrease the pending intent count, and wake any waiters, if appropriate. */ +static inline void xfs_defer_drain_rele(struct xfs_defer_drain *dr) +{ + if (atomic_dec_and_test(&dr->dr_count) && + static_branch_unlikely(&xfs_drain_waiter_gate) && + has_waiters(&dr->dr_waiters)) + wake_up(&dr->dr_waiters); +} + +/* Are there intents pending? */ +static inline bool xfs_defer_drain_busy(struct xfs_defer_drain *dr) +{ + return atomic_read(&dr->dr_count) > 0; +} + +/* + * Wait for the pending intent count for a drain to hit zero. + * + * Callers must not hold any locks that would prevent intents from being + * finished. + */ +static inline int xfs_defer_drain_wait(struct xfs_defer_drain *dr) +{ + return wait_event_killable(dr->dr_waiters, !xfs_defer_drain_busy(dr)); +} + +/* + * Get a passive reference to an AG and declare an intent to update its + * metadata. + */ +struct xfs_perag * +xfs_perag_intent_get( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, agno); + if (!pag) + return NULL; + + xfs_perag_intent_hold(pag); + return pag; +} + +/* + * Release our intent to update this AG's metadata, and then release our + * passive ref to the AG. + */ +void +xfs_perag_intent_put( + struct xfs_perag *pag) +{ + xfs_perag_intent_rele(pag); + xfs_perag_put(pag); +} + +/* + * Declare an intent to update AG metadata. Other threads that need exclusive + * access can decide to back off if they see declared intentions. + */ +void +xfs_perag_intent_hold( + struct xfs_perag *pag) +{ + trace_xfs_perag_intent_hold(pag, __return_address); + xfs_defer_drain_grab(&pag->pag_intents_drain); +} + +/* Release our intent to update this AG's metadata. */ +void +xfs_perag_intent_rele( + struct xfs_perag *pag) +{ + trace_xfs_perag_intent_rele(pag, __return_address); + xfs_defer_drain_rele(&pag->pag_intents_drain); +} + +/* + * Wait for the intent update count for this AG to hit zero. + * Callers must not hold any AG header buffers. + */ +int +xfs_perag_intent_drain( + struct xfs_perag *pag) +{ + trace_xfs_perag_wait_intents(pag, __return_address); + return xfs_defer_drain_wait(&pag->pag_intents_drain); +} + +/* Has anyone declared an intent to update this AG? */ +bool +xfs_perag_intent_busy( + struct xfs_perag *pag) +{ + return xfs_defer_drain_busy(&pag->pag_intents_drain); +} diff --git a/fs/xfs/xfs_drain.h b/fs/xfs/xfs_drain.h new file mode 100644 index 000000000000..50a5772a8296 --- /dev/null +++ b/fs/xfs/xfs_drain.h @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef XFS_DRAIN_H_ +#define XFS_DRAIN_H_ + +struct xfs_perag; + +#ifdef CONFIG_XFS_DRAIN_INTENTS +/* + * Passive drain mechanism. This data structure tracks a count of some items + * and contains a waitqueue for callers who would like to wake up when the + * count hits zero. + */ +struct xfs_defer_drain { + /* Number of items pending in some part of the filesystem. */ + atomic_t dr_count; + + /* Queue to wait for dri_count to go to zero */ + struct wait_queue_head dr_waiters; +}; + +void xfs_defer_drain_init(struct xfs_defer_drain *dr); +void xfs_defer_drain_free(struct xfs_defer_drain *dr); + +void xfs_drain_wait_disable(void); +void xfs_drain_wait_enable(void); + +/* + * Deferred Work Intent Drains + * =========================== + * + * When a writer thread executes a chain of log intent items, the AG header + * buffer locks will cycle during a transaction roll to get from one intent + * item to the next in a chain. Although scrub takes all AG header buffer + * locks, this isn't sufficient to guard against scrub checking an AG while + * that writer thread is in the middle of finishing a chain because there's no + * higher level locking primitive guarding allocation groups. + * + * When there's a collision, cross-referencing between data structures (e.g. + * rmapbt and refcountbt) yields false corruption events; if repair is running, + * this results in incorrect repairs, which is catastrophic. + * + * The solution is to the perag structure the count of active intents and make + * scrub wait until it has both AG header buffer locks and the intent counter + * reaches zero. It is therefore critical that deferred work threads hold the + * AGI or AGF buffers when decrementing the intent counter. + * + * Given a list of deferred work items, the deferred work manager will complete + * a work item and all the sub-items that the parent item creates before moving + * on to the next work item in the list. This is also true for all levels of + * sub-items. Writer threads are permitted to queue multiple work items + * targetting the same AG, so a deferred work item (such as a BUI) that creates + * sub-items (such as RUIs) must bump the intent counter and maintain it until + * the sub-items can themselves bump the intent counter. + * + * Therefore, the intent count tracks entire lifetimes of deferred work items. + * All functions that create work items must increment the intent counter as + * soon as the item is added to the transaction and cannot drop the counter + * until the item is finished or cancelled. + */ +struct xfs_perag *xfs_perag_intent_get(struct xfs_mount *mp, + xfs_agnumber_t agno); +void xfs_perag_intent_put(struct xfs_perag *pag); + +void xfs_perag_intent_hold(struct xfs_perag *pag); +void xfs_perag_intent_rele(struct xfs_perag *pag); + +int xfs_perag_intent_drain(struct xfs_perag *pag); +bool xfs_perag_intent_busy(struct xfs_perag *pag); +#else +struct xfs_defer_drain { /* empty */ }; + +#define xfs_defer_drain_free(dr) ((void)0) +#define xfs_defer_drain_init(dr) ((void)0) + +#define xfs_perag_intent_get(mp, agno) xfs_perag_get((mp), (agno)) +#define xfs_perag_intent_put(pag) xfs_perag_put(pag) + +static inline void xfs_perag_intent_hold(struct xfs_perag *pag) { } +static inline void xfs_perag_intent_rele(struct xfs_perag *pag) { } + +#endif /* CONFIG_XFS_DRAIN_INTENTS */ + +#endif /* XFS_DRAIN_H_ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 011b50469301..f9e36b810663 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -351,8 +351,6 @@ xfs_trans_free_extent( struct xfs_mount *mp = tp->t_mountp; struct xfs_extent *extp; uint next_extent; - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, - xefi->xefi_startblock); xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); int error; @@ -363,12 +361,13 @@ xfs_trans_free_extent( if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; - trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, - xefi->xefi_blockcount); + trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0, + agbno, xefi->xefi_blockcount); - error = __xfs_free_extent(tp, xefi->xefi_startblock, + error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE, xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); + /* * Mark the transaction dirty, even on error. This ensures the * transaction is aborted, which: @@ -396,14 +395,13 @@ xfs_extent_free_diff_items( const struct list_head *a, const struct list_head *b) { - struct xfs_mount *mp = priv; struct xfs_extent_free_item *ra; struct xfs_extent_free_item *rb; ra = container_of(a, struct xfs_extent_free_item, xefi_list); rb = container_of(b, struct xfs_extent_free_item, xefi_list); - return XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) - - XFS_FSB_TO_AGNO(mp, rb->xefi_startblock); + + return ra->xefi_pag->pag_agno - rb->xefi_pag->pag_agno; } /* Log a free extent to the intent item. */ @@ -462,6 +460,26 @@ xfs_extent_free_create_done( return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item; } +/* Take a passive ref to the AG containing the space we're freeing. */ +void +xfs_extent_free_get_group( + struct xfs_mount *mp, + struct xfs_extent_free_item *xefi) +{ + xfs_agnumber_t agno; + + agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock); + xefi->xefi_pag = xfs_perag_intent_get(mp, agno); +} + +/* Release a passive AG ref after some freeing work. */ +static inline void +xfs_extent_free_put_group( + struct xfs_extent_free_item *xefi) +{ + xfs_perag_intent_put(xefi->xefi_pag); +} + /* Process a free extent. */ STATIC int xfs_extent_free_finish_item( @@ -476,6 +494,8 @@ xfs_extent_free_finish_item( xefi = container_of(item, struct xfs_extent_free_item, xefi_list); error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); + + xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); return error; } @@ -496,6 +516,8 @@ xfs_extent_free_cancel_item( struct xfs_extent_free_item *xefi; xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + + xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); } @@ -526,24 +548,21 @@ xfs_agfl_free_finish_item( struct xfs_extent *extp; struct xfs_buf *agbp; int error; - xfs_agnumber_t agno; xfs_agblock_t agbno; uint next_extent; - struct xfs_perag *pag; xefi = container_of(item, struct xfs_extent_free_item, xefi_list); ASSERT(xefi->xefi_blockcount == 1); - agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock); agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); oinfo.oi_owner = xefi->xefi_owner; - trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, xefi->xefi_blockcount); + trace_xfs_agfl_free_deferred(mp, xefi->xefi_pag->pag_agno, 0, agbno, + xefi->xefi_blockcount); - pag = xfs_perag_get(mp, agno); - error = xfs_alloc_read_agf(pag, tp, 0, &agbp); + error = xfs_alloc_read_agf(xefi->xefi_pag, tp, 0, &agbp); if (!error) - error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo); - xfs_perag_put(pag); + error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno, + agbno, agbp, &oinfo); /* * Mark the transaction dirty, even on error. This ensures the @@ -562,6 +581,7 @@ xfs_agfl_free_finish_item( extp->ext_len = xefi->xefi_blockcount; efdp->efd_next_extent++; + xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); return error; } @@ -632,7 +652,9 @@ xfs_efi_item_recover( fake.xefi_startblock = extp->ext_start; fake.xefi_blockcount = extp->ext_len; + xfs_extent_free_get_group(mp, &fake); error = xfs_trans_free_extent(tp, efdp, &fake); + xfs_extent_free_put_group(&fake); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, extp, sizeof(*extp)); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index c9a7e270a428..351849fc18ff 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -767,7 +767,8 @@ again: return 0; out_error_or_again: - if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { + if (!(flags & (XFS_IGET_INCORE | XFS_IGET_NORETRY)) && + error == -EAGAIN) { delay(1); goto again; } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 6cd180721659..87910191a9dd 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -34,10 +34,13 @@ struct xfs_icwalk { /* * Flags for xfs_iget() */ -#define XFS_IGET_CREATE 0x1 -#define XFS_IGET_UNTRUSTED 0x2 -#define XFS_IGET_DONTCACHE 0x4 -#define XFS_IGET_INCORE 0x8 /* don't read from disk or reinit */ +#define XFS_IGET_CREATE (1U << 0) +#define XFS_IGET_UNTRUSTED (1U << 1) +#define XFS_IGET_DONTCACHE (1U << 2) +/* don't read from disk or reinit */ +#define XFS_IGET_INCORE (1U << 3) +/* Return -EAGAIN immediately if the inode is unavailable. */ +#define XFS_IGET_NORETRY (1U << 4) int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c index 43005ce8bd48..2ddccb172fa0 100644 --- a/fs/xfs/xfs_iunlink_item.c +++ b/fs/xfs/xfs_iunlink_item.c @@ -168,9 +168,7 @@ xfs_iunlink_log_inode( iup->ip = ip; iup->next_agino = next_agino; iup->old_agino = ip->i_next_unlinked; - - atomic_inc(&pag->pag_ref); - iup->pag = pag; + iup->pag = xfs_perag_hold(pag); xfs_trans_add_item(tp, &iup->item); tp->t_flags |= XFS_TRANS_DIRTY; diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 21be93bf006d..b3275e8d47b6 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -667,11 +667,10 @@ xfs_iwalk_threaded( iwag->mp = mp; /* - * perag is being handed off to async work, so take another + * perag is being handed off to async work, so take a passive * reference for the async work to release. */ - atomic_inc(&pag->pag_ref); - iwag->pag = pag; + iwag->pag = xfs_perag_hold(pag); iwag->iwalk_fn = iwalk_fn; iwag->data = data; iwag->startino = startino; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index e88f18f85e4b..74dcb05069e8 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -80,6 +80,7 @@ typedef __u32 xfs_nlink_t; #include "xfs_cksum.h" #include "xfs_buf.h" #include "xfs_message.h" +#include "xfs_drain.h" #ifdef __BIG_ENDIAN #define XFS_NATIVE_HOST 1 diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 48d771a76add..edd8587658d5 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -20,6 +20,7 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_ag.h" struct kmem_cache *xfs_cui_cache; struct kmem_cache *xfs_cud_cache; @@ -279,14 +280,13 @@ xfs_refcount_update_diff_items( const struct list_head *a, const struct list_head *b) { - struct xfs_mount *mp = priv; struct xfs_refcount_intent *ra; struct xfs_refcount_intent *rb; ra = container_of(a, struct xfs_refcount_intent, ri_list); rb = container_of(b, struct xfs_refcount_intent, ri_list); - return XFS_FSB_TO_AGNO(mp, ra->ri_startblock) - - XFS_FSB_TO_AGNO(mp, rb->ri_startblock); + + return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno; } /* Set the phys extent flags for this reverse mapping. */ @@ -365,6 +365,26 @@ xfs_refcount_update_create_done( return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item; } +/* Take a passive ref to the AG containing the space we're refcounting. */ +void +xfs_refcount_update_get_group( + struct xfs_mount *mp, + struct xfs_refcount_intent *ri) +{ + xfs_agnumber_t agno; + + agno = XFS_FSB_TO_AGNO(mp, ri->ri_startblock); + ri->ri_pag = xfs_perag_intent_get(mp, agno); +} + +/* Release a passive AG ref after finishing refcounting work. */ +static inline void +xfs_refcount_update_put_group( + struct xfs_refcount_intent *ri) +{ + xfs_perag_intent_put(ri->ri_pag); +} + /* Process a deferred refcount update. */ STATIC int xfs_refcount_update_finish_item( @@ -386,6 +406,8 @@ xfs_refcount_update_finish_item( ri->ri_type == XFS_REFCOUNT_DECREASE); return -EAGAIN; } + + xfs_refcount_update_put_group(ri); kmem_cache_free(xfs_refcount_intent_cache, ri); return error; } @@ -406,6 +428,8 @@ xfs_refcount_update_cancel_item( struct xfs_refcount_intent *ri; ri = container_of(item, struct xfs_refcount_intent, ri_list); + + xfs_refcount_update_put_group(ri); kmem_cache_free(xfs_refcount_intent_cache, ri); } @@ -520,9 +544,13 @@ xfs_cui_item_recover( fake.ri_startblock = pmap->pe_startblock; fake.ri_blockcount = pmap->pe_len; - if (!requeue_only) + + if (!requeue_only) { + xfs_refcount_update_get_group(mp, &fake); error = xfs_trans_log_finish_refcount_update(tp, cudp, &fake, &rcur); + xfs_refcount_update_put_group(&fake); + } if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &cuip->cui_format, diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index a1619d67015f..520c7ebdfed8 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -20,6 +20,7 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_ag.h" struct kmem_cache *xfs_rui_cache; struct kmem_cache *xfs_rud_cache; @@ -320,14 +321,13 @@ xfs_rmap_update_diff_items( const struct list_head *a, const struct list_head *b) { - struct xfs_mount *mp = priv; struct xfs_rmap_intent *ra; struct xfs_rmap_intent *rb; ra = container_of(a, struct xfs_rmap_intent, ri_list); rb = container_of(b, struct xfs_rmap_intent, ri_list); - return XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) - - XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock); + + return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno; } /* Log rmap updates in the intent item. */ @@ -390,6 +390,26 @@ xfs_rmap_update_create_done( return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item; } +/* Take a passive ref to the AG containing the space we're rmapping. */ +void +xfs_rmap_update_get_group( + struct xfs_mount *mp, + struct xfs_rmap_intent *ri) +{ + xfs_agnumber_t agno; + + agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock); + ri->ri_pag = xfs_perag_intent_get(mp, agno); +} + +/* Release a passive AG ref after finishing rmapping work. */ +static inline void +xfs_rmap_update_put_group( + struct xfs_rmap_intent *ri) +{ + xfs_perag_intent_put(ri->ri_pag); +} + /* Process a deferred rmap update. */ STATIC int xfs_rmap_update_finish_item( @@ -405,6 +425,8 @@ xfs_rmap_update_finish_item( error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri, state); + + xfs_rmap_update_put_group(ri); kmem_cache_free(xfs_rmap_intent_cache, ri); return error; } @@ -425,6 +447,8 @@ xfs_rmap_update_cancel_item( struct xfs_rmap_intent *ri; ri = container_of(item, struct xfs_rmap_intent, ri_list); + + xfs_rmap_update_put_group(ri); kmem_cache_free(xfs_rmap_intent_cache, ri); } @@ -559,11 +583,13 @@ xfs_rui_item_recover( fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + xfs_rmap_update_get_group(mp, &fake); error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake, &rcur); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map, sizeof(*map)); + xfs_rmap_update_put_group(&fake); if (error) goto abort_error; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 4f814f9e12ab..4d2e87462ac4 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1548,6 +1548,19 @@ xfs_fs_fill_super( #endif } + /* ASCII case insensitivity is undergoing deprecation. */ + if (xfs_has_asciici(mp)) { +#ifdef CONFIG_XFS_SUPPORT_ASCII_CI + xfs_warn_once(mp, + "Deprecated ASCII case-insensitivity feature (ascii-ci=1) will not be supported after September 2030."); +#else + xfs_warn(mp, + "Deprecated ASCII case-insensitivity feature (ascii-ci=1) not supported by kernel."); + error = -EINVAL; + goto out_free_sb; +#endif + } + /* Filesystem claims it needs repair, so refuse the mount. */ if (xfs_has_needsrepair(mp)) { xfs_warn(mp, "Filesystem needs repair. Please run xfs_repair."); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 9c0006c55fec..cd4ca5b1fcb0 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -190,6 +190,7 @@ DEFINE_EVENT(xfs_perag_class, name, \ TP_ARGS(pag, caller_ip)) DEFINE_PERAG_REF_EVENT(xfs_perag_get); DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); +DEFINE_PERAG_REF_EVENT(xfs_perag_hold); DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_grab); DEFINE_PERAG_REF_EVENT(xfs_perag_grab_tag); @@ -2686,6 +2687,44 @@ DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred); DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer); DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred); +DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp, + void *item), + TP_ARGS(mp, dfp, item), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(void *, intent) + __field(void *, item) + __field(char, committed) + __field(int, nr) + ), + TP_fast_assign( + __entry->dev = mp ? mp->m_super->s_dev : 0; + __entry->type = dfp->dfp_type; + __entry->intent = dfp->dfp_intent; + __entry->item = item; + __entry->committed = dfp->dfp_done != NULL; + __entry->nr = dfp->dfp_count; + ), + TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->intent, + __entry->item, + __entry->committed, + __entry->nr) +) +#define DEFINE_DEFER_PENDING_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_defer_pending_item_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp, \ + void *item), \ + TP_ARGS(mp, dfp, item)) + +DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_add_item); +DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_cancel_item); +DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_finish_item); + /* rmap tracepoints */ DECLARE_EVENT_CLASS(xfs_rmap_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, @@ -4325,6 +4364,39 @@ TRACE_EVENT(xfs_force_shutdown, __entry->line_num) ); +#ifdef CONFIG_XFS_DRAIN_INTENTS +DECLARE_EVENT_CLASS(xfs_perag_intents_class, + TP_PROTO(struct xfs_perag *pag, void *caller_ip), + TP_ARGS(pag, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(long, nr_intents) + __field(void *, caller_ip) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->nr_intents = atomic_read(&pag->pag_intents_drain.dr_count); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno 0x%x intents %ld caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->nr_intents, + __entry->caller_ip) +); + +#define DEFINE_PERAG_INTENTS_EVENT(name) \ +DEFINE_EVENT(xfs_perag_intents_class, name, \ + TP_PROTO(struct xfs_perag *pag, void *caller_ip), \ + TP_ARGS(pag, caller_ip)) +DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_hold); +DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_rele); +DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents); + +#endif /* CONFIG_XFS_DRAIN_INTENTS */ + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH |