From c8c568259772751a14e969b7230990508de73d9d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 16 Mar 2022 11:54:18 -0700 Subject: xfs: don't include bnobt blocks when reserving free block pool xfs_reserve_blocks controls the size of the user-visible free space reserve pool. Given the difference between the current and requested pool sizes, it will try to reserve free space from fdblocks. However, the amount requested from fdblocks is also constrained by the amount of space that we think xfs_mod_fdblocks will give us. If we forget to subtract m_allocbt_blks before calling xfs_mod_fdblocks, it will will return ENOSPC and we'll hang the kernel at mount due to the infinite loop. In commit fd43cf600cf6, we decided that xfs_mod_fdblocks should not hand out the "free space" used by the free space btrees, because some portion of the free space btrees hold in reserve space for future btree expansion. Unfortunately, xfs_reserve_blocks' estimation of the number of blocks that it could request from xfs_mod_fdblocks was not updated to include m_allocbt_blks, so if space is extremely low, the caller hangs. Fix this by creating a function to estimate the number of blocks that can be reserved from fdblocks, which needs to exclude the set-aside and m_allocbt_blks. Found by running xfs/306 (which formats a single-AG 20MB filesystem) with an fstests configuration that specifies a 1k blocksize and a specially crafted log size that will consume 7/8 of the space (17920 blocks, specifically) in that AG. Cc: Brian Foster Fixes: fd43cf600cf6 ("xfs: set aside allocation btree blocks from block reservation") Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Dave Chinner --- fs/xfs/xfs_fsops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs/xfs_fsops.c') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 33e26690a8c4..710e857bb825 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -434,7 +434,7 @@ xfs_reserve_blocks( error = -ENOSPC; do { free = percpu_counter_sum(&mp->m_fdblocks) - - mp->m_alloc_set_aside; + xfs_fdblocks_unavailable(mp); if (free <= 0) break; -- cgit v1.2.3-58-ga151 From 15f04fdc75aaaa1cccb0b8b3af1be290e118a7bc Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 11 Mar 2022 10:56:01 -0800 Subject: xfs: remove infinite loop when reserving free block pool Infinite loops in kernel code are scary. Calls to xfs_reserve_blocks should be rare (people should just use the defaults!) so we really don't need to try so hard. Simplify the logic here by removing the infinite loop. Cc: Brian Foster Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_fsops.c | 50 ++++++++++++++++++++------------------------------ 1 file changed, 20 insertions(+), 30 deletions(-) (limited to 'fs/xfs/xfs_fsops.c') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 710e857bb825..3c6d9d6836ef 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -430,46 +430,36 @@ xfs_reserve_blocks( * If the request is larger than the current reservation, reserve the * blocks before we update the reserve counters. Sample m_fdblocks and * perform a partial reservation if the request exceeds free space. + * + * The code below estimates how many blocks it can request from + * fdblocks to stash in the reserve pool. This is a classic TOCTOU + * race since fdblocks updates are not always coordinated via + * m_sb_lock. */ - error = -ENOSPC; - do { - free = percpu_counter_sum(&mp->m_fdblocks) - + free = percpu_counter_sum(&mp->m_fdblocks) - xfs_fdblocks_unavailable(mp); - if (free <= 0) - break; - - delta = request - mp->m_resblks; - lcounter = free - delta; - if (lcounter < 0) - /* We can't satisfy the request, just get what we can */ - fdblks_delta = free; - else - fdblks_delta = delta; - + delta = request - mp->m_resblks; + if (delta > 0 && free > 0) { /* * We'll either succeed in getting space from the free block - * count or we'll get an ENOSPC. If we get a ENOSPC, it means - * things changed while we were calculating fdblks_delta and so - * we should try again to see if there is anything left to - * reserve. - * - * Don't set the reserved flag here - we don't want to reserve - * the extra reserve blocks from the reserve..... + * count or we'll get an ENOSPC. Don't set the reserved flag + * here - we don't want to reserve the extra reserve blocks + * from the reserve. */ + fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); error = xfs_mod_fdblocks(mp, -fdblks_delta, 0); spin_lock(&mp->m_sb_lock); - } while (error == -ENOSPC); - /* - * Update the reserve counters if blocks have been successfully - * allocated. - */ - if (!error && fdblks_delta) { - mp->m_resblks += fdblks_delta; - mp->m_resblks_avail += fdblks_delta; + /* + * Update the reserve counters if blocks have been successfully + * allocated. + */ + if (!error) { + mp->m_resblks += fdblks_delta; + mp->m_resblks_avail += fdblks_delta; + } } - out: if (outval) { outval->resblks = mp->m_resblks; -- cgit v1.2.3-58-ga151 From 0baa2657dc4d79202148be79a3dc36c35f425060 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 24 Mar 2022 12:43:32 -0700 Subject: xfs: always succeed at setting the reserve pool size Nowadays, xfs_mod_fdblocks will always choose to fill the reserve pool with freed blocks before adding to fdblocks. Therefore, we can change the behavior of xfs_reserve_blocks slightly -- setting the target size of the pool should always succeed, since a deficiency will eventually be made up as blocks get freed. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_fsops.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/xfs/xfs_fsops.c') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 3c6d9d6836ef..5c2bea1e12a8 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -434,11 +434,14 @@ xfs_reserve_blocks( * The code below estimates how many blocks it can request from * fdblocks to stash in the reserve pool. This is a classic TOCTOU * race since fdblocks updates are not always coordinated via - * m_sb_lock. + * m_sb_lock. Set the reserve size even if there's not enough free + * space to fill it because mod_fdblocks will refill an undersized + * reserve when it can. */ free = percpu_counter_sum(&mp->m_fdblocks) - xfs_fdblocks_unavailable(mp); delta = request - mp->m_resblks; + mp->m_resblks = request; if (delta > 0 && free > 0) { /* * We'll either succeed in getting space from the free block @@ -455,10 +458,8 @@ xfs_reserve_blocks( * Update the reserve counters if blocks have been successfully * allocated. */ - if (!error) { - mp->m_resblks += fdblks_delta; + if (!error) mp->m_resblks_avail += fdblks_delta; - } } out: if (outval) { -- cgit v1.2.3-58-ga151 From 82be38bcf8a2e056b4c99ce79a3827fa743df6ec Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 24 Mar 2022 10:57:07 -0700 Subject: xfs: fix overfilling of reserve pool Due to cycling of m_sb_lock, it's possible for multiple callers of xfs_reserve_blocks to race at changing the pool size, subtracting blocks from fdblocks, and actually putting it in the pool. The result of all this is that we can overfill the reserve pool to hilarious levels. xfs_mod_fdblocks, when called with a positive value, already knows how to take freed blocks and either fill the reserve until it's full, or put them in fdblocks. Use that instead of setting m_resblks_avail directly. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_fsops.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs/xfs/xfs_fsops.c') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 5c2bea1e12a8..5b5b68affe66 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -448,18 +448,17 @@ xfs_reserve_blocks( * count or we'll get an ENOSPC. Don't set the reserved flag * here - we don't want to reserve the extra reserve blocks * from the reserve. + * + * The desired reserve size can change after we drop the lock. + * Use mod_fdblocks to put the space into the reserve or into + * fdblocks as appropriate. */ fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); error = xfs_mod_fdblocks(mp, -fdblks_delta, 0); - spin_lock(&mp->m_sb_lock); - - /* - * Update the reserve counters if blocks have been successfully - * allocated. - */ if (!error) - mp->m_resblks_avail += fdblks_delta; + xfs_mod_fdblocks(mp, fdblks_delta, 0); + spin_lock(&mp->m_sb_lock); } out: if (outval) { -- cgit v1.2.3-58-ga151 From 85bcfa26f9a3782be37d4feafd49668b98b8bdbe Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 16 Mar 2022 13:38:43 -0700 Subject: xfs: don't report reserved bnobt space as available On a modern filesystem, we don't allow userspace to allocate blocks for data storage from the per-AG space reservations, the user-controlled reservation pool that prevents ENOSPC in the middle of internal operations, or the internal per-AG set-aside that prevents unwanted filesystem shutdowns due to ENOSPC during a bmap btree split. Since we now consider freespace btree blocks as unavailable for allocation for data storage, we shouldn't report those blocks via statfs either. This makes the numbers that we return via the statfs f_bavail and f_bfree fields a more conservative estimate of actual free space. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Dave Chinner --- fs/xfs/xfs_fsops.c | 2 +- fs/xfs/xfs_super.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/xfs/xfs_fsops.c') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 5b5b68affe66..196e2c51309c 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -347,7 +347,7 @@ xfs_fs_counts( cnt->allocino = percpu_counter_read_positive(&mp->m_icount); cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - mp->m_alloc_set_aside; + xfs_fdblocks_unavailable(mp); spin_lock(&mp->m_sb_lock); cnt->freertx = mp->m_sb.sb_frextents; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d84714e4e46a..54be9d64093e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -815,7 +815,8 @@ xfs_fs_statfs( spin_unlock(&mp->m_sb_lock); /* make sure statp->f_bfree does not underflow */ - statp->f_bfree = max_t(int64_t, fdblocks - mp->m_alloc_set_aside, 0); + statp->f_bfree = max_t(int64_t, 0, + fdblocks - xfs_fdblocks_unavailable(mp)); statp->f_bavail = statp->f_bfree; fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree); -- cgit v1.2.3-58-ga151 From 41e6362183589afd2cd51d653e277d256daab11f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Mar 2022 18:22:01 -0700 Subject: xfs: xfs_do_force_shutdown needs to block racing shutdowns When we call xfs_forced_shutdown(), the caller often expects the filesystem to be completely shut down when it returns. However, if we have racing xfs_forced_shutdown() calls, the first caller sets the mount shutdown flag then goes to shutdown the log. The second caller sees the mount shutdown flag and returns immediately - it does not wait for the log to be shut down. Unfortunately, xfs_forced_shutdown() is used in some places that expect it to completely shut down the filesystem before it returns (e.g. xfs_trans_log_inode()). As such, returning before the log has been shut down leaves us in a place where the transaction failed to complete correctly but we still call xfs_trans_commit(). This situation arises because xfs_trans_log_inode() does not return an error and instead calls xfs_force_shutdown() to ensure that the transaction being committed is aborted. Unfortunately, we have a race condition where xfs_trans_commit() needs to check xlog_is_shutdown() because it can't abort log items before the log is shut down, but it needs to use xfs_is_shutdown() because xfs_forced_shutdown() does not block waiting for the log to shut down. To fix this conundrum, first we make all calls to xfs_forced_shutdown() block until the log is also shut down. This means we can then safely use xfs_forced_shutdown() as a mechanism that ensures the currently running transaction will be aborted by xfs_trans_commit() regardless of the shutdown check it uses. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_fsops.c | 6 +++++- fs/xfs/xfs_log.c | 1 + fs/xfs/xfs_log_priv.h | 11 +++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'fs/xfs/xfs_fsops.c') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 196e2c51309c..68f74549fa22 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -17,6 +17,7 @@ #include "xfs_fsops.h" #include "xfs_trans_space.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_trace.h" @@ -518,8 +519,11 @@ xfs_do_force_shutdown( int tag; const char *why; - if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) + + if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) { + xlog_shutdown_wait(mp->m_log); return; + } if (mp->m_sb_bp) mp->m_sb_bp->b_flags |= XBF_DONE; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 4188ed752169..678ca01047e1 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3923,6 +3923,7 @@ xlog_force_shutdown( xlog_state_shutdown_callbacks(log); spin_unlock(&log->l_icloglock); + wake_up_var(&log->l_opstate); return log_error; } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 23103d68423c..cd0508e26fec 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -484,6 +484,17 @@ xlog_is_shutdown(struct xlog *log) return test_bit(XLOG_IO_ERROR, &log->l_opstate); } +/* + * Wait until the xlog_force_shutdown() has marked the log as shut down + * so xlog_is_shutdown() will always return true. + */ +static inline void +xlog_shutdown_wait( + struct xlog *log) +{ + wait_var_event(&log->l_opstate, xlog_is_shutdown(log)); +} + /* common routines */ extern int xlog_recover( -- cgit v1.2.3-58-ga151