diff options
-rw-r--r-- | fs/xfs/libxfs/xfs_alloc.c | 28 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_alloc.h | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_bio_io.c | 33 | ||||
-rw-r--r-- | fs/xfs/xfs_fsops.c | 60 | ||||
-rw-r--r-- | fs/xfs/xfs_icache.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_inode_item.c | 162 | ||||
-rw-r--r-- | fs/xfs/xfs_inode_item.h | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_linux.h | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_log.c | 109 | ||||
-rw-r--r-- | fs/xfs/xfs_log_cil.c | 46 | ||||
-rw-r--r-- | fs/xfs/xfs_log_priv.h | 14 | ||||
-rw-r--r-- | fs/xfs/xfs_log_recover.c | 56 | ||||
-rw-r--r-- | fs/xfs/xfs_mount.c | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_mount.h | 15 | ||||
-rw-r--r-- | fs/xfs/xfs_super.c | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_trans.c | 48 | ||||
-rw-r--r-- | fs/xfs/xfs_trans_ail.c | 8 |
18 files changed, 347 insertions, 246 deletions
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 353e53b892e6..b52ed339727f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -82,6 +82,24 @@ xfs_prealloc_blocks( } /* + * The number of blocks per AG that we withhold from xfs_mod_fdblocks to + * guarantee that we can refill the AGFL prior to allocating space in a nearly + * full AG. Although the the space described by the free space btrees, the + * blocks used by the freesp btrees themselves, and the blocks owned by the + * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk + * free space in the AG drop so low that the free space btrees cannot refill an + * empty AGFL up to the minimum level. Rather than grind through empty AGs + * until the fs goes down, we subtract this many AG blocks from the incore + * fdblocks to ensure user allocation does not overcommit the space the + * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to + * withhold space from xfs_mod_fdblocks, so we do not account for that here. + */ +#define XFS_ALLOCBT_AGFL_RESERVE 4 + +/* + * Compute the number of blocks that we set aside to guarantee the ability to + * refill the AGFL and handle a full bmap btree split. + * * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of * AGF buffer (PV 947395), we place constraints on the relationship among * actual allocations for data blocks, freelist blocks, and potential file data @@ -93,14 +111,14 @@ xfs_prealloc_blocks( * extents need to be actually allocated. To get around this, we explicitly set * aside a few blocks which will not be reserved in delayed allocation. * - * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a - * potential split of the file's bmap btree. + * For each AG, we need to reserve enough blocks to replenish a totally empty + * AGFL and 4 more to handle a potential split of the file's bmap btree. */ unsigned int xfs_alloc_set_aside( struct xfs_mount *mp) { - return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4); + return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4); } /* @@ -124,12 +142,12 @@ xfs_alloc_ag_max_usable( unsigned int blocks; blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ - blocks += XFS_ALLOC_AGFL_RESERVE; + blocks += XFS_ALLOCBT_AGFL_RESERVE; blocks += 3; /* AGF, AGI btree root blocks */ if (xfs_has_finobt(mp)) blocks++; /* finobt root block */ if (xfs_has_rmapbt(mp)) - blocks++; /* rmap root block */ + blocks++; /* rmap root block */ if (xfs_has_reflink(mp)) blocks++; /* refcount root block */ diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 1c14a0b1abea..d4c057b764f9 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -88,7 +88,6 @@ typedef struct xfs_alloc_arg { #define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */ /* freespace limit calculations */ -#define XFS_ALLOC_AGFL_RESERVE 4 unsigned int xfs_alloc_set_aside(struct xfs_mount *mp); unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index 32fa02945f73..ae4345b37621 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -9,39 +9,6 @@ static inline unsigned int bio_max_vecs(unsigned int count) return bio_max_segs(howmany(count, PAGE_SIZE)); } -static void -xfs_flush_bdev_async_endio( - struct bio *bio) -{ - complete(bio->bi_private); -} - -/* - * Submit a request for an async cache flush to run. If the request queue does - * not require flush operations, just skip it altogether. If the caller needs - * to wait for the flush completion at a later point in time, they must supply a - * valid completion. This will be signalled when the flush completes. The - * caller never sees the bio that is issued here. - */ -void -xfs_flush_bdev_async( - struct bio *bio, - struct block_device *bdev, - struct completion *done) -{ - struct request_queue *q = bdev->bd_disk->queue; - - if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { - complete(done); - return; - } - - bio_init(bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC); - bio->bi_private = done; - bio->bi_end_io = xfs_flush_bdev_async_endio; - - submit_bio(bio); -} int xfs_rw_bdev( struct block_device *bdev, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 33e26690a8c4..68f74549fa22 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -17,6 +17,7 @@ #include "xfs_fsops.h" #include "xfs_trans_space.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_trace.h" @@ -347,7 +348,7 @@ xfs_fs_counts( cnt->allocino = percpu_counter_read_positive(&mp->m_icount); cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - mp->m_alloc_set_aside; + xfs_fdblocks_unavailable(mp); spin_lock(&mp->m_sb_lock); cnt->freertx = mp->m_sb.sb_frextents; @@ -430,46 +431,36 @@ xfs_reserve_blocks( * If the request is larger than the current reservation, reserve the * blocks before we update the reserve counters. Sample m_fdblocks and * perform a partial reservation if the request exceeds free space. + * + * The code below estimates how many blocks it can request from + * fdblocks to stash in the reserve pool. This is a classic TOCTOU + * race since fdblocks updates are not always coordinated via + * m_sb_lock. Set the reserve size even if there's not enough free + * space to fill it because mod_fdblocks will refill an undersized + * reserve when it can. */ - error = -ENOSPC; - do { - free = percpu_counter_sum(&mp->m_fdblocks) - - mp->m_alloc_set_aside; - if (free <= 0) - break; - - delta = request - mp->m_resblks; - lcounter = free - delta; - if (lcounter < 0) - /* We can't satisfy the request, just get what we can */ - fdblks_delta = free; - else - fdblks_delta = delta; - + free = percpu_counter_sum(&mp->m_fdblocks) - + xfs_fdblocks_unavailable(mp); + delta = request - mp->m_resblks; + mp->m_resblks = request; + if (delta > 0 && free > 0) { /* * We'll either succeed in getting space from the free block - * count or we'll get an ENOSPC. If we get a ENOSPC, it means - * things changed while we were calculating fdblks_delta and so - * we should try again to see if there is anything left to - * reserve. + * count or we'll get an ENOSPC. Don't set the reserved flag + * here - we don't want to reserve the extra reserve blocks + * from the reserve. * - * Don't set the reserved flag here - we don't want to reserve - * the extra reserve blocks from the reserve..... + * The desired reserve size can change after we drop the lock. + * Use mod_fdblocks to put the space into the reserve or into + * fdblocks as appropriate. */ + fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); error = xfs_mod_fdblocks(mp, -fdblks_delta, 0); + if (!error) + xfs_mod_fdblocks(mp, fdblks_delta, 0); spin_lock(&mp->m_sb_lock); - } while (error == -ENOSPC); - - /* - * Update the reserve counters if blocks have been successfully - * allocated. - */ - if (!error && fdblks_delta) { - mp->m_resblks += fdblks_delta; - mp->m_resblks_avail += fdblks_delta; } - out: if (outval) { outval->resblks = mp->m_resblks; @@ -528,8 +519,11 @@ xfs_do_force_shutdown( int tag; const char *why; - if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) + + if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) { + xlog_shutdown_wait(mp->m_log); return; + } if (mp->m_sb_bp) mp->m_sb_bp->b_flags |= XBF_DONE; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 20186c584c7d..bffd6eb0b298 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -883,7 +883,7 @@ xfs_reclaim_inode( */ if (xlog_is_shutdown(ip->i_mount->m_log)) { xfs_iunpin_wait(ip); - xfs_iflush_abort(ip); + xfs_iflush_shutdown_abort(ip); goto reclaim; } if (xfs_ipincount(ip)) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 26227d26f274..9de6205fe134 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3631,7 +3631,7 @@ xfs_iflush_cluster( /* * We must use the safe variant here as on shutdown xfs_iflush_abort() - * can remove itself from the list. + * will remove itself from the list. */ list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { iip = (struct xfs_inode_log_item *)lip; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 11158fa81a09..9e6ef55cf29e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -544,10 +544,17 @@ xfs_inode_item_push( uint rval = XFS_ITEM_SUCCESS; int error; - ASSERT(iip->ili_item.li_buf); + if (!bp || (ip->i_flags & XFS_ISTALE)) { + /* + * Inode item/buffer is being being aborted due to cluster + * buffer deletion. Trigger a log force to have that operation + * completed and items removed from the AIL before the next push + * attempt. + */ + return XFS_ITEM_PINNED; + } - if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) || - (ip->i_flags & XFS_ISTALE)) + if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; if (xfs_iflags_test(ip, XFS_IFLUSHING)) @@ -834,46 +841,143 @@ xfs_buf_inode_io_fail( } /* - * This is the inode flushing abort routine. It is called when - * the filesystem is shutting down to clean up the inode state. It is - * responsible for removing the inode item from the AIL if it has not been - * re-logged and clearing the inode's flush state. + * Clear the inode logging fields so no more flushes are attempted. If we are + * on a buffer list, it is now safe to remove it because the buffer is + * guaranteed to be locked. The caller will drop the reference to the buffer + * the log item held. + */ +static void +xfs_iflush_abort_clean( + struct xfs_inode_log_item *iip) +{ + iip->ili_last_fields = 0; + iip->ili_fields = 0; + iip->ili_fsync_fields = 0; + iip->ili_flush_lsn = 0; + iip->ili_item.li_buf = NULL; + list_del_init(&iip->ili_item.li_bio_list); +} + +/* + * Abort flushing the inode from a context holding the cluster buffer locked. + * + * This is the normal runtime method of aborting writeback of an inode that is + * attached to a cluster buffer. It occurs when the inode and the backing + * cluster buffer have been freed (i.e. inode is XFS_ISTALE), or when cluster + * flushing or buffer IO completion encounters a log shutdown situation. + * + * If we need to abort inode writeback and we don't already hold the buffer + * locked, call xfs_iflush_shutdown_abort() instead as this should only ever be + * necessary in a shutdown situation. */ void xfs_iflush_abort( struct xfs_inode *ip) { struct xfs_inode_log_item *iip = ip->i_itemp; - struct xfs_buf *bp = NULL; + struct xfs_buf *bp; - if (iip) { - /* - * Clear the failed bit before removing the item from the AIL so - * xfs_trans_ail_delete() doesn't try to clear and release the - * buffer attached to the log item before we are done with it. - */ - clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); - xfs_trans_ail_delete(&iip->ili_item, 0); + if (!iip) { + /* clean inode, nothing to do */ + xfs_iflags_clear(ip, XFS_IFLUSHING); + return; + } + + /* + * Remove the inode item from the AIL before we clear its internal + * state. Whilst the inode is in the AIL, it should have a valid buffer + * pointer for push operations to access - it is only safe to remove the + * inode from the buffer once it has been removed from the AIL. + * + * We also clear the failed bit before removing the item from the AIL + * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer + * references the inode item owns and needs to hold until we've fully + * aborted the inode log item and detached it from the buffer. + */ + clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); + xfs_trans_ail_delete(&iip->ili_item, 0); + + /* + * Grab the inode buffer so can we release the reference the inode log + * item holds on it. + */ + spin_lock(&iip->ili_lock); + bp = iip->ili_item.li_buf; + xfs_iflush_abort_clean(iip); + spin_unlock(&iip->ili_lock); + xfs_iflags_clear(ip, XFS_IFLUSHING); + if (bp) + xfs_buf_rele(bp); +} + +/* + * Abort an inode flush in the case of a shutdown filesystem. This can be called + * from anywhere with just an inode reference and does not require holding the + * inode cluster buffer locked. If the inode is attached to a cluster buffer, + * it will grab and lock it safely, then abort the inode flush. + */ +void +xfs_iflush_shutdown_abort( + struct xfs_inode *ip) +{ + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_buf *bp; + + if (!iip) { + /* clean inode, nothing to do */ + xfs_iflags_clear(ip, XFS_IFLUSHING); + return; + } + + spin_lock(&iip->ili_lock); + bp = iip->ili_item.li_buf; + if (!bp) { + spin_unlock(&iip->ili_lock); + xfs_iflush_abort(ip); + return; + } + + /* + * We have to take a reference to the buffer so that it doesn't get + * freed when we drop the ili_lock and then wait to lock the buffer. + * We'll clean up the extra reference after we pick up the ili_lock + * again. + */ + xfs_buf_hold(bp); + spin_unlock(&iip->ili_lock); + xfs_buf_lock(bp); + + spin_lock(&iip->ili_lock); + if (!iip->ili_item.li_buf) { /* - * Clear the inode logging fields so no more flushes are - * attempted. + * Raced with another removal, hold the only reference + * to bp now. Inode should not be in the AIL now, so just clean + * up and return; */ - spin_lock(&iip->ili_lock); - iip->ili_last_fields = 0; - iip->ili_fields = 0; - iip->ili_fsync_fields = 0; - iip->ili_flush_lsn = 0; - bp = iip->ili_item.li_buf; - iip->ili_item.li_buf = NULL; - list_del_init(&iip->ili_item.li_bio_list); + ASSERT(list_empty(&iip->ili_item.li_bio_list)); + ASSERT(!test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)); + xfs_iflush_abort_clean(iip); spin_unlock(&iip->ili_lock); + xfs_iflags_clear(ip, XFS_IFLUSHING); + xfs_buf_relse(bp); + return; } - xfs_iflags_clear(ip, XFS_IFLUSHING); - if (bp) - xfs_buf_rele(bp); + + /* + * Got two references to bp. The first will get dropped by + * xfs_iflush_abort() when the item is removed from the buffer list, but + * we can't drop our reference until _abort() returns because we have to + * unlock the buffer as well. Hence we abort and then unlock and release + * our reference to the buffer. + */ + ASSERT(iip->ili_item.li_buf == bp); + spin_unlock(&iip->ili_lock); + xfs_iflush_abort(ip); + xfs_buf_relse(bp); } + /* * convert an xfs_inode_log_format struct from the old 32 bit version * (which can have different field alignments) to the native 64 bit version diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 1a302000d604..bbd836a44ff0 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -44,6 +44,7 @@ static inline int xfs_inode_clean(struct xfs_inode *ip) extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); extern void xfs_iflush_abort(struct xfs_inode *); +extern void xfs_iflush_shutdown_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, struct xfs_inode_log_format *); diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 09a8fba84ff9..cb9105d667db 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -197,8 +197,6 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, char *data, unsigned int op); -void xfs_flush_bdev_async(struct bio *bio, struct block_device *bdev, - struct completion *done); #define ASSERT_ALWAYS(expr) \ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index a8034c0afdf2..499e15b24215 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -487,7 +487,10 @@ out_error: * Run all the pending iclog callbacks and wake log force waiters and iclog * space waiters so they can process the newly set shutdown state. We really * don't care what order we process callbacks here because the log is shut down - * and so state cannot change on disk anymore. + * and so state cannot change on disk anymore. However, we cannot wake waiters + * until the callbacks have been processed because we may be in unmount and + * we must ensure that all AIL operations the callbacks perform have completed + * before we tear down the AIL. * * We avoid processing actively referenced iclogs so that we don't run callbacks * while the iclog owner might still be preparing the iclog for IO submssion. @@ -501,7 +504,6 @@ xlog_state_shutdown_callbacks( struct xlog_in_core *iclog; LIST_HEAD(cb_list); - spin_lock(&log->l_icloglock); iclog = log->l_iclog; do { if (atomic_read(&iclog->ic_refcnt)) { @@ -509,26 +511,22 @@ xlog_state_shutdown_callbacks( continue; } list_splice_init(&iclog->ic_callbacks, &cb_list); + spin_unlock(&log->l_icloglock); + + xlog_cil_process_committed(&cb_list); + + spin_lock(&log->l_icloglock); wake_up_all(&iclog->ic_write_wait); wake_up_all(&iclog->ic_force_wait); } while ((iclog = iclog->ic_next) != log->l_iclog); wake_up_all(&log->l_flush_wait); - spin_unlock(&log->l_icloglock); - - xlog_cil_process_committed(&cb_list); } /* * Flush iclog to disk if this is the last reference to the given iclog and the * it is in the WANT_SYNC state. * - * If the caller passes in a non-zero @old_tail_lsn and the current log tail - * does not match, there may be metadata on disk that must be persisted before - * this iclog is written. To satisfy that requirement, set the - * XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new - * log tail value. - * * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the * log tail is updated correctly. NEED_FUA indicates that the iclog will be * written to stable storage, and implies that a commit record is contained @@ -545,12 +543,10 @@ xlog_state_shutdown_callbacks( * always capture the tail lsn on the iclog on the first NEED_FUA release * regardless of the number of active reference counts on this iclog. */ - int xlog_state_release_iclog( struct xlog *log, - struct xlog_in_core *iclog, - xfs_lsn_t old_tail_lsn) + struct xlog_in_core *iclog) { xfs_lsn_t tail_lsn; bool last_ref; @@ -561,18 +557,14 @@ xlog_state_release_iclog( /* * Grabbing the current log tail needs to be atomic w.r.t. the writing * of the tail LSN into the iclog so we guarantee that the log tail does - * not move between deciding if a cache flush is required and writing - * the LSN into the iclog below. + * not move between the first time we know that the iclog needs to be + * made stable and when we eventually submit it. */ - if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) { + if ((iclog->ic_state == XLOG_STATE_WANT_SYNC || + (iclog->ic_flags & XLOG_ICL_NEED_FUA)) && + !iclog->ic_header.h_tail_lsn) { tail_lsn = xlog_assign_tail_lsn(log->l_mp); - - if (old_tail_lsn && tail_lsn != old_tail_lsn) - iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; - - if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) && - !iclog->ic_header.h_tail_lsn) - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); } last_ref = atomic_dec_and_test(&iclog->ic_refcnt); @@ -583,11 +575,8 @@ xlog_state_release_iclog( * pending iclog callbacks that were waiting on the release of * this iclog. */ - if (last_ref) { - spin_unlock(&log->l_icloglock); + if (last_ref) xlog_state_shutdown_callbacks(log); - spin_lock(&log->l_icloglock); - } return -EIO; } @@ -600,8 +589,6 @@ xlog_state_release_iclog( } iclog->ic_state = XLOG_STATE_SYNCING; - if (!iclog->ic_header.h_tail_lsn) - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); xlog_verify_tail_lsn(log, iclog); trace_xlog_iclog_syncing(iclog, _RET_IP_); @@ -873,7 +860,7 @@ xlog_force_iclog( iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; if (iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(iclog->ic_log, iclog, 0); - return xlog_state_release_iclog(iclog->ic_log, iclog, 0); + return xlog_state_release_iclog(iclog->ic_log, iclog); } /* @@ -1373,7 +1360,7 @@ xlog_ioend_work( */ if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } xlog_state_done_syncing(iclog); @@ -1912,7 +1899,7 @@ xlog_write_iclog( iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return; } if (is_vmalloc_addr(iclog->ic_data)) @@ -2411,7 +2398,7 @@ xlog_write_copy_finish( ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || xlog_is_shutdown(log)); release_iclog: - error = xlog_state_release_iclog(log, iclog, 0); + error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); return error; } @@ -2487,7 +2474,7 @@ xlog_write( xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); xlog_print_tic_res(log->l_mp, ticket); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } len = xlog_write_calc_vec_length(ticket, log_vector, optype); @@ -2628,7 +2615,7 @@ next_lv: spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - error = xlog_state_release_iclog(log, iclog, 0); + error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); return error; @@ -3052,7 +3039,7 @@ restart: * reference to the iclog. */ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) - error = xlog_state_release_iclog(log, iclog, 0); + error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); if (error) return error; @@ -3821,9 +3808,10 @@ xlog_verify_iclog( #endif /* - * Perform a forced shutdown on the log. This should be called once and once - * only by the high level filesystem shutdown code to shut the log subsystem - * down cleanly. + * Perform a forced shutdown on the log. + * + * This can be called from low level log code to trigger a shutdown, or from the + * high level mount shutdown code when the mount shuts down. * * Our main objectives here are to make sure that: * a. if the shutdown was not due to a log IO error, flush the logs to @@ -3832,6 +3820,8 @@ xlog_verify_iclog( * parties to find out. Nothing new gets queued after this is done. * c. Tasks sleeping on log reservations, pinned objects and * other resources get woken up. + * d. The mount is also marked as shut down so that log triggered shutdowns + * still behave the same as if they called xfs_forced_shutdown(). * * Return true if the shutdown cause was a log IO error and we actually shut the * log down. @@ -3843,25 +3833,25 @@ xlog_force_shutdown( { bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR); - /* - * If this happens during log recovery then we aren't using the runtime - * log mechanisms yet so there's nothing to shut down. - */ - if (!log || xlog_in_recovery(log)) + if (!log) return false; - ASSERT(!xlog_is_shutdown(log)); - /* * Flush all the completed transactions to disk before marking the log * being shut down. We need to do this first as shutting down the log * before the force will prevent the log force from flushing the iclogs * to disk. * - * Re-entry due to a log IO error shutdown during the log force is - * prevented by the atomicity of higher level shutdown code. + * When we are in recovery, there are no transactions to flush, and + * we don't want to touch the log because we don't want to perturb the + * current head/tail for future recovery attempts. Hence we need to + * avoid a log force in this case. + * + * If we are shutting down due to a log IO error, then we must avoid + * trying to write the log as that may just result in more IO errors and + * an endless shutdown/force loop. */ - if (!log_error) + if (!log_error && !xlog_in_recovery(log)) xfs_log_force(log->l_mp, XFS_LOG_SYNC); /* @@ -3878,12 +3868,25 @@ xlog_force_shutdown( spin_lock(&log->l_icloglock); if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) { spin_unlock(&log->l_icloglock); - ASSERT(0); return false; } spin_unlock(&log->l_icloglock); /* + * If this log shutdown also sets the mount shutdown state, issue a + * shutdown warning message. + */ + if (!test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &log->l_mp->m_opstate)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_SHUTDOWN_LOGERROR, +"Filesystem has been shut down due to log error (0x%x).", + shutdown_flags); + xfs_alert(log->l_mp, +"Please unmount the filesystem and rectify the problem(s)."); + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); + } + + /* * We don't want anybody waiting for log reservations after this. That * means we have to wake up everybody queued up on reserveq as well as * writeq. In addition, we make sure in xlog_{re}grant_log_space that @@ -3903,8 +3906,12 @@ xlog_force_shutdown( wake_up_all(&log->l_cilp->xc_start_wait); wake_up_all(&log->l_cilp->xc_commit_wait); spin_unlock(&log->l_cilp->xc_push_lock); + + spin_lock(&log->l_icloglock); xlog_state_shutdown_callbacks(log); + spin_unlock(&log->l_icloglock); + wake_up_var(&log->l_opstate); return log_error; } diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 796e4464f809..ba57323bfdce 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -540,7 +540,7 @@ xlog_cil_insert_items( spin_unlock(&cil->xc_cil_lock); if (tp->t_ticket->t_curr_res < 0) - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } static void @@ -705,11 +705,21 @@ xlog_cil_set_ctx_write_state( * The LSN we need to pass to the log items on transaction * commit is the LSN reported by the first log vector write, not * the commit lsn. If we use the commit record lsn then we can - * move the tail beyond the grant write head. + * move the grant write head beyond the tail LSN and overwrite + * it. */ ctx->start_lsn = lsn; wake_up_all(&cil->xc_start_wait); spin_unlock(&cil->xc_push_lock); + + /* + * Make sure the metadata we are about to overwrite in the log + * has been flushed to stable storage before this iclog is + * issued. + */ + spin_lock(&cil->xc_log->l_icloglock); + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; + spin_unlock(&cil->xc_log->l_icloglock); return; } @@ -854,7 +864,7 @@ xlog_cil_write_commit_record( error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS); if (error) - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -888,10 +898,7 @@ xlog_cil_push_work( struct xfs_trans_header thdr; struct xfs_log_iovec lhdr; struct xfs_log_vec lvhdr = { NULL }; - xfs_lsn_t preflush_tail_lsn; xfs_csn_t push_seq; - struct bio bio; - DECLARE_COMPLETION_ONSTACK(bdev_flush); bool push_commit_stable; new_ctx = xlog_cil_ctx_alloc(); @@ -962,23 +969,6 @@ xlog_cil_push_work( spin_unlock(&cil->xc_push_lock); /* - * The CIL is stable at this point - nothing new will be added to it - * because we hold the flush lock exclusively. Hence we can now issue - * a cache flush to ensure all the completed metadata in the journal we - * are about to overwrite is on stable storage. - * - * Because we are issuing this cache flush before we've written the - * tail lsn to the iclog, we can have metadata IO completions move the - * tail forwards between the completion of this flush and the iclog - * being written. In this case, we need to re-issue the cache flush - * before the iclog write. To detect whether the log tail moves, sample - * the tail LSN *before* we issue the flush. - */ - preflush_tail_lsn = atomic64_read(&log->l_tail_lsn); - xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev, - &bdev_flush); - - /* * Pull all the log vectors off the items in the CIL, and remove the * items from the CIL. We don't need the CIL lock here because it's only * needed on the transaction commit side which is currently locked out @@ -1054,12 +1044,6 @@ xlog_cil_push_work( lvhdr.lv_iovecp = &lhdr; lvhdr.lv_next = ctx->lv_chain; - /* - * Before we format and submit the first iclog, we have to ensure that - * the metadata writeback ordering cache flush is complete. - */ - wait_for_completion(&bdev_flush); - error = xlog_cil_write_chain(ctx, &lvhdr); if (error) goto out_abort_free_ticket; @@ -1118,7 +1102,7 @@ xlog_cil_push_work( if (push_commit_stable && ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(log, ctx->commit_iclog, 0); - xlog_state_release_iclog(log, ctx->commit_iclog, preflush_tail_lsn); + xlog_state_release_iclog(log, ctx->commit_iclog); /* Not safe to reference ctx now! */ @@ -1139,7 +1123,7 @@ out_abort_free_ticket: return; } spin_lock(&log->l_icloglock); - xlog_state_release_iclog(log, ctx->commit_iclog, 0); + xlog_state_release_iclog(log, ctx->commit_iclog); /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 23103d68423c..401cdc400980 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -484,6 +484,17 @@ xlog_is_shutdown(struct xlog *log) return test_bit(XLOG_IO_ERROR, &log->l_opstate); } +/* + * Wait until the xlog_force_shutdown() has marked the log as shut down + * so xlog_is_shutdown() will always return true. + */ +static inline void +xlog_shutdown_wait( + struct xlog *log) +{ + wait_var_event(&log->l_opstate, xlog_is_shutdown(log)); +} + /* common routines */ extern int xlog_recover( @@ -524,8 +535,7 @@ void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog, int eventual_size); -int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog, - xfs_lsn_t log_tail_lsn); +int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog); /* * When we crack an atomic LSN, we sample it first so that the value will not diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 96c997ed2ec8..c4ad4296c540 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2485,7 +2485,7 @@ xlog_finish_defer_ops( error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres, dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp); if (error) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(mp->m_log, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -2519,21 +2519,22 @@ xlog_abort_defer_ops( xfs_defer_ops_capture_free(mp, dfc); } } + /* * When this is called, all of the log intent items which did not have - * corresponding log done items should be in the AIL. What we do now - * is update the data structures associated with each one. + * corresponding log done items should be in the AIL. What we do now is update + * the data structures associated with each one. * - * Since we process the log intent items in normal transactions, they - * will be removed at some point after the commit. This prevents us - * from just walking down the list processing each one. We'll use a - * flag in the intent item to skip those that we've already processed - * and use the AIL iteration mechanism's generation count to try to - * speed this up at least a bit. + * Since we process the log intent items in normal transactions, they will be + * removed at some point after the commit. This prevents us from just walking + * down the list processing each one. We'll use a flag in the intent item to + * skip those that we've already processed and use the AIL iteration mechanism's + * generation count to try to speed this up at least a bit. * - * When we start, we know that the intents are the only things in the - * AIL. As we process them, however, other items are added to the - * AIL. + * When we start, we know that the intents are the only things in the AIL. As we + * process them, however, other items are added to the AIL. Hence we know we + * have started recovery on all the pending intents when we find an non-intent + * item in the AIL. */ STATIC int xlog_recover_process_intents( @@ -2556,17 +2557,8 @@ xlog_recover_process_intents( for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; lip = xfs_trans_ail_cursor_next(ailp, &cur)) { - /* - * We're done when we see something other than an intent. - * There should be no intents left in the AIL now. - */ - if (!xlog_item_is_intent(lip)) { -#ifdef DEBUG - for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) - ASSERT(!xlog_item_is_intent(lip)); -#endif + if (!xlog_item_is_intent(lip)) break; - } /* * We should never see a redo item with a LSN higher than @@ -2607,8 +2599,9 @@ err: } /* - * A cancel occurs when the mount has failed and we're bailing out. - * Release all pending log intent items so they don't pin the AIL. + * A cancel occurs when the mount has failed and we're bailing out. Release all + * pending log intent items that we haven't started recovery on so they don't + * pin the AIL. */ STATIC void xlog_recover_cancel_intents( @@ -2622,17 +2615,8 @@ xlog_recover_cancel_intents( spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { - /* - * We're done when we see something other than an intent. - * There should be no intents left in the AIL now. - */ - if (!xlog_item_is_intent(lip)) { -#ifdef DEBUG - for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) - ASSERT(!xlog_item_is_intent(lip)); -#endif + if (!xlog_item_is_intent(lip)) break; - } spin_unlock(&ailp->ail_lock); lip->li_ops->iop_release(lip); @@ -3470,7 +3454,7 @@ xlog_recover_finish( */ xlog_recover_cancel_intents(log); xfs_alert(log->l_mp, "Failed to recover intents"); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -3517,7 +3501,7 @@ xlog_recover_finish( * end of intents processing can be pushed through the CIL * and AIL. */ - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } return 0; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bed73e8002a5..c5f153c3693f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -21,6 +21,7 @@ #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_fsops.h" @@ -1146,7 +1147,7 @@ xfs_mod_fdblocks( * problems (i.e. transaction abort, pagecache discards, etc.) than * slightly premature -ENOSPC. */ - set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); + set_aside = xfs_fdblocks_unavailable(mp); percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); if (__percpu_counter_compare(&mp->m_fdblocks, set_aside, XFS_FDBLOCKS_BATCH) >= 0) { diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 00720a02e761..f6dc19de8322 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -479,6 +479,21 @@ extern void xfs_unmountfs(xfs_mount_t *); */ #define XFS_FDBLOCKS_BATCH 1024 +/* + * Estimate the amount of free space that is not available to userspace and is + * not explicitly reserved from the incore fdblocks. This includes: + * + * - The minimum number of blocks needed to support splitting a bmap btree + * - The blocks currently in use by the freespace btrees because they record + * the actual blocks that will fill per-AG metadata space reservations + */ +static inline uint64_t +xfs_fdblocks_unavailable( + struct xfs_mount *mp) +{ + return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); +} + extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved); extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d84714e4e46a..54be9d64093e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -815,7 +815,8 @@ xfs_fs_statfs( spin_unlock(&mp->m_sb_lock); /* make sure statp->f_bfree does not underflow */ - statp->f_bfree = max_t(int64_t, fdblocks - mp->m_alloc_set_aside, 0); + statp->f_bfree = max_t(int64_t, 0, + fdblocks - xfs_fdblocks_unavailable(mp)); statp->f_bavail = statp->f_bfree; fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 917a69f0a6ff..0ac717aad380 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -836,6 +836,7 @@ __xfs_trans_commit( bool regrant) { struct xfs_mount *mp = tp->t_mountp; + struct xlog *log = mp->m_log; xfs_csn_t commit_seq = 0; int error = 0; int sync = tp->t_flags & XFS_TRANS_SYNC; @@ -864,7 +865,13 @@ __xfs_trans_commit( if (!(tp->t_flags & XFS_TRANS_DIRTY)) goto out_unreserve; - if (xfs_is_shutdown(mp)) { + /* + * We must check against log shutdown here because we cannot abort log + * items and leave them dirty, inconsistent and unpinned in memory while + * the log is active. This leaves them open to being written back to + * disk, and that will lead to on-disk corruption. + */ + if (xlog_is_shutdown(log)) { error = -EIO; goto out_unreserve; } @@ -878,7 +885,7 @@ __xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - xlog_cil_commit(mp->m_log, tp, &commit_seq, regrant); + xlog_cil_commit(log, tp, &commit_seq, regrant); xfs_trans_free(tp); @@ -905,10 +912,10 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - if (regrant && !xlog_is_shutdown(mp->m_log)) - xfs_log_ticket_regrant(mp->m_log, tp->t_ticket); + if (regrant && !xlog_is_shutdown(log)) + xfs_log_ticket_regrant(log, tp->t_ticket); else - xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); + xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; } xfs_trans_free_items(tp, !!error); @@ -926,18 +933,27 @@ xfs_trans_commit( } /* - * Unlock all of the transaction's items and free the transaction. - * The transaction must not have modified any of its items, because - * there is no way to restore them to their previous state. + * Unlock all of the transaction's items and free the transaction. If the + * transaction is dirty, we must shut down the filesystem because there is no + * way to restore them to their previous state. * - * If the transaction has made a log reservation, make sure to release - * it as well. + * If the transaction has made a log reservation, make sure to release it as + * well. + * + * This is a high level function (equivalent to xfs_trans_commit()) and so can + * be called after the transaction has effectively been aborted due to the mount + * being shut down. However, if the mount has not been shut down and the + * transaction is dirty we will shut the mount down and, in doing so, that + * guarantees that the log is shut down, too. Hence we don't need to be as + * careful with shutdown state and dirty items here as we need to be in + * xfs_trans_commit(). */ void xfs_trans_cancel( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; + struct xlog *log = mp->m_log; bool dirty = (tp->t_flags & XFS_TRANS_DIRTY); trace_xfs_trans_cancel(tp, _RET_IP_); @@ -955,16 +971,18 @@ xfs_trans_cancel( } /* - * See if the caller is relying on us to shut down the - * filesystem. This happens in paths where we detect - * corruption and decide to give up. + * See if the caller is relying on us to shut down the filesystem. We + * only want an error report if there isn't already a shutdown in + * progress, so we only need to check against the mount shutdown state + * here. */ if (dirty && !xfs_is_shutdown(mp)) { XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } #ifdef DEBUG - if (!dirty && !xfs_is_shutdown(mp)) { + /* Log items need to be consistent until the log is shut down. */ + if (!dirty && !xlog_is_shutdown(log)) { struct xfs_log_item *lip; list_for_each_entry(lip, &tp->t_items, li_trans) @@ -975,7 +993,7 @@ xfs_trans_cancel( xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); + xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; } diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index c2ccb98c7bcd..d3a97a028560 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -873,17 +873,17 @@ xfs_trans_ail_delete( int shutdown_type) { struct xfs_ail *ailp = lip->li_ailp; - struct xfs_mount *mp = ailp->ail_log->l_mp; + struct xlog *log = ailp->ail_log; xfs_lsn_t tail_lsn; spin_lock(&ailp->ail_lock); if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); - if (shutdown_type && !xlog_is_shutdown(ailp->ail_log)) { - xfs_alert_tag(mp, XFS_PTAG_AILDELETE, + if (shutdown_type && !xlog_is_shutdown(log)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", __func__); - xfs_force_shutdown(mp, shutdown_type); + xlog_force_shutdown(log, shutdown_type); } return; } |