diff options
81 files changed, 1315 insertions, 1089 deletions
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 16bb9a328678..da031b93e182 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -3,10 +3,10 @@ * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ -#include <linux/sched/mm.h> +#include "xfs.h" #include <linux/backing-dev.h> -#include "kmem.h" #include "xfs_message.h" +#include "xfs_trace.h" void * kmem_alloc(size_t size, xfs_km_flags_t flags) @@ -15,9 +15,11 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; + trace_kmem_alloc(size, flags, _RET_IP_); + do { ptr = kmalloc(size, lflags); - if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + if (ptr || (flags & KM_MAYFAIL)) return ptr; if (!(++retries % 100)) xfs_err(NULL, @@ -28,28 +30,24 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) } while (1); } -void * -kmem_alloc_large(size_t size, xfs_km_flags_t flags) + +/* + * __vmalloc() will allocate data pages and auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence + * we need to tell memory reclaim that we are in such a context via + * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here + * and potentially deadlocking. + */ +static void * +__kmem_vmalloc(size_t size, xfs_km_flags_t flags) { unsigned nofs_flag = 0; void *ptr; - gfp_t lflags; - - ptr = kmem_alloc(size, flags | KM_MAYFAIL); - if (ptr) - return ptr; + gfp_t lflags = kmem_flags_convert(flags); - /* - * __vmalloc() will allocate data pages and auxillary structures (e.g. - * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context - * here. Hence we need to tell memory reclaim that we are in such a - * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering - * the filesystem here and potentially deadlocking. - */ if (flags & KM_NOFS) nofs_flag = memalloc_nofs_save(); - lflags = kmem_flags_convert(flags); ptr = __vmalloc(size, lflags, PAGE_KERNEL); if (flags & KM_NOFS) @@ -58,6 +56,44 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags) return ptr; } +/* + * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned + * to the @align_mask. We only guarantee alignment up to page size, we'll clamp + * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE + * aligned region. + */ +void * +kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags) +{ + void *ptr; + + trace_kmem_alloc_io(size, flags, _RET_IP_); + + if (WARN_ON_ONCE(align_mask >= PAGE_SIZE)) + align_mask = PAGE_SIZE - 1; + + ptr = kmem_alloc(size, flags | KM_MAYFAIL); + if (ptr) { + if (!((uintptr_t)ptr & align_mask)) + return ptr; + kfree(ptr); + } + return __kmem_vmalloc(size, flags); +} + +void * +kmem_alloc_large(size_t size, xfs_km_flags_t flags) +{ + void *ptr; + + trace_kmem_alloc_large(size, flags, _RET_IP_); + + ptr = kmem_alloc(size, flags | KM_MAYFAIL); + if (ptr) + return ptr; + return __kmem_vmalloc(size, flags); +} + void * kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) { @@ -65,9 +101,11 @@ kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; + trace_kmem_realloc(newsize, flags, _RET_IP_); + do { ptr = krealloc(old, newsize, lflags); - if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + if (ptr || (flags & KM_MAYFAIL)) return ptr; if (!(++retries % 100)) xfs_err(NULL, @@ -85,9 +123,10 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; + trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_); do { ptr = kmem_cache_alloc(zone, lflags); - if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + if (ptr || (flags & KM_MAYFAIL)) return ptr; if (!(++retries % 100)) xfs_err(NULL, diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 267655acd426..8170d95cf930 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -16,8 +16,6 @@ */ typedef unsigned __bitwise xfs_km_flags_t; -#define KM_SLEEP ((__force xfs_km_flags_t)0x0001u) -#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u) #define KM_NOFS ((__force xfs_km_flags_t)0x0004u) #define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) #define KM_ZERO ((__force xfs_km_flags_t)0x0010u) @@ -32,15 +30,11 @@ kmem_flags_convert(xfs_km_flags_t flags) { gfp_t lflags; - BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO)); + BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO)); - if (flags & KM_NOSLEEP) { - lflags = GFP_ATOMIC | __GFP_NOWARN; - } else { - lflags = GFP_KERNEL | __GFP_NOWARN; - if (flags & KM_NOFS) - lflags &= ~__GFP_FS; - } + lflags = GFP_KERNEL | __GFP_NOWARN; + if (flags & KM_NOFS) + lflags &= ~__GFP_FS; /* * Default page/slab allocator behavior is to retry for ever @@ -59,6 +53,7 @@ kmem_flags_convert(xfs_km_flags_t flags) } extern void *kmem_alloc(size_t, xfs_km_flags_t); +extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags); extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); static inline void kmem_free(const void *ptr) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 372ad55631fc..533b04aaf6f6 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2205,7 +2205,7 @@ xfs_defer_agfl_block( ASSERT(xfs_bmap_free_item_zone != NULL); ASSERT(oinfo != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); new->xefi_blockcount = 1; new->xefi_oinfo = *oinfo; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index d6ed5d2c07c2..58fa85cec325 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -81,10 +81,9 @@ typedef struct xfs_alloc_arg { /* * Defines for datatype */ -#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ -#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ -#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ -#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */ +#define XFS_ALLOC_INITIAL_USER_DATA (1 << 0)/* special case start of file */ +#define XFS_ALLOC_USERDATA_ZERO (1 << 1)/* zero extent on allocation */ +#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */ static inline bool xfs_alloc_is_userdata(int datatype) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index d48fcf11cc35..510ca6974604 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -97,7 +97,10 @@ xfs_inode_hasattr( * Overall external interface routines. *========================================================================*/ -/* Retrieve an extended attribute and its value. Must have ilock. */ +/* + * Retrieve an extended attribute and its value. Must have ilock. + * Returns 0 on successful retrieval, otherwise an error. + */ int xfs_attr_get_ilocked( struct xfs_inode *ip, @@ -115,12 +118,28 @@ xfs_attr_get_ilocked( return xfs_attr_node_get(args); } -/* Retrieve an extended attribute by name, and its value. */ +/* + * Retrieve an extended attribute by name, and its value if requested. + * + * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value, + * just an indication whether the attribute exists and the size of the value if + * it exists. The size is returned in @valuelenp, + * + * If the attribute is found, but exceeds the size limit set by the caller in + * @valuelenp, return -ERANGE with the size of the attribute that was found in + * @valuelenp. + * + * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after + * existence of the attribute has been determined. On success, return that + * buffer to the caller and leave them to free it. On failure, free any + * allocated buffer and ensure the buffer pointer returned to the caller is + * null. + */ int xfs_attr_get( struct xfs_inode *ip, const unsigned char *name, - unsigned char *value, + unsigned char **value, int *valuelenp, int flags) { @@ -128,6 +147,8 @@ xfs_attr_get( uint lock_mode; int error; + ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value); + XFS_STATS_INC(ip->i_mount, xs_attr_get); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -137,17 +158,29 @@ xfs_attr_get( if (error) return error; - args.value = value; - args.valuelen = *valuelenp; /* Entirely possible to look up a name which doesn't exist */ args.op_flags = XFS_DA_OP_OKNOENT; + if (flags & ATTR_ALLOC) + args.op_flags |= XFS_DA_OP_ALLOCVAL; + else + args.value = *value; + args.valuelen = *valuelenp; lock_mode = xfs_ilock_attr_map_shared(ip); error = xfs_attr_get_ilocked(ip, &args); xfs_iunlock(ip, lock_mode); - *valuelenp = args.valuelen; - return error == -EEXIST ? 0 : error; + + /* on error, we have to clean up allocated value buffers */ + if (error) { + if (flags & ATTR_ALLOC) { + kmem_free(args.value); + *value = NULL; + } + return error; + } + *value = args.value; + return 0; } /* @@ -768,6 +801,8 @@ xfs_attr_leaf_removename( * * This leaf block cannot have a "remote" value, we only call this routine * if bmap_one_block() says there is only one block (ie: no remote blks). + * + * Returns 0 on successful retrieval, otherwise an error. */ STATIC int xfs_attr_leaf_get(xfs_da_args_t *args) @@ -789,9 +824,6 @@ xfs_attr_leaf_get(xfs_da_args_t *args) } error = xfs_attr3_leaf_getvalue(bp, args); xfs_trans_brelse(args->trans, bp); - if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { - error = xfs_attr_rmtval_get(args); - } return error; } @@ -1268,11 +1300,13 @@ xfs_attr_refillstate(xfs_da_state_t *state) } /* - * Look up a filename in a node attribute list. + * Retrieve the attribute data from a node attribute list. * * This routine gets called for any attribute fork that has more than one * block, ie: both true Btree attr lists and for single-leaf-blocks with * "remote" values taking up more blocks. + * + * Returns 0 on successful retrieval, otherwise an error. */ STATIC int xfs_attr_node_get(xfs_da_args_t *args) @@ -1294,24 +1328,21 @@ xfs_attr_node_get(xfs_da_args_t *args) error = xfs_da3_node_lookup_int(state, &retval); if (error) { retval = error; - } else if (retval == -EEXIST) { - blk = &state->path.blk[ state->path.active-1 ]; - ASSERT(blk->bp != NULL); - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - - /* - * Get the value, local or "remote" - */ - retval = xfs_attr3_leaf_getvalue(blk->bp, args); - if (!retval && (args->rmtblkno > 0) - && !(args->flags & ATTR_KERNOVAL)) { - retval = xfs_attr_rmtval_get(args); - } + goto out_release; } + if (retval != -EEXIST) + goto out_release; + + /* + * Get the value, local or "remote" + */ + blk = &state->path.blk[state->path.active - 1]; + retval = xfs_attr3_leaf_getvalue(blk->bp, args); /* * If not in a transaction, we have to release all the buffers. */ +out_release: for (i = 0; i < state->path.active; i++) { xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index ff28ebf3b635..94badfa1743e 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -37,6 +37,7 @@ struct xfs_attr_list_context; #define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ #define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ +#define ATTR_ALLOC 0x8000 /* allocate xattr buffer on demand */ #define XFS_ATTR_FLAGS \ { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ @@ -47,7 +48,8 @@ struct xfs_attr_list_context; { ATTR_REPLACE, "REPLACE" }, \ { ATTR_KERNOTIME, "KERNOTIME" }, \ { ATTR_KERNOVAL, "KERNOVAL" }, \ - { ATTR_INCOMPLETE, "INCOMPLETE" } + { ATTR_INCOMPLETE, "INCOMPLETE" }, \ + { ATTR_ALLOC, "ALLOC" } /* * The maximum size (into the kernel or returned from the kernel) of an @@ -143,7 +145,7 @@ int xfs_attr_list_int(struct xfs_attr_list_context *); int xfs_inode_hasattr(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, - unsigned char *value, int *valuelenp, int flags); + unsigned char **value, int *valuelenp, int flags); int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, unsigned char *value, int valuelen, int flags); int xfs_attr_set_args(struct xfs_da_args *args); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 70eb941d02e4..b9f019603d0b 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -393,6 +393,50 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags) return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); } +static int +xfs_attr_copy_value( + struct xfs_da_args *args, + unsigned char *value, + int valuelen) +{ + /* + * No copy if all we have to do is get the length + */ + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = valuelen; + return 0; + } + + /* + * No copy if the length of the existing buffer is too small + */ + if (args->valuelen < valuelen) { + args->valuelen = valuelen; + return -ERANGE; + } + + if (args->op_flags & XFS_DA_OP_ALLOCVAL) { + args->value = kmem_alloc_large(valuelen, 0); + if (!args->value) + return -ENOMEM; + } + args->valuelen = valuelen; + + /* remote block xattr requires IO for copy-in */ + if (args->rmtblkno) + return xfs_attr_rmtval_get(args); + + /* + * This is to prevent a GCC warning because the remote xattr case + * doesn't have a value to pass in. In that case, we never reach here, + * but GCC can't work that out and so throws a "passing NULL to + * memcpy" warning. + */ + if (!value) + return -EINVAL; + memcpy(args->value, value, valuelen); + return 0; +} /*======================================================================== * External routines when attribute fork size < XFS_LITINO(mp). @@ -720,15 +764,19 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) } /* - * Look up a name in a shortform attribute list structure. + * Retreive the attribute value and length. + * + * If ATTR_KERNOVAL is specified, only the length needs to be returned. + * Unlike a lookup, we only return an error if the attribute does not + * exist or we can't retrieve the value. */ -/*ARGSUSED*/ int -xfs_attr_shortform_getvalue(xfs_da_args_t *args) +xfs_attr_shortform_getvalue( + struct xfs_da_args *args) { - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; - int i; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + int i; ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; @@ -741,18 +789,8 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args) continue; if (!xfs_attr_namesp_match(args->flags, sfe->flags)) continue; - if (args->flags & ATTR_KERNOVAL) { - args->valuelen = sfe->valuelen; - return -EEXIST; - } - if (args->valuelen < sfe->valuelen) { - args->valuelen = sfe->valuelen; - return -ERANGE; - } - args->valuelen = sfe->valuelen; - memcpy(args->value, &sfe->nameval[args->namelen], - args->valuelen); - return -EEXIST; + return xfs_attr_copy_value(args, &sfe->nameval[args->namelen], + sfe->valuelen); } return -ENOATTR; } @@ -782,7 +820,7 @@ xfs_attr_shortform_to_leaf( ifp = dp->i_afp; sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; size = be16_to_cpu(sf->hdr.totsize); - tmpbuffer = kmem_alloc(size, KM_SLEEP); + tmpbuffer = kmem_alloc(size, 0); ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, ifp->if_u1.if_data, size); sf = (xfs_attr_shortform_t *)tmpbuffer; @@ -985,7 +1023,7 @@ xfs_attr3_leaf_to_shortform( trace_xfs_attr_leaf_to_sf(args); - tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); + tmpbuffer = kmem_alloc(args->geo->blksize, 0); if (!tmpbuffer) return -ENOMEM; @@ -1448,7 +1486,7 @@ xfs_attr3_leaf_compact( trace_xfs_attr_leaf_compact(args); - tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); + tmpbuffer = kmem_alloc(args->geo->blksize, 0); memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); memset(bp->b_addr, 0, args->geo->blksize); leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; @@ -2167,7 +2205,7 @@ xfs_attr3_leaf_unbalance( struct xfs_attr_leafblock *tmp_leaf; struct xfs_attr3_icleaf_hdr tmphdr; - tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP); + tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0); /* * Copy the header into the temp leaf so that all the stuff @@ -2350,6 +2388,10 @@ xfs_attr3_leaf_lookup_int( /* * Get the value associated with an attribute name from a leaf attribute * list structure. + * + * If ATTR_KERNOVAL is specified, only the length needs to be returned. + * Unlike a lookup, we only return an error if the attribute does not + * exist or we can't retrieve the value. */ int xfs_attr3_leaf_getvalue( @@ -2361,7 +2403,6 @@ xfs_attr3_leaf_getvalue( struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_local *name_loc; struct xfs_attr_leaf_name_remote *name_rmt; - int valuelen; leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); @@ -2373,36 +2414,19 @@ xfs_attr3_leaf_getvalue( name_loc = xfs_attr3_leaf_name_local(leaf, args->index); ASSERT(name_loc->namelen == args->namelen); ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); - valuelen = be16_to_cpu(name_loc->valuelen); - if (args->flags & ATTR_KERNOVAL) { - args->valuelen = valuelen; - return 0; - } - if (args->valuelen < valuelen) { - args->valuelen = valuelen; - return -ERANGE; - } - args->valuelen = valuelen; - memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); - } else { - name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); - ASSERT(name_rmt->namelen == args->namelen); - ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); - args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); - args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, - args->rmtvaluelen); - if (args->flags & ATTR_KERNOVAL) { - args->valuelen = args->rmtvaluelen; - return 0; - } - if (args->valuelen < args->rmtvaluelen) { - args->valuelen = args->rmtvaluelen; - return -ERANGE; - } - args->valuelen = args->rmtvaluelen; - } - return 0; + return xfs_attr_copy_value(args, + &name_loc->nameval[args->namelen], + be16_to_cpu(name_loc->valuelen)); + } + + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); + ASSERT(name_rmt->namelen == args->namelen); + ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); + args->rmtblkno = be32_to_cpu(name_rmt->valueblk); + args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, + args->rmtvaluelen); + return xfs_attr_copy_value(args, NULL, args->rmtvaluelen); } /*======================================================================== diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 4eb30d357045..3e39b7d40f25 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -358,6 +358,8 @@ xfs_attr_rmtval_copyin( /* * Read the value associated with an attribute from the out-of-line buffer * that we stored it in. + * + * Returns 0 on successful retrieval, otherwise an error. */ int xfs_attr_rmtval_get( diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 07aad70f3931..054b4ce30033 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -553,7 +553,7 @@ __xfs_bmap_add_free( #endif ASSERT(xfs_bmap_free_item_zone != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); new->xefi_startblock = bno; new->xefi_blockcount = (xfs_extlen_t)len; if (oinfo) @@ -1099,7 +1099,7 @@ xfs_bmap_add_attrfork( if (error) goto trans_cancel; ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0); ip->i_afp->if_flags = XFS_IFEXTENTS; logflags = 0; switch (ip->i_d.di_format) { @@ -1985,11 +1985,8 @@ xfs_bmap_add_extent_delay_real( } /* add reverse mapping unless caller opted out */ - if (!(bma->flags & XFS_BMAPI_NORMAP)) { - error = xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new); - if (error) - goto done; - } + if (!(bma->flags & XFS_BMAPI_NORMAP)) + xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new); /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(bma->ip, whichfork)) { @@ -2471,9 +2468,7 @@ xfs_bmap_add_extent_unwritten_real( } /* update reverse mappings */ - error = xfs_rmap_convert_extent(mp, tp, ip, whichfork, new); - if (error) - goto done; + xfs_rmap_convert_extent(mp, tp, ip, whichfork, new); /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(ip, whichfork)) { @@ -2832,11 +2827,8 @@ xfs_bmap_add_extent_hole_real( } /* add reverse mapping unless caller opted out */ - if (!(flags & XFS_BMAPI_NORMAP)) { - error = xfs_rmap_map_extent(tp, ip, whichfork, new); - if (error) - goto done; - } + if (!(flags & XFS_BMAPI_NORMAP)) + xfs_rmap_map_extent(tp, ip, whichfork, new); /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(ip, whichfork)) { @@ -4050,12 +4042,8 @@ xfs_bmapi_allocate( */ if (!(bma->flags & XFS_BMAPI_METADATA)) { bma->datatype = XFS_ALLOC_NOBUSY; - if (whichfork == XFS_DATA_FORK) { - if (bma->offset == 0) - bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; - else - bma->datatype |= XFS_ALLOC_USERDATA; - } + if (whichfork == XFS_DATA_FORK && bma->offset == 0) + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; if (bma->flags & XFS_BMAPI_ZERO) bma->datatype |= XFS_ALLOC_USERDATA_ZERO; } @@ -4401,12 +4389,9 @@ xfs_bmapi_write( * If this is a CoW allocation, record the data in * the refcount btree for orphan recovery. */ - if (whichfork == XFS_COW_FORK) { - error = xfs_refcount_alloc_cow_extent(tp, - bma.blkno, bma.length); - if (error) - goto error0; - } + if (whichfork == XFS_COW_FORK) + xfs_refcount_alloc_cow_extent(tp, bma.blkno, + bma.length); } /* Deal with the allocated space we found. */ @@ -4530,7 +4515,7 @@ xfs_bmapi_convert_delalloc( if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) goto out_finish; error = -EFSCORRUPTED; - if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip))) + if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) goto out_finish; XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); @@ -4540,12 +4525,8 @@ xfs_bmapi_convert_delalloc( *imap = bma.got; *seq = READ_ONCE(ifp->if_seq); - if (whichfork == XFS_COW_FORK) { - error = xfs_refcount_alloc_cow_extent(tp, bma.blkno, - bma.length); - if (error) - goto out_finish; - } + if (whichfork == XFS_COW_FORK) + xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, whichfork); @@ -5149,18 +5130,14 @@ xfs_bmap_del_extent_real( } /* remove reverse mapping */ - error = xfs_rmap_unmap_extent(tp, ip, whichfork, del); - if (error) - goto done; + xfs_rmap_unmap_extent(tp, ip, whichfork, del); /* * If we need to, add to list of extents to delete. */ if (do_fx && !(bflags & XFS_BMAPI_REMAP)) { if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { - error = xfs_refcount_decrease_extent(tp, del); - if (error) - goto done; + xfs_refcount_decrease_extent(tp, del); } else { __xfs_bmap_add_free(tp, del->br_startblock, del->br_blockcount, NULL, @@ -5651,12 +5628,11 @@ done: &new); /* update reverse mapping. rmap functions merge the rmaps for us */ - error = xfs_rmap_unmap_extent(tp, ip, whichfork, got); - if (error) - return error; + xfs_rmap_unmap_extent(tp, ip, whichfork, got); memcpy(&new, got, sizeof(new)); new.br_startoff = left->br_startoff + left->br_blockcount; - return xfs_rmap_map_extent(tp, ip, whichfork, &new); + xfs_rmap_map_extent(tp, ip, whichfork, &new); + return 0; } static int @@ -5695,10 +5671,9 @@ xfs_bmap_shift_update_extent( got); /* update reverse mapping */ - error = xfs_rmap_unmap_extent(tp, ip, whichfork, &prev); - if (error) - return error; - return xfs_rmap_map_extent(tp, ip, whichfork, got); + xfs_rmap_unmap_extent(tp, ip, whichfork, &prev); + xfs_rmap_map_extent(tp, ip, whichfork, got); + return 0; } int @@ -6094,7 +6069,7 @@ __xfs_bmap_add( bmap->br_blockcount, bmap->br_state); - bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS); + bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS); INIT_LIST_HEAD(&bi->bi_list); bi->bi_type = type; bi->bi_owner = ip; @@ -6106,29 +6081,29 @@ __xfs_bmap_add( } /* Map an extent into a file. */ -int +void xfs_bmap_map_extent( struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *PREV) { if (!xfs_bmap_is_update_needed(PREV)) - return 0; + return; - return __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV); + __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV); } /* Unmap an extent out of a file. */ -int +void xfs_bmap_unmap_extent( struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *PREV) { if (!xfs_bmap_is_update_needed(PREV)) - return 0; + return; - return __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV); + __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV); } /* diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 8f597f9abdbe..5bb446d80542 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -171,6 +171,13 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) !isnullstartblock(irec->br_startblock); } +/* + * Check the mapping for obviously garbage allocations that could trash the + * filesystem immediately. + */ +#define xfs_valid_startblock(ip, startblock) \ + ((startblock) != 0 || XFS_IS_REALTIME_INODE(ip)) + void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); @@ -254,9 +261,9 @@ int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip, enum xfs_bmap_intent_type type, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, xfs_filblks_t *blockcount, xfs_exntst_t state); -int xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, +void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); -int xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, +void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); static inline int xfs_bmap_fork_to_state(int whichfork) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index fbb18ba5d905..ffe608d2a2d9 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -400,8 +400,20 @@ xfs_bmbt_diff_two_keys( union xfs_btree_key *k1, union xfs_btree_key *k2) { - return (int64_t)be64_to_cpu(k1->bmbt.br_startoff) - - be64_to_cpu(k2->bmbt.br_startoff); + uint64_t a = be64_to_cpu(k1->bmbt.br_startoff); + uint64_t b = be64_to_cpu(k2->bmbt.br_startoff); + + /* + * Note: This routine previously casted a and b to int64 and subtracted + * them to generate a result. This lead to problems if b was the + * "maximum" key value (all ones) being signed incorrectly, hence this + * somewhat less efficient version. + */ + if (a > b) + return 1; + if (b > a) + return -1; + return 0; } static xfs_failaddr_t diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index f1048efa4268..71de937f9e64 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4466,8 +4466,6 @@ xfs_btree_lblock_verify( * btree block * * @bp: buffer containing the btree block - * @max_recs: pointer to the m_*_mxr max records field in the xfs mount - * @pag_max_level: pointer to the per-ag max level field */ xfs_failaddr_t xfs_btree_sblock_v5hdr_verify( @@ -4600,7 +4598,7 @@ xfs_btree_simple_query_range( /* Callback */ error = fn(cur, recp, priv); - if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT) + if (error) break; advloop: @@ -4702,8 +4700,7 @@ pop_up: */ if (ldiff >= 0 && hdiff >= 0) { error = fn(cur, recp, priv); - if (error < 0 || - error == XFS_BTREE_QUERY_RANGE_ABORT) + if (error) break; } else if (hdiff < 0) { /* Record is larger than high key; pop. */ @@ -4774,8 +4771,7 @@ out: * Query a btree for all records overlapping a given interval of keys. The * supplied function will be called with each record found; return one of the * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error - * code. This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a - * negative error code. + * code. This function returns -ECANCELED, zero, or a negative error code. */ int xfs_btree_query_range( @@ -4891,7 +4887,7 @@ xfs_btree_has_record_helper( union xfs_btree_rec *rec, void *priv) { - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; } /* Is there a record covering a given range of keys? */ @@ -4906,7 +4902,7 @@ xfs_btree_has_record( error = xfs_btree_query_range(cur, low, high, &xfs_btree_has_record_helper, NULL); - if (error == XFS_BTREE_QUERY_RANGE_ABORT) { + if (error == -ECANCELED) { *exists = true; return 0; } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index fa3cd8ab9aba..ced1e65d1483 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -464,9 +464,13 @@ xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); -/* return codes */ -#define XFS_BTREE_QUERY_RANGE_CONTINUE (XFS_ITER_CONTINUE) /* keep iterating */ -#define XFS_BTREE_QUERY_RANGE_ABORT (XFS_ITER_ABORT) /* stop iterating */ +/* + * Return codes for the query range iterator function are 0 to continue + * iterating, and non-zero to stop iterating. Any non-zero value will be + * passed up to the _query_range caller. The special value -ECANCELED can be + * used to stop iteration, because _query_range never generates that error + * code on its own. + */ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, union xfs_btree_rec *rec, void *priv); diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 0bf56e94bfe9..4fd1223c1bd5 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2098,7 +2098,7 @@ xfs_da_grow_inode_int( * If we didn't get it and the block might work if fragmented, * try without the CONTIG flag. Loop until we get it all. */ - mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); + mapp = kmem_alloc(sizeof(*mapp) * count, 0); for (b = *bno, mapi = 0; b < *bno + count; ) { nmap = min(XFS_BMAP_MAX_NMAP, count); c = (int)(*bno + count - b); @@ -2480,7 +2480,7 @@ xfs_buf_map_from_irec( if (nirecs > 1) { map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), - KM_SLEEP | KM_NOFS); + KM_NOFS); if (!map) return -ENOMEM; *mapp = map; @@ -2539,7 +2539,7 @@ xfs_dabuf_map( */ if (nfsb != 1) irecs = kmem_zalloc(sizeof(irec) * nfsb, - KM_SLEEP | KM_NOFS); + KM_NOFS); nirecs = nfsb; error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 84dd865b6c3d..ae0bbd20d9ca 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -81,13 +81,15 @@ typedef struct xfs_da_args { #define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ +#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ { XFS_DA_OP_RENAME, "RENAME" }, \ { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ - { XFS_DA_OP_CILOOKUP, "CILOOKUP" } + { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ + { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" } /* * Storage for holding state during Btree searches and split/join ops. diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index eb2be2a6a25a..22557527cfdb 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -517,7 +517,7 @@ xfs_defer_add( } if (!dfp) { dfp = kmem_alloc(sizeof(struct xfs_defer_pending), - KM_SLEEP | KM_NOFS); + KM_NOFS); dfp->dfp_type = type; dfp->dfp_intent = NULL; dfp->dfp_done = NULL; diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 67840723edbb..867c5dee0751 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -110,9 +110,9 @@ xfs_da_mount( nodehdr_size = mp->m_dir_inode_ops->node_hdr_size; mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), - KM_SLEEP | KM_MAYFAIL); + KM_MAYFAIL); mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), - KM_SLEEP | KM_MAYFAIL); + KM_MAYFAIL); if (!mp->m_dir_geo || !mp->m_attr_geo) { kmem_free(mp->m_dir_geo); kmem_free(mp->m_attr_geo); @@ -217,7 +217,7 @@ xfs_dir_init( if (error) return error; - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args = kmem_zalloc(sizeof(*args), KM_NOFS); if (!args) return -ENOMEM; @@ -254,7 +254,7 @@ xfs_dir_createname( XFS_STATS_INC(dp->i_mount, xs_dir_create); } - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args = kmem_zalloc(sizeof(*args), KM_NOFS); if (!args) return -ENOMEM; @@ -353,7 +353,7 @@ xfs_dir_lookup( * lockdep Doing this avoids having to add a bunch of lockdep class * annotations into the reclaim path for the ilock. */ - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args = kmem_zalloc(sizeof(*args), KM_NOFS); args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; @@ -422,7 +422,7 @@ xfs_dir_removename( ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args = kmem_zalloc(sizeof(*args), KM_NOFS); if (!args) return -ENOMEM; @@ -483,7 +483,7 @@ xfs_dir_replace( if (rval) return rval; - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args = kmem_zalloc(sizeof(*args), KM_NOFS); if (!args) return -ENOMEM; diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index a6fb0cc2085e..9595ced393dc 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -1092,7 +1092,7 @@ xfs_dir2_sf_to_block( * Copy the directory into a temporary buffer. * Then pitch the incore inode data so we can make extents. */ - sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP); + sfp = kmem_alloc(ifp->if_bytes, 0); memcpy(sfp, oldsfp, ifp->if_bytes); xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 1fc44efc344d..705c4f562758 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -32,8 +32,6 @@ static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state, static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, int index, xfs_da_state_blk_t *dblk, int *rval); -static int xfs_dir2_node_addname_int(xfs_da_args_t *args, - xfs_da_state_blk_t *fblk); /* * Check internal consistency of a leafn block. @@ -1611,113 +1609,152 @@ xfs_dir2_leafn_unbalance( } /* - * Top-level node form directory addname routine. + * Add a new data block to the directory at the free space index that the caller + * has specified. */ -int /* error */ -xfs_dir2_node_addname( - xfs_da_args_t *args) /* operation arguments */ +static int +xfs_dir2_node_add_datablk( + struct xfs_da_args *args, + struct xfs_da_state_blk *fblk, + xfs_dir2_db_t *dbno, + struct xfs_buf **dbpp, + struct xfs_buf **fbpp, + int *findex) { - xfs_da_state_blk_t *blk; /* leaf block for insert */ - int error; /* error return value */ - int rval; /* sub-return value */ - xfs_da_state_t *state; /* btree cursor */ + struct xfs_inode *dp = args->dp; + struct xfs_trans *tp = args->trans; + struct xfs_mount *mp = dp->i_mount; + struct xfs_dir3_icfree_hdr freehdr; + struct xfs_dir2_data_free *bf; + struct xfs_dir2_data_hdr *hdr; + struct xfs_dir2_free *free = NULL; + xfs_dir2_db_t fbno; + struct xfs_buf *fbp; + struct xfs_buf *dbp; + __be16 *bests = NULL; + int error; - trace_xfs_dir2_node_addname(args); + /* Not allowed to allocate, return failure. */ + if (args->total == 0) + return -ENOSPC; + + /* Allocate and initialize the new data block. */ + error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, dbno); + if (error) + return error; + error = xfs_dir3_data_init(args, *dbno, &dbp); + if (error) + return error; /* - * Allocate and initialize the state (btree cursor). - */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; - /* - * Look up the name. We're not supposed to find it, but - * this gives us the insertion point. + * Get the freespace block corresponding to the data block + * that was just allocated. */ - error = xfs_da3_node_lookup_int(state, &rval); + fbno = dp->d_ops->db_to_fdb(args->geo, *dbno); + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(args->geo, fbno), &fbp); if (error) - rval = error; - if (rval != -ENOENT) { - goto done; - } + return error; + /* - * Add the data entry to a data block. - * Extravalid is set to a freeblock found by lookup. + * If there wasn't a freespace block, the read will + * return a NULL fbp. Allocate and initialize a new one. */ - rval = xfs_dir2_node_addname_int(args, - state->extravalid ? &state->extrablk : NULL); - if (rval) { - goto done; + if (!fbp) { + error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fbno); + if (error) + return error; + + if (dp->d_ops->db_to_fdb(args->geo, *dbno) != fbno) { + xfs_alert(mp, +"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld", + __func__, (unsigned long long)dp->i_ino, + (long long)dp->d_ops->db_to_fdb(args->geo, *dbno), + (long long)*dbno, (long long)fbno); + if (fblk) { + xfs_alert(mp, + " fblk "PTR_FMT" blkno %llu index %d magic 0x%x", + fblk, (unsigned long long)fblk->blkno, + fblk->index, fblk->magic); + } else { + xfs_alert(mp, " ... fblk is NULL"); + } + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + /* Get a buffer for the new block. */ + error = xfs_dir3_free_get_buf(args, fbno, &fbp); + if (error) + return error; + free = fbp->b_addr; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + + /* Remember the first slot as our empty slot. */ + freehdr.firstdb = (fbno - xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET)) * + dp->d_ops->free_max_bests(args->geo); + } else { + free = fbp->b_addr; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); } - blk = &state->path.blk[state->path.active - 1]; - ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); + + /* Set the freespace block index from the data block number. */ + *findex = dp->d_ops->db_to_fdindex(args->geo, *dbno); + + /* Extend the freespace table if the new data block is off the end. */ + if (*findex >= freehdr.nvalid) { + ASSERT(*findex < dp->d_ops->free_max_bests(args->geo)); + freehdr.nvalid = *findex + 1; + bests[*findex] = cpu_to_be16(NULLDATAOFF); + } + /* - * Add the new leaf entry. + * If this entry was for an empty data block (this should always be + * true) then update the header. */ - rval = xfs_dir2_leafn_add(blk->bp, args, blk->index); - if (rval == 0) { - /* - * It worked, fix the hash values up the btree. - */ - if (!(args->op_flags & XFS_DA_OP_JUSTCHECK)) - xfs_da3_fixhashpath(state, &state->path); - } else { - /* - * It didn't work, we need to split the leaf block. - */ - if (args->total == 0) { - ASSERT(rval == -ENOSPC); - goto done; - } - /* - * Split the leaf block and insert the new entry. - */ - rval = xfs_da3_split(state); + if (bests[*findex] == cpu_to_be16(NULLDATAOFF)) { + freehdr.nused++; + dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); + xfs_dir2_free_log_header(args, fbp); } -done: - xfs_da_state_free(state); - return rval; + + /* Update the freespace value for the new block in the table. */ + hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); + bests[*findex] = bf[0].length; + + *dbpp = dbp; + *fbpp = fbp; + return 0; } -/* - * Add the data entry for a node-format directory name addition. - * The leaf entry is added in xfs_dir2_leafn_add. - * We may enter with a freespace block that the lookup found. - */ -static int /* error */ -xfs_dir2_node_addname_int( - xfs_da_args_t *args, /* operation arguments */ - xfs_da_state_blk_t *fblk) /* optional freespace block */ +static int +xfs_dir2_node_find_freeblk( + struct xfs_da_args *args, + struct xfs_da_state_blk *fblk, + xfs_dir2_db_t *dbnop, + struct xfs_buf **fbpp, + int *findexp, + int length) { - xfs_dir2_data_hdr_t *hdr; /* data block header */ - xfs_dir2_db_t dbno; /* data block number */ - struct xfs_buf *dbp; /* data block buffer */ - xfs_dir2_data_entry_t *dep; /* data entry pointer */ - xfs_inode_t *dp; /* incore directory inode */ - xfs_dir2_data_unused_t *dup; /* data unused entry pointer */ - int error; /* error return value */ - xfs_dir2_db_t fbno; /* freespace block number */ - struct xfs_buf *fbp; /* freespace buffer */ - int findex; /* freespace entry index */ - xfs_dir2_free_t *free=NULL; /* freespace block structure */ - xfs_dir2_db_t ifbno; /* initial freespace block no */ - xfs_dir2_db_t lastfbno=0; /* highest freespace block no */ - int length; /* length of the new entry */ - int logfree; /* need to log free entry */ - xfs_mount_t *mp; /* filesystem mount point */ - int needlog; /* need to log data header */ - int needscan; /* need to rescan data frees */ - __be16 *tagp; /* data entry tag pointer */ - xfs_trans_t *tp; /* transaction pointer */ - __be16 *bests; struct xfs_dir3_icfree_hdr freehdr; - struct xfs_dir2_data_free *bf; - xfs_dir2_data_aoff_t aoff; + struct xfs_dir2_free *free = NULL; + struct xfs_inode *dp = args->dp; + struct xfs_trans *tp = args->trans; + struct xfs_buf *fbp = NULL; + xfs_dir2_db_t firstfbno; + xfs_dir2_db_t lastfbno; + xfs_dir2_db_t ifbno = -1; + xfs_dir2_db_t dbno = -1; + xfs_dir2_db_t fbno; + xfs_fileoff_t fo; + __be16 *bests = NULL; + int findex = 0; + int error; - dp = args->dp; - mp = dp->i_mount; - tp = args->trans; - length = dp->d_ops->data_entsize(args->namelen); /* * If we came in with a freespace block that means that lookup * found an entry with our hash value. This is the freespace @@ -1725,288 +1762,157 @@ xfs_dir2_node_addname_int( */ if (fblk) { fbp = fblk->bp; - /* - * Remember initial freespace block number. - */ - ifbno = fblk->blkno; free = fbp->b_addr; findex = fblk->index; - bests = dp->d_ops->free_bests_p(free); - dp->d_ops->free_hdr_from_disk(&freehdr, free); - - /* - * This means the free entry showed that the data block had - * space for our entry, so we remembered it. - * Use that data block. - */ if (findex >= 0) { + /* caller already found the freespace for us. */ + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + ASSERT(findex < freehdr.nvalid); ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF); ASSERT(be16_to_cpu(bests[findex]) >= length); dbno = freehdr.firstdb + findex; - } else { - /* - * The data block looked at didn't have enough room. - * We'll start at the beginning of the freespace entries. - */ - dbno = -1; - findex = 0; + goto found_block; } - } else { + /* - * Didn't come in with a freespace block, so no data block. + * The data block looked at didn't have enough room. + * We'll start at the beginning of the freespace entries. */ - ifbno = dbno = -1; + ifbno = fblk->blkno; + xfs_trans_brelse(tp, fbp); fbp = NULL; - findex = 0; + fblk->bp = NULL; } /* - * If we don't have a data block yet, we're going to scan the - * freespace blocks looking for one. Figure out what the - * highest freespace block number is. - */ - if (dbno == -1) { - xfs_fileoff_t fo; /* freespace block number */ - - if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) - return error; - lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo); - fbno = ifbno; - } - /* - * While we haven't identified a data block, search the freeblock - * data for a good data block. If we find a null freeblock entry, - * indicating a hole in the data blocks, remember that. + * If we don't have a data block yet, we're going to scan the freespace + * data for a data block with enough free space in it. */ - while (dbno == -1) { - /* - * If we don't have a freeblock in hand, get the next one. - */ - if (fbp == NULL) { - /* - * Happens the first time through unless lookup gave - * us a freespace block to start with. - */ - if (++fbno == 0) - fbno = xfs_dir2_byte_to_db(args->geo, - XFS_DIR2_FREE_OFFSET); - /* - * If it's ifbno we already looked at it. - */ - if (fbno == ifbno) - fbno++; - /* - * If it's off the end we're done. - */ - if (fbno >= lastfbno) - break; - /* - * Read the block. There can be holes in the - * freespace blocks, so this might not succeed. - * This should be really rare, so there's no reason - * to avoid it. - */ - error = xfs_dir2_free_try_read(tp, dp, - xfs_dir2_db_to_da(args->geo, fbno), - &fbp); - if (error) - return error; - if (!fbp) - continue; - free = fbp->b_addr; - findex = 0; - } - /* - * Look at the current free entry. Is it good enough? - * - * The bests initialisation should be where the bufer is read in - * the above branch. But gcc is too stupid to realise that bests - * and the freehdr are actually initialised if they are placed - * there, so we have to do it here to avoid warnings. Blech. - */ - bests = dp->d_ops->free_bests_p(free); - dp->d_ops->free_hdr_from_disk(&freehdr, free); - if (be16_to_cpu(bests[findex]) != NULLDATAOFF && - be16_to_cpu(bests[findex]) >= length) - dbno = freehdr.firstdb + findex; - else { - /* - * Are we done with the freeblock? - */ - if (++findex == freehdr.nvalid) { - /* - * Drop the block. - */ - xfs_trans_brelse(tp, fbp); - fbp = NULL; - if (fblk && fblk->bp) - fblk->bp = NULL; - } - } - } - /* - * If we don't have a data block, we need to allocate one and make - * the freespace entries refer to it. - */ - if (unlikely(dbno == -1)) { - /* - * Not allowed to allocate, return failure. - */ - if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) - return -ENOSPC; - - /* - * Allocate and initialize the new data block. - */ - if (unlikely((error = xfs_dir2_grow_inode(args, - XFS_DIR2_DATA_SPACE, - &dbno)) || - (error = xfs_dir3_data_init(args, dbno, &dbp)))) - return error; + error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK); + if (error) + return error; + lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo); + firstfbno = xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET); - /* - * If (somehow) we have a freespace block, get rid of it. - */ - if (fbp) - xfs_trans_brelse(tp, fbp); - if (fblk && fblk->bp) - fblk->bp = NULL; + for (fbno = lastfbno - 1; fbno >= firstfbno; fbno--) { + /* If it's ifbno we already looked at it. */ + if (fbno == ifbno) + continue; /* - * Get the freespace block corresponding to the data block - * that was just allocated. + * Read the block. There can be holes in the freespace blocks, + * so this might not succeed. This should be really rare, so + * there's no reason to avoid it. */ - fbno = dp->d_ops->db_to_fdb(args->geo, dbno); error = xfs_dir2_free_try_read(tp, dp, - xfs_dir2_db_to_da(args->geo, fbno), - &fbp); + xfs_dir2_db_to_da(args->geo, fbno), + &fbp); if (error) return error; + if (!fbp) + continue; - /* - * If there wasn't a freespace block, the read will - * return a NULL fbp. Allocate and initialize a new one. - */ - if (!fbp) { - error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, - &fbno); - if (error) - return error; + free = fbp->b_addr; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); - if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) { - xfs_alert(mp, -"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d", - __func__, (unsigned long long)dp->i_ino, - (long long)dp->d_ops->db_to_fdb( - args->geo, dbno), - (long long)dbno, (long long)fbno, - (unsigned long long)ifbno, lastfbno); - if (fblk) { - xfs_alert(mp, - " fblk "PTR_FMT" blkno %llu index %d magic 0x%x", - fblk, - (unsigned long long)fblk->blkno, - fblk->index, - fblk->magic); - } else { - xfs_alert(mp, " ... fblk is NULL"); - } - XFS_ERROR_REPORT("xfs_dir2_node_addname_int", - XFS_ERRLEVEL_LOW, mp); - return -EFSCORRUPTED; + /* Scan the free entry array for a large enough free space. */ + for (findex = freehdr.nvalid - 1; findex >= 0; findex--) { + if (be16_to_cpu(bests[findex]) != NULLDATAOFF && + be16_to_cpu(bests[findex]) >= length) { + dbno = freehdr.firstdb + findex; + goto found_block; } - - /* - * Get a buffer for the new block. - */ - error = xfs_dir3_free_get_buf(args, fbno, &fbp); - if (error) - return error; - free = fbp->b_addr; - bests = dp->d_ops->free_bests_p(free); - dp->d_ops->free_hdr_from_disk(&freehdr, free); - - /* - * Remember the first slot as our empty slot. - */ - freehdr.firstdb = - (fbno - xfs_dir2_byte_to_db(args->geo, - XFS_DIR2_FREE_OFFSET)) * - dp->d_ops->free_max_bests(args->geo); - } else { - free = fbp->b_addr; - bests = dp->d_ops->free_bests_p(free); - dp->d_ops->free_hdr_from_disk(&freehdr, free); } - /* - * Set the freespace block index from the data block number. - */ - findex = dp->d_ops->db_to_fdindex(args->geo, dbno); - /* - * If it's after the end of the current entries in the - * freespace block, extend that table. - */ - if (findex >= freehdr.nvalid) { - ASSERT(findex < dp->d_ops->free_max_bests(args->geo)); - freehdr.nvalid = findex + 1; - /* - * Tag new entry so nused will go up. - */ - bests[findex] = cpu_to_be16(NULLDATAOFF); - } - /* - * If this entry was for an empty data block - * (this should always be true) then update the header. - */ - if (bests[findex] == cpu_to_be16(NULLDATAOFF)) { - freehdr.nused++; - dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); - xfs_dir2_free_log_header(args, fbp); - } - /* - * Update the real value in the table. - * We haven't allocated the data entry yet so this will - * change again. - */ - hdr = dbp->b_addr; - bf = dp->d_ops->data_bestfree_p(hdr); - bests[findex] = bf[0].length; - logfree = 1; + /* Didn't find free space, go on to next free block */ + xfs_trans_brelse(tp, fbp); } + +found_block: + *dbnop = dbno; + *fbpp = fbp; + *findexp = findex; + return 0; +} + + +/* + * Add the data entry for a node-format directory name addition. + * The leaf entry is added in xfs_dir2_leafn_add. + * We may enter with a freespace block that the lookup found. + */ +static int +xfs_dir2_node_addname_int( + struct xfs_da_args *args, /* operation arguments */ + struct xfs_da_state_blk *fblk) /* optional freespace block */ +{ + struct xfs_dir2_data_unused *dup; /* data unused entry pointer */ + struct xfs_dir2_data_entry *dep; /* data entry pointer */ + struct xfs_dir2_data_hdr *hdr; /* data block header */ + struct xfs_dir2_data_free *bf; + struct xfs_dir2_free *free = NULL; /* freespace block structure */ + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + struct xfs_buf *dbp; /* data block buffer */ + struct xfs_buf *fbp; /* freespace buffer */ + xfs_dir2_data_aoff_t aoff; + xfs_dir2_db_t dbno; /* data block number */ + int error; /* error return value */ + int findex; /* freespace entry index */ + int length; /* length of the new entry */ + int logfree = 0; /* need to log free entry */ + int needlog = 0; /* need to log data header */ + int needscan = 0; /* need to rescan data frees */ + __be16 *tagp; /* data entry tag pointer */ + __be16 *bests; + + length = dp->d_ops->data_entsize(args->namelen); + error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &findex, + length); + if (error) + return error; + /* - * We had a data block so we don't have to make a new one. + * Now we know if we must allocate blocks, so if we are checking whether + * we can insert without allocation then we can return now. */ - else { - /* - * If just checking, we succeeded. - */ - if (args->op_flags & XFS_DA_OP_JUSTCHECK) - return 0; + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { + if (dbno == -1) + return -ENOSPC; + return 0; + } - /* - * Read the data block in. - */ + /* + * If we don't have a data block, we need to allocate one and make + * the freespace entries refer to it. + */ + if (dbno == -1) { + /* we're going to have to log the free block index later */ + logfree = 1; + error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp, + &findex); + } else { + /* Read the data block in. */ error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, dbno), -1, &dbp); - if (error) - return error; - hdr = dbp->b_addr; - bf = dp->d_ops->data_bestfree_p(hdr); - logfree = 0; } + if (error) + return error; + + /* setup for data block up now */ + hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); ASSERT(be16_to_cpu(bf[0].length) >= length); - /* - * Point to the existing unused space. - */ + + /* Point to the existing unused space. */ dup = (xfs_dir2_data_unused_t *) ((char *)hdr + be16_to_cpu(bf[0].offset)); - needscan = needlog = 0; - /* - * Mark the first part of the unused space, inuse for us. - */ + + /* Mark the first part of the unused space, inuse for us. */ aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length, &needlog, &needscan); @@ -2014,9 +1920,8 @@ xfs_dir2_node_addname_int( xfs_trans_brelse(tp, dbp); return error; } - /* - * Fill in the new entry and log it. - */ + + /* Fill in the new entry and log it. */ dep = (xfs_dir2_data_entry_t *)dup; dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; @@ -2025,38 +1930,101 @@ xfs_dir2_node_addname_int( tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); xfs_dir2_data_log_entry(args, dbp, dep); - /* - * Rescan the block for bestfree if needed. - */ + + /* Rescan the freespace and log the data block if needed. */ if (needscan) xfs_dir2_data_freescan(dp, hdr, &needlog); - /* - * Log the data block header if needed. - */ if (needlog) xfs_dir2_data_log_header(args, dbp); - /* - * If the freespace entry is now wrong, update it. - */ - bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */ - if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) { + + /* If the freespace block entry is now wrong, update it. */ + free = fbp->b_addr; + bests = dp->d_ops->free_bests_p(free); + if (bests[findex] != bf[0].length) { bests[findex] = bf[0].length; logfree = 1; } - /* - * Log the freespace entry if needed. - */ + + /* Log the freespace entry if needed. */ if (logfree) xfs_dir2_free_log_bests(args, fbp, findex, findex); - /* - * Return the data block and offset in args, then drop the data block. - */ + + /* Return the data block and offset in args. */ args->blkno = (xfs_dablk_t)dbno; args->index = be16_to_cpu(*tagp); return 0; } /* + * Top-level node form directory addname routine. + */ +int /* error */ +xfs_dir2_node_addname( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_da_state_blk_t *blk; /* leaf block for insert */ + int error; /* error return value */ + int rval; /* sub-return value */ + xfs_da_state_t *state; /* btree cursor */ + + trace_xfs_dir2_node_addname(args); + + /* + * Allocate and initialize the state (btree cursor). + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + /* + * Look up the name. We're not supposed to find it, but + * this gives us the insertion point. + */ + error = xfs_da3_node_lookup_int(state, &rval); + if (error) + rval = error; + if (rval != -ENOENT) { + goto done; + } + /* + * Add the data entry to a data block. + * Extravalid is set to a freeblock found by lookup. + */ + rval = xfs_dir2_node_addname_int(args, + state->extravalid ? &state->extrablk : NULL); + if (rval) { + goto done; + } + blk = &state->path.blk[state->path.active - 1]; + ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); + /* + * Add the new leaf entry. + */ + rval = xfs_dir2_leafn_add(blk->bp, args, blk->index); + if (rval == 0) { + /* + * It worked, fix the hash values up the btree. + */ + if (!(args->op_flags & XFS_DA_OP_JUSTCHECK)) + xfs_da3_fixhashpath(state, &state->path); + } else { + /* + * It didn't work, we need to split the leaf block. + */ + if (args->total == 0) { + ASSERT(rval == -ENOSPC); + goto done; + } + /* + * Split the leaf block and insert the new entry. + */ + rval = xfs_da3_split(state); + } +done: + xfs_da_state_free(state); + return rval; +} + +/* * Lookup an entry in a node-format directory. * All the real work happens in xfs_da3_node_lookup_int. * The only real output is the inode number of the entry. diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 033589257f54..85f14fc2a8da 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -164,7 +164,7 @@ xfs_dir2_block_to_sf( * can free the block and copy the formatted data into the inode literal * area. */ - dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP); + dst = kmem_alloc(mp->m_sb.sb_inodesize, 0); hdr = bp->b_addr; /* @@ -436,7 +436,7 @@ xfs_dir2_sf_addname_hard( sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; old_isize = (int)dp->i_d.di_size; - buf = kmem_alloc(old_isize, KM_SLEEP); + buf = kmem_alloc(old_isize, 0); oldsfp = (xfs_dir2_sf_hdr_t *)buf; memcpy(oldsfp, sfp, old_isize); /* @@ -1096,7 +1096,7 @@ xfs_dir2_sf_toino4( * Don't want xfs_idata_realloc copying the data here. */ oldsize = dp->i_df.if_bytes; - buf = kmem_alloc(oldsize, KM_SLEEP); + buf = kmem_alloc(oldsize, 0); oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; ASSERT(oldsfp->i8count == 1); memcpy(buf, oldsfp, oldsize); @@ -1169,7 +1169,7 @@ xfs_dir2_sf_toino8( * Don't want xfs_idata_realloc copying the data here. */ oldsize = dp->i_df.if_bytes; - buf = kmem_alloc(oldsize, KM_SLEEP); + buf = kmem_alloc(oldsize, 0); oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; ASSERT(oldsfp->i8count == 0); memcpy(buf, oldsfp, oldsize); diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 52d03a3a02a4..39dd2b908106 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -287,7 +287,7 @@ struct xfs_ag_geometry { uint32_t ag_ifree; /* o: inodes free */ uint32_t ag_sick; /* o: sick things in ag */ uint32_t ag_checked; /* o: checked metadata in ag */ - uint32_t ag_reserved32; /* o: zero */ + uint32_t ag_flags; /* i/o: flags for this ag */ uint64_t ag_reserved[12];/* o: zero */ }; #define XFS_AG_GEOM_SICK_SB (1 << 0) /* superblock */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 04377ab75863..588d44613094 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2787,8 +2787,13 @@ xfs_ialloc_setup_geometry( igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr, inodes); - /* Set the maximum inode count for this filesystem. */ - if (sbp->sb_imax_pct) { + /* + * Set the maximum inode count for this filesystem, being careful not + * to use obviously garbage sb_inopblog/sb_inopblock values. Regular + * users should never get here due to failing sb verification, but + * certain users (xfs_db) need to be usable even with corrupt metadata. + */ + if (sbp->sb_imax_pct && igeo->ialloc_blks) { /* * Make sure the maximum inode count is a multiple * of the units we allocate inodes in. diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 27aa3f2bc4bc..7bc87408f1a0 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -616,7 +616,7 @@ xfs_iext_realloc_root( * sequence counter is seen before the modifications to the extent tree itself * take effect. */ -static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state) +static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp) { WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); } @@ -633,7 +633,7 @@ xfs_iext_insert( struct xfs_iext_leaf *new = NULL; int nr_entries, i; - xfs_iext_inc_seq(ifp, state); + xfs_iext_inc_seq(ifp); if (ifp->if_height == 0) xfs_iext_alloc_root(ifp, cur); @@ -875,7 +875,7 @@ xfs_iext_remove( ASSERT(ifp->if_u1.if_root != NULL); ASSERT(xfs_iext_valid(ifp, cur)); - xfs_iext_inc_seq(ifp, state); + xfs_iext_inc_seq(ifp); nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1; for (i = cur->pos; i < nr_entries; i++) @@ -983,7 +983,7 @@ xfs_iext_update_extent( { struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); - xfs_iext_inc_seq(ifp, state); + xfs_iext_inc_seq(ifp); if (cur->pos == 0) { struct xfs_bmbt_irec old; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index bf3e04018246..c643beeb5a24 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -94,7 +94,7 @@ xfs_iformat_fork( return 0; ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS); switch (dip->di_aformat) { case XFS_DINODE_FMT_LOCAL: @@ -147,7 +147,7 @@ xfs_init_local_fork( if (size) { real_size = roundup(mem_size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS); memcpy(ifp->if_u1.if_data, data, size); if (zero_terminate) ifp->if_u1.if_data[size] = '\0'; @@ -302,7 +302,7 @@ xfs_iformat_btree( } ifp->if_broot_bytes = size; - ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); + ifp->if_broot = kmem_alloc(size, KM_NOFS); ASSERT(ifp->if_broot != NULL); /* * Copy and convert from the on-disk structure @@ -367,7 +367,7 @@ xfs_iroot_realloc( */ if (ifp->if_broot_bytes == 0) { new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); - ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + ifp->if_broot = kmem_alloc(new_size, KM_NOFS); ifp->if_broot_bytes = (int)new_size; return; } @@ -382,7 +382,7 @@ xfs_iroot_realloc( new_max = cur_max + rec_diff; new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - KM_SLEEP | KM_NOFS); + KM_NOFS); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, @@ -408,7 +408,7 @@ xfs_iroot_realloc( else new_size = 0; if (new_size > 0) { - new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + new_broot = kmem_alloc(new_size, KM_NOFS); /* * First copy over the btree block header. */ @@ -492,7 +492,7 @@ xfs_idata_realloc( * We enforce that here. */ ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data, - roundup(new_size, 4), KM_SLEEP | KM_NOFS); + roundup(new_size, 4), KM_NOFS); ifp->if_bytes = new_size; } @@ -683,7 +683,7 @@ xfs_ifork_init_cow( return; ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, - KM_SLEEP | KM_NOFS); + KM_NOFS); ip->i_cowfp->if_flags = XFS_IFEXTENTS; ip->i_cformat = XFS_DINODE_FMT_EXTENTS; ip->i_cnextents = 0; diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 51bb9bdb0e84..9a7fadb1361c 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1174,7 +1174,7 @@ out_cur: /* * Record a refcount intent for later processing. */ -static int +static void __xfs_refcount_add( struct xfs_trans *tp, enum xfs_refcount_intent_type type, @@ -1189,44 +1189,43 @@ __xfs_refcount_add( blockcount); ri = kmem_alloc(sizeof(struct xfs_refcount_intent), - KM_SLEEP | KM_NOFS); + KM_NOFS); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_startblock = startblock; ri->ri_blockcount = blockcount; xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); - return 0; } /* * Increase the reference count of the blocks backing a file's extent. */ -int +void xfs_refcount_increase_extent( struct xfs_trans *tp, struct xfs_bmbt_irec *PREV) { if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) - return 0; + return; - return __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, - PREV->br_startblock, PREV->br_blockcount); + __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock, + PREV->br_blockcount); } /* * Decrease the reference count of the blocks backing a file's extent. */ -int +void xfs_refcount_decrease_extent( struct xfs_trans *tp, struct xfs_bmbt_irec *PREV) { if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) - return 0; + return; - return __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, - PREV->br_startblock, PREV->br_blockcount); + __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock, + PREV->br_blockcount); } /* @@ -1541,47 +1540,40 @@ __xfs_refcount_cow_free( } /* Record a CoW staging extent in the refcount btree. */ -int +void xfs_refcount_alloc_cow_extent( struct xfs_trans *tp, xfs_fsblock_t fsb, xfs_extlen_t len) { struct xfs_mount *mp = tp->t_mountp; - int error; if (!xfs_sb_version_hasreflink(&mp->m_sb)) - return 0; + return; - error = __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); - if (error) - return error; + __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); /* Add rmap entry */ - return xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), + xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); } /* Forget a CoW staging event in the refcount btree. */ -int +void xfs_refcount_free_cow_extent( struct xfs_trans *tp, xfs_fsblock_t fsb, xfs_extlen_t len) { struct xfs_mount *mp = tp->t_mountp; - int error; if (!xfs_sb_version_hasreflink(&mp->m_sb)) - return 0; + return; /* Remove rmap entry */ - error = xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), + xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); - if (error) - return error; - - return __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len); + __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len); } struct xfs_refcount_recovery { @@ -1602,7 +1594,7 @@ xfs_refcount_recover_extent( if (be32_to_cpu(rec->refc.rc_refcount) != 1) return -EFSCORRUPTED; - rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP); + rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); list_add_tail(&rr->rr_list, debris); @@ -1679,10 +1671,8 @@ xfs_refcount_recover_cow_leftovers( /* Free the orphan record */ agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; fsb = XFS_AGB_TO_FSB(mp, agno, agbno); - error = xfs_refcount_free_cow_extent(tp, fsb, + xfs_refcount_free_cow_extent(tp, fsb, rr->rr_rrec.rc_blockcount); - if (error) - goto out_trans; /* Free the block. */ xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL); diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 1d9c518575e7..209795539c8d 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -29,9 +29,9 @@ struct xfs_refcount_intent { xfs_extlen_t ri_blockcount; }; -extern int xfs_refcount_increase_extent(struct xfs_trans *tp, +void xfs_refcount_increase_extent(struct xfs_trans *tp, struct xfs_bmbt_irec *irec); -extern int xfs_refcount_decrease_extent(struct xfs_trans *tp, +void xfs_refcount_decrease_extent(struct xfs_trans *tp, struct xfs_bmbt_irec *irec); extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, @@ -45,10 +45,10 @@ extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_end_of_shared); -extern int xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, - xfs_fsblock_t fsb, xfs_extlen_t len); -extern int xfs_refcount_free_cow_extent(struct xfs_trans *tp, - xfs_fsblock_t fsb, xfs_extlen_t len); +void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, + xfs_extlen_t len); +void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, + xfs_extlen_t len); extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, xfs_agnumber_t agno); diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index e6aeb390b2fb..38e9414878b3 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -168,7 +168,6 @@ xfs_rmap_btrec_to_irec( union xfs_btree_rec *rec, struct xfs_rmap_irec *irec) { - irec->rm_flags = 0; irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock); irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount); irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner); @@ -254,15 +253,15 @@ xfs_rmap_find_left_neighbor_helper( rec->rm_flags); if (rec->rm_owner != info->high.rm_owner) - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset) - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; *info->irec = *rec; *info->stat = 1; - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; } /* @@ -305,7 +304,7 @@ xfs_rmap_find_left_neighbor( error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_find_left_neighbor_helper, &info); - if (error == XFS_BTREE_QUERY_RANGE_ABORT) + if (error == -ECANCELED) error = 0; if (*stat) trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, @@ -330,16 +329,16 @@ xfs_rmap_lookup_le_range_helper( rec->rm_flags); if (rec->rm_owner != info->high.rm_owner) - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && (rec->rm_offset > info->high.rm_offset || rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset)) - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; *info->irec = *rec; *info->stat = 1; - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; } /* @@ -377,7 +376,7 @@ xfs_rmap_lookup_le_range( cur->bc_private.a.agno, bno, 0, owner, offset, flags); error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_lookup_le_range_helper, &info); - if (error == XFS_BTREE_QUERY_RANGE_ABORT) + if (error == -ECANCELED) error = 0; if (*stat) trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, @@ -2268,7 +2267,7 @@ xfs_rmap_update_is_needed( * Record a rmap intent; the list is kept sorted first by AG and then by * increasing age. */ -static int +static void __xfs_rmap_add( struct xfs_trans *tp, enum xfs_rmap_intent_type type, @@ -2287,7 +2286,7 @@ __xfs_rmap_add( bmap->br_blockcount, bmap->br_state); - ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS); + ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_owner = owner; @@ -2295,11 +2294,10 @@ __xfs_rmap_add( ri->ri_bmap = *bmap; xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list); - return 0; } /* Map an extent into a file. */ -int +void xfs_rmap_map_extent( struct xfs_trans *tp, struct xfs_inode *ip, @@ -2307,15 +2305,15 @@ xfs_rmap_map_extent( struct xfs_bmbt_irec *PREV) { if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) - return 0; + return; - return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? + __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino, whichfork, PREV); } /* Unmap an extent out of a file. */ -int +void xfs_rmap_unmap_extent( struct xfs_trans *tp, struct xfs_inode *ip, @@ -2323,9 +2321,9 @@ xfs_rmap_unmap_extent( struct xfs_bmbt_irec *PREV) { if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) - return 0; + return; - return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? + __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino, whichfork, PREV); } @@ -2336,7 +2334,7 @@ xfs_rmap_unmap_extent( * Note that tp can be NULL here as no transaction is used for COW fork * unwritten conversion. */ -int +void xfs_rmap_convert_extent( struct xfs_mount *mp, struct xfs_trans *tp, @@ -2345,15 +2343,15 @@ xfs_rmap_convert_extent( struct xfs_bmbt_irec *PREV) { if (!xfs_rmap_update_is_needed(mp, whichfork)) - return 0; + return; - return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? + __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino, whichfork, PREV); } /* Schedule the creation of an rmap for non-file data. */ -int +void xfs_rmap_alloc_extent( struct xfs_trans *tp, xfs_agnumber_t agno, @@ -2364,18 +2362,18 @@ xfs_rmap_alloc_extent( struct xfs_bmbt_irec bmap; if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) - return 0; + return; bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); bmap.br_blockcount = len; bmap.br_startoff = 0; bmap.br_state = XFS_EXT_NORM; - return __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap); + __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap); } /* Schedule the deletion of an rmap for non-file data. */ -int +void xfs_rmap_free_extent( struct xfs_trans *tp, xfs_agnumber_t agno, @@ -2386,14 +2384,14 @@ xfs_rmap_free_extent( struct xfs_bmbt_irec bmap; if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) - return 0; + return; bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); bmap.br_blockcount = len; bmap.br_startoff = 0; bmap.br_state = XFS_EXT_NORM; - return __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); + __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); } /* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */ @@ -2511,7 +2509,7 @@ xfs_rmap_has_other_keys_helper( ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) return 0; rks->has_rmap = true; - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; } /* @@ -2540,8 +2538,11 @@ xfs_rmap_has_other_keys( error = xfs_rmap_query_range(cur, &low, &high, xfs_rmap_has_other_keys_helper, &rks); + if (error < 0) + return error; + *has_rmap = rks.has_rmap; - return error; + return 0; } const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = { diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index e21ed0294e5c..abe633403fd1 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -68,6 +68,7 @@ xfs_rmap_irec_offset_unpack( if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS)) return -EFSCORRUPTED; irec->rm_offset = XFS_RMAP_OFF(offset); + irec->rm_flags = 0; if (offset & XFS_RMAP_OFF_ATTR_FORK) irec->rm_flags |= XFS_RMAP_ATTR_FORK; if (offset & XFS_RMAP_OFF_BMBT_BLOCK) @@ -161,16 +162,16 @@ struct xfs_rmap_intent { }; /* functions for updating the rmapbt based on bmbt map/unmap operations */ -int xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, +void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); -int xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, +void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); -int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp, +void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); -int xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno, +void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); -int xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, +void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index e0641b7337b3..c45acbd3add9 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -177,10 +177,4 @@ struct xfs_ino_geometry { unsigned int agino_log; /* #bits for agino in inum */ }; -/* Keep iterating the data structure. */ -#define XFS_ITER_CONTINUE (0) - -/* Stop iterating the data structure. */ -#define XFS_ITER_ABORT (1) - #endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 802b34cd10fe..300b3e91ca3a 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -169,6 +169,14 @@ typedef struct xfs_bmbt_irec xfs_exntst_t br_state; /* extent state */ } xfs_bmbt_irec_t; +/* per-AG block reservation types */ +enum xfs_ag_resv_type { + XFS_AG_RESV_NONE = 0, + XFS_AG_RESV_AGFL, + XFS_AG_RESV_METADATA, + XFS_AG_RESV_RMAPBT, +}; + /* * Type verifier functions */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 16b09b941441..ba0f747c82e8 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -639,7 +639,7 @@ xchk_agfl_block( xchk_agfl_block_xref(sc, agbno); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return XFS_ITER_ABORT; + return -ECANCELED; return 0; } @@ -730,7 +730,7 @@ xchk_agfl( /* Check the blocks in the AGFL. */ error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), sc->sa.agfl_bp, xchk_agfl_block, &sai); - if (error == XFS_ITER_ABORT) { + if (error == -ECANCELED) { error = 0; goto out_free; } diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 1afc58bf71dd..0edc7f8eb96e 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -80,7 +80,7 @@ xchk_setup_xattr( * without the inode lock held, which means we can sleep. */ if (sc->flags & XCHK_TRY_HARDER) { - error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, KM_SLEEP); + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0); if (error) return error; } @@ -163,8 +163,6 @@ xchk_xattr_listent( args.valuelen = valuelen; error = xfs_attr_get_ilocked(context->dp, &args); - if (error == -EEXIST) - error = 0; if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, &error)) goto fail_xref; @@ -173,7 +171,7 @@ xchk_xattr_listent( args.blkno); fail_xref: if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - context->seen_enough = XFS_ITER_ABORT; + context->seen_enough = 1; return; } diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 1bd29fdc2ab5..fa6ea6407992 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -75,6 +75,7 @@ struct xchk_bmap_info { xfs_fileoff_t lastoff; bool is_rt; bool is_shared; + bool was_loaded; int whichfork; }; @@ -213,25 +214,20 @@ xchk_bmap_xref_rmap( /* Cross-reference a single rtdev extent record. */ STATIC void -xchk_bmap_rt_extent_xref( - struct xchk_bmap_info *info, +xchk_bmap_rt_iextent_xref( struct xfs_inode *ip, - struct xfs_btree_cur *cur, + struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { - if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return; - xchk_xref_is_used_rt_space(info->sc, irec->br_startblock, irec->br_blockcount); } /* Cross-reference a single datadev extent record. */ STATIC void -xchk_bmap_extent_xref( - struct xchk_bmap_info *info, +xchk_bmap_iextent_xref( struct xfs_inode *ip, - struct xfs_btree_cur *cur, + struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { struct xfs_mount *mp = info->sc->mp; @@ -240,9 +236,6 @@ xchk_bmap_extent_xref( xfs_extlen_t len; int error; - if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return; - agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock); agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); len = irec->br_blockcount; @@ -300,20 +293,15 @@ xchk_bmap_dirattr_extent( /* Scrub a single extent record. */ STATIC int -xchk_bmap_extent( +xchk_bmap_iextent( struct xfs_inode *ip, - struct xfs_btree_cur *cur, struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { struct xfs_mount *mp = info->sc->mp; - struct xfs_buf *bp = NULL; xfs_filblks_t end; int error = 0; - if (cur) - xfs_btree_get_block(cur, 0, &bp); - /* * Check for out-of-order extents. This record could have come * from the incore list, for which there is no ordering check. @@ -364,10 +352,13 @@ xchk_bmap_extent( xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); + if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + if (info->is_rt) - xchk_bmap_rt_extent_xref(info, ip, cur, irec); + xchk_bmap_rt_iextent_xref(ip, info, irec); else - xchk_bmap_extent_xref(info, ip, cur, irec); + xchk_bmap_iextent_xref(ip, info, irec); info->lastoff = irec->br_startoff + irec->br_blockcount; return error; @@ -380,10 +371,13 @@ xchk_bmapbt_rec( union xfs_btree_rec *rec) { struct xfs_bmbt_irec irec; + struct xfs_bmbt_irec iext_irec; + struct xfs_iext_cursor icur; struct xchk_bmap_info *info = bs->private; struct xfs_inode *ip = bs->cur->bc_private.b.ip; struct xfs_buf *bp = NULL; struct xfs_btree_block *block; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork); uint64_t owner; int i; @@ -402,9 +396,26 @@ xchk_bmapbt_rec( } } - /* Set up the in-core record and scrub it. */ + /* + * Check that the incore extent tree contains an extent that matches + * this one exactly. We validate those cached bmaps later, so we don't + * need to check them here. If the incore extent tree was just loaded + * from disk by the scrubber, we assume that its contents match what's + * on disk (we still hold the ILOCK) and skip the equivalence check. + */ + if (!info->was_loaded) + return 0; + xfs_bmbt_disk_get_all(&rec->bmbt, &irec); - return xchk_bmap_extent(ip, bs->cur, info, &irec); + if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur, + &iext_irec) || + irec.br_startoff != iext_irec.br_startoff || + irec.br_startblock != iext_irec.br_startblock || + irec.br_blockcount != iext_irec.br_blockcount || + irec.br_state != iext_irec.br_state) + xchk_fblock_set_corrupt(bs->sc, info->whichfork, + irec.br_startoff); + return 0; } /* Scan the btree records. */ @@ -415,15 +426,26 @@ xchk_bmap_btree( struct xchk_bmap_info *info) { struct xfs_owner_info oinfo; + struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork); struct xfs_mount *mp = sc->mp; struct xfs_inode *ip = sc->ip; struct xfs_btree_cur *cur; int error; + /* Load the incore bmap cache if it's not loaded. */ + info->was_loaded = ifp->if_flags & XFS_IFEXTENTS; + if (!info->was_loaded) { + error = xfs_iread_extents(sc->tp, ip, whichfork); + if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) + goto out; + } + + /* Check the btree structure. */ cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork); xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); error = xchk_btree(sc, cur, xchk_bmapbt_rec, &oinfo, info); xfs_btree_del_cursor(cur, error); +out: return error; } @@ -500,7 +522,7 @@ xchk_bmap_check_rmap( out: if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; return 0; } @@ -529,7 +551,7 @@ xchk_bmap_check_ag_rmaps( sbcri.sc = sc; sbcri.whichfork = whichfork; error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri); - if (error == XFS_BTREE_QUERY_RANGE_ABORT) + if (error == -ECANCELED) error = 0; xfs_btree_del_cursor(cur, error); @@ -671,13 +693,6 @@ xchk_bmap( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) goto out; - /* Now try to scrub the in-memory extent list. */ - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(sc->tp, ip, whichfork); - if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) - goto out; - } - /* Find the offset of the last extent in the mapping. */ error = xfs_bmap_last_offset(ip, &endoff, whichfork); if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) @@ -689,7 +704,7 @@ xchk_bmap( for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) - break; + goto out; if (isnullstartblock(irec.br_startblock)) continue; if (irec.br_startoff >= endoff) { @@ -697,7 +712,7 @@ xchk_bmap( irec.br_startoff); goto out; } - error = xchk_bmap_extent(ip, NULL, &info, &irec); + error = xchk_bmap_iextent(ip, &info, &irec); if (error) goto out; } diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index fc3f510c9034..98f82d7c8b40 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -125,7 +125,7 @@ xchk_setup_fscounters( struct xchk_fscounters *fsc; int error; - sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), KM_SLEEP); + sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0); if (!sc->buf) return -ENOMEM; fsc = sc->buf; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 4cfeec57fb05..b70a88bc975e 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -351,7 +351,7 @@ xrep_init_btblock( xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); - xfs_trans_log_buf(tp, bp, 0, bp->b_length); + xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1); bp->b_ops = ops; *bpp = bp; @@ -664,7 +664,7 @@ xrep_findroot_agfl_walk( { xfs_agblock_t *agbno = priv; - return (*agbno == bno) ? XFS_ITER_ABORT : 0; + return (*agbno == bno) ? -ECANCELED : 0; } /* Does this block match the btree information passed in? */ @@ -694,7 +694,7 @@ xrep_findroot_block( if (owner == XFS_RMAP_OWN_AG) { error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp, xrep_findroot_agfl_walk, &agbno); - if (error == XFS_ITER_ABORT) + if (error == -ECANCELED) return 0; if (error) return error; diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 99c0b1234c3c..5641ae512c9e 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -22,7 +22,7 @@ xchk_setup_symlink( struct xfs_inode *ip) { /* Allocate the buffer without the inode lock held. */ - sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP); + sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0); if (!sc->buf) return -ENOMEM; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index cbda40d40326..96d7071cfa46 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -112,7 +112,7 @@ xfs_get_acl(struct inode *inode, int type) { struct xfs_inode *ip = XFS_I(inode); struct posix_acl *acl = NULL; - struct xfs_acl *xfs_acl; + struct xfs_acl *xfs_acl = NULL; unsigned char *ea_name; int error; int len; @@ -135,12 +135,8 @@ xfs_get_acl(struct inode *inode, int type) * go out to the disk. */ len = XFS_ACL_MAX_SIZE(ip->i_mount); - xfs_acl = kmem_zalloc_large(len, KM_SLEEP); - if (!xfs_acl) - return ERR_PTR(-ENOMEM); - - error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl, - &len, ATTR_ROOT); + error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len, + ATTR_ALLOC | ATTR_ROOT); if (error) { /* * If the attribute doesn't exist make sure we have a negative @@ -151,8 +147,8 @@ xfs_get_acl(struct inode *inode, int type) } else { acl = xfs_acl_from_disk(xfs_acl, len, XFS_ACL_MAX_ENTRIES(ip->i_mount)); + kmem_free(xfs_acl); } - kmem_free(xfs_acl); return acl; } @@ -180,7 +176,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) struct xfs_acl *xfs_acl; int len = XFS_ACL_MAX_SIZE(ip->i_mount); - xfs_acl = kmem_zalloc_large(len, KM_SLEEP); + xfs_acl = kmem_zalloc_large(len, 0); if (!xfs_acl) return -ENOMEM; diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index dc93c51c17de..a640a285cc52 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -147,7 +147,7 @@ xfs_attr3_leaf_inactive( * Allocate storage for a list of all the "remote" value extents. */ size = count * sizeof(xfs_attr_inactive_list_t); - list = kmem_alloc(size, KM_SLEEP); + list = kmem_alloc(size, 0); /* * Identify each of the "remote" value extents. diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 58fc820a70c6..00758fdc2fec 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -109,7 +109,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) * It didn't all fit, so we have to sort everything on hashval. */ sbsize = sf->hdr.count * sizeof(*sbuf); - sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS); + sbp = sbuf = kmem_alloc(sbsize, KM_NOFS); /* * Scan the attribute list for the rest of the entries, storing diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 9fa4a7ee8cfc..83d24e983d4c 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -141,7 +141,7 @@ xfs_bui_init( { struct xfs_bui_log_item *buip; - buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP); + buip = kmem_zone_zalloc(xfs_bui_zone, 0); xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; @@ -218,7 +218,7 @@ xfs_trans_get_bud( { struct xfs_bud_log_item *budp; - budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP); + budp = kmem_zone_zalloc(xfs_bud_zone, 0); xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops); budp->bud_buip = buip; @@ -542,9 +542,7 @@ xfs_bui_recover( irec.br_blockcount = count; irec.br_startoff = bmap->me_startoff; irec.br_state = state; - error = xfs_bmap_unmap_extent(tp, ip, &irec); - if (error) - goto err_inode; + xfs_bmap_unmap_extent(tp, ip, &irec); } set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 98c6a7a71427..0910cb75b65d 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -39,9 +39,9 @@ xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) { - return (XFS_IS_REALTIME_INODE(ip) ? \ - (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ - XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); + if (XFS_IS_REALTIME_INODE(ip)) + return XFS_FSB_TO_BB(ip->i_mount, fsb); + return XFS_FSB_TO_DADDR(ip->i_mount, fsb); } /* @@ -1532,24 +1532,16 @@ xfs_swap_extent_rmap( trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec); /* Remove the mapping from the donor file. */ - error = xfs_bmap_unmap_extent(tp, tip, &uirec); - if (error) - goto out; + xfs_bmap_unmap_extent(tp, tip, &uirec); /* Remove the mapping from the source file. */ - error = xfs_bmap_unmap_extent(tp, ip, &irec); - if (error) - goto out; + xfs_bmap_unmap_extent(tp, ip, &irec); /* Map the donor file's blocks into the source file. */ - error = xfs_bmap_map_extent(tp, ip, &uirec); - if (error) - goto out; + xfs_bmap_map_extent(tp, ip, &uirec); /* Map the source file's blocks into the donor file. */ - error = xfs_bmap_map_extent(tp, tip, &irec); - if (error) - goto out; + xfs_bmap_map_extent(tp, tip, &irec); error = xfs_defer_finish(tpp); tp = *tpp; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index ca0849043f54..120ef99d09e8 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -353,7 +353,8 @@ xfs_buf_allocate_memory( */ size = BBTOB(bp->b_length); if (size < PAGE_SIZE) { - bp->b_addr = kmem_alloc(size, KM_NOFS); + int align_mask = xfs_buftarg_dma_alignment(bp->b_target); + bp->b_addr = kmem_alloc_io(size, align_mask, KM_NOFS); if (!bp->b_addr) { /* low memory - use alloc_page loop instead */ goto use_alloc_page; @@ -368,7 +369,7 @@ xfs_buf_allocate_memory( } bp->b_offset = offset_in_page(bp->b_addr); bp->b_pages = bp->b_page_array; - bp->b_pages[0] = virt_to_page(bp->b_addr); + bp->b_pages[0] = kmem_to_page(bp->b_addr); bp->b_page_count = 1; bp->b_flags |= _XBF_KMEM; return 0; @@ -1741,7 +1742,7 @@ xfs_alloc_buftarg( { xfs_buftarg_t *btp; - btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); + btp = kmem_zalloc(sizeof(*btp), KM_NOFS); btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c6e57a3f409e..f6ce17d8d848 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -350,6 +350,12 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) +static inline int +xfs_buftarg_dma_alignment(struct xfs_buftarg *bt) +{ + return queue_dma_alignment(bt->bt_bdev->bd_disk->queue); +} + int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 7dcaec54a20b..d74fbd1e9d3e 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -702,7 +702,7 @@ xfs_buf_item_get_format( } bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), - KM_SLEEP); + 0); if (!bip->bli_formats) return -ENOMEM; return 0; @@ -747,7 +747,7 @@ xfs_buf_item_init( return 0; } - bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); + bip = kmem_zone_zalloc(xfs_buf_item_zone, 0); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index fb1ad4483081..aeb95e7391c1 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -440,7 +440,7 @@ xfs_dquot_alloc( { struct xfs_dquot *dqp; - dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); + dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0); dqp->dq_flags = type; dqp->q_core.d_id = cpu_to_be32(id); @@ -1239,7 +1239,7 @@ xfs_qm_exit(void) /* * Iterate every dquot of a particular type. The caller must ensure that the * particular quota type is active. iter_fn can return negative error codes, - * or XFS_ITER_ABORT to indicate that it wants to stop iterating. + * or -ECANCELED to indicate that it wants to stop iterating. */ int xfs_qm_dqiterate( diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 282ec5af293e..d60647d7197b 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -347,7 +347,7 @@ xfs_qm_qoff_logitem_init( { struct xfs_qoff_logitem *qf; - qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP); + qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0); xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 544c9482a0ef..849fd4476950 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -213,7 +213,7 @@ xfs_errortag_init( struct xfs_mount *mp) { mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, - KM_SLEEP | KM_MAYFAIL); + KM_MAYFAIL); if (!mp->m_errortag) return -ENOMEM; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 0ed68379e551..2183d87be4cf 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -33,7 +33,7 @@ xfs_extent_busy_insert( struct rb_node **rbp; struct rb_node *parent = NULL; - new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP); + new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0); new->agno = agno; new->bno = bno; new->length = len; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 86f6512d6864..e44efc41a041 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -163,9 +163,9 @@ xfs_efi_init( if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { size = (uint)(sizeof(xfs_efi_log_item_t) + ((nextents - 1) * sizeof(xfs_extent_t))); - efip = kmem_zalloc(size, KM_SLEEP); + efip = kmem_zalloc(size, 0); } else { - efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP); + efip = kmem_zone_zalloc(xfs_efi_zone, 0); } xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); @@ -333,9 +333,9 @@ xfs_trans_get_efd( if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) + (nextents - 1) * sizeof(struct xfs_extent), - KM_SLEEP); + 0); } else { - efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP); + efdp = kmem_zone_zalloc(xfs_efd_zone, 0); } xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 28101bbc0b78..d952d5962e93 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -28,6 +28,7 @@ #include <linux/falloc.h> #include <linux/backing-dev.h> #include <linux/mman.h> +#include <linux/fadvise.h> static const struct vm_operations_struct xfs_file_vm_ops; @@ -933,6 +934,30 @@ out_unlock: return error; } +STATIC int +xfs_file_fadvise( + struct file *file, + loff_t start, + loff_t end, + int advice) +{ + struct xfs_inode *ip = XFS_I(file_inode(file)); + int ret; + int lockflags = 0; + + /* + * Operations creating pages in page cache need protection from hole + * punching and similar ops + */ + if (advice == POSIX_FADV_WILLNEED) { + lockflags = XFS_IOLOCK_SHARED; + xfs_ilock(ip, lockflags); + } + ret = generic_fadvise(file, start, end, advice); + if (lockflags) + xfs_iunlock(ip, lockflags); + return ret; +} STATIC loff_t xfs_file_remap_range( @@ -1232,6 +1257,7 @@ const struct file_operations xfs_file_operations = { .fsync = xfs_file_fsync, .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, + .fadvise = xfs_file_fadvise, .remap_file_range = xfs_file_remap_range, }; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 5a8f9641562a..d082143feb5a 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -250,7 +250,7 @@ xfs_getfsmap_helper( rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; } /* Are we just counting mappings? */ @@ -259,14 +259,14 @@ xfs_getfsmap_helper( info->head->fmh_entries++; if (info->last) - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; info->head->fmh_entries++; rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; } /* @@ -276,7 +276,7 @@ xfs_getfsmap_helper( */ if (rec_daddr > info->next_daddr) { if (info->head->fmh_entries >= info->head->fmh_count) - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; fmr.fmr_device = info->dev; fmr.fmr_physical = info->next_daddr; @@ -295,7 +295,7 @@ xfs_getfsmap_helper( /* Fill out the extent we found */ if (info->head->fmh_entries >= info->head->fmh_count) - return XFS_BTREE_QUERY_RANGE_ABORT; + return -ECANCELED; trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec); @@ -328,7 +328,7 @@ out: rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; - return XFS_BTREE_QUERY_RANGE_CONTINUE; + return 0; } /* Transform a rmapbt irec into a fsmap */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0b0fd10a36d4..944add5ff8e0 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -40,7 +40,7 @@ xfs_inode_alloc( * KM_MAYFAIL and return NULL here on ENOMEM. Set the * code up to do this anyway. */ - ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); + ip = kmem_zone_alloc(xfs_inode_zone, 0); if (!ip) return NULL; if (inode_init_always(mp->m_super, VFS_I(ip))) { diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index d99a0a3e5f40..3ebd1b7f49d8 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -89,7 +89,7 @@ xfs_icreate_log( { struct xfs_icreate_item *icp; - icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP); + icp = kmem_zone_zalloc(xfs_icreate_zone, 0); xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, &xfs_icreate_item_ops); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 6467d5e1df2d..18f4b262e61c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2018,7 +2018,7 @@ xfs_iunlink_add_backref( if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) return 0; - iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS); + iu = kmem_zalloc(sizeof(*iu), KM_NOFS); iu->iu_agino = prev_agino; iu->iu_next_unlinked = this_agino; @@ -3282,7 +3282,8 @@ xfs_rename( spaceres); /* - * Set up the target. + * Check for expected errors before we dirty the transaction + * so we can return an error without a transaction abort. */ if (target_ip == NULL) { /* @@ -3294,6 +3295,46 @@ xfs_rename( if (error) goto out_trans_cancel; } + } else { + /* + * If target exists and it's a directory, check that whether + * it can be destroyed. + */ + if (S_ISDIR(VFS_I(target_ip)->i_mode) && + (!xfs_dir_isempty(target_ip) || + (VFS_I(target_ip)->i_nlink > 2))) { + error = -EEXIST; + goto out_trans_cancel; + } + } + + /* + * Directory entry creation below may acquire the AGF. Remove + * the whiteout from the unlinked list first to preserve correct + * AGI/AGF locking order. This dirties the transaction so failures + * after this point will abort and log recovery will clean up the + * mess. + * + * For whiteouts, we need to bump the link count on the whiteout + * inode. After this point, we have a real link, clear the tmpfile + * state flag from the inode so it doesn't accidentally get misused + * in future. + */ + if (wip) { + ASSERT(VFS_I(wip)->i_nlink == 0); + error = xfs_iunlink_remove(tp, wip); + if (error) + goto out_trans_cancel; + + xfs_bumplink(tp, wip); + xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); + VFS_I(wip)->i_state &= ~I_LINKABLE; + } + + /* + * Set up the target. + */ + if (target_ip == NULL) { /* * If target does not exist and the rename crosses * directories, adjust the target directory link count @@ -3312,22 +3353,6 @@ xfs_rename( } } else { /* target_ip != NULL */ /* - * If target exists and it's a directory, check that both - * target and source are directories and that target can be - * destroyed, or that neither is a directory. - */ - if (S_ISDIR(VFS_I(target_ip)->i_mode)) { - /* - * Make sure target dir is empty. - */ - if (!(xfs_dir_isempty(target_ip)) || - (VFS_I(target_ip)->i_nlink > 2)) { - error = -EEXIST; - goto out_trans_cancel; - } - } - - /* * Link the source inode under the target name. * If the source inode is a directory and we are moving * it across directories, its ".." entry will be @@ -3417,30 +3442,6 @@ xfs_rename( if (error) goto out_trans_cancel; - /* - * For whiteouts, we need to bump the link count on the whiteout inode. - * This means that failures all the way up to this point leave the inode - * on the unlinked list and so cleanup is a simple matter of dropping - * the remaining reference to it. If we fail here after bumping the link - * count, we're shutting down the filesystem so we'll never see the - * intermediate state on disk. - */ - if (wip) { - ASSERT(VFS_I(wip)->i_nlink == 0); - xfs_bumplink(tp, wip); - error = xfs_iunlink_remove(tp, wip); - if (error) - goto out_trans_cancel; - xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); - - /* - * Now we have a real link, clear the "I'm a tmpfile" state - * flag from the inode so it doesn't accidentally get misused in - * future. - */ - VFS_I(wip)->i_state &= ~I_LINKABLE; - } - xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); if (new_parent) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index c9a502eed204..bb8f076805b9 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -651,7 +651,7 @@ xfs_inode_item_init( struct xfs_inode_log_item *iip; ASSERT(ip->i_itemp == NULL); - iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); + iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0); iip->ili_inode = ip; xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index affa557c2337..d58f0d6a699e 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -396,7 +396,7 @@ xfs_attrlist_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); + kbuf = kmem_zalloc_large(al_hreq.buflen, 0); if (!kbuf) goto out_dput; @@ -434,11 +434,11 @@ xfs_attrmulti_attr_get( if (*len > XFS_XATTR_SIZE_MAX) return -EINVAL; - kbuf = kmem_zalloc_large(*len, KM_SLEEP); + kbuf = kmem_zalloc_large(*len, 0); if (!kbuf) return -ENOMEM; - error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); + error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags); if (error) goto out_kfree; @@ -831,7 +831,7 @@ xfs_bulkstat_fmt( /* * Check the incoming bulk request @hdr from userspace and initialize the * internal @breq bulk request appropriately. Returns 0 if the bulk request - * should proceed; XFS_ITER_ABORT if there's nothing to do; or the usual + * should proceed; -ECANCELED if there's nothing to do; or the usual * negative error code. */ static int @@ -889,13 +889,13 @@ xfs_bulk_ireq_setup( /* Asking for an inode past the end of the AG? We're done! */ if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno) - return XFS_ITER_ABORT; + return -ECANCELED; } else if (hdr->agno) return -EINVAL; /* Asking for an inode past the end of the FS? We're done! */ if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount) - return XFS_ITER_ABORT; + return -ECANCELED; return 0; } @@ -936,7 +936,7 @@ xfs_ioc_bulkstat( return -EFAULT; error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat); - if (error == XFS_ITER_ABORT) + if (error == -ECANCELED) goto out_teardown; if (error < 0) return error; @@ -986,7 +986,7 @@ xfs_ioc_inumbers( return -EFAULT; error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers); - if (error == XFS_ITER_ABORT) + if (error == -ECANCELED) goto out_teardown; if (error < 0) return error; @@ -1038,6 +1038,10 @@ xfs_ioc_ag_geometry( if (copy_from_user(&ageo, arg, sizeof(ageo))) return -EFAULT; + if (ageo.ag_flags) + return -EINVAL; + if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved))) + return -EINVAL; error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo); if (error) @@ -1309,8 +1313,7 @@ xfs_ioctl_setattr_dax_invalidate( if (fa->fsx_xflags & FS_XFLAG_DAX) { if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) return -EINVAL; - if (S_ISREG(inode->i_mode) && - !bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)), + if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)), sb->s_blocksize)) return -EINVAL; } @@ -1881,7 +1884,7 @@ xfs_ioc_getfsmap( info.mp = ip->i_mount; info.data = arg; error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info); - if (error == XFS_BTREE_QUERY_RANGE_ABORT) { + if (error == -ECANCELED) { error = 0; aborted = true; } else if (error) diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 7bd7534f5051..1e08bf79b478 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -381,7 +381,7 @@ xfs_compat_attrlist_by_handle( return PTR_ERR(dentry); error = -ENOMEM; - kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); + kbuf = kmem_zalloc_large(al_hreq.buflen, 0); if (!kbuf) goto out_dput; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 3a4310d7cb59..f780e223b118 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -58,7 +58,7 @@ xfs_bmbt_to_iomap( { struct xfs_mount *mp = ip->i_mount; - if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip))) + if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) return xfs_alert_fsblock_zero(ip, imap); if (imap->br_startblock == HOLESTARTBLOCK) { @@ -297,7 +297,7 @@ xfs_iomap_write_direct( goto out_unlock; } - if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) + if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) error = xfs_alert_fsblock_zero(ip, imap); out_unlock: @@ -814,7 +814,7 @@ xfs_iomap_write_unwritten( if (error) return error; - if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) + if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) return xfs_alert_fsblock_zero(ip, &imap); if ((numblks_fsb = imap.br_blockcount) == 0) { diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f5c955d35be4..884950adbd16 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -137,7 +137,7 @@ xfs_bulkstat_one_int( xfs_irele(ip); error = bc->formatter(bc->breq, buf); - if (error == XFS_IBULK_ABORT) + if (error == -ECANCELED) goto out_advance; if (error) goto out; @@ -169,7 +169,7 @@ xfs_bulkstat_one( ASSERT(breq->icount == 1); bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), - KM_SLEEP | KM_MAYFAIL); + KM_MAYFAIL); if (!bc.buf) return -ENOMEM; @@ -181,7 +181,7 @@ xfs_bulkstat_one( * If we reported one inode to userspace then we abort because we hit * the end of the buffer. Don't leak that back to userspace. */ - if (error == XFS_IWALK_ABORT) + if (error == -ECANCELED) error = 0; return error; @@ -243,7 +243,7 @@ xfs_bulkstat( return 0; bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), - KM_SLEEP | KM_MAYFAIL); + KM_MAYFAIL); if (!bc.buf) return -ENOMEM; @@ -342,7 +342,7 @@ xfs_inumbers_walk( int error; error = ic->formatter(ic->breq, &inogrp); - if (error && error != XFS_IBULK_ABORT) + if (error && error != -ECANCELED) return error; ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) + diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index e90c1fc5b981..96a1e2a9be3f 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -18,9 +18,6 @@ struct xfs_ibulk { /* Only iterate within the same AG as startino */ #define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG) -/* Return value that means we want to abort the walk. */ -#define XFS_IBULK_ABORT (XFS_IWALK_ABORT) - /* * Advance the user buffer pointer by one record of the given size. If the * buffer is now full, return the appropriate error code. @@ -34,13 +31,21 @@ xfs_ibulk_advance( breq->ubuffer = b + bytes; breq->ocount++; - return breq->ocount == breq->icount ? XFS_IBULK_ABORT : 0; + return breq->ocount == breq->icount ? -ECANCELED : 0; } /* * Return stat information in bulk (by-inode) for the filesystem. */ +/* + * Return codes for the formatter function are 0 to continue iterating, and + * non-zero to stop iterating. Any non-zero value will be passed up to the + * bulkstat/inumbers caller. The special value -ECANCELED can be used to stop + * iteration, as neither bulkstat nor inumbers will ever generate that error + * code on their own. + */ + typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq, const struct xfs_bulkstat *bstat); diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 8c7d727149ea..aa375cf53021 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -31,7 +31,7 @@ * inode it finds, it calls a walk function with the relevant inode number and * a pointer to caller-provided data. The walk function can return the usual * negative error code to stop the iteration; 0 to continue the iteration; or - * XFS_IWALK_ABORT to stop the iteration. This return value is returned to the + * -ECANCELED to stop the iteration. This return value is returned to the * caller. * * Internally, we allow the walk function to do anything, which means that we @@ -616,7 +616,7 @@ xfs_iwalk_threaded( if (xfs_pwork_ctl_want_abort(&pctl)) break; - iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP); + iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0); iwag->mp = mp; iwag->iwalk_fn = iwalk_fn; iwag->data = data; diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h index 6c960e10ed4d..37a795f03267 100644 --- a/fs/xfs/xfs_iwalk.h +++ b/fs/xfs/xfs_iwalk.h @@ -6,12 +6,17 @@ #ifndef __XFS_IWALK_H__ #define __XFS_IWALK_H__ +/* + * Return codes for the inode/inobt walk function are 0 to continue iterating, + * and non-zero to stop iterating. Any non-zero value will be passed up to the + * iwalk or inobt_walk caller. The special value -ECANCELED can be used to + * stop iteration, as neither iwalk nor inobt_walk will ever generate that + * error code on their own. + */ + /* Walk all inodes in the filesystem starting from @startino. */ typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, void *data); -/* Return values for xfs_iwalk_fn. */ -#define XFS_IWALK_CONTINUE (XFS_ITER_CONTINUE) -#define XFS_IWALK_ABORT (XFS_ITER_ABORT) int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, unsigned int flags, xfs_iwalk_fn iwalk_fn, @@ -30,8 +35,6 @@ typedef int (*xfs_inobt_walk_fn)(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, const struct xfs_inobt_rec_incore *irec, void *data); -/* Return value (for xfs_inobt_walk_fn) that aborts the walk immediately. */ -#define XFS_INOBT_WALK_ABORT (XFS_IWALK_ABORT) int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, unsigned int flags, diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7fc3c1ad36bc..a2beee9f74da 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -214,15 +214,42 @@ xlog_grant_head_wake( { struct xlog_ticket *tic; int need_bytes; + bool woken_task = false; list_for_each_entry(tic, &head->waiters, t_queue) { + + /* + * There is a chance that the size of the CIL checkpoints in + * progress at the last AIL push target calculation resulted in + * limiting the target to the log head (l_last_sync_lsn) at the + * time. This may not reflect where the log head is now as the + * CIL checkpoints may have completed. + * + * Hence when we are woken here, it may be that the head of the + * log that has moved rather than the tail. As the tail didn't + * move, there still won't be space available for the + * reservation we require. However, if the AIL has already + * pushed to the target defined by the old log head location, we + * will hang here waiting for something else to update the AIL + * push target. + * + * Therefore, if there isn't space to wake the first waiter on + * the grant head, we need to push the AIL again to ensure the + * target reflects both the current log tail and log head + * position before we wait for the tail to move again. + */ + need_bytes = xlog_ticket_reservation(log, head, tic); - if (*free_bytes < need_bytes) + if (*free_bytes < need_bytes) { + if (!woken_task) + xlog_grant_push_ail(log, need_bytes); return false; + } *free_bytes -= need_bytes; trace_xfs_log_grant_wake_up(log, tic); wake_up_process(tic->t_task); + woken_task = true; } return true; @@ -428,8 +455,7 @@ xfs_log_reserve( XFS_STATS_INC(mp, xs_try_logspace); ASSERT(*ticp == NULL); - tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, - KM_SLEEP); + tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0); *ticp = tic; xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt @@ -1404,6 +1430,7 @@ xlog_alloc_log( */ ASSERT(log->l_iclog_size >= 4096); for (i = 0; i < log->l_iclog_bufs; i++) { + int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp); size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * sizeof(struct bio_vec); @@ -1415,8 +1442,8 @@ xlog_alloc_log( iclog->ic_prev = prev_iclog; prev_iclog = iclog; - iclog->ic_data = kmem_alloc_large(log->l_iclog_size, - KM_MAYFAIL); + iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask, + KM_MAYFAIL); if (!iclog->ic_data) goto out_free_iclog; #ifdef DEBUG @@ -2496,21 +2523,35 @@ next_lv: ***************************************************************************** */ -/* Clean iclogs starting from the head. This ordering must be - * maintained, so an iclog doesn't become ACTIVE beyond one that - * is SYNCING. This is also required to maintain the notion that we use - * a ordered wait queue to hold off would be writers to the log when every - * iclog is trying to sync to disk. +/* + * An iclog has just finished IO completion processing, so we need to update + * the iclog state and propagate that up into the overall log state. Hence we + * prepare the iclog for cleaning, and then clean all the pending dirty iclogs + * starting from the head, and then wake up any threads that are waiting for the + * iclog to be marked clean. + * + * The ordering of marking iclogs ACTIVE must be maintained, so an iclog + * doesn't become ACTIVE beyond one that is SYNCING. This is also required to + * maintain the notion that we use a ordered wait queue to hold off would be + * writers to the log when every iclog is trying to sync to disk. + * + * Caller must hold the icloglock before calling us. * - * State Change: DIRTY -> ACTIVE + * State Change: !IOERROR -> DIRTY -> ACTIVE */ STATIC void -xlog_state_clean_log( - struct xlog *log) +xlog_state_clean_iclog( + struct xlog *log, + struct xlog_in_core *dirty_iclog) { - xlog_in_core_t *iclog; - int changed = 0; + struct xlog_in_core *iclog; + int changed = 0; + /* Prepare the completed iclog. */ + if (!(dirty_iclog->ic_state & XLOG_STATE_IOERROR)) + dirty_iclog->ic_state = XLOG_STATE_DIRTY; + + /* Walk all the iclogs to update the ordered active state. */ iclog = log->l_iclog; do { if (iclog->ic_state == XLOG_STATE_DIRTY) { @@ -2548,7 +2589,13 @@ xlog_state_clean_log( iclog = iclog->ic_next; } while (iclog != log->l_iclog); - /* log is locked when we are called */ + + /* + * Wake up threads waiting in xfs_log_force() for the dirty iclog + * to be cleaned. + */ + wake_up_all(&dirty_iclog->ic_force_wait); + /* * Change state for the dummy log recording. * We usually go to NEED. But we go to NEED2 if the changed indicates @@ -2582,7 +2629,7 @@ xlog_state_clean_log( ASSERT(0); } } -} /* xlog_state_clean_log */ +} STATIC xfs_lsn_t xlog_get_lowest_lsn( @@ -2603,30 +2650,205 @@ xlog_get_lowest_lsn( return lowest_lsn; } +/* + * Completion of a iclog IO does not imply that a transaction has completed, as + * transactions can be large enough to span many iclogs. We cannot change the + * tail of the log half way through a transaction as this may be the only + * transaction in the log and moving the tail to point to the middle of it + * will prevent recovery from finding the start of the transaction. Hence we + * should only update the last_sync_lsn if this iclog contains transaction + * completion callbacks on it. + * + * We have to do this before we drop the icloglock to ensure we are the only one + * that can update it. + * + * If we are moving the last_sync_lsn forwards, we also need to ensure we kick + * the reservation grant head pushing. This is due to the fact that the push + * target is bound by the current last_sync_lsn value. Hence if we have a large + * amount of log space bound up in this committing transaction then the + * last_sync_lsn value may be the limiting factor preventing tail pushing from + * freeing space in the log. Hence once we've updated the last_sync_lsn we + * should push the AIL to ensure the push target (and hence the grant head) is + * no longer bound by the old log head location and can move forwards and make + * progress again. + */ +static void +xlog_state_set_callback( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t header_lsn) +{ + iclog->ic_state = XLOG_STATE_CALLBACK; + + ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), + header_lsn) <= 0); + + if (list_empty_careful(&iclog->ic_callbacks)) + return; + + atomic64_set(&log->l_last_sync_lsn, header_lsn); + xlog_grant_push_ail(log, 0); +} + +/* + * Return true if we need to stop processing, false to continue to the next + * iclog. The caller will need to run callbacks if the iclog is returned in the + * XLOG_STATE_CALLBACK state. + */ +static bool +xlog_state_iodone_process_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + struct xlog_in_core *completed_iclog, + bool *ioerror) +{ + xfs_lsn_t lowest_lsn; + xfs_lsn_t header_lsn; + + /* Skip all iclogs in the ACTIVE & DIRTY states */ + if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY)) + return false; + + /* + * Between marking a filesystem SHUTDOWN and stopping the log, we do + * flush all iclogs to disk (if there wasn't a log I/O error). So, we do + * want things to go smoothly in case of just a SHUTDOWN w/o a + * LOG_IO_ERROR. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) { + *ioerror = true; + return false; + } + + /* + * Can only perform callbacks in order. Since this iclog is not in the + * DONE_SYNC/ DO_CALLBACK state, we skip the rest and just try to clean + * up. If we set our iclog to DO_CALLBACK, we will not process it when + * we retry since a previous iclog is in the CALLBACK and the state + * cannot change since we are holding the l_icloglock. + */ + if (!(iclog->ic_state & + (XLOG_STATE_DONE_SYNC | XLOG_STATE_DO_CALLBACK))) { + if (completed_iclog && + (completed_iclog->ic_state == XLOG_STATE_DONE_SYNC)) { + completed_iclog->ic_state = XLOG_STATE_DO_CALLBACK; + } + return true; + } + + /* + * We now have an iclog that is in either the DO_CALLBACK or DONE_SYNC + * states. The other states (WANT_SYNC, SYNCING, or CALLBACK were caught + * by the above if and are going to clean (i.e. we aren't doing their + * callbacks) see the above if. + * + * We will do one more check here to see if we have chased our tail + * around. If this is not the lowest lsn iclog, then we will leave it + * for another completion to process. + */ + header_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + lowest_lsn = xlog_get_lowest_lsn(log); + if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0) + return false; + + xlog_state_set_callback(log, iclog, header_lsn); + return false; + +} + +/* + * Keep processing entries in the iclog callback list until we come around and + * it is empty. We need to atomically see that the list is empty and change the + * state to DIRTY so that we don't miss any more callbacks being added. + * + * This function is called with the icloglock held and returns with it held. We + * drop it while running callbacks, however, as holding it over thousands of + * callbacks is unnecessary and causes excessive contention if we do. + */ +static void +xlog_state_do_iclog_callbacks( + struct xlog *log, + struct xlog_in_core *iclog, + bool aborted) +{ + spin_unlock(&log->l_icloglock); + spin_lock(&iclog->ic_callback_lock); + while (!list_empty(&iclog->ic_callbacks)) { + LIST_HEAD(tmp); + + list_splice_init(&iclog->ic_callbacks, &tmp); + + spin_unlock(&iclog->ic_callback_lock); + xlog_cil_process_committed(&tmp, aborted); + spin_lock(&iclog->ic_callback_lock); + } + + /* + * Pick up the icloglock while still holding the callback lock so we + * serialise against anyone trying to add more callbacks to this iclog + * now we've finished processing. + */ + spin_lock(&log->l_icloglock); + spin_unlock(&iclog->ic_callback_lock); +} + +#ifdef DEBUG +/* + * Make one last gasp attempt to see if iclogs are being left in limbo. If the + * above loop finds an iclog earlier than the current iclog and in one of the + * syncing states, the current iclog is put into DO_CALLBACK and the callbacks + * are deferred to the completion of the earlier iclog. Walk the iclogs in order + * and make sure that no iclog is in DO_CALLBACK unless an earlier iclog is in + * one of the syncing states. + * + * Note that SYNCING|IOERROR is a valid state so we cannot just check for + * ic_state == SYNCING. + */ +static void +xlog_state_callback_check_state( + struct xlog *log) +{ + struct xlog_in_core *first_iclog = log->l_iclog; + struct xlog_in_core *iclog = first_iclog; + + do { + ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); + /* + * Terminate the loop if iclogs are found in states + * which will cause other threads to clean up iclogs. + * + * SYNCING - i/o completion will go through logs + * DONE_SYNC - interrupt thread should be waiting for + * l_icloglock + * IOERROR - give up hope all ye who enter here + */ + if (iclog->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_state & XLOG_STATE_SYNCING || + iclog->ic_state == XLOG_STATE_DONE_SYNC || + iclog->ic_state == XLOG_STATE_IOERROR ) + break; + iclog = iclog->ic_next; + } while (first_iclog != iclog); +} +#else +#define xlog_state_callback_check_state(l) ((void)0) +#endif + STATIC void xlog_state_do_callback( struct xlog *log, bool aborted, struct xlog_in_core *ciclog) { - xlog_in_core_t *iclog; - xlog_in_core_t *first_iclog; /* used to know when we've - * processed all iclogs once */ - int flushcnt = 0; - xfs_lsn_t lowest_lsn; - int ioerrors; /* counter: iclogs with errors */ - int loopdidcallbacks; /* flag: inner loop did callbacks*/ - int funcdidcallbacks; /* flag: function did callbacks */ - int repeats; /* for issuing console warnings if - * looping too many times */ - int wake = 0; + struct xlog_in_core *iclog; + struct xlog_in_core *first_iclog; + bool did_callbacks = false; + bool cycled_icloglock; + bool ioerror; + int flushcnt = 0; + int repeats = 0; spin_lock(&log->l_icloglock); - first_iclog = iclog = log->l_iclog; - ioerrors = 0; - funcdidcallbacks = 0; - repeats = 0; - do { /* * Scan all iclogs starting with the one pointed to by the @@ -2638,137 +2860,34 @@ xlog_state_do_callback( */ first_iclog = log->l_iclog; iclog = log->l_iclog; - loopdidcallbacks = 0; + cycled_icloglock = false; + ioerror = false; repeats++; do { + if (xlog_state_iodone_process_iclog(log, iclog, + ciclog, &ioerror)) + break; - /* skip all iclogs in the ACTIVE & DIRTY states */ - if (iclog->ic_state & - (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) { + if (!(iclog->ic_state & + (XLOG_STATE_CALLBACK | XLOG_STATE_IOERROR))) { iclog = iclog->ic_next; continue; } /* - * Between marking a filesystem SHUTDOWN and stopping - * the log, we do flush all iclogs to disk (if there - * wasn't a log I/O error). So, we do want things to - * go smoothly in case of just a SHUTDOWN w/o a - * LOG_IO_ERROR. - */ - if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { - /* - * Can only perform callbacks in order. Since - * this iclog is not in the DONE_SYNC/ - * DO_CALLBACK state, we skip the rest and - * just try to clean up. If we set our iclog - * to DO_CALLBACK, we will not process it when - * we retry since a previous iclog is in the - * CALLBACK and the state cannot change since - * we are holding the l_icloglock. - */ - if (!(iclog->ic_state & - (XLOG_STATE_DONE_SYNC | - XLOG_STATE_DO_CALLBACK))) { - if (ciclog && (ciclog->ic_state == - XLOG_STATE_DONE_SYNC)) { - ciclog->ic_state = XLOG_STATE_DO_CALLBACK; - } - break; - } - /* - * We now have an iclog that is in either the - * DO_CALLBACK or DONE_SYNC states. The other - * states (WANT_SYNC, SYNCING, or CALLBACK were - * caught by the above if and are going to - * clean (i.e. we aren't doing their callbacks) - * see the above if. - */ - - /* - * We will do one more check here to see if we - * have chased our tail around. - */ - - lowest_lsn = xlog_get_lowest_lsn(log); - if (lowest_lsn && - XFS_LSN_CMP(lowest_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { - iclog = iclog->ic_next; - continue; /* Leave this iclog for - * another thread */ - } - - iclog->ic_state = XLOG_STATE_CALLBACK; - - - /* - * Completion of a iclog IO does not imply that - * a transaction has completed, as transactions - * can be large enough to span many iclogs. We - * cannot change the tail of the log half way - * through a transaction as this may be the only - * transaction in the log and moving th etail to - * point to the middle of it will prevent - * recovery from finding the start of the - * transaction. Hence we should only update the - * last_sync_lsn if this iclog contains - * transaction completion callbacks on it. - * - * We have to do this before we drop the - * icloglock to ensure we are the only one that - * can update it. - */ - ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), - be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); - if (!list_empty_careful(&iclog->ic_callbacks)) - atomic64_set(&log->l_last_sync_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)); - - } else - ioerrors++; - - spin_unlock(&log->l_icloglock); - - /* - * Keep processing entries in the callback list until - * we come around and it is empty. We need to - * atomically see that the list is empty and change the - * state to DIRTY so that we don't miss any more - * callbacks being added. - */ - spin_lock(&iclog->ic_callback_lock); - while (!list_empty(&iclog->ic_callbacks)) { - LIST_HEAD(tmp); - - list_splice_init(&iclog->ic_callbacks, &tmp); - - spin_unlock(&iclog->ic_callback_lock); - xlog_cil_process_committed(&tmp, aborted); - spin_lock(&iclog->ic_callback_lock); - } - - loopdidcallbacks++; - funcdidcallbacks++; - - spin_lock(&log->l_icloglock); - spin_unlock(&iclog->ic_callback_lock); - if (!(iclog->ic_state & XLOG_STATE_IOERROR)) - iclog->ic_state = XLOG_STATE_DIRTY; - - /* - * Transition from DIRTY to ACTIVE if applicable. - * NOP if STATE_IOERROR. + * Running callbacks will drop the icloglock which means + * we'll have to run at least one more complete loop. */ - xlog_state_clean_log(log); - - /* wake up threads waiting in xfs_log_force() */ - wake_up_all(&iclog->ic_force_wait); + cycled_icloglock = true; + xlog_state_do_iclog_callbacks(log, iclog, aborted); + xlog_state_clean_iclog(log, iclog); iclog = iclog->ic_next; } while (first_iclog != iclog); + did_callbacks |= cycled_icloglock; + if (repeats > 5000) { flushcnt += repeats; repeats = 0; @@ -2776,50 +2895,15 @@ xlog_state_do_callback( "%s: possible infinite loop (%d iterations)", __func__, flushcnt); } - } while (!ioerrors && loopdidcallbacks); + } while (!ioerror && cycled_icloglock); -#ifdef DEBUG - /* - * Make one last gasp attempt to see if iclogs are being left in limbo. - * If the above loop finds an iclog earlier than the current iclog and - * in one of the syncing states, the current iclog is put into - * DO_CALLBACK and the callbacks are deferred to the completion of the - * earlier iclog. Walk the iclogs in order and make sure that no iclog - * is in DO_CALLBACK unless an earlier iclog is in one of the syncing - * states. - * - * Note that SYNCING|IOABORT is a valid state so we cannot just check - * for ic_state == SYNCING. - */ - if (funcdidcallbacks) { - first_iclog = iclog = log->l_iclog; - do { - ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); - /* - * Terminate the loop if iclogs are found in states - * which will cause other threads to clean up iclogs. - * - * SYNCING - i/o completion will go through logs - * DONE_SYNC - interrupt thread should be waiting for - * l_icloglock - * IOERROR - give up hope all ye who enter here - */ - if (iclog->ic_state == XLOG_STATE_WANT_SYNC || - iclog->ic_state & XLOG_STATE_SYNCING || - iclog->ic_state == XLOG_STATE_DONE_SYNC || - iclog->ic_state == XLOG_STATE_IOERROR ) - break; - iclog = iclog->ic_next; - } while (first_iclog != iclog); - } -#endif + if (did_callbacks) + xlog_state_callback_check_state(log); if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) - wake = 1; - spin_unlock(&log->l_icloglock); - - if (wake) wake_up_all(&log->l_flush_wait); + + spin_unlock(&log->l_icloglock); } @@ -3919,7 +4003,9 @@ xfs_log_force_umount( * item committed callback functions will do this again under lock to * avoid races. */ + spin_lock(&log->l_cilp->xc_push_lock); wake_up_all(&log->l_cilp->xc_commit_wait); + spin_unlock(&log->l_cilp->xc_push_lock); xlog_state_do_callback(log, true, NULL); #ifdef XFSERRORDEBUG diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index fa5602d0fd7f..ef652abd112c 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -38,7 +38,7 @@ xlog_cil_ticket_alloc( struct xlog_ticket *tic; tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, - KM_SLEEP|KM_NOFS); + KM_NOFS); /* * set the current reservation to zero so we know to steal the basic @@ -186,7 +186,7 @@ xlog_cil_alloc_shadow_bufs( */ kmem_free(lip->li_lv_shadow); - lv = kmem_alloc_large(buf_size, KM_SLEEP | KM_NOFS); + lv = kmem_alloc_large(buf_size, KM_NOFS); memset(lv, 0, xlog_cil_iovec_space(niovecs)); lv->lv_item = lip; @@ -660,7 +660,7 @@ xlog_cil_push( if (!cil) return 0; - new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); + new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); down_write(&cil->xc_ctx_lock); @@ -1179,11 +1179,11 @@ xlog_cil_init( struct xfs_cil *cil; struct xfs_cil_ctx *ctx; - cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); + cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL); if (!cil) return -ENOMEM; - ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); + ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL); if (!ctx) { kmem_free(cil); return -ENOMEM; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 13d1d3e95b88..508319039dce 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -97,6 +97,8 @@ xlog_alloc_buffer( struct xlog *log, int nbblks) { + int align_mask = xfs_buftarg_dma_alignment(log->l_targ); + /* * Pass log block 0 since we don't have an addr yet, buffer will be * verified on read. @@ -125,7 +127,7 @@ xlog_alloc_buffer( if (nbblks > 1 && log->l_sectBBsize > 1) nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL); + return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL); } /* @@ -1960,7 +1962,7 @@ xlog_recover_buffer_pass1( } } - bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); + bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0); bcp->bc_blkno = buf_f->blf_blkno; bcp->bc_len = buf_f->blf_len; bcp->bc_refcount = 1; @@ -2930,7 +2932,7 @@ xlog_recover_inode_pass2( if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { in_f = item->ri_buf[0].i_addr; } else { - in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP); + in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0); need_free = 1; error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); if (error) @@ -4161,7 +4163,7 @@ xlog_recover_add_item( { xlog_recover_item_t *item; - item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); + item = kmem_zalloc(sizeof(xlog_recover_item_t), 0); INIT_LIST_HEAD(&item->ri_list); list_add_tail(&item->ri_list, head); } @@ -4201,7 +4203,7 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP); + ptr = kmem_realloc(old_ptr, len + old_len, 0); memcpy(&ptr[old_len], dp, len); item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; @@ -4261,7 +4263,7 @@ xlog_recover_add_to_trans( return 0; } - ptr = kmem_alloc(len, KM_SLEEP); + ptr = kmem_alloc(len, 0); memcpy(ptr, dp, len); in_f = (struct xfs_inode_log_format *)ptr; @@ -4289,7 +4291,7 @@ xlog_recover_add_to_trans( item->ri_total = in_f->ilf_size; item->ri_buf = kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), - KM_SLEEP); + 0); } ASSERT(item->ri_total > item->ri_cnt); /* Description region is ri_buf[0] */ @@ -4423,7 +4425,7 @@ xlog_recover_ophdr_to_trans( * This is a new transaction so allocate a new recovery container to * hold the recovery ops that will follow. */ - trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP); + trans = kmem_zalloc(sizeof(struct xlog_recover), 0); trans->r_log_tid = tid; trans->r_lsn = be64_to_cpu(rhead->h_lsn); INIT_LIST_HEAD(&trans->r_itemq); @@ -5022,16 +5024,27 @@ xlog_recover_process_one_iunlink( } /* - * xlog_iunlink_recover + * Recover AGI unlinked lists + * + * This is called during recovery to process any inodes which we unlinked but + * not freed when the system crashed. These inodes will be on the lists in the + * AGI blocks. What we do here is scan all the AGIs and fully truncate and free + * any inodes found on the lists. Each inode is removed from the lists when it + * has been fully truncated and is freed. The freeing of the inode and its + * removal from the list must be atomic. + * + * If everything we touch in the agi processing loop is already in memory, this + * loop can hold the cpu for a long time. It runs without lock contention, + * memory allocation contention, the need wait for IO, etc, and so will run + * until we either run out of inodes to process, run low on memory or we run out + * of log space. * - * This is called during recovery to process any inodes which - * we unlinked but not freed when the system crashed. These - * inodes will be on the lists in the AGI blocks. What we do - * here is scan all the AGIs and fully truncate and free any - * inodes found on the lists. Each inode is removed from the - * lists when it has been fully truncated and is freed. The - * freeing of the inode and its removal from the list must be - * atomic. + * This behaviour is bad for latency on single CPU and non-preemptible kernels, + * and can prevent other filesytem work (such as CIL pushes) from running. This + * can lead to deadlocks if the recovery process runs out of log reservation + * space. Hence we need to yield the CPU when there is other kernel work + * scheduled on this CPU to ensure other scheduled work can run without undue + * latency. */ STATIC void xlog_recover_process_iunlinks( @@ -5078,6 +5091,7 @@ xlog_recover_process_iunlinks( while (agino != NULLAGINO) { agino = xlog_recover_process_one_iunlink(mp, agno, agino, bucket); + cond_resched(); } } xfs_buf_rele(agibp); @@ -5527,7 +5541,7 @@ xlog_do_log_recovery( */ log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * sizeof(struct list_head), - KM_SLEEP); + 0); for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 322da6909290..ba5b6f3b2b88 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -82,7 +82,7 @@ xfs_uuid_mount( if (hole < 0) { xfs_uuid_table = kmem_realloc(xfs_uuid_table, (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), - KM_SLEEP); + 0); hole = xfs_uuid_table_size++; } xfs_uuid_table[hole] = *uuid; @@ -214,7 +214,7 @@ xfs_initialize_perag( spin_lock(&mp->m_perag_lock); if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { - BUG(); + WARN_ON_ONCE(1); spin_unlock(&mp->m_perag_lock); radix_tree_preload_end(); error = -EEXIST; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 4adb6837439a..fdb60e09a9c5 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -327,13 +327,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } /* per-AG block reservation data structures*/ -enum xfs_ag_resv_type { - XFS_AG_RESV_NONE = 0, - XFS_AG_RESV_AGFL, - XFS_AG_RESV_METADATA, - XFS_AG_RESV_RMAPBT, -}; - struct xfs_ag_resv { /* number of blocks originally reserved here */ xfs_extlen_t ar_orig_reserved; diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 74738813f60d..a06661dac5be 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -333,12 +333,12 @@ xfs_mru_cache_create( if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) return -EINVAL; - if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP))) + if (!(mru = kmem_zalloc(sizeof(*mru), 0))) return -ENOMEM; /* An extra list is needed to avoid reaping up to a grp_time early. */ mru->grp_count = grp_count + 1; - mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP); + mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0); if (!mru->lists) { err = -ENOMEM; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5e7a37f0cf84..ecd8ce152ab1 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -642,7 +642,7 @@ xfs_qm_init_quotainfo( ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); + qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), 0); error = list_lru_init(&qinf->qi_lru); if (error) @@ -978,7 +978,7 @@ xfs_qm_reset_dqcounts_buf( if (qip->i_d.di_nblocks == 0) return 0; - map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); + map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0); lblkno = 0; maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index d8288aa0670a..2328268e6245 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -144,9 +144,9 @@ xfs_cui_init( ASSERT(nextents > 0); if (nextents > XFS_CUI_MAX_FAST_EXTENTS) cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), - KM_SLEEP); + 0); else - cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP); + cuip = kmem_zone_zalloc(xfs_cui_zone, 0); xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); cuip->cui_format.cui_nextents = nextents; @@ -223,7 +223,7 @@ xfs_trans_get_cud( { struct xfs_cud_log_item *cudp; - cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP); + cudp = kmem_zone_zalloc(xfs_cud_zone, 0); xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops); cudp->cud_cuip = cuip; @@ -555,26 +555,24 @@ xfs_cui_recover( irec.br_blockcount = new_len; switch (type) { case XFS_REFCOUNT_INCREASE: - error = xfs_refcount_increase_extent(tp, &irec); + xfs_refcount_increase_extent(tp, &irec); break; case XFS_REFCOUNT_DECREASE: - error = xfs_refcount_decrease_extent(tp, &irec); + xfs_refcount_decrease_extent(tp, &irec); break; case XFS_REFCOUNT_ALLOC_COW: - error = xfs_refcount_alloc_cow_extent(tp, + xfs_refcount_alloc_cow_extent(tp, irec.br_startblock, irec.br_blockcount); break; case XFS_REFCOUNT_FREE_COW: - error = xfs_refcount_free_cow_extent(tp, + xfs_refcount_free_cow_extent(tp, irec.br_startblock, irec.br_blockcount); break; default: ASSERT(0); } - if (error) - goto abort_error; requeue_only = true; } } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index edbe37b7f636..0f08153b4994 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -495,10 +495,8 @@ xfs_reflink_cancel_cow_blocks( ASSERT((*tpp)->t_firstblock == NULLFSBLOCK); /* Free the CoW orphan record. */ - error = xfs_refcount_free_cow_extent(*tpp, - del.br_startblock, del.br_blockcount); - if (error) - break; + xfs_refcount_free_cow_extent(*tpp, del.br_startblock, + del.br_blockcount); xfs_bmap_add_free(*tpp, del.br_startblock, del.br_blockcount, NULL); @@ -675,15 +673,10 @@ xfs_reflink_end_cow_extent( trace_xfs_reflink_cow_remap(ip, &del); /* Free the CoW orphan record. */ - error = xfs_refcount_free_cow_extent(tp, del.br_startblock, - del.br_blockcount); - if (error) - goto out_cancel; + xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount); /* Map the new blocks into the data fork. */ - error = xfs_bmap_map_extent(tp, ip, &del); - if (error) - goto out_cancel; + xfs_bmap_map_extent(tp, ip, &del); /* Charge this new data fork mapping to the on-disk quota. */ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, @@ -1070,14 +1063,10 @@ xfs_reflink_remap_extent( uirec.br_blockcount, uirec.br_startblock); /* Update the refcount tree */ - error = xfs_refcount_increase_extent(tp, &uirec); - if (error) - goto out_cancel; + xfs_refcount_increase_extent(tp, &uirec); /* Map the new blocks into the data fork. */ - error = xfs_bmap_map_extent(tp, ip, &uirec); - if (error) - goto out_cancel; + xfs_bmap_map_extent(tp, ip, &uirec); /* Update quota accounting. */ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 77ed557b6127..8939e0ea09cd 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -142,9 +142,9 @@ xfs_rui_init( ASSERT(nextents > 0); if (nextents > XFS_RUI_MAX_FAST_EXTENTS) - ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP); + ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0); else - ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP); + ruip = kmem_zone_zalloc(xfs_rui_zone, 0); xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); ruip->rui_format.rui_nextents = nextents; @@ -244,7 +244,7 @@ xfs_trans_get_rud( { struct xfs_rud_log_item *rudp; - rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP); + rudp = kmem_zone_zalloc(xfs_rud_zone, 0); xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, &xfs_rud_item_ops); rudp->rud_ruip = ruip; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 5fa4db3c3e32..4a48a8c75b4f 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -865,7 +865,7 @@ xfs_alloc_rsum_cache( * lower bound on the minimum level with any free extents. We can * continue without the cache if it couldn't be allocated. */ - mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, KM_SLEEP); + mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0); if (!mp->m_rsum_cache) xfs_warn(mp, "could not allocate realtime summary cache"); } @@ -963,7 +963,7 @@ xfs_growfs_rt( /* * Allocate a new (fake) mount/sb. */ - nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP); + nmp = kmem_alloc(sizeof(*nmp), 0); /* * Loop over the bitmap blocks. * We will do everything one bitmap block at a time. diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f9450235533c..391b4748cae3 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -818,7 +818,8 @@ xfs_init_mount_workqueues( goto out_destroy_buf; mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); + WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND, + 0, mp->m_fsname); if (!mp->m_cil_workqueue) goto out_destroy_unwritten; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 8094b1920eef..eaae275ed430 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -23,6 +23,7 @@ struct xlog; struct xlog_ticket; struct xlog_recover; struct xlog_recover_item; +struct xlog_rec_header; struct xfs_buf_log_format; struct xfs_inode_log_format; struct xfs_bmbt_irec; @@ -30,6 +31,10 @@ struct xfs_btree_cur; struct xfs_refcount_irec; struct xfs_fsmap; struct xfs_rmap_irec; +struct xfs_icreate_log; +struct xfs_owner_info; +struct xfs_trans_res; +struct xfs_inobt_rec_incore; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -3575,6 +3580,35 @@ TRACE_EVENT(xfs_pwork_init, __entry->nr_threads, __entry->pid) ) +DECLARE_EVENT_CLASS(xfs_kmem_class, + TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), + TP_ARGS(size, flags, caller_ip), + TP_STRUCT__entry( + __field(ssize_t, size) + __field(int, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->size = size; + __entry->flags = flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("size %zd flags 0x%x caller %pS", + __entry->size, + __entry->flags, + (char *)__entry->caller_ip) +) + +#define DEFINE_KMEM_EVENT(name) \ +DEFINE_EVENT(xfs_kmem_class, name, \ + TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \ + TP_ARGS(size, flags, caller_ip)) +DEFINE_KMEM_EVENT(kmem_alloc); +DEFINE_KMEM_EVENT(kmem_alloc_io); +DEFINE_KMEM_EVENT(kmem_alloc_large); +DEFINE_KMEM_EVENT(kmem_realloc); +DEFINE_KMEM_EVENT(kmem_zone_alloc); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index d42a68d8313b..f4795fdb7389 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -90,7 +90,7 @@ xfs_trans_dup( trace_xfs_trans_dup(tp, _RET_IP_); - ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); + ntp = kmem_zone_zalloc(xfs_trans_zone, 0); /* * Initialize the new transaction structure. @@ -263,7 +263,7 @@ xfs_trans_alloc( * GFP_NOFS allocation context so that we avoid lockdep false positives * by doing GFP_KERNEL allocations inside sb_start_intwrite(). */ - tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); + tp = kmem_zone_zalloc(xfs_trans_zone, 0); if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 1027c9ca6eb8..16457465833b 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -863,7 +863,7 @@ STATIC void xfs_trans_alloc_dqinfo( xfs_trans_t *tp) { - tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP); + tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0); } void diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 3123b5aaad2a..cb895b1df5e4 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -30,7 +30,7 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, value = NULL; } - error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags); + error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags); if (error) return error; return asize; diff --git a/include/linux/fs.h b/include/linux/fs.h index ae6648145d18..ffe35d97afcb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3543,6 +3543,8 @@ extern void inode_nohighmem(struct inode *inode); /* mm/fadvise.c */ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice); +extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, + int advice); #if defined(CONFIG_IO_URING) extern struct sock *io_uring_get_socket(struct file *file); diff --git a/mm/fadvise.c b/mm/fadvise.c index 467bcd032037..4f17c83db575 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -27,8 +27,7 @@ * deactivate the pages and clear PG_Referenced. */ -static int generic_fadvise(struct file *file, loff_t offset, loff_t len, - int advice) +int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) { struct inode *inode; struct address_space *mapping; @@ -178,6 +177,7 @@ static int generic_fadvise(struct file *file, loff_t offset, loff_t len, } return 0; } +EXPORT_SYMBOL(generic_fadvise); int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice) { diff --git a/mm/madvise.c b/mm/madvise.c index 968df3aa069f..bac973b9f2cc 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -14,6 +14,7 @@ #include <linux/userfaultfd_k.h> #include <linux/hugetlb.h> #include <linux/falloc.h> +#include <linux/fadvise.h> #include <linux/sched.h> #include <linux/ksm.h> #include <linux/fs.h> @@ -275,6 +276,7 @@ static long madvise_willneed(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct file *file = vma->vm_file; + loff_t offset; *prev = vma; #ifdef CONFIG_SWAP @@ -298,12 +300,20 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } - start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - if (end > vma->vm_end) - end = vma->vm_end; - end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - - force_page_cache_readahead(file->f_mapping, file, start, end - start); + /* + * Filesystem's fadvise may need to take various locks. We need to + * explicitly grab a reference because the vma (and hence the + * vma's reference to the file) can go away as soon as we drop + * mmap_sem. + */ + *prev = NULL; /* tell sys_madvise we drop mmap_sem */ + get_file(file); + up_read(¤t->mm->mmap_sem); + offset = (loff_t)(start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); + fput(file); + down_read(¤t->mm->mmap_sem); return 0; } |