diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/ctree.c | 52 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 4 | ||||
-rw-r--r-- | fs/btrfs/super.c | 4 | ||||
-rw-r--r-- | fs/btrfs/transaction.h | 4 | ||||
-rw-r--r-- | fs/erofs/decompressor_lzma.c | 5 | ||||
-rw-r--r-- | fs/erofs/super.c | 2 | ||||
-rw-r--r-- | fs/nfs/nfs4proc.c | 4 | ||||
-rw-r--r-- | fs/nfs/nfs4state.c | 45 | ||||
-rw-r--r-- | fs/nfs/write.c | 2 | ||||
-rw-r--r-- | fs/overlayfs/export.c | 2 | ||||
-rw-r--r-- | fs/overlayfs/file.c | 2 | ||||
-rw-r--r-- | fs/overlayfs/ovl_entry.h | 10 | ||||
-rw-r--r-- | fs/overlayfs/params.c | 17 | ||||
-rw-r--r-- | fs/overlayfs/super.c | 27 | ||||
-rw-r--r-- | fs/smb/client/connect.c | 26 | ||||
-rw-r--r-- | fs/xfs/xfs_discard.c | 266 | ||||
-rw-r--r-- | fs/xfs/xfs_discard.h | 6 | ||||
-rw-r--r-- | fs/xfs/xfs_extent_busy.c | 34 | ||||
-rw-r--r-- | fs/xfs/xfs_extent_busy.h | 24 | ||||
-rw-r--r-- | fs/xfs/xfs_log_cil.c | 93 | ||||
-rw-r--r-- | fs/xfs/xfs_log_priv.h | 5 |
21 files changed, 441 insertions, 193 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a4cb4b642987..da519c1b6ad0 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -682,18 +682,30 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, u64 search_start; int ret; - if (test_bit(BTRFS_ROOT_DELETING, &root->state)) - btrfs_err(fs_info, - "COW'ing blocks on a fs root that's being dropped"); - - if (trans->transaction != fs_info->running_transaction) - WARN(1, KERN_CRIT "trans %llu running %llu\n", - trans->transid, - fs_info->running_transaction->transid); + if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, + "attempt to COW block %llu on root %llu that is being deleted", + buf->start, btrfs_root_id(root)); + return -EUCLEAN; + } - if (trans->transid != fs_info->generation) - WARN(1, KERN_CRIT "trans %llu running %llu\n", - trans->transid, fs_info->generation); + /* + * COWing must happen through a running transaction, which always + * matches the current fs generation (it's a transaction with a state + * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs + * into error state to prevent the commit of any transaction. + */ + if (unlikely(trans->transaction != fs_info->running_transaction || + trans->transid != fs_info->generation)) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu", + buf->start, btrfs_root_id(root), trans->transid, + fs_info->running_transaction->transid, + fs_info->generation); + return -EUCLEAN; + } if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; @@ -805,8 +817,22 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, int progress_passed = 0; struct btrfs_disk_key disk_key; - WARN_ON(trans->transaction != fs_info->running_transaction); - WARN_ON(trans->transid != fs_info->generation); + /* + * COWing must happen through a running transaction, which always + * matches the current fs generation (it's a transaction with a state + * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs + * into error state to prevent the commit of any transaction. + */ + if (unlikely(trans->transaction != fs_info->running_transaction || + trans->transid != fs_info->generation)) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu", + parent->start, btrfs_root_id(root), trans->transid, + fs_info->running_transaction->transid, + fs_info->generation); + return -EUCLEAN; + } parent_nritems = btrfs_header_nritems(parent); blocksize = fs_info->nodesize; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 75ab766fe156..8e7d03bc1b56 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2978,7 +2978,7 @@ static void get_block_group_info(struct list_head *groups_list, static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, void __user *arg) { - struct btrfs_ioctl_space_args space_args; + struct btrfs_ioctl_space_args space_args = { 0 }; struct btrfs_ioctl_space_info space; struct btrfs_ioctl_space_info *dest; struct btrfs_ioctl_space_info *dest_orig; @@ -4338,7 +4338,7 @@ static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) - struct btrfs_ioctl_send_args_32 args32; + struct btrfs_ioctl_send_args_32 args32 = { 0 }; ret = copy_from_user(&args32, argp, sizeof(args32)); if (ret) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 1a093ec0f7e3..f49e597e197f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -954,6 +954,10 @@ static int btrfs_parse_subvol_options(const char *options, char **subvol_name, *subvol_objectid = subvolid; break; + case Opt_err: + btrfs_err(NULL, "unrecognized mount option '%s'", p); + error = -EINVAL; + goto out; default: break; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 6b309f8a99a8..93869cda6af9 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -219,8 +219,8 @@ do { \ (errno))) { \ /* Stack trace printed. */ \ } else { \ - btrfs_debug((trans)->fs_info, \ - "Transaction aborted (error %d)", \ + btrfs_err((trans)->fs_info, \ + "Transaction aborted (error %d)", \ (errno)); \ } \ } \ diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 73091fbe3ea4..dee10d22ada9 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -217,9 +217,12 @@ again: strm->buf.out_size = min_t(u32, outlen, PAGE_SIZE - pageofs); outlen -= strm->buf.out_size; - if (!rq->out[no] && rq->fillgaps) /* deduped */ + if (!rq->out[no] && rq->fillgaps) { /* deduped */ rq->out[no] = erofs_allocpage(pagepool, GFP_KERNEL | __GFP_NOFAIL); + set_page_private(rq->out[no], + Z_EROFS_SHORTLIVED_PAGE); + } if (rq->out[no]) strm->buf.out = kmap(rq->out[no]) + pageofs; pageofs = 0; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 44a24d573f1f..3700af9ee173 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -235,7 +235,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, return PTR_ERR(ptr); dis = ptr + erofs_blkoff(sb, *pos); - if (!dif->path) { + if (!sbi->devs->flatdev && !dif->path) { if (!dis->tag[0]) { erofs_err(sb, "empty device tag @ pos %llu", *pos); return -EINVAL; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3508d8238826..7016eaadf555 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -10622,7 +10622,9 @@ static void nfs4_disable_swap(struct inode *inode) */ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; - nfs4_schedule_state_manager(clp); + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + wake_up_var(&clp->cl_state); } static const struct inode_operations nfs4_dir_inode_operations = { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e079987af4a3..9a5d911a7edc 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) { struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; + struct rpc_clnt *clnt = clp->cl_rpcclient; + bool swapon = false; - if (clp->cl_rpcclient->cl_shutdown) + if (clnt->cl_shutdown) return; set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); - if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) { - wake_up_var(&clp->cl_state); - return; + + if (atomic_read(&clnt->cl_swapper)) { + swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, + &clp->cl_state); + if (!swapon) { + wake_up_var(&clp->cl_state); + return; + } } - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); + + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + return; + __module_get(THIS_MODULE); refcount_inc(&clp->cl_count); @@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) __func__, PTR_ERR(task)); if (!nfs_client_init_is_complete(clp)) nfs_mark_client_ready(clp, PTR_ERR(task)); + if (swapon) + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs4_clear_state_manager_bit(clp); - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs_put_client(clp); module_put(THIS_MODULE); } @@ -2703,6 +2714,13 @@ static void nfs4_state_manager(struct nfs_client *clp) nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); + if (test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, + &clp->cl_state)) { + memflags = memalloc_nofs_save(); + continue; + } + if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) { if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { nfs_client_return_marked_delegations(clp); @@ -2741,22 +2759,25 @@ static int nfs4_run_state_manager(void *ptr) allow_signal(SIGKILL); again: - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); nfs4_state_manager(clp); - if (atomic_read(&cl->cl_swapper)) { + + if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) && + !test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) { wait_var_event_interruptible(&clp->cl_state, test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)); - if (atomic_read(&cl->cl_swapper) && - test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) + if (!atomic_read(&cl->cl_swapper)) + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + if (refcount_read(&clp->cl_count) > 1 && !signalled() && + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) goto again; /* Either no longer a swapper, or were signalled */ + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); } - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); if (refcount_read(&clp->cl_count) > 1 && !signalled() && test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && - !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state)) + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) goto again; nfs_put_client(clp); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 8c1ee1a1a28f..7720b5e43014 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -802,8 +802,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) } if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { - nfs_release_request(req); atomic_long_dec(&NFS_I(nfs_page_to_inode(req))->nrequests); + nfs_release_request(req); } } diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index c8c8588bd98c..26b782c53910 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -188,7 +188,7 @@ static int ovl_check_encode_origin(struct dentry *dentry) /* Lower file handle for non-upper non-decodable */ if (!ovl_dentry_upper(dentry) && !decodable) - return 0; + return 1; /* Upper file handle for pure upper */ if (!ovl_dentry_lower(dentry)) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 693971d20280..8be4dc050d1e 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -341,7 +341,6 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (!aio_req) goto out; - real.flags = 0; aio_req->orig_iocb = iocb; kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); aio_req->iocb.ki_complete = ovl_aio_rw_complete; @@ -413,7 +412,6 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (!aio_req) goto out; - real.flags = 0; aio_req->orig_iocb = iocb; kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); aio_req->iocb.ki_flags = ifl; diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index e9539f98e86a..d82d2a043da2 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -8,6 +8,7 @@ struct ovl_config { char *upperdir; char *workdir; + char **lowerdirs; bool default_permissions; int redirect_mode; int verity_mode; @@ -39,17 +40,8 @@ struct ovl_layer { int idx; /* One fsid per unique underlying sb (upper fsid == 0) */ int fsid; - char *name; }; -/* - * ovl_free_fs() relies on @mnt being the first member when unmounting - * the private mounts created for each layer. Let's check both the - * offset and type. - */ -static_assert(offsetof(struct ovl_layer, mnt) == 0); -static_assert(__same_type(typeof_member(struct ovl_layer, mnt), struct vfsmount *)); - struct ovl_path { const struct ovl_layer *layer; struct dentry *dentry; diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index b9355bb6d75a..95b751507ac8 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -752,12 +752,12 @@ void ovl_free_fs(struct ovl_fs *ofs) if (ofs->upperdir_locked) ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root); - /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */ - mounts = (struct vfsmount **) ofs->layers; + /* Reuse ofs->config.lowerdirs as a vfsmount array before freeing it */ + mounts = (struct vfsmount **) ofs->config.lowerdirs; for (i = 0; i < ofs->numlayer; i++) { iput(ofs->layers[i].trap); + kfree(ofs->config.lowerdirs[i]); mounts[i] = ofs->layers[i].mnt; - kfree(ofs->layers[i].name); } kern_unmount_array(mounts, ofs->numlayer); kfree(ofs->layers); @@ -765,6 +765,7 @@ void ovl_free_fs(struct ovl_fs *ofs) free_anon_bdev(ofs->fs[i].pseudo_dev); kfree(ofs->fs); + kfree(ofs->config.lowerdirs); kfree(ofs->config.upperdir); kfree(ofs->config.workdir); if (ofs->creator_cred) @@ -949,16 +950,16 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry) struct super_block *sb = dentry->d_sb; struct ovl_fs *ofs = OVL_FS(sb); size_t nr, nr_merged_lower = ofs->numlayer - ofs->numdatalayer; - const struct ovl_layer *data_layers = &ofs->layers[nr_merged_lower]; + char **lowerdatadirs = &ofs->config.lowerdirs[nr_merged_lower]; - /* ofs->layers[0] is the upper layer */ - seq_printf(m, ",lowerdir=%s", ofs->layers[1].name); + /* lowerdirs[] starts from offset 1 */ + seq_printf(m, ",lowerdir=%s", ofs->config.lowerdirs[1]); /* dump regular lower layers */ for (nr = 2; nr < nr_merged_lower; nr++) - seq_printf(m, ":%s", ofs->layers[nr].name); + seq_printf(m, ":%s", ofs->config.lowerdirs[nr]); /* dump data lower layers */ for (nr = 0; nr < ofs->numdatalayer; nr++) - seq_printf(m, "::%s", data_layers[nr].name); + seq_printf(m, "::%s", lowerdatadirs[nr]); if (ofs->config.upperdir) { seq_show_option(m, "upperdir", ofs->config.upperdir); seq_show_option(m, "workdir", ofs->config.workdir); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index def266b5e2a3..3fa2416264a4 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -104,8 +104,8 @@ static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak) static int ovl_dentry_revalidate_common(struct dentry *dentry, unsigned int flags, bool weak) { - struct ovl_entry *oe = OVL_E(dentry); - struct ovl_path *lowerstack = ovl_lowerstack(oe); + struct ovl_entry *oe; + struct ovl_path *lowerstack; struct inode *inode = d_inode_rcu(dentry); struct dentry *upper; unsigned int i; @@ -115,6 +115,8 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry, if (!inode) return -ECHILD; + oe = OVL_I_E(inode); + lowerstack = ovl_lowerstack(oe); upper = ovl_i_dentry_upper(inode); if (upper) ret = ovl_revalidate_real(upper, flags, weak); @@ -167,6 +169,7 @@ static void ovl_free_inode(struct inode *inode) struct ovl_inode *oi = OVL_I(inode); kfree(oi->redirect); + kfree(oi->oe); mutex_destroy(&oi->lock); kmem_cache_free(ovl_inode_cachep, oi); } @@ -176,7 +179,7 @@ static void ovl_destroy_inode(struct inode *inode) struct ovl_inode *oi = OVL_I(inode); dput(oi->__upperdentry); - ovl_free_entry(oi->oe); + ovl_stack_put(ovl_lowerstack(oi->oe), ovl_numlower(oi->oe)); if (S_ISDIR(inode->i_mode)) ovl_dir_cache_free(inode); else @@ -569,11 +572,6 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, upper_layer->idx = 0; upper_layer->fsid = 0; - err = -ENOMEM; - upper_layer->name = kstrdup(ofs->config.upperdir, GFP_KERNEL); - if (!upper_layer->name) - goto out; - /* * Inherit SB_NOSEC flag from upperdir. * @@ -1122,7 +1120,8 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, layers[ofs->numlayer].idx = ofs->numlayer; layers[ofs->numlayer].fsid = fsid; layers[ofs->numlayer].fs = &ofs->fs[fsid]; - layers[ofs->numlayer].name = l->name; + /* Store for printing lowerdir=... in ovl_show_options() */ + ofs->config.lowerdirs[ofs->numlayer] = l->name; l->name = NULL; ofs->numlayer++; ofs->fs[fsid].is_lower = true; @@ -1367,8 +1366,16 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) if (!layers) goto out_err; + ofs->config.lowerdirs = kcalloc(ctx->nr + 1, sizeof(char *), GFP_KERNEL); + if (!ofs->config.lowerdirs) { + kfree(layers); + goto out_err; + } ofs->layers = layers; - /* Layer 0 is reserved for upper even if there's no upper */ + /* + * Layer 0 is reserved for upper even if there's no upper. + * For consistency, config.lowerdirs[0] is NULL. + */ ofs->numlayer = 1; sb->s_stack_depth = 0; diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 3902e90dca6b..7b923e36501b 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -2474,8 +2474,9 @@ cifs_put_tcon(struct cifs_tcon *tcon) static struct cifs_tcon * cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) { - int rc, xid; struct cifs_tcon *tcon; + bool nohandlecache; + int rc, xid; tcon = cifs_find_tcon(ses, ctx); if (tcon) { @@ -2493,14 +2494,17 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) goto out_fail; } - if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING) - tcon = tcon_info_alloc(true); + if (ses->server->dialect >= SMB20_PROT_ID && + (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING)) + nohandlecache = ctx->nohandlecache; else - tcon = tcon_info_alloc(false); + nohandlecache = true; + tcon = tcon_info_alloc(!nohandlecache); if (tcon == NULL) { rc = -ENOMEM; goto out_fail; } + tcon->nohandlecache = nohandlecache; if (ctx->snapshot_time) { if (ses->server->vals->protocol_id == 0) { @@ -2662,10 +2666,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) tcon->nocase = ctx->nocase; tcon->broken_sparse_sup = ctx->no_sparse; tcon->max_cached_dirs = ctx->max_cached_dirs; - if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING) - tcon->nohandlecache = ctx->nohandlecache; - else - tcon->nohandlecache = true; tcon->nodelete = ctx->nodelete; tcon->local_lease = ctx->local_lease; INIT_LIST_HEAD(&tcon->pending_opens); @@ -2895,9 +2895,9 @@ bind_socket(struct TCP_Server_Info *server) if (server->srcaddr.ss_family != AF_UNSPEC) { /* Bind to the specified local IP address */ struct socket *socket = server->ssocket; - rc = socket->ops->bind(socket, - (struct sockaddr *) &server->srcaddr, - sizeof(server->srcaddr)); + rc = kernel_bind(socket, + (struct sockaddr *) &server->srcaddr, + sizeof(server->srcaddr)); if (rc < 0) { struct sockaddr_in *saddr4; struct sockaddr_in6 *saddr6; @@ -3046,8 +3046,8 @@ generic_ip_connect(struct TCP_Server_Info *server) socket->sk->sk_sndbuf, socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); - rc = socket->ops->connect(socket, saddr, slen, - server->noblockcnt ? O_NONBLOCK : 0); + rc = kernel_connect(socket, saddr, slen, + server->noblockcnt ? O_NONBLOCK : 0); /* * When mounting SMB root file systems, we do not want to block in * connect. Otherwise bail out and then let cifs_reconnect() perform diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index afc4c78b9eed..d5787991bb5b 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (C) 2010 Red Hat, Inc. + * Copyright (C) 2010, 2023 Red Hat, Inc. * All Rights Reserved. */ #include "xfs.h" @@ -19,21 +19,147 @@ #include "xfs_log.h" #include "xfs_ag.h" -STATIC int -xfs_trim_extents( +/* + * Notes on an efficient, low latency fstrim algorithm + * + * We need to walk the filesystem free space and issue discards on the free + * space that meet the search criteria (size and location). We cannot issue + * discards on extents that might be in use, or are so recently in use they are + * still marked as busy. To serialise against extent state changes whilst we are + * gathering extents to trim, we must hold the AGF lock to lock out other + * allocations and extent free operations that might change extent state. + * + * However, we cannot just hold the AGF for the entire AG free space walk whilst + * we issue discards on each free space that is found. Storage devices can have + * extremely slow discard implementations (e.g. ceph RBD) and so walking a + * couple of million free extents and issuing synchronous discards on each + * extent can take a *long* time. Whilst we are doing this walk, nothing else + * can access the AGF, and we can stall transactions and hence the log whilst + * modifications wait for the AGF lock to be released. This can lead hung tasks + * kicking the hung task timer and rebooting the system. This is bad. + * + * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI + * lock, gathers a range of inode cluster buffers that are allocated, drops the + * AGI lock and then reads all the inode cluster buffers and processes them. It + * loops doing this, using a cursor to keep track of where it is up to in the AG + * for each iteration to restart the INOBT lookup from. + * + * We can't do this exactly with free space - once we drop the AGF lock, the + * state of the free extent is out of our control and we cannot run a discard + * safely on it in this situation. Unless, of course, we've marked the free + * extent as busy and undergoing a discard operation whilst we held the AGF + * locked. + * + * This is exactly how online discard works - free extents are marked busy when + * they are freed, and once the extent free has been committed to the journal, + * the busy extent record is marked as "undergoing discard" and the discard is + * then issued on the free extent. Once the discard completes, the busy extent + * record is removed and the extent is able to be allocated again. + * + * In the context of fstrim, if we find a free extent we need to discard, we + * don't have to discard it immediately. All we need to do it record that free + * extent as being busy and under discard, and all the allocation routines will + * now avoid trying to allocate it. Hence if we mark the extent as busy under + * the AGF lock, we can safely discard it without holding the AGF lock because + * nothing will attempt to allocate that free space until the discard completes. + * + * This also allows us to issue discards asynchronously like we do with online + * discard, and so for fast devices fstrim will run much faster as we can have + * multiple discard operations in flight at once, as well as pipeline the free + * extent search so that it overlaps in flight discard IO. + */ + +struct workqueue_struct *xfs_discard_wq; + +static void +xfs_discard_endio_work( + struct work_struct *work) +{ + struct xfs_busy_extents *extents = + container_of(work, struct xfs_busy_extents, endio_work); + + xfs_extent_busy_clear(extents->mount, &extents->extent_list, false); + kmem_free(extents->owner); +} + +/* + * Queue up the actual completion to a thread to avoid IRQ-safe locking for + * pagb_lock. + */ +static void +xfs_discard_endio( + struct bio *bio) +{ + struct xfs_busy_extents *extents = bio->bi_private; + + INIT_WORK(&extents->endio_work, xfs_discard_endio_work); + queue_work(xfs_discard_wq, &extents->endio_work); + bio_put(bio); +} + +/* + * Walk the discard list and issue discards on all the busy extents in the + * list. We plug and chain the bios so that we only need a single completion + * call to clear all the busy extents once the discards are complete. + */ +int +xfs_discard_extents( + struct xfs_mount *mp, + struct xfs_busy_extents *extents) +{ + struct xfs_extent_busy *busyp; + struct bio *bio = NULL; + struct blk_plug plug; + int error = 0; + + blk_start_plug(&plug); + list_for_each_entry(busyp, &extents->extent_list, list) { + trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, + busyp->length); + + error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, + XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), + XFS_FSB_TO_BB(mp, busyp->length), + GFP_NOFS, &bio); + if (error && error != -EOPNOTSUPP) { + xfs_info(mp, + "discard failed for extent [0x%llx,%u], error %d", + (unsigned long long)busyp->bno, + busyp->length, + error); + break; + } + } + + if (bio) { + bio->bi_private = extents; + bio->bi_end_io = xfs_discard_endio; + submit_bio(bio); + } else { + xfs_discard_endio_work(&extents->endio_work); + } + blk_finish_plug(&plug); + + return error; +} + + +static int +xfs_trim_gather_extents( struct xfs_perag *pag, xfs_daddr_t start, xfs_daddr_t end, xfs_daddr_t minlen, + struct xfs_alloc_rec_incore *tcur, + struct xfs_busy_extents *extents, uint64_t *blocks_trimmed) { struct xfs_mount *mp = pag->pag_mount; - struct block_device *bdev = mp->m_ddev_targp->bt_bdev; struct xfs_btree_cur *cur; struct xfs_buf *agbp; - struct xfs_agf *agf; int error; int i; + int batch = 100; /* * Force out the log. This means any transactions that might have freed @@ -45,20 +171,28 @@ xfs_trim_extents( error = xfs_alloc_read_agf(pag, NULL, 0, &agbp); if (error) return error; - agf = agbp->b_addr; cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT); /* - * Look up the longest btree in the AGF and start with it. + * Look up the extent length requested in the AGF and start with it. */ - error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i); + if (tcur->ar_startblock == NULLAGBLOCK) + error = xfs_alloc_lookup_ge(cur, 0, tcur->ar_blockcount, &i); + else + error = xfs_alloc_lookup_le(cur, tcur->ar_startblock, + tcur->ar_blockcount, &i); if (error) goto out_del_cursor; + if (i == 0) { + /* nothing of that length left in the AG, we are done */ + tcur->ar_blockcount = 0; + goto out_del_cursor; + } /* * Loop until we are done with all extents that are large - * enough to be worth discarding. + * enough to be worth discarding or we hit batch limits. */ while (i) { xfs_agblock_t fbno; @@ -73,7 +207,16 @@ xfs_trim_extents( error = -EFSCORRUPTED; break; } - ASSERT(flen <= be32_to_cpu(agf->agf_longest)); + + if (--batch <= 0) { + /* + * Update the cursor to point at this extent so we + * restart the next batch from this extent. + */ + tcur->ar_startblock = fbno; + tcur->ar_blockcount = flen; + break; + } /* * use daddr format for all range/len calculations as that is @@ -88,6 +231,7 @@ xfs_trim_extents( */ if (dlen < minlen) { trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); + tcur->ar_blockcount = 0; break; } @@ -110,29 +254,103 @@ xfs_trim_extents( goto next_extent; } - trace_xfs_discard_extent(mp, pag->pag_agno, fbno, flen); - error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS); - if (error) - break; + xfs_extent_busy_insert_discard(pag, fbno, flen, + &extents->extent_list); *blocks_trimmed += flen; - next_extent: error = xfs_btree_decrement(cur, 0, &i); if (error) break; - if (fatal_signal_pending(current)) { - error = -ERESTARTSYS; - break; - } + /* + * If there's no more records in the tree, we are done. Set the + * cursor block count to 0 to indicate to the caller that there + * is no more extents to search. + */ + if (i == 0) + tcur->ar_blockcount = 0; } + /* + * If there was an error, release all the gathered busy extents because + * we aren't going to issue a discard on them any more. + */ + if (error) + xfs_extent_busy_clear(mp, &extents->extent_list, false); out_del_cursor: xfs_btree_del_cursor(cur, error); xfs_buf_relse(agbp); return error; } +static bool +xfs_trim_should_stop(void) +{ + return fatal_signal_pending(current) || freezing(current); +} + +/* + * Iterate the free list gathering extents and discarding them. We need a cursor + * for the repeated iteration of gather/discard loop, so use the longest extent + * we found in the last batch as the key to start the next. + */ +static int +xfs_trim_extents( + struct xfs_perag *pag, + xfs_daddr_t start, + xfs_daddr_t end, + xfs_daddr_t minlen, + uint64_t *blocks_trimmed) +{ + struct xfs_alloc_rec_incore tcur = { + .ar_blockcount = pag->pagf_longest, + .ar_startblock = NULLAGBLOCK, + }; + int error = 0; + + do { + struct xfs_busy_extents *extents; + + extents = kzalloc(sizeof(*extents), GFP_KERNEL); + if (!extents) { + error = -ENOMEM; + break; + } + + extents->mount = pag->pag_mount; + extents->owner = extents; + INIT_LIST_HEAD(&extents->extent_list); + + error = xfs_trim_gather_extents(pag, start, end, minlen, + &tcur, extents, blocks_trimmed); + if (error) { + kfree(extents); + break; + } + + /* + * We hand the extent list to the discard function here so the + * discarded extents can be removed from the busy extent list. + * This allows the discards to run asynchronously with gathering + * the next round of extents to discard. + * + * However, we must ensure that we do not reference the extent + * list after this function call, as it may have been freed by + * the time control returns to us. + */ + error = xfs_discard_extents(pag->pag_mount, extents); + if (error) + break; + + if (xfs_trim_should_stop()) + break; + + } while (tcur.ar_blockcount != 0); + + return error; + +} + /* * trim a range of the filesystem. * @@ -195,12 +413,12 @@ xfs_ioc_trim( for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) { error = xfs_trim_extents(pag, start, end, minlen, &blocks_trimmed); - if (error) { + if (error) last_error = error; - if (error == -ERESTARTSYS) { - xfs_perag_rele(pag); - break; - } + + if (xfs_trim_should_stop()) { + xfs_perag_rele(pag); + break; } } diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h index de92d9cc958f..2b1a85223a56 100644 --- a/fs/xfs/xfs_discard.h +++ b/fs/xfs/xfs_discard.h @@ -3,8 +3,10 @@ #define XFS_DISCARD_H 1 struct fstrim_range; -struct list_head; +struct xfs_mount; +struct xfs_busy_extents; -extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); +int xfs_discard_extents(struct xfs_mount *mp, struct xfs_busy_extents *busy); +int xfs_ioc_trim(struct xfs_mount *mp, struct fstrim_range __user *fstrim); #endif /* XFS_DISCARD_H */ diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 7c2fdc71e42d..746814815b1d 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -19,13 +19,13 @@ #include "xfs_log.h" #include "xfs_ag.h" -void -xfs_extent_busy_insert( - struct xfs_trans *tp, +static void +xfs_extent_busy_insert_list( struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, - unsigned int flags) + unsigned int flags, + struct list_head *busy_list) { struct xfs_extent_busy *new; struct xfs_extent_busy *busyp; @@ -40,7 +40,7 @@ xfs_extent_busy_insert( new->flags = flags; /* trace before insert to be able to see failed inserts */ - trace_xfs_extent_busy(tp->t_mountp, pag->pag_agno, bno, len); + trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len); spin_lock(&pag->pagb_lock); rbp = &pag->pagb_tree.rb_node; @@ -62,10 +62,32 @@ xfs_extent_busy_insert( rb_link_node(&new->rb_node, parent, rbp); rb_insert_color(&new->rb_node, &pag->pagb_tree); - list_add(&new->list, &tp->t_busy); + list_add(&new->list, busy_list); spin_unlock(&pag->pagb_lock); } +void +xfs_extent_busy_insert( + struct xfs_trans *tp, + struct xfs_perag *pag, + xfs_agblock_t bno, + xfs_extlen_t len, + unsigned int flags) +{ + xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy); +} + +void +xfs_extent_busy_insert_discard( + struct xfs_perag *pag, + xfs_agblock_t bno, + xfs_extlen_t len, + struct list_head *busy_list) +{ + xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED, + busy_list); +} + /* * Search for a busy extent within the range of the extent we are about to * allocate. You need to be holding the busy extent tree lock when calling diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index c37bf87e6781..0639aab336f3 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -16,9 +16,6 @@ struct xfs_alloc_arg; /* * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that * have been freed but whose transactions aren't committed to disk yet. - * - * Note that we use the transaction ID to record the transaction, not the - * transaction structure itself. See xfs_extent_busy_insert() for details. */ struct xfs_extent_busy { struct rb_node rb_node; /* ag by-bno indexed search tree */ @@ -31,11 +28,32 @@ struct xfs_extent_busy { #define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */ }; +/* + * List used to track groups of related busy extents all the way through + * to discard completion. + */ +struct xfs_busy_extents { + struct xfs_mount *mount; + struct list_head extent_list; + struct work_struct endio_work; + + /* + * Owner is the object containing the struct xfs_busy_extents to free + * once the busy extents have been processed. If only the + * xfs_busy_extents object needs freeing, then point this at itself. + */ + void *owner; +}; + void xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); void +xfs_extent_busy_insert_discard(struct xfs_perag *pag, xfs_agblock_t bno, + xfs_extlen_t len, struct list_head *busy_list); + +void xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list, bool do_discard); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ebc70aaa299c..67a99d94701e 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -16,8 +16,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_trace.h" - -struct workqueue_struct *xfs_discard_wq; +#include "xfs_discard.h" /* * Allocate a new ticket. Failing to get a new ticket makes it really hard to @@ -103,7 +102,7 @@ xlog_cil_ctx_alloc(void) ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS); INIT_LIST_HEAD(&ctx->committing); - INIT_LIST_HEAD(&ctx->busy_extents); + INIT_LIST_HEAD(&ctx->busy_extents.extent_list); INIT_LIST_HEAD(&ctx->log_items); INIT_LIST_HEAD(&ctx->lv_chain); INIT_WORK(&ctx->push_work, xlog_cil_push_work); @@ -132,7 +131,7 @@ xlog_cil_push_pcp_aggregate( if (!list_empty(&cilpcp->busy_extents)) { list_splice_init(&cilpcp->busy_extents, - &ctx->busy_extents); + &ctx->busy_extents.extent_list); } if (!list_empty(&cilpcp->log_items)) list_splice_init(&cilpcp->log_items, &ctx->log_items); @@ -708,76 +707,6 @@ xlog_cil_free_logvec( } } -static void -xlog_discard_endio_work( - struct work_struct *work) -{ - struct xfs_cil_ctx *ctx = - container_of(work, struct xfs_cil_ctx, discard_endio_work); - struct xfs_mount *mp = ctx->cil->xc_log->l_mp; - - xfs_extent_busy_clear(mp, &ctx->busy_extents, false); - kmem_free(ctx); -} - -/* - * Queue up the actual completion to a thread to avoid IRQ-safe locking for - * pagb_lock. Note that we need a unbounded workqueue, otherwise we might - * get the execution delayed up to 30 seconds for weird reasons. - */ -static void -xlog_discard_endio( - struct bio *bio) -{ - struct xfs_cil_ctx *ctx = bio->bi_private; - - INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work); - queue_work(xfs_discard_wq, &ctx->discard_endio_work); - bio_put(bio); -} - -static void -xlog_discard_busy_extents( - struct xfs_mount *mp, - struct xfs_cil_ctx *ctx) -{ - struct list_head *list = &ctx->busy_extents; - struct xfs_extent_busy *busyp; - struct bio *bio = NULL; - struct blk_plug plug; - int error = 0; - - ASSERT(xfs_has_discard(mp)); - - blk_start_plug(&plug); - list_for_each_entry(busyp, list, list) { - trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, - busyp->length); - - error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, - XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), - XFS_FSB_TO_BB(mp, busyp->length), - GFP_NOFS, &bio); - if (error && error != -EOPNOTSUPP) { - xfs_info(mp, - "discard failed for extent [0x%llx,%u], error %d", - (unsigned long long)busyp->bno, - busyp->length, - error); - break; - } - } - - if (bio) { - bio->bi_private = ctx; - bio->bi_end_io = xlog_discard_endio; - submit_bio(bio); - } else { - xlog_discard_endio_work(&ctx->discard_endio_work); - } - blk_finish_plug(&plug); -} - /* * Mark all items committed and clear busy extents. We free the log vector * chains in a separate pass so that we unpin the log items as quickly as @@ -807,8 +736,8 @@ xlog_cil_committed( xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain, ctx->start_lsn, abort); - xfs_extent_busy_sort(&ctx->busy_extents); - xfs_extent_busy_clear(mp, &ctx->busy_extents, + xfs_extent_busy_sort(&ctx->busy_extents.extent_list); + xfs_extent_busy_clear(mp, &ctx->busy_extents.extent_list, xfs_has_discard(mp) && !abort); spin_lock(&ctx->cil->xc_push_lock); @@ -817,10 +746,14 @@ xlog_cil_committed( xlog_cil_free_logvec(&ctx->lv_chain); - if (!list_empty(&ctx->busy_extents)) - xlog_discard_busy_extents(mp, ctx); - else - kmem_free(ctx); + if (!list_empty(&ctx->busy_extents.extent_list)) { + ctx->busy_extents.mount = mp; + ctx->busy_extents.owner = ctx; + xfs_discard_extents(mp, &ctx->busy_extents); + return; + } + + kmem_free(ctx); } void diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index af87648331d5..fa3ad1d7b31c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -6,6 +6,8 @@ #ifndef __XFS_LOG_PRIV_H__ #define __XFS_LOG_PRIV_H__ +#include "xfs_extent_busy.h" /* for struct xfs_busy_extents */ + struct xfs_buf; struct xlog; struct xlog_ticket; @@ -223,12 +225,11 @@ struct xfs_cil_ctx { struct xlog_in_core *commit_iclog; struct xlog_ticket *ticket; /* chkpt ticket */ atomic_t space_used; /* aggregate size of regions */ - struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_busy_extents busy_extents; struct list_head log_items; /* log items in chkpt */ struct list_head lv_chain; /* logvecs being pushed */ struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ - struct work_struct discard_endio_work; struct work_struct push_work; atomic_t order_id; |