From e1e71c168813564be0f6ea3d6740a059ca42d177 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 4 Aug 2021 13:22:58 +0200 Subject: fuse: fix use after free in fuse_read_interrupt() There is a potential race between fuse_read_interrupt() and fuse_request_end(). TASK1 in fuse_read_interrupt(): delete req->intr_entry (while holding fiq->lock) TASK2 in fuse_request_end(): req->intr_entry is empty -> skip fiq->lock wake up TASK3 TASK3 request is freed TASK1 in fuse_read_interrupt(): dereference req->in.h.unique ***BAM*** Fix by always grabbing fiq->lock if the request was ever interrupted (FR_INTERRUPTED set) thereby serializing with concurrent fuse_read_interrupt() calls. FR_INTERRUPTED is set before the request is queued on fiq->interrupts. Dequeing the request is done with list_del_init() but FR_INTERRUPTED is not cleared in this case. Reported-by: lijiazi Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1c8f79b3dd06..dde341a6388a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -288,10 +288,10 @@ void fuse_request_end(struct fuse_req *req) /* * test_and_set_bit() implies smp_mb() between bit - * changing and below intr_entry check. Pairs with + * changing and below FR_INTERRUPTED check. Pairs with * smp_mb() from queue_interrupt(). */ - if (!list_empty(&req->intr_entry)) { + if (test_bit(FR_INTERRUPTED, &req->flags)) { spin_lock(&fiq->lock); list_del_init(&req->intr_entry); spin_unlock(&fiq->lock); -- cgit v1.2.3-58-ga151 From 84c215075b5723ab946708a6c74c26bd3c51114c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 4 Aug 2021 13:22:58 +0200 Subject: fuse: name fs_context consistently Naming convention under fs/fuse/: struct fuse_conn *fc; struct fs_context *fsc; Signed-off-by: Miklos Szeredi --- fs/fuse/control.c | 10 ++++----- fs/fuse/inode.c | 60 ++++++++++++++++++++++++++--------------------------- fs/fuse/virtio_fs.c | 12 +++++------ 3 files changed, 41 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/fuse/control.c b/fs/fuse/control.c index cc7e94d73c6c..000d2e5627e9 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -328,7 +328,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) drop_nlink(d_inode(fuse_control_sb->s_root)); } -static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fctx) +static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc) { static const struct tree_descr empty_descr = {""}; struct fuse_conn *fc; @@ -354,18 +354,18 @@ static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fctx) return 0; } -static int fuse_ctl_get_tree(struct fs_context *fc) +static int fuse_ctl_get_tree(struct fs_context *fsc) { - return get_tree_single(fc, fuse_ctl_fill_super); + return get_tree_single(fsc, fuse_ctl_fill_super); } static const struct fs_context_operations fuse_ctl_context_ops = { .get_tree = fuse_ctl_get_tree, }; -static int fuse_ctl_init_fs_context(struct fs_context *fc) +static int fuse_ctl_init_fs_context(struct fs_context *fsc) { - fc->ops = &fuse_ctl_context_ops; + fsc->ops = &fuse_ctl_context_ops; return 0; } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b9beb39a4a18..d1b1b17b321c 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -138,12 +138,12 @@ static void fuse_evict_inode(struct inode *inode) } } -static int fuse_reconfigure(struct fs_context *fc) +static int fuse_reconfigure(struct fs_context *fsc) { - struct super_block *sb = fc->root->d_sb; + struct super_block *sb = fsc->root->d_sb; sync_filesystem(sb); - if (fc->sb_flags & SB_MANDLOCK) + if (fsc->sb_flags & SB_MANDLOCK) return -EINVAL; return 0; @@ -573,38 +573,38 @@ static const struct fs_parameter_spec fuse_fs_parameters[] = { {} }; -static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) +static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param) { struct fs_parse_result result; - struct fuse_fs_context *ctx = fc->fs_private; + struct fuse_fs_context *ctx = fsc->fs_private; int opt; - if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + if (fsc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { /* * Ignore options coming from mount(MS_REMOUNT) for backward * compatibility. */ - if (fc->oldapi) + if (fsc->oldapi) return 0; - return invalfc(fc, "No changes allowed in reconfigure"); + return invalfc(fsc, "No changes allowed in reconfigure"); } - opt = fs_parse(fc, fuse_fs_parameters, param, &result); + opt = fs_parse(fsc, fuse_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case OPT_SOURCE: - if (fc->source) - return invalfc(fc, "Multiple sources specified"); - fc->source = param->string; + if (fsc->source) + return invalfc(fsc, "Multiple sources specified"); + fsc->source = param->string; param->string = NULL; break; case OPT_SUBTYPE: if (ctx->subtype) - return invalfc(fc, "Multiple subtypes specified"); + return invalfc(fsc, "Multiple subtypes specified"); ctx->subtype = param->string; param->string = NULL; return 0; @@ -616,22 +616,22 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) case OPT_ROOTMODE: if (!fuse_valid_type(result.uint_32)) - return invalfc(fc, "Invalid rootmode"); + return invalfc(fsc, "Invalid rootmode"); ctx->rootmode = result.uint_32; ctx->rootmode_present = true; break; case OPT_USER_ID: - ctx->user_id = make_kuid(fc->user_ns, result.uint_32); + ctx->user_id = make_kuid(fsc->user_ns, result.uint_32); if (!uid_valid(ctx->user_id)) - return invalfc(fc, "Invalid user_id"); + return invalfc(fsc, "Invalid user_id"); ctx->user_id_present = true; break; case OPT_GROUP_ID: - ctx->group_id = make_kgid(fc->user_ns, result.uint_32); + ctx->group_id = make_kgid(fsc->user_ns, result.uint_32); if (!gid_valid(ctx->group_id)) - return invalfc(fc, "Invalid group_id"); + return invalfc(fsc, "Invalid group_id"); ctx->group_id_present = true; break; @@ -649,7 +649,7 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) case OPT_BLKSIZE: if (!ctx->is_bdev) - return invalfc(fc, "blksize only supported for fuseblk"); + return invalfc(fsc, "blksize only supported for fuseblk"); ctx->blksize = result.uint_32; break; @@ -660,9 +660,9 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } -static void fuse_free_fc(struct fs_context *fc) +static void fuse_free_fsc(struct fs_context *fsc) { - struct fuse_fs_context *ctx = fc->fs_private; + struct fuse_fs_context *ctx = fsc->fs_private; if (ctx) { kfree(ctx->subtype); @@ -1566,9 +1566,9 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) return err; } -static int fuse_get_tree(struct fs_context *fc) +static int fuse_get_tree(struct fs_context *fsc) { - struct fuse_fs_context *ctx = fc->fs_private; + struct fuse_fs_context *ctx = fsc->fs_private; if (!ctx->fd_present || !ctx->rootmode_present || !ctx->user_id_present || !ctx->group_id_present) @@ -1576,14 +1576,14 @@ static int fuse_get_tree(struct fs_context *fc) #ifdef CONFIG_BLOCK if (ctx->is_bdev) - return get_tree_bdev(fc, fuse_fill_super); + return get_tree_bdev(fsc, fuse_fill_super); #endif - return get_tree_nodev(fc, fuse_fill_super); + return get_tree_nodev(fsc, fuse_fill_super); } static const struct fs_context_operations fuse_context_ops = { - .free = fuse_free_fc, + .free = fuse_free_fsc, .parse_param = fuse_parse_param, .reconfigure = fuse_reconfigure, .get_tree = fuse_get_tree, @@ -1592,7 +1592,7 @@ static const struct fs_context_operations fuse_context_ops = { /* * Set up the filesystem mount context. */ -static int fuse_init_fs_context(struct fs_context *fc) +static int fuse_init_fs_context(struct fs_context *fsc) { struct fuse_fs_context *ctx; @@ -1605,14 +1605,14 @@ static int fuse_init_fs_context(struct fs_context *fc) ctx->legacy_opts_show = true; #ifdef CONFIG_BLOCK - if (fc->fs_type == &fuseblk_fs_type) { + if (fsc->fs_type == &fuseblk_fs_type) { ctx->is_bdev = true; ctx->destroy = true; } #endif - fc->fs_private = ctx; - fc->ops = &fuse_context_ops; + fsc->fs_private = ctx; + fsc->ops = &fuse_context_ops; return 0; } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 8f52cdaa8445..0ad89c6629d7 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -97,14 +97,14 @@ static const struct fs_parameter_spec virtio_fs_parameters[] = { {} }; -static int virtio_fs_parse_param(struct fs_context *fc, +static int virtio_fs_parse_param(struct fs_context *fsc, struct fs_parameter *param) { struct fs_parse_result result; - struct fuse_fs_context *ctx = fc->fs_private; + struct fuse_fs_context *ctx = fsc->fs_private; int opt; - opt = fs_parse(fc, virtio_fs_parameters, param, &result); + opt = fs_parse(fsc, virtio_fs_parameters, param, &result); if (opt < 0) return opt; @@ -119,9 +119,9 @@ static int virtio_fs_parse_param(struct fs_context *fc, return 0; } -static void virtio_fs_free_fc(struct fs_context *fc) +static void virtio_fs_free_fsc(struct fs_context *fsc) { - struct fuse_fs_context *ctx = fc->fs_private; + struct fuse_fs_context *ctx = fsc->fs_private; kfree(ctx); } @@ -1488,7 +1488,7 @@ out_err: } static const struct fs_context_operations virtio_fs_context_ops = { - .free = virtio_fs_free_fc, + .free = virtio_fs_free_fsc, .parse_param = virtio_fs_parse_param, .get_tree = virtio_fs_get_tree, }; -- cgit v1.2.3-58-ga151 From badc741459f42f51e244533ce1df1cd9ac5ac6d7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 4 Aug 2021 13:22:58 +0200 Subject: fuse: move option checking into fuse_fill_super() Checking whether the "fd=", "rootmode=", "user_id=" and "group_id=" mount options are present can be moved from fuse_get_tree() into fuse_fill_super() where the value of the options are consumed. This relaxes semantics of reusing a fuse blockdev mount using the device name. Before this patch presence of these options were enforced but values ignored, after this patch these options are completely ignored in this case. Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d1b1b17b321c..54379a0c86d3 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1514,6 +1514,10 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) struct fuse_conn *fc; struct fuse_mount *fm; + if (!ctx->fd_present || !ctx->rootmode_present || + !ctx->user_id_present || !ctx->group_id_present) + return -EINVAL; + err = -EINVAL; file = fget(ctx->fd); if (!file) @@ -1570,14 +1574,9 @@ static int fuse_get_tree(struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; - if (!ctx->fd_present || !ctx->rootmode_present || - !ctx->user_id_present || !ctx->group_id_present) - return -EINVAL; - -#ifdef CONFIG_BLOCK - if (ctx->is_bdev) + if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) { return get_tree_bdev(fsc, fuse_fill_super); -#endif + } return get_tree_nodev(fsc, fuse_fill_super); } -- cgit v1.2.3-58-ga151 From 62dd1fc8cc6b22e3e568be46ebdb817e66f5d6a5 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 5 Aug 2021 05:57:27 +0200 Subject: fuse: move fget() to fuse_get_tree() Affected call chains: fuse_get_tree -> get_tree_(bdev|nodev) -> fuse_fill_super Needed for following patch. Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 1 + fs/fuse/inode.c | 44 +++++++++++++++++++++----------------------- 2 files changed, 22 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 07829ce78695..a78480933ebe 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -489,6 +489,7 @@ struct fuse_dev { struct fuse_fs_context { int fd; + struct file *file; unsigned int rootmode; kuid_t user_id; kgid_t group_id; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 54379a0c86d3..3d64a68c52f7 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1509,38 +1509,33 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common); static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; - struct file *file; int err; struct fuse_conn *fc; struct fuse_mount *fm; - if (!ctx->fd_present || !ctx->rootmode_present || + if (!ctx->file || !ctx->rootmode_present || !ctx->user_id_present || !ctx->group_id_present) return -EINVAL; - err = -EINVAL; - file = fget(ctx->fd); - if (!file) - goto err; - /* * Require mount to happen from the same user namespace which * opened /dev/fuse to prevent potential attacks. */ - if ((file->f_op != &fuse_dev_operations) || - (file->f_cred->user_ns != sb->s_user_ns)) - goto err_fput; - ctx->fudptr = &file->private_data; + err = -EINVAL; + if ((ctx->file->f_op != &fuse_dev_operations) || + (ctx->file->f_cred->user_ns != sb->s_user_ns)) + goto err; + ctx->fudptr = &ctx->file->private_data; fc = kmalloc(sizeof(*fc), GFP_KERNEL); err = -ENOMEM; if (!fc) - goto err_fput; + goto err; fm = kzalloc(sizeof(*fm), GFP_KERNEL); if (!fm) { kfree(fc); - goto err_fput; + goto err; } fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL); @@ -1551,12 +1546,8 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) err = fuse_fill_super_common(sb, ctx); if (err) goto err_put_conn; - /* - * atomic_dec_and_test() in fput() provides the necessary - * memory barrier for file->private_data to be visible on all - * CPUs after this - */ - fput(file); + /* file->private_data shall be visible on all CPUs after this */ + smp_mb(); fuse_send_init(get_fuse_mount_super(sb)); return 0; @@ -1564,8 +1555,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) fuse_conn_put(fc); kfree(fm); sb->s_fs_info = NULL; - err_fput: - fput(file); err: return err; } @@ -1573,12 +1562,21 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) static int fuse_get_tree(struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; + int err; + + if (ctx->fd_present) + ctx->file = fget(ctx->fd); if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) { - return get_tree_bdev(fsc, fuse_fill_super); + err = get_tree_bdev(fsc, fuse_fill_super); + goto out_fput; } - return get_tree_nodev(fsc, fuse_fill_super); + err = get_tree_nodev(fsc, fuse_fill_super); +out_fput: + if (ctx->file) + fput(ctx->file); + return err; } static const struct fs_context_operations fuse_context_ops = { -- cgit v1.2.3-58-ga151 From 5d5b74aa9c766f0dd37d5cc1a2a7a94586130501 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 5 Aug 2021 05:57:27 +0200 Subject: fuse: allow sharing existing sb Make it possible to create a new mount from a already working server. Here's a detailed description of the problem from Jakob: "The background for this question is occasional problems we see with our fuse filesystem [1] and mount namespaces. On a usual client, we have system-wide, autofs managed mountpoints. When a new mount namespace is created (which can be done unprivileged in combination with user namespaces), it can happen that a mountpoint is used inside the new namespace but idle in the root mount namespace. So autofs unmounts the parent, system-wide mountpoint. But the fuse module stays active and still serves mountpoint in the child mount namespace. Because the fuse daemon also blocks other system wide resources corresponding to the mountpoint, this situation effectively prevents new mounts until the child mount namespaces closes. [1] https://github.com/cvmfs/cvmfs" Reported-by: Jakob Blomer Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3d64a68c52f7..a3e7fb484938 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1559,9 +1559,26 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) return err; } +/* + * This is the path where user supplied an already initialized fuse dev. In + * this case never create a new super if the old one is gone. + */ +static int fuse_set_no_super(struct super_block *sb, struct fs_context *fsc) +{ + return -ENOTCONN; +} + +static int fuse_test_super(struct super_block *sb, struct fs_context *fsc) +{ + + return fsc->sget_key == get_fuse_conn_super(sb); +} + static int fuse_get_tree(struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; + struct fuse_dev *fud; + struct super_block *sb; int err; if (ctx->fd_present) @@ -1571,8 +1588,27 @@ static int fuse_get_tree(struct fs_context *fsc) err = get_tree_bdev(fsc, fuse_fill_super); goto out_fput; } + /* + * While block dev mount can be initialized with a dummy device fd + * (found by device name), normal fuse mounts can't + */ + if (!ctx->file) + return -EINVAL; - err = get_tree_nodev(fsc, fuse_fill_super); + /* + * Allow creating a fuse mount with an already initialized fuse + * connection + */ + fud = READ_ONCE(ctx->file->private_data); + if (ctx->file->f_op == &fuse_dev_operations && fud) { + fsc->sget_key = fud->fc; + sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super); + err = PTR_ERR_OR_ZERO(sb); + if (!IS_ERR(sb)) + fsc->root = dget(sb->s_root); + } else { + err = get_tree_nodev(fsc, fuse_fill_super); + } out_fput: if (ctx->file) fput(ctx->file); -- cgit v1.2.3-58-ga151 From 76224355db7570cbe6b6f75c8929a1558828dd55 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 17 Aug 2021 21:05:16 +0200 Subject: fuse: truncate pagecache on atomic_o_trunc fuse_finish_open() will be called with FUSE_NOWRITE in case of atomic O_TRUNC. This can deadlock with fuse_wait_on_page_writeback() in fuse_launder_page() triggered by invalidate_inode_pages2(). Fix by replacing invalidate_inode_pages2() in fuse_finish_open() with a truncate_pagecache() call. This makes sense regardless of FOPEN_KEEP_CACHE or fc->writeback cache, so do it unconditionally. Reported-by: Xie Yongji Reported-and-tested-by: syzbot+bea44a5189836d956894@syzkaller.appspotmail.com Fixes: e4648309b85a ("fuse: truncate pending writes on O_TRUNC") Cc: Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 97f860cfc195..5e5efb66c7d7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -198,12 +198,11 @@ void fuse_finish_open(struct inode *inode, struct file *file) struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); - if (!(ff->open_flags & FOPEN_KEEP_CACHE)) - invalidate_inode_pages2(inode->i_mapping); if (ff->open_flags & FOPEN_STREAM) stream_open(inode, file); else if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -211,10 +210,14 @@ void fuse_finish_open(struct inode *inode, struct file *file) fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, 0); spin_unlock(&fi->lock); + truncate_pagecache(inode, 0); fuse_invalidate_attr(inode); if (fc->writeback_cache) file_update_time(file); + } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) { + invalidate_inode_pages2(inode->i_mapping); } + if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); } -- cgit v1.2.3-58-ga151 From 59bda8ecee2ffc6a602b7bf2b9e43ca669cdbdcd Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 31 Aug 2021 14:18:08 +0200 Subject: fuse: flush extending writes Callers of fuse_writeback_range() assume that the file is ready for modification by the server in the supplied byte range after the call returns. If there's a write that extends the file beyond the end of the supplied range, then the file needs to be extended to at least the end of the range, but currently that's not done. There are at least two cases where this can cause problems: - copy_file_range() will return short count if the file is not extended up to end of the source range. - FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE will not extend the file, hence the region may not be fully allocated. Fix by flushing writes from the start of the range up to the end of the file. This could be optimized if the writes are non-extending, etc, but it's probably not worth the trouble. Fixes: a2bc92362941 ("fuse: fix copy_file_range() in the writeback case") Fixes: 6b1bdb56b17c ("fuse: allow fallocate(FALLOC_FL_ZERO_RANGE)") Cc: # v5.2 Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5e5efb66c7d7..88be26e5866b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2884,7 +2884,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) { - int err = filemap_write_and_wait_range(inode->i_mapping, start, end); + int err = filemap_write_and_wait_range(inode->i_mapping, start, -1); if (!err) fuse_sync_writes(inode); -- cgit v1.2.3-58-ga151 From 660585b56e63ca034ad506ea53c807c5cdca3196 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 1 Sep 2021 12:39:02 +0200 Subject: fuse: wait for writepages in syncfs In case of fuse the MM subsystem doesn't guarantee that page writeback completes by the time ->sync_fs() is called. This is because fuse completes page writeback immediately to prevent DoS of memory reclaim by the userspace file server. This means that fuse itself must ensure that writes are synced before sending the SYNCFS request to the server. Introduce sync buckets, that hold a counter for the number of outstanding write requests. On syncfs replace the current bucket with a new one and wait until the old bucket's counter goes down to zero. It is possible to have multiple syncfs calls in parallel, in which case there could be more than one waited-on buckets. Descendant buckets must not complete until the parent completes. Add a count to the child (new) bucket until the (parent) old bucket completes. Use RCU protection to dereference the current bucket and to wake up an emptied bucket. Use fc->lock to protect against parallel assignments to the current bucket. This leaves just the counter to be a possible scalability issue. The fc->num_waiting counter has a similar issue, so both should be addressed at the same time. Reported-by: Amir Goldstein Fixes: 2d82ab251ef0 ("virtiofs: propagate sync() to file server") Cc: # v5.14 Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 21 ++++++++++++++++++++ fs/fuse/fuse_i.h | 19 ++++++++++++++++++ fs/fuse/inode.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 88be26e5866b..2bca7edfc9f6 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -392,6 +392,7 @@ struct fuse_writepage_args { struct list_head queue_entry; struct fuse_writepage_args *next; struct inode *inode; + struct fuse_sync_bucket *bucket; }; static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, @@ -1611,6 +1612,9 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) struct fuse_args_pages *ap = &wpa->ia.ap; int i; + if (wpa->bucket) + fuse_sync_bucket_dec(wpa->bucket); + for (i = 0; i < ap->num_pages; i++) __free_page(ap->pages[i]); @@ -1874,6 +1878,20 @@ static struct fuse_writepage_args *fuse_writepage_args_alloc(void) } +static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, + struct fuse_writepage_args *wpa) +{ + if (!fc->sync_fs) + return; + + rcu_read_lock(); + /* Prevent resurrection of dead bucket in unlikely race with syncfs */ + do { + wpa->bucket = rcu_dereference(fc->curr_bucket); + } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count))); + rcu_read_unlock(); +} + static int fuse_writepage_locked(struct page *page) { struct address_space *mapping = page->mapping; @@ -1901,6 +1919,7 @@ static int fuse_writepage_locked(struct page *page) if (!wpa->ia.ff) goto err_nofile; + fuse_writepage_add_to_bucket(fc, wpa); fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); copy_highpage(tmp_page, page); @@ -2151,6 +2170,8 @@ static int fuse_writepages_fill(struct page *page, __free_page(tmp_page); goto out_unlock; } + fuse_writepage_add_to_bucket(fc, wpa); + data->max_pages = 1; ap = &wpa->ia.ap; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a78480933ebe..f166e24dd48b 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -516,6 +516,13 @@ struct fuse_fs_context { void **fudptr; }; +struct fuse_sync_bucket { + /* count is a possible scalability bottleneck */ + atomic_t count; + wait_queue_head_t waitq; + struct rcu_head rcu; +}; + /** * A Fuse connection. * @@ -808,6 +815,9 @@ struct fuse_conn { /** List of filesystems using this connection */ struct list_head mounts; + + /* New writepages go into this bucket */ + struct fuse_sync_bucket __rcu *curr_bucket; }; /* @@ -911,6 +921,15 @@ static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, descs[i].length = PAGE_SIZE - descs[i].offset; } +static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket) +{ + /* Need RCU protection to prevent use after free after the decrement */ + rcu_read_lock(); + if (atomic_dec_and_test(&bucket->count)) + wake_up(&bucket->waitq); + rcu_read_unlock(); +} + /** Device operations */ extern const struct file_operations fuse_dev_operations; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index a3e7fb484938..2187211893ff 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -506,6 +506,57 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) return err; } +static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void) +{ + struct fuse_sync_bucket *bucket; + + bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL); + if (bucket) { + init_waitqueue_head(&bucket->waitq); + /* Initial active count */ + atomic_set(&bucket->count, 1); + } + return bucket; +} + +static void fuse_sync_fs_writes(struct fuse_conn *fc) +{ + struct fuse_sync_bucket *bucket, *new_bucket; + int count; + + new_bucket = fuse_sync_bucket_alloc(); + spin_lock(&fc->lock); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + count = atomic_read(&bucket->count); + WARN_ON(count < 1); + /* No outstanding writes? */ + if (count == 1) { + spin_unlock(&fc->lock); + kfree(new_bucket); + return; + } + + /* + * Completion of new bucket depends on completion of this bucket, so add + * one more count. + */ + atomic_inc(&new_bucket->count); + rcu_assign_pointer(fc->curr_bucket, new_bucket); + spin_unlock(&fc->lock); + /* + * Drop initial active count. At this point if all writes in this and + * ancestor buckets complete, the count will go to zero and this task + * will be woken up. + */ + atomic_dec(&bucket->count); + + wait_event(bucket->waitq, atomic_read(&bucket->count) == 0); + + /* Drop temp count on descendant bucket */ + fuse_sync_bucket_dec(new_bucket); + kfree_rcu(bucket, rcu); +} + static int fuse_sync_fs(struct super_block *sb, int wait) { struct fuse_mount *fm = get_fuse_mount_super(sb); @@ -528,6 +579,8 @@ static int fuse_sync_fs(struct super_block *sb, int wait) if (!fc->sync_fs) return 0; + fuse_sync_fs_writes(fc); + memset(&inarg, 0, sizeof(inarg)); args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); @@ -763,6 +816,7 @@ void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { struct fuse_iqueue *fiq = &fc->iq; + struct fuse_sync_bucket *bucket; if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_conn_free(fc); @@ -770,6 +824,11 @@ void fuse_conn_put(struct fuse_conn *fc) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); put_user_ns(fc->user_ns); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + if (bucket) { + WARN_ON(atomic_read(&bucket->count) != 1); + kfree(bucket); + } fc->release(fc); } } @@ -1418,6 +1477,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) if (sb->s_flags & SB_MANDLOCK) goto err; + rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc()); fuse_sb_defaults(sb); if (ctx->is_bdev) { -- cgit v1.2.3-58-ga151 From a9667ac88e2b20f6426e09945e9dbf555fb86ff0 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 1 Sep 2021 12:39:02 +0200 Subject: fuse: remove unused arg in fuse_write_file_get() The struct fuse_conn argument is not used and can be removed. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2bca7edfc9f6..f196680c390a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1820,8 +1820,7 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, fuse_writepage_free(wpa); } -static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi) { struct fuse_file *ff = NULL; @@ -1836,22 +1835,20 @@ static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, return ff; } -static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi) { - struct fuse_file *ff = __fuse_write_file_get(fc, fi); + struct fuse_file *ff = __fuse_write_file_get(fi); WARN_ON(!ff); return ff; } int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) { - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff; int err; - ff = __fuse_write_file_get(fc, fi); + ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) fuse_file_put(ff, false, false); @@ -1915,7 +1912,7 @@ static int fuse_writepage_locked(struct page *page) goto err_free; error = -EIO; - wpa->ia.ff = fuse_write_file_get(fc, fi); + wpa->ia.ff = fuse_write_file_get(fi); if (!wpa->ia.ff) goto err_nofile; @@ -2135,7 +2132,7 @@ static int fuse_writepages_fill(struct page *page, if (!data->ff) { err = -EIO; - data->ff = fuse_write_file_get(fc, fi); + data->ff = fuse_write_file_get(fi); if (!data->ff) goto out_unlock; } -- cgit v1.2.3-58-ga151