fuse: wait for writepages in syncfs

In case of fuse the MM subsystem doesn't guarantee that page writeback completes by the time ->sync_fs() is called. This is because fuse completes page writeback immediately to prevent DoS of memory reclaim by the userspace file server. This means that fuse itself must ensure that writes are synced before sending the SYNCFS request to the server. Introduce sync buckets, that hold a counter for the number of outstanding write requests. On syncfs replace the current bucket with a new one and wait until the old bucket's counter goes down to zero. It is possible to have multiple syncfs calls in parallel, in which case there could be more than one waited-on buckets. Descendant buckets must not complete until the parent completes. Add a count to the child (new) bucket until the (parent) old bucket completes. Use RCU protection to dereference the current bucket and to wake up an emptied bucket. Use fc->lock to protect against parallel assignments to the current bucket. This leaves just the counter to be a possible scalability issue. The fc->num_waiting counter has a similar issue, so both should be addressed at the same time. Reported-by: Amir Goldstein <amir73il@gmail.com> Fixes: 2d82ab251ef0 ("virtiofs: propagate sync() to file server") Cc: <stable@vger.kernel.org> # v5.14 Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
author: Miklos Szeredi <mszeredi@redhat.com> 2021-09-01 12:39:02 +0200
committer: Miklos Szeredi <mszeredi@redhat.com> 2021-09-06 13:37:02 +0200
commit: 660585b56e63ca034ad506ea53c807c5cdca3196 (patch)
tree: a3ebc2af64f6e82c70ef1aaac95343b2ea443023 /fs/fuse/inode.c
parent: 59bda8ecee2ffc6a602b7bf2b9e43ca669cdbdcd (diff)
1 files changed, 60 insertions, 0 deletions
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index a3e7fb484938..2187211893ff 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -506,6 +506,57 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return err;
 }
 
+static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void)
+{
+	struct fuse_sync_bucket *bucket;
+
+	bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL);
+	if (bucket) {
+		init_waitqueue_head(&bucket->waitq);
+		/* Initial active count */
+		atomic_set(&bucket->count, 1);
+	}
+	return bucket;
+}
+
+static void fuse_sync_fs_writes(struct fuse_conn *fc)
+{
+	struct fuse_sync_bucket *bucket, *new_bucket;
+	int count;
+
+	new_bucket = fuse_sync_bucket_alloc();
+	spin_lock(&fc->lock);
+	bucket = rcu_dereference_protected(fc->curr_bucket, 1);
+	count = atomic_read(&bucket->count);
+	WARN_ON(count < 1);
+	/* No outstanding writes? */
+	if (count == 1) {
+		spin_unlock(&fc->lock);
+		kfree(new_bucket);
+		return;
+	}
+
+	/*
+	 * Completion of new bucket depends on completion of this bucket, so add
+	 * one more count.
+	 */
+	atomic_inc(&new_bucket->count);
+	rcu_assign_pointer(fc->curr_bucket, new_bucket);
+	spin_unlock(&fc->lock);
+	/*
+	 * Drop initial active count.  At this point if all writes in this and
+	 * ancestor buckets complete, the count will go to zero and this task
+	 * will be woken up.
+	 */
+	atomic_dec(&bucket->count);
+
+	wait_event(bucket->waitq, atomic_read(&bucket->count) == 0);
+
+	/* Drop temp count on descendant bucket */
+	fuse_sync_bucket_dec(new_bucket);
+	kfree_rcu(bucket, rcu);
+}
+
 static int fuse_sync_fs(struct super_block *sb, int wait)
 {
 	struct fuse_mount *fm = get_fuse_mount_super(sb);
@@ -528,6 +579,8 @@ static int fuse_sync_fs(struct super_block *sb, int wait)
 	if (!fc->sync_fs)
 		return 0;
 
+	fuse_sync_fs_writes(fc);
+
 	memset(&inarg, 0, sizeof(inarg));
 	args.in_numargs = 1;
 	args.in_args[0].size = sizeof(inarg);
@@ -763,6 +816,7 @@ void fuse_conn_put(struct fuse_conn *fc)
 {
 	if (refcount_dec_and_test(&fc->count)) {
 		struct fuse_iqueue *fiq = &fc->iq;
+		struct fuse_sync_bucket *bucket;
 
 		if (IS_ENABLED(CONFIG_FUSE_DAX))
 			fuse_dax_conn_free(fc);
@@ -770,6 +824,11 @@ void fuse_conn_put(struct fuse_conn *fc)
 			fiq->ops->release(fiq);
 		put_pid_ns(fc->pid_ns);
 		put_user_ns(fc->user_ns);
+		bucket = rcu_dereference_protected(fc->curr_bucket, 1);
+		if (bucket) {
+			WARN_ON(atomic_read(&bucket->count) != 1);
+			kfree(bucket);
+		}
 		fc->release(fc);
 	}
 }
@@ -1418,6 +1477,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
 	if (sb->s_flags & SB_MANDLOCK)
 		goto err;
 
+	rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc());
 	fuse_sb_defaults(sb);
 
 	if (ctx->is_bdev) {
author	Miklos Szeredi <mszeredi@redhat.com>	2021-09-01 12:39:02 +0200
committer	Miklos Szeredi <mszeredi@redhat.com>	2021-09-06 13:37:02 +0200
commit	660585b56e63ca034ad506ea53c807c5cdca3196 (patch)
tree	a3ebc2af64f6e82c70ef1aaac95343b2ea443023 /fs/fuse/inode.c
parent	59bda8ecee2ffc6a602b7bf2b9e43ca669cdbdcd (diff)