gfs2: Rework freeze / thaw logic

So far, at mount time, gfs2 would take the freeze glock in shared mode and then immediately drop it again, turning it into a cached glock that can be reclaimed at any time. To freeze the filesystem cluster-wide, the node initiating the freeze would take the freeze glock in exclusive mode, which would cause the freeze glock's freeze_go_sync() callback to run on each node. There, gfs2 would freeze the filesystem and schedule gfs2_freeze_func() to run. gfs2_freeze_func() would re-acquire the freeze glock in shared mode, thaw the filesystem, and drop the freeze glock again. The initiating node would keep the freeze glock held in exclusive mode. To thaw the filesystem, the initiating node would drop the freeze glock again, which would allow gfs2_freeze_func() to resume on all nodes, leaving the filesystem in the thawed state. It turns out that in freeze_go_sync(), we cannot reliably and safely freeze the filesystem. This is primarily because the final unmount of a filesystem takes a write lock on the s_umount rw semaphore before calling into gfs2_put_super(), and freeze_go_sync() needs to call freeze_super() which also takes a write lock on the same semaphore, causing a deadlock. We could work around this by trying to take an active reference on the super block first, which would prevent unmount from running at the same time. But that can fail, and freeze_go_sync() isn't actually allowed to fail. To get around this, this patch changes the freeze glock locking scheme as follows: At mount time, each node takes the freeze glock in shared mode. To freeze a filesystem, the initiating node first freezes the filesystem locally and then drops and re-acquires the freeze glock in exclusive mode. All other nodes notice that there is contention on the freeze glock in their go_callback callbacks, and they schedule gfs2_freeze_func() to run. There, they freeze the filesystem locally and drop and re-acquire the freeze glock before re-thawing the filesystem. This is happening outside of the glock state engine, so there, we are allowed to fail. From a cluster point of view, taking and immediately dropping a glock is indistinguishable from taking the glock and only dropping it upon contention, so this new scheme is compatible with the old one. Thanks to Li Dong <lidong@vivo.com> for reporting a locking bug in gfs2_freeze_func() in a previous version of this commit. Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
author: Andreas Gruenbacher <agruenba@redhat.com> 2022-11-14 23:34:50 +0100
committer: Andreas Gruenbacher <agruenba@redhat.com> 2023-07-03 22:25:02 +0200
commit: b77b4a4815a9651d1d6e07b8e6548eee9531a5eb (patch)
tree: a03c5c47deba377022ca106ea42c99f5765bccde /fs/gfs2/super.c
parent: cad1e15804a83afd9a5c1d95a428d60d1f9c0340 (diff)
1 files changed, 135 insertions, 37 deletions
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 9a428e09d38c..81c8d07a4540 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -332,7 +332,12 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp)
 	struct lfcc *lfcc;
 	LIST_HEAD(list);
 	struct gfs2_log_header_host lh;
-	int error;
+	int error, error2;
+
+	/*
+	 * Grab all the journal glocks in SH mode.  We are *probably* doing
+	 * that to prevent recovery.
+	 */
 
 	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
 		lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
@@ -349,11 +354,13 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp)
 		list_add(&lfcc->list, &list);
 	}
 
+	gfs2_freeze_unlock(&sdp->sd_freeze_gh);
+
 	error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE,
 				   LM_FLAG_NOEXP | GL_NOPID,
 				   &sdp->sd_freeze_gh);
 	if (error)
-		goto out;
+		goto relock_shared;
 
 	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
 		error = gfs2_jdesc_check(jd);
@@ -368,8 +375,14 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp)
 		}
 	}
 
-	if (error)
-		gfs2_freeze_unlock(&sdp->sd_freeze_gh);
+	if (!error)
+		goto out;  /* success */
+
+	gfs2_freeze_unlock(&sdp->sd_freeze_gh);
+
+relock_shared:
+	error2 = gfs2_freeze_lock_shared(sdp, &sdp->sd_freeze_gh, 0);
+	gfs2_assert_withdraw(sdp, !error2);
 
 out:
 	while (!list_empty(&list)) {
@@ -615,6 +628,8 @@ restart:
 
 	/*  Release stuff  */
 
+	gfs2_freeze_unlock(&sdp->sd_freeze_gh);
+
 	iput(sdp->sd_jindex);
 	iput(sdp->sd_statfs_inode);
 	iput(sdp->sd_rindex);
@@ -669,31 +684,82 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
 	return sdp->sd_log_error;
 }
 
-void gfs2_freeze_func(struct work_struct *work)
+static int gfs2_freeze_locally(struct gfs2_sbd *sdp)
 {
-	int error;
-	struct gfs2_holder freeze_gh;
-	struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_freeze_work);
 	struct super_block *sb = sdp->sd_vfs;
+	int error;
 
-	atomic_inc(&sb->s_active);
-	error = gfs2_freeze_lock_shared(sdp, &freeze_gh, 0);
-	if (error) {
-		gfs2_assert_withdraw(sdp, 0);
-	} else {
-		atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
-		error = thaw_super(sb);
-		if (error) {
-			fs_info(sdp, "GFS2: couldn't thaw filesystem: %d\n",
-				error);
-			gfs2_assert_withdraw(sdp, 0);
+	atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE);
+
+	error = freeze_super(sb);
+	if (error)
+		goto fail;
+
+	if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+		gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE |
+			       GFS2_LFC_FREEZE_GO_SYNC);
+		if (gfs2_withdrawn(sdp)) {
+			thaw_super(sb);
+			error = -EIO;
+			goto fail;
 		}
-		gfs2_freeze_unlock(&freeze_gh);
 	}
+	return 0;
+
+fail:
+	atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
+	return error;
+}
+
+static int gfs2_do_thaw(struct gfs2_sbd *sdp)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	int error;
+
+	error = gfs2_freeze_lock_shared(sdp, &sdp->sd_freeze_gh, 0);
+	if (error)
+		goto fail;
+	error = thaw_super(sb);
+	if (!error)
+		return 0;
+
+fail:
+	fs_info(sdp, "GFS2: couldn't thaw filesystem: %d\n", error);
+	gfs2_assert_withdraw(sdp, 0);
+	return error;
+}
+
+void gfs2_freeze_func(struct work_struct *work)
+{
+	struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_freeze_work);
+	struct super_block *sb = sdp->sd_vfs;
+	int error;
+
+	mutex_lock(&sdp->sd_freeze_mutex);
+	error = -EBUSY;
+	if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN)
+		goto freeze_failed;
+
+	error = gfs2_freeze_locally(sdp);
+	if (error)
+		goto freeze_failed;
+
+	gfs2_freeze_unlock(&sdp->sd_freeze_gh);
+	atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
+
+	error = gfs2_do_thaw(sdp);
+	if (error)
+		goto out;
+
+	atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
+	goto out;
+
+freeze_failed:
+	fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n", error);
+
+out:
+	mutex_unlock(&sdp->sd_freeze_mutex);
 	deactivate_super(sb);
-	clear_bit_unlock(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
-	wake_up_bit(&sdp->sd_flags, SDF_FREEZE_INITIATOR);
-	return;
 }
 
 /**
@@ -707,21 +773,27 @@ static int gfs2_freeze_super(struct super_block *sb)
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	int error;
 
-	mutex_lock(&sdp->sd_freeze_mutex);
-	if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN) {
-		error = -EBUSY;
+	if (!mutex_trylock(&sdp->sd_freeze_mutex))
+		return -EBUSY;
+	error = -EBUSY;
+	if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN)
 		goto out;
-	}
 
 	for (;;) {
-		if (gfs2_withdrawn(sdp)) {
-			error = -EINVAL;
+		error = gfs2_freeze_locally(sdp);
+		if (error) {
+			fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n",
+				error);
 			goto out;
 		}
 
 		error = gfs2_lock_fs_check_clean(sdp);
 		if (!error)
-			break;
+			break;  /* success */
+
+		error = gfs2_do_thaw(sdp);
+		if (error)
+			goto out;
 
 		if (error == -EBUSY)
 			fs_err(sdp, "waiting for recovery before freeze\n");
@@ -735,8 +807,12 @@ static int gfs2_freeze_super(struct super_block *sb)
 		fs_err(sdp, "retrying...\n");
 		msleep(1000);
 	}
-	set_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
+
 out:
+	if (!error) {
+		set_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
+		atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
+	}
 	mutex_unlock(&sdp->sd_freeze_mutex);
 	return error;
 }
@@ -750,17 +826,39 @@ out:
 static int gfs2_thaw_super(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
+	int error;
 
-	mutex_lock(&sdp->sd_freeze_mutex);
-	if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN ||
-	    !gfs2_holder_initialized(&sdp->sd_freeze_gh)) {
-		mutex_unlock(&sdp->sd_freeze_mutex);
-		return -EINVAL;
+	if (!mutex_trylock(&sdp->sd_freeze_mutex))
+		return -EBUSY;
+	error = -EINVAL;
+	if (!test_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags))
+		goto out;
+
+	gfs2_freeze_unlock(&sdp->sd_freeze_gh);
+
+	error = gfs2_do_thaw(sdp);
+
+	if (!error) {
+		clear_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
+		atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
 	}
+out:
+	mutex_unlock(&sdp->sd_freeze_mutex);
+	return error;
+}
+
+void gfs2_thaw_freeze_initiator(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+
+	mutex_lock(&sdp->sd_freeze_mutex);
+	if (!test_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags))
+		goto out;
 
 	gfs2_freeze_unlock(&sdp->sd_freeze_gh);
+
+out:
 	mutex_unlock(&sdp->sd_freeze_mutex);
-	return wait_on_bit(&sdp->sd_flags, SDF_FREEZE_INITIATOR, TASK_INTERRUPTIBLE);
 }
 
 /**
author	Andreas Gruenbacher <agruenba@redhat.com>	2022-11-14 23:34:50 +0100
committer	Andreas Gruenbacher <agruenba@redhat.com>	2023-07-03 22:25:02 +0200
commit	b77b4a4815a9651d1d6e07b8e6548eee9531a5eb (patch)
tree	a03c5c47deba377022ca106ea42c99f5765bccde /fs/gfs2/super.c
parent	cad1e15804a83afd9a5c1d95a428d60d1f9c0340 (diff)