bcachefs: gc now operates on second set of bucket marks

This means we can now use gc to verify the allocation information - important for testing persistant alloc info Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
author: Kent Overstreet <kent.overstreet@gmail.com> 2018-07-23 05:32:01 -0400
committer: Kent Overstreet <kent.overstreet@linux.dev> 2023-10-22 17:08:12 -0400
commit: 9ca53b55f7415783c6cc8b751c99f2af6cc0a932 (patch)
tree: cef41ef35075c7bfaa765faf6ab9c5d58f0a56b4 /fs/bcachefs
parent: e647369168e02a06ff5ee229cc14ad72b2f5ddfd (diff)
11 files changed, 495 insertions, 383 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index c17fba1eae96..3f0e2dd29fde 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -930,12 +930,6 @@ static int bch2_allocator_thread(void *arg)
 		pr_debug("free_inc now empty");
 
 		do {
-			if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
-				up_read(&c->gc_lock);
-				bch_err(ca, "gc failure");
-				goto stop;
-			}
-
 			/*
 			 * Find some buckets that we can invalidate, either
 			 * they're completely unused, or only contain clean data
@@ -1293,9 +1287,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 	bool invalidating_data = false;
 	int ret = 0;
 
-	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
-		return -1;
-
 	if (test_alloc_startup(c)) {
 		invalidating_data = true;
 		goto not_enough;
@@ -1321,9 +1312,7 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 				continue;
 
 			bch2_mark_alloc_bucket(c, ca, bu, true,
-					gc_pos_alloc(c, NULL),
-					BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					BCH_BUCKET_MARK_GC_LOCK_HELD);
+					gc_pos_alloc(c, NULL), 0);
 
 			fifo_push(&ca->free_inc, bu);
 
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index cdea3a1d9176..eaa2055000b6 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -347,7 +347,6 @@ enum gc_phase {
 
 	GC_PHASE_PENDING_DELETE,
 	GC_PHASE_ALLOC,
-	GC_PHASE_DONE
 };
 
 struct gc_pos {
@@ -392,15 +391,14 @@ struct bch_dev {
 	 * gc_lock, for device resize - holding any is sufficient for access:
 	 * Or rcu_read_lock(), but only for ptr_stale():
 	 */
-	struct bucket_array __rcu *buckets;
+	struct bucket_array __rcu *buckets[2];
 	unsigned long		*buckets_dirty;
 	unsigned long		*buckets_written;
 	/* most out of date gen in the btree */
 	u8			*oldest_gens;
 	struct rw_semaphore	bucket_lock;
 
-	struct bch_dev_usage __percpu *usage_percpu;
-	struct bch_dev_usage	usage_cached;
+	struct bch_dev_usage __percpu *usage[2];
 
 	/* Allocator: */
 	struct task_struct __rcu *alloc_thread;
@@ -478,7 +476,6 @@ enum {
 
 	/* errors: */
 	BCH_FS_ERROR,
-	BCH_FS_GC_FAILURE,
 
 	/* misc: */
 	BCH_FS_BDEV_MOUNTED,
@@ -614,8 +611,8 @@ struct bch_fs {
 
 	atomic64_t		sectors_available;
 
-	struct bch_fs_usage __percpu *usage_percpu;
-	struct bch_fs_usage	usage_cached;
+	struct bch_fs_usage __percpu *usage[2];
+
 	struct percpu_rw_semaphore usage_lock;
 
 	struct closure_waitlist	freelist_wait;
@@ -656,9 +653,6 @@ struct bch_fs {
 	 *
 	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
 	 *
-	 * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
-	 * currently running, and gc marks are currently valid
-	 *
 	 * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
 	 * can read without a lock.
 	 */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index e900fd4ffd06..6eba65fcb52c 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -260,8 +260,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
 {
 	struct gc_pos pos = { 0 };
 	unsigned flags =
-		BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-		BCH_BUCKET_MARK_GC_LOCK_HELD|
+		BCH_BUCKET_MARK_GC|
 		(initial ? BCH_BUCKET_MARK_NOATOMIC : 0);
 	int ret = 0;
 
@@ -484,9 +483,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 				      BCH_DATA_SB, flags);
 	}
 
-	if (c)
-		spin_lock(&c->journal.lock);
-
 	for (i = 0; i < ca->journal.nr; i++) {
 		b = ca->journal.buckets[i];
 		bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
@@ -495,7 +491,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 	}
 
 	if (c) {
-		spin_unlock(&c->journal.lock);
 		percpu_up_read(&c->usage_lock);
 	} else {
 		preempt_enable();
@@ -511,9 +506,7 @@ static void bch2_mark_superblocks(struct bch_fs *c)
 	gc_pos_set(c, gc_phase(GC_PHASE_SB));
 
 	for_each_online_member(ca, c, i)
-		bch2_mark_dev_superblock(c, ca,
-					 BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					 BCH_BUCKET_MARK_GC_LOCK_HELD);
+		bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC);
 	mutex_unlock(&c->sb_lock);
 }
 
@@ -521,7 +514,6 @@ static void bch2_mark_superblocks(struct bch_fs *c)
 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 {
 	struct gc_pos pos = { 0 };
-	struct bch_fs_usage stats = { 0 };
 	struct btree_update *as;
 	struct pending_btree_node_free *d;
 
@@ -533,13 +525,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 			bch2_mark_key(c, BKEY_TYPE_BTREE,
 				      bkey_i_to_s_c(&d->key),
 				      true, 0,
-				      pos, &stats, 0,
-				      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-				      BCH_BUCKET_MARK_GC_LOCK_HELD);
-	/*
-	 * Don't apply stats - pending deletes aren't tracked in
-	 * bch_alloc_stats:
-	 */
+				      pos, NULL, 0,
+				      BCH_BUCKET_MARK_GC);
 
 	mutex_unlock(&c->btree_interior_update_lock);
 }
@@ -560,8 +547,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 		fifo_for_each_entry(i, &ca->free_inc, iter)
 			bch2_mark_alloc_bucket(c, ca, i, true,
 					       gc_pos_alloc(c, NULL),
-					       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					       BCH_BUCKET_MARK_GC_LOCK_HELD);
+					       BCH_BUCKET_MARK_GC);
 
 
 
@@ -569,8 +555,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 			fifo_for_each_entry(i, &ca->free[j], iter)
 				bch2_mark_alloc_bucket(c, ca, i, true,
 						       gc_pos_alloc(c, NULL),
-						       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-						       BCH_BUCKET_MARK_GC_LOCK_HELD);
+						       BCH_BUCKET_MARK_GC);
 	}
 
 	spin_unlock(&c->freelist_lock);
@@ -584,8 +569,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 			bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
 					       gc_pos_alloc(c, ob),
-					       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					       BCH_BUCKET_MARK_GC_LOCK_HELD);
+					       BCH_BUCKET_MARK_GC);
 		}
 		spin_unlock(&ob->lock);
 	}
@@ -593,122 +577,310 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
 	percpu_up_read(&c->usage_lock);
 }
 
-static void bch2_gc_start(struct bch_fs *c)
+static void bch2_gc_free(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_member_device(ca, c, i) {
+		kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
+			sizeof(struct bucket_array) +
+			ca->mi.nbuckets * sizeof(struct bucket));
+		ca->buckets[1] = NULL;
+
+		free_percpu(ca->usage[1]);
+		ca->usage[1] = NULL;
+	}
+
+	free_percpu(c->usage[1]);
+	c->usage[1] = NULL;
+}
+
+static void bch2_gc_done_nocheck(struct bch_fs *c)
 {
 	struct bch_dev *ca;
-	struct bucket_array *buckets;
-	struct bucket_mark new;
 	unsigned i;
-	size_t b;
 	int cpu;
 
-	percpu_down_write(&c->usage_lock);
+	for_each_member_device(ca, c, i) {
+		struct bucket_array *src = __bucket_array(ca, 1);
 
-	/*
-	 * Indicates to buckets code that gc is now in progress - done under
-	 * usage_lock to avoid racing with bch2_mark_key():
-	 */
-	__gc_pos_set(c, gc_phase(GC_PHASE_START));
+		memcpy(__bucket_array(ca, 0), src,
+		       sizeof(struct bucket_array) +
+		       sizeof(struct bucket) * src->nbuckets);
+	};
 
-	/* Save a copy of the existing bucket stats while we recompute them: */
 	for_each_member_device(ca, c, i) {
-		ca->usage_cached = __bch2_dev_usage_read(ca);
+		struct bch_dev_usage *p;
+
 		for_each_possible_cpu(cpu) {
-			struct bch_dev_usage *p =
-				per_cpu_ptr(ca->usage_percpu, cpu);
+			p = per_cpu_ptr(ca->usage[0], cpu);
 			memset(p, 0, sizeof(*p));
 		}
+
+		preempt_disable();
+		*this_cpu_ptr(ca->usage[0]) = __bch2_dev_usage_read(ca, 1);
+		preempt_enable();
 	}
 
-	c->usage_cached = __bch2_fs_usage_read(c);
-	for_each_possible_cpu(cpu) {
-		struct bch_fs_usage *p =
-			per_cpu_ptr(c->usage_percpu, cpu);
+	{
+		struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
+		struct bch_fs_usage *p;
 
-		memset(p->replicas, 0, sizeof(p->replicas));
-		memset(p->buckets, 0, sizeof(p->buckets));
+		for_each_possible_cpu(cpu) {
+			p = per_cpu_ptr(c->usage[0], cpu);
+			memset(p, 0, offsetof(typeof(*p), online_reserved));
+		}
+
+		preempt_disable();
+		memcpy(this_cpu_ptr(c->usage[0]),
+		       &src,
+		       offsetof(typeof(*p), online_reserved));
+		preempt_enable();
 	}
 
+}
+
+static void bch2_gc_done(struct bch_fs *c, bool initial)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int cpu;
+
+#define copy_field(_f, _msg, ...)					\
+	if (dst._f != src._f) {						\
+		pr_info(_msg ": got %llu, should be %llu, fixing"	\
+			, ##__VA_ARGS__, dst._f, src._f);		\
+		dst._f = src._f;					\
+	}
+#define copy_bucket_field(_f)						\
+	if (dst->b[b].mark._f != src->b[b].mark._f) {			\
+		pr_info("dev %u bucket %zu has wrong " #_f		\
+			": got %u, should be %u, fixing",		\
+			i, b, dst->b[b].mark._f, src->b[b].mark._f);	\
+		dst->b[b]._mark._f = src->b[b].mark._f;			\
+	}
+#define copy_dev_field(_f, _msg, ...)					\
+	copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
+#define copy_fs_field(_f, _msg, ...)					\
+	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
+
+	percpu_down_write(&c->usage_lock);
+
+	if (initial) {
+		bch2_gc_done_nocheck(c);
+		goto out;
+	}
+
+	for_each_member_device(ca, c, i) {
+		struct bucket_array *dst = __bucket_array(ca, 0);
+		struct bucket_array *src = __bucket_array(ca, 1);
+		size_t b;
+
+		if (initial) {
+			memcpy(dst, src,
+			       sizeof(struct bucket_array) +
+			       sizeof(struct bucket) * dst->nbuckets);
+		}
+
+		for (b = 0; b < src->nbuckets; b++) {
+			copy_bucket_field(gen);
+			copy_bucket_field(data_type);
+			copy_bucket_field(owned_by_allocator);
+			copy_bucket_field(stripe);
+			copy_bucket_field(dirty_sectors);
+			copy_bucket_field(cached_sectors);
+		}
+	};
+
+	for_each_member_device(ca, c, i) {
+		struct bch_dev_usage dst = __bch2_dev_usage_read(ca, 0);
+		struct bch_dev_usage src = __bch2_dev_usage_read(ca, 1);
+		struct bch_dev_usage *p;
+		unsigned b;
+
+		for (b = 0; b < BCH_DATA_NR; b++)
+			copy_dev_field(buckets[b],
+				       "buckets[%s]", bch2_data_types[b]);
+		copy_dev_field(buckets_alloc, "buckets_alloc");
+		copy_dev_field(buckets_ec, "buckets_ec");
+
+		for (b = 0; b < BCH_DATA_NR; b++)
+			copy_dev_field(sectors[b],
+				       "sectors[%s]", bch2_data_types[b]);
+		copy_dev_field(sectors_fragmented,
+			       "sectors_fragmented");
+
+		for_each_possible_cpu(cpu) {
+			p = per_cpu_ptr(ca->usage[0], cpu);
+			memset(p, 0, sizeof(*p));
+		}
+
+		preempt_disable();
+		p = this_cpu_ptr(ca->usage[0]);
+		*p = dst;
+		preempt_enable();
+	}
+
+	{
+		struct bch_fs_usage dst = __bch2_fs_usage_read(c, 0);
+		struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
+		struct bch_fs_usage *p;
+		unsigned r, b;
+
+		for (r = 0; r < BCH_REPLICAS_MAX; r++) {
+			for (b = 0; b < BCH_DATA_NR; b++)
+				copy_fs_field(replicas[r].data[b],
+					      "replicas[%i].data[%s]",
+					      r, bch2_data_types[b]);
+			copy_fs_field(replicas[r].ec_data,
+				      "replicas[%i].ec_data", r);
+			copy_fs_field(replicas[r].persistent_reserved,
+				      "replicas[%i].persistent_reserved", r);
+		}
+
+		for (b = 0; b < BCH_DATA_NR; b++)
+			copy_fs_field(buckets[b],
+				      "buckets[%s]", bch2_data_types[b]);
+
+		for_each_possible_cpu(cpu) {
+			p = per_cpu_ptr(c->usage[0], cpu);
+			memset(p, 0, offsetof(typeof(*p), online_reserved));
+		}
+
+		preempt_disable();
+		p = this_cpu_ptr(c->usage[0]);
+		memcpy(p, &dst, offsetof(typeof(*p), online_reserved));
+		preempt_enable();
+	}
+out:
 	percpu_up_write(&c->usage_lock);
 
-	/* Clear bucket marks: */
+#undef copy_field
+#undef copy_fs_field
+#undef copy_dev_field
+#undef copy_bucket_field
+}
+
+static int bch2_gc_start(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	BUG_ON(c->usage[1]);
+
+	c->usage[1] = alloc_percpu(struct bch_fs_usage);
+	if (!c->usage[1])
+		return -ENOMEM;
+
 	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		buckets = bucket_array(ca);
-
-		for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
-			bucket_cmpxchg(buckets->b + b, new, ({
-				new.owned_by_allocator	= 0;
-				new.data_type		= 0;
-				new.cached_sectors	= 0;
-				new.dirty_sectors	= 0;
-				new.stripe		= 0;
-			}));
-			ca->oldest_gens[b] = new.gen;
+		BUG_ON(ca->buckets[1]);
+		BUG_ON(ca->usage[1]);
+
+		ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
+				ca->mi.nbuckets * sizeof(struct bucket),
+				GFP_KERNEL|__GFP_ZERO);
+		if (!ca->buckets[1]) {
+			percpu_ref_put(&ca->ref);
+			return -ENOMEM;
+		}
+
+		ca->usage[1] = alloc_percpu(struct bch_dev_usage);
+		if (!ca->usage[1]) {
+			percpu_ref_put(&ca->ref);
+			return -ENOMEM;
 		}
-		up_read(&ca->bucket_lock);
 	}
+
+	percpu_down_write(&c->usage_lock);
+
+	for_each_member_device(ca, c, i) {
+		struct bucket_array *dst = __bucket_array(ca, 1);
+		struct bucket_array *src = __bucket_array(ca, 0);
+		size_t b;
+
+		dst->first_bucket	= src->first_bucket;
+		dst->nbuckets		= src->nbuckets;
+
+		for (b = 0; b < src->nbuckets; b++)
+			dst->b[b]._mark.gen = src->b[b].mark.gen;
+	};
+
+	percpu_up_write(&c->usage_lock);
+
+	return 0;
 }
 
 /**
- * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
+ * bch2_gc - walk _all_ references to buckets, and recompute them:
+ *
+ * Order matters here:
+ *  - Concurrent GC relies on the fact that we have a total ordering for
+ *    everything that GC walks - see  gc_will_visit_node(),
+ *    gc_will_visit_root()
+ *
+ *  - also, references move around in the course of index updates and
+ *    various other crap: everything needs to agree on the ordering
+ *    references are allowed to move around in - e.g., we're allowed to
+ *    start with a reference owned by an open_bucket (the allocator) and
+ *    move it to the btree, but not the reverse.
+ *
+ *    This is necessary to ensure that gc doesn't miss references that
+ *    move around - if references move backwards in the ordering GC
+ *    uses, GC could skip past them
  */
-void bch2_gc(struct bch_fs *c)
+int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial)
 {
 	struct bch_dev *ca;
 	u64 start_time = local_clock();
-	unsigned i;
+	unsigned i, iter = 0;
 	int ret;
 
-	/*
-	 * Walk _all_ references to buckets, and recompute them:
-	 *
-	 * Order matters here:
-	 *  - Concurrent GC relies on the fact that we have a total ordering for
-	 *    everything that GC walks - see  gc_will_visit_node(),
-	 *    gc_will_visit_root()
-	 *
-	 *  - also, references move around in the course of index updates and
-	 *    various other crap: everything needs to agree on the ordering
-	 *    references are allowed to move around in - e.g., we're allowed to
-	 *    start with a reference owned by an open_bucket (the allocator) and
-	 *    move it to the btree, but not the reverse.
-	 *
-	 *    This is necessary to ensure that gc doesn't miss references that
-	 *    move around - if references move backwards in the ordering GC
-	 *    uses, GC could skip past them
-	 */
 	trace_gc_start(c);
 
-	/*
-	 * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on
-	 * gc_lock if sectors_available goes to 0:
-	 */
-	bch2_recalc_sectors_available(c);
-
 	down_write(&c->gc_lock);
-	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+again:
+	ret = bch2_gc_start(c);
+	if (ret)
 		goto out;
 
-	bch2_gc_start(c);
-
 	bch2_mark_superblocks(c);
 
-	ret = bch2_gc_btrees(c, NULL, false);
-	if (ret) {
-		bch_err(c, "btree gc failed: %d", ret);
-		set_bit(BCH_FS_GC_FAILURE, &c->flags);
+	ret = bch2_gc_btrees(c, journal, initial);
+	if (ret)
 		goto out;
-	}
 
 	bch2_mark_pending_btree_node_frees(c);
 	bch2_mark_allocator_buckets(c);
 
-	/* Indicates that gc is no longer in progress: */
-	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
 	c->gc_count++;
 out:
+	if (!ret && test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
+		/*
+		 * XXX: make sure gens we fixed got saved
+		 */
+		if (iter++ <= 2) {
+			bch_info(c, "Fixed gens, restarting mark and sweep:");
+			clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+			goto again;
+		}
+
+		bch_info(c, "Unable to fix bucket gens, looping");
+		ret = -EINVAL;
+	}
+
+	if (!ret)
+		bch2_gc_done(c, initial);
+
+	/* Indicates that gc is no longer in progress: */
+	__gc_pos_set(c, gc_phase(GC_PHASE_START));
+
+	bch2_gc_free(c);
 	up_write(&c->gc_lock);
+
+	if (!ret && initial)
+		set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+
 	trace_gc_end(c);
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 
@@ -724,6 +896,7 @@ out:
 	 * allocator thread - issue wakeup in case they blocked on gc_lock:
 	 */
 	closure_wake_up(&c->freelist_wait);
+	return ret;
 }
 
 /* Btree coalescing */
@@ -1039,9 +1212,6 @@ void bch2_coalesce(struct bch_fs *c)
 {
 	enum btree_id id;
 
-	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
-		return;
-
 	down_read(&c->gc_lock);
 	trace_gc_coalesce_start(c);
 
@@ -1053,7 +1223,6 @@ void bch2_coalesce(struct bch_fs *c)
 		if (ret) {
 			if (ret != -ESHUTDOWN)
 				bch_err(c, "btree coalescing failed: %d", ret);
-			set_bit(BCH_FS_GC_FAILURE, &c->flags);
 			return;
 		}
 	}
@@ -1068,6 +1237,7 @@ static int bch2_gc_thread(void *arg)
 	struct io_clock *clock = &c->io_clock[WRITE];
 	unsigned long last = atomic_long_read(&clock->now);
 	unsigned last_kick = atomic_read(&c->kick_gc);
+	int ret;
 
 	set_freezable();
 
@@ -1101,7 +1271,9 @@ static int bch2_gc_thread(void *arg)
 		last = atomic_long_read(&clock->now);
 		last_kick = atomic_read(&c->kick_gc);
 
-		bch2_gc(c);
+		ret = bch2_gc(c, NULL, false);
+		if (ret)
+			bch_err(c, "btree gc failed: %i", ret);
 
 		debug_check_no_locks_held();
 	}
@@ -1142,30 +1314,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
 
 int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
 {
-	unsigned iter = 0;
-	int ret = 0;
-
-	down_write(&c->gc_lock);
-again:
-	bch2_gc_start(c);
-
-	bch2_mark_superblocks(c);
-
-	ret = bch2_gc_btrees(c, journal, true);
-	if (ret)
-		goto err;
-
-	if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
-		if (iter++ > 2) {
-			bch_info(c, "Unable to fix bucket gens, looping");
-			ret = -EINVAL;
-			goto err;
-		}
-
-		bch_info(c, "Fixed gens, restarting initial mark and sweep:");
-		clear_bit(BCH_FS_FIXED_GENS, &c->flags);
-		goto again;
-	}
+	int ret = bch2_gc(c, journal, true);
 
 	/*
 	 * Skip past versions that might have possibly been used (as nonces),
@@ -1174,9 +1323,5 @@ again:
 	if (c->sb.encryption_type)
 		atomic64_add(1 << 16, &c->key_version);
 
-	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
-	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-err:
-	up_write(&c->gc_lock);
 	return ret;
 }
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 47a590015325..bb77564b9463 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -7,7 +7,7 @@
 enum bkey_type;
 
 void bch2_coalesce(struct bch_fs *);
-void bch2_gc(struct bch_fs *);
+int bch2_gc(struct bch_fs *, struct list_head *, bool);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
 int bch2_initial_gc(struct bch_fs *, struct list_head *);
@@ -105,14 +105,14 @@ static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *o
 	};
 }
 
-static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
+static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
 {
 	unsigned seq;
 	bool ret;
 
 	do {
 		seq = read_seqcount_begin(&c->gc_pos_lock);
-		ret = gc_pos_cmp(c->gc_pos, pos) < 0;
+		ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
 	} while (read_seqcount_retry(&c->gc_pos_lock, seq));
 
 	return ret;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index af31819c88c7..2631b0732d4b 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -160,7 +160,6 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
 {
 	struct bch_fs *c = as->c;
 	struct pending_btree_node_free *d;
-	unsigned replicas;
 
 	/*
 	 * btree_update lock is only needed here to avoid racing with
@@ -179,15 +178,6 @@ found:
 	d->index_update_done = true;
 
 	/*
-	 * Btree nodes are accounted as freed in bch_alloc_stats when they're
-	 * freed from the index:
-	 */
-	replicas = bch2_extent_nr_dirty_ptrs(k);
-	if (replicas)
-		stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -=
-			c->opts.btree_node_size * replicas;
-
-	/*
 	 * We're dropping @k from the btree, but it's still live until the
 	 * index update is persistent so we need to keep a reference around for
 	 * mark and sweep to find - that's primarily what the
@@ -208,15 +198,16 @@ found:
 	 * bch2_mark_key() compares the current gc pos to the pos we're
 	 * moving this reference from, hence one comparison here:
 	 */
-	if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
-		struct bch_fs_usage tmp = { 0 };
+	if (gc_pos_cmp(c->gc_pos, b
+		       ? gc_pos_btree_node(b)
+		       : gc_pos_btree_root(as->btree_id)) >= 0 &&
+	    gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
+		struct gc_pos pos = { 0 };
 
 		bch2_mark_key(c, BKEY_TYPE_BTREE,
 			      bkey_i_to_s_c(&d->key),
-			      false, 0, b
-			      ? gc_pos_btree_node(b)
-			      : gc_pos_btree_root(as->btree_id),
-			      &tmp, 0, 0);
+			      false, 0, pos,
+			      NULL, 0, BCH_BUCKET_MARK_GC);
 		/*
 		 * Don't apply tmp - pending deletes aren't tracked in
 		 * bch_alloc_stats:
@@ -287,19 +278,13 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
 					struct pending_btree_node_free *pending)
 {
-	struct bch_fs_usage stats = { 0 };
-
 	BUG_ON(!pending->index_update_done);
 
 	bch2_mark_key(c, BKEY_TYPE_BTREE,
 		      bkey_i_to_s_c(&pending->key),
 		      false, 0,
 		      gc_phase(GC_PHASE_PENDING_DELETE),
-		      &stats, 0, 0);
-	/*
-	 * Don't apply stats - pending deletes aren't tracked in
-	 * bch_alloc_stats:
-	 */
+		      NULL, 0, 0);
 }
 
 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@@ -1939,6 +1924,25 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 
 	btree_interior_update_add_node_reference(as, b);
 
+	/*
+	 * XXX: the rest of the update path treats this like we're actually
+	 * inserting a new node and deleting the existing node, so the
+	 * reservation needs to include enough space for @b
+	 *
+	 * that is actually sketch as fuck though and I am surprised the code
+	 * seems to work like that, definitely need to go back and rework it
+	 * into something saner.
+	 *
+	 * (I think @b is just getting double counted until the btree update
+	 * finishes and "deletes" @b on disk)
+	 */
+	ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
+			c->opts.btree_node_size *
+			bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)),
+			BCH_DISK_RESERVATION_NOFAIL|
+			BCH_DISK_RESERVATION_GC_LOCK_HELD);
+	BUG_ON(ret);
+
 	parent = btree_node_parent(iter, b);
 	if (parent) {
 		if (new_hash) {
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 201798866242..2ebe8bad978e 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -85,8 +85,7 @@ static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
 
 static void bch2_fs_stats_verify(struct bch_fs *c)
 {
-	struct bch_fs_usage stats =
-		__bch2_fs_usage_read(c);
+	struct bch_fs_usage stats =_bch2_fs_usage_read(c);
 	unsigned i, j;
 
 	for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
@@ -209,43 +208,24 @@ do {									\
 	_acc;								\
 })
 
-#define bch2_usage_read_cached(_c, _cached, _uncached)			\
-({									\
-	typeof(_cached) _ret;						\
-	unsigned _seq;							\
-									\
-	do {								\
-		_seq = read_seqcount_begin(&(_c)->gc_pos_lock);		\
-		_ret = (_c)->gc_pos.phase == GC_PHASE_DONE		\
-			? bch2_usage_read_raw(_uncached)			\
-			: (_cached);					\
-	} while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq));	\
-									\
-	_ret;								\
-})
-
-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca, bool gc)
 {
-	return bch2_usage_read_raw(ca->usage_percpu);
+	return bch2_usage_read_raw(ca->usage[gc]);
 }
 
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 {
-	return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
+	return bch2_usage_read_raw(ca->usage[0]);
 }
 
-struct bch_fs_usage
-__bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *c, bool gc)
 {
-	return bch2_usage_read_raw(c->usage_percpu);
+	return bch2_usage_read_raw(c->usage[gc]);
 }
 
-struct bch_fs_usage
-bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c)
 {
-	return bch2_usage_read_cached(c,
-				     c->usage_cached,
-				     c->usage_percpu);
+	return bch2_usage_read_raw(c->usage[0]);
 }
 
 struct fs_usage_sum {
@@ -327,13 +307,11 @@ static inline enum bch_data_type bucket_type(struct bucket_mark m)
 		: m.data_type;
 }
 
-static bool bucket_became_unavailable(struct bch_fs *c,
-				      struct bucket_mark old,
+static bool bucket_became_unavailable(struct bucket_mark old,
 				      struct bucket_mark new)
 {
 	return is_available_bucket(old) &&
-	       !is_available_bucket(new) &&
-	       (!c || c->gc_pos.phase == GC_PHASE_DONE);
+	       !is_available_bucket(new);
 }
 
 void bch2_fs_usage_apply(struct bch_fs *c,
@@ -364,11 +342,13 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 	percpu_down_read(&c->usage_lock);
 	preempt_disable();
 	/* online_reserved not subject to gc: */
-	this_cpu_add(c->usage_percpu->online_reserved, stats->online_reserved);
+	this_cpu_add(c->usage[0]->online_reserved, stats->online_reserved);
 	stats->online_reserved = 0;
 
-	if (!gc_will_visit(c, gc_pos))
-		bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats);
+	bch2_usage_add(this_cpu_ptr(c->usage[0]), stats);
+
+	if (gc_visited(c, gc_pos))
+		bch2_usage_add(this_cpu_ptr(c->usage[1]), stats);
 
 	bch2_fs_stats_verify(c);
 	preempt_enable();
@@ -378,8 +358,9 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 }
 
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
-				  struct bch_fs_usage *stats,
-				  struct bucket_mark old, struct bucket_mark new)
+				  struct bch_fs_usage *fs_usage,
+				  struct bucket_mark old, struct bucket_mark new,
+				  bool gc)
 {
 	struct bch_dev_usage *dev_usage;
 
@@ -391,14 +372,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 		bch2_data_types[old.data_type],
 		bch2_data_types[new.data_type]);
 
-	stats->buckets[bucket_type(old)] -= ca->mi.bucket_size;
-	stats->buckets[bucket_type(new)] += ca->mi.bucket_size;
-
 	preempt_disable();
-	dev_usage = this_cpu_ptr(ca->usage_percpu);
+	dev_usage = this_cpu_ptr(ca->usage[gc]);
 
-	dev_usage->buckets[bucket_type(old)]--;
-	dev_usage->buckets[bucket_type(new)]++;
+	if (bucket_type(old) != bucket_type(new)) {
+		if (bucket_type(old)) {
+			fs_usage->buckets[bucket_type(old)] -= ca->mi.bucket_size;
+			dev_usage->buckets[bucket_type(old)]--;
+		} else {
+			fs_usage->buckets[bucket_type(new)] += ca->mi.bucket_size;
+			dev_usage->buckets[bucket_type(new)]++;
+		}
+	}
 
 	dev_usage->buckets_alloc +=
 		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
@@ -425,21 +410,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 ({								\
 	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
 								\
-	bch2_dev_usage_update(c, ca, stats, _old, new);		\
+	bch2_dev_usage_update(c, ca, stats, _old, new, gc);	\
 	_old;							\
 })
 
-void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
-			    size_t b, struct bucket_mark *old)
+static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+				     size_t b, struct bucket_mark *old,
+				     bool gc)
 {
-	struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
-	struct bucket *g;
+	struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
+	struct bucket *g = __bucket(ca, b, gc);
 	struct bucket_mark new;
 
-	percpu_rwsem_assert_held(&c->usage_lock);
-
-	g = bucket(ca, b);
-
 	*old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
 		BUG_ON(!is_available_bucket(new));
 
@@ -450,38 +432,49 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.gen++;
 	}));
 
-	/*
-	 * This isn't actually correct yet, since fs usage is still
-	 * uncompressed sectors:
-	 */
 	stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors;
+}
+
+void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    size_t b, struct bucket_mark *old)
+{
+	percpu_rwsem_assert_held(&c->usage_lock);
+
+	__bch2_invalidate_bucket(c, ca, b, old, false);
 
 	if (!old->owned_by_allocator && old->cached_sectors)
 		trace_invalidate(ca, bucket_to_sector(ca, b),
 				 old->cached_sectors);
 }
 
-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-			    size_t b, bool owned_by_allocator,
-			    struct gc_pos pos, unsigned flags)
+static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+				     size_t b, bool owned_by_allocator,
+				     bool gc)
 {
-	struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
-	struct bucket *g;
+	struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
+	struct bucket *g = __bucket(ca, b, gc);
 	struct bucket_mark old, new;
 
-	percpu_rwsem_assert_held(&c->usage_lock);
-	g = bucket(ca, b);
-
-	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-	    gc_will_visit(c, pos))
-		return;
-
 	old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
 		new.owned_by_allocator	= owned_by_allocator;
 	}));
 
-	BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
-	       c->gc_pos.phase == GC_PHASE_DONE);
+	BUG_ON(!gc &&
+	       !owned_by_allocator && !old.owned_by_allocator);
+}
+
+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    size_t b, bool owned_by_allocator,
+			    struct gc_pos pos, unsigned flags)
+{
+	percpu_rwsem_assert_held(&c->usage_lock);
+
+	if (!(flags & BCH_BUCKET_MARK_GC))
+		__bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
+
+	if ((flags & BCH_BUCKET_MARK_GC) ||
+	    gc_visited(c, pos))
+		__bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
 }
 
 #define checked_add(a, b)					\
@@ -491,37 +484,49 @@ do {								\
 	BUG_ON((a) != _res);					\
 } while (0)
 
+static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+					size_t b, enum bch_data_type type,
+					unsigned sectors, bool gc)
+{
+	struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
+	struct bucket *g = __bucket(ca, b, gc);
+	struct bucket_mark old, new;
+
+	BUG_ON(type != BCH_DATA_SB &&
+	       type != BCH_DATA_JOURNAL);
+
+	old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+		new.data_type	= type;
+		checked_add(new.dirty_sectors, sectors);
+	}));
+
+	fs_usage->replicas[0].data[type] += sectors;
+}
+
 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 			       size_t b, enum bch_data_type type,
 			       unsigned sectors, struct gc_pos pos,
 			       unsigned flags)
 {
-	struct bch_fs_usage *stats;
-	struct bucket *g;
-	struct bucket_mark old, new;
-
 	BUG_ON(type != BCH_DATA_SB &&
 	       type != BCH_DATA_JOURNAL);
 
+	preempt_disable();
+
 	if (likely(c)) {
 		percpu_rwsem_assert_held(&c->usage_lock);
 
-		if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-		    gc_will_visit(c, pos))
-			return;
-
-		preempt_disable();
-		stats = this_cpu_ptr(c->usage_percpu);
-
-		g = bucket(ca, b);
-		old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
-			new.data_type = type;
-			checked_add(new.dirty_sectors, sectors);
-		}));
-
-		stats->replicas[0].data[type] += sectors;
-		preempt_enable();
+		if (!(flags & BCH_BUCKET_MARK_GC))
+			__bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+						    false);
+		if ((flags & BCH_BUCKET_MARK_GC) ||
+		    gc_visited(c, pos))
+			__bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+						    true);
 	} else {
+		struct bucket *g;
+		struct bucket_mark old, new;
+
 		rcu_read_lock();
 
 		g = bucket(ca, b);
@@ -533,8 +538,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 		rcu_read_unlock();
 	}
 
-	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
-	       bucket_became_unavailable(c, old, new));
+	preempt_enable();
 }
 
 static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
@@ -579,23 +583,15 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			      struct extent_ptr_decoded p,
 			      s64 sectors, enum bch_data_type data_type,
 			      struct bch_fs_usage *fs_usage,
-			      u64 journal_seq, unsigned flags)
+			      u64 journal_seq, unsigned flags,
+			      bool gc)
 {
 	struct bucket_mark old, new;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-	struct bucket *g = PTR_BUCKET(ca, &p.ptr);
+	size_t b = PTR_BUCKET_NR(ca, &p.ptr);
+	struct bucket *g = __bucket(ca, b, gc);
 	u64 v;
 
-	if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
-		if (journal_seq)
-			bucket_cmpxchg(g, new, ({
-				new.journal_seq_valid	= 1;
-				new.journal_seq		= journal_seq;
-			}));
-
-		return;
-	}
-
 	v = atomic64_read(&g->_mark.v);
 	do {
 		new.v.counter = old.v.counter = v;
@@ -637,10 +633,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			      old.v.counter,
 			      new.v.counter)) != old.v.counter);
 
-	bch2_dev_usage_update(c, ca, fs_usage, old, new);
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
 
-	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
-	       bucket_became_unavailable(c, old, new));
+	BUG_ON(!gc && bucket_became_unavailable(old, new));
 }
 
 static void bch2_mark_stripe_ptr(struct bch_fs *c,
@@ -688,9 +683,9 @@ static void bch2_mark_stripe_ptr(struct bch_fs *c,
 
 static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 			     s64 sectors, enum bch_data_type data_type,
-			     struct gc_pos pos,
 			     struct bch_fs_usage *stats,
-			     u64 journal_seq, unsigned flags)
+			     u64 journal_seq, unsigned flags,
+			     bool gc)
 {
 	BUG_ON(!sectors);
 
@@ -712,7 +707,7 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 			s64 adjusted_disk_sectors = disk_sectors;
 
 			bch2_mark_pointer(c, e, p, disk_sectors, data_type,
-					  stats, journal_seq, flags);
+					  stats, journal_seq, flags, gc);
 
 			if (!p.ptr.cached)
 				for (i = 0; i < p.ec_nr; i++)
@@ -758,21 +753,20 @@ static void bucket_set_stripe(struct bch_fs *c,
 			      const struct bch_stripe *v,
 			      bool enabled,
 			      struct bch_fs_usage *fs_usage,
-			      u64 journal_seq)
+			      u64 journal_seq,
+			      bool gc)
 {
 	unsigned i;
 
 	for (i = 0; i < v->nr_blocks; i++) {
 		const struct bch_extent_ptr *ptr = v->ptrs + i;
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-		struct bucket *g;
+		size_t b = PTR_BUCKET_NR(ca, ptr);
+		struct bucket *g = __bucket(ca, b, gc);
 		struct bucket_mark new, old;
 
 		BUG_ON(ptr_stale(ca, ptr));
 
-		rcu_read_lock();
-		g = PTR_BUCKET(ca, ptr);
-
 		old = bucket_cmpxchg(g, new, ({
 			new.stripe			= enabled;
 			if (journal_seq) {
@@ -780,18 +774,18 @@ static void bucket_set_stripe(struct bch_fs *c,
 				new.journal_seq		= journal_seq;
 			}
 		}));
-		rcu_read_unlock();
 
 		BUG_ON(old.stripe == enabled);
 
-		bch2_dev_usage_update(c, ca, fs_usage, old, new);
+		bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
 	}
 }
 
 static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
-			     bool inserting, struct gc_pos pos,
+			     bool inserting,
 			     struct bch_fs_usage *fs_usage,
-			     u64 journal_seq, unsigned flags)
+			     u64 journal_seq, unsigned flags,
+			     bool gc)
 {
 	switch (k.k->type) {
 	case BCH_STRIPE: {
@@ -820,74 +814,64 @@ static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 		else
 			bch2_stripes_heap_del(c, m, idx);
 
-		bucket_set_stripe(c, s.v, inserting, fs_usage, 0);
+		bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
 		break;
 	}
 	}
 }
 
-void bch2_mark_key(struct bch_fs *c,
-		   enum bkey_type type, struct bkey_s_c k,
-		   bool inserting, s64 sectors,
-		   struct gc_pos pos,
-		   struct bch_fs_usage *stats,
-		   u64 journal_seq, unsigned flags)
+static void __bch2_mark_key(struct bch_fs *c,
+			    enum bkey_type type, struct bkey_s_c k,
+			    bool inserting, s64 sectors,
+			    struct bch_fs_usage *stats,
+			    u64 journal_seq, unsigned flags,
+			    bool gc)
 {
-	/*
-	 * synchronization w.r.t. GC:
-	 *
-	 * Normally, bucket sector counts/marks are updated on the fly, as
-	 * references are added/removed from the btree, the lists of buckets the
-	 * allocator owns, other metadata buckets, etc.
-	 *
-	 * When GC is in progress and going to mark this reference, we do _not_
-	 * mark this reference here, to avoid double counting - GC will count it
-	 * when it gets to it.
-	 *
-	 * To know whether we should mark a given reference (GC either isn't
-	 * running, or has already marked references at this position) we
-	 * construct a total order for everything GC walks. Then, we can simply
-	 * compare the position of the reference we're marking - @pos - with
-	 * GC's current position. If GC is going to mark this reference, GC's
-	 * current position will be less than @pos; if GC's current position is
-	 * greater than @pos GC has either already walked this position, or
-	 * isn't running.
-	 *
-	 * To avoid racing with GC's position changing, we have to deal with
-	 *  - GC's position being set to GC_POS_MIN when GC starts:
-	 *    usage_lock guards against this
-	 *  - GC's position overtaking @pos: we guard against this with
-	 *    whatever lock protects the data structure the reference lives in
-	 *    (e.g. the btree node lock, or the relevant allocator lock).
-	 */
-
-	percpu_down_read(&c->usage_lock);
-	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-	    gc_will_visit(c, pos))
-		flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
-
-	if (!stats)
-		stats = this_cpu_ptr(c->usage_percpu);
-
 	switch (type) {
 	case BKEY_TYPE_BTREE:
 		bch2_mark_extent(c, k, inserting
 				 ?  c->opts.btree_node_size
 				 : -c->opts.btree_node_size,
 				 BCH_DATA_BTREE,
-				 pos, stats, journal_seq, flags);
+				 stats, journal_seq, flags, gc);
 		break;
 	case BKEY_TYPE_EXTENTS:
 		bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
-				 pos, stats, journal_seq, flags);
+				 stats, journal_seq, flags, gc);
 		break;
 	case BKEY_TYPE_EC:
 		bch2_mark_stripe(c, k, inserting,
-				 pos, stats, journal_seq, flags);
+				 stats, journal_seq, flags, gc);
 		break;
 	default:
 		break;
 	}
+}
+
+void bch2_mark_key(struct bch_fs *c,
+		   enum bkey_type type, struct bkey_s_c k,
+		   bool inserting, s64 sectors,
+		   struct gc_pos pos,
+		   struct bch_fs_usage *stats,
+		   u64 journal_seq, unsigned flags)
+{
+	percpu_down_read(&c->usage_lock);
+
+	if (!(flags & BCH_BUCKET_MARK_GC)) {
+		if (!stats)
+			stats = this_cpu_ptr(c->usage[0]);
+
+		__bch2_mark_key(c, type, k, inserting, sectors,
+				stats, journal_seq, flags, false);
+	}
+
+	if ((flags & BCH_BUCKET_MARK_GC) ||
+	    gc_visited(c, pos)) {
+		__bch2_mark_key(c, type, k, inserting, sectors,
+				this_cpu_ptr(c->usage[1]),
+				journal_seq, flags, true);
+	}
+
 	percpu_up_read(&c->usage_lock);
 }
 
@@ -963,28 +947,20 @@ void bch2_mark_update(struct btree_insert *trans,
 
 /* Disk reservations: */
 
-static u64 __recalc_sectors_available(struct bch_fs *c)
+static u64 bch2_recalc_sectors_available(struct bch_fs *c)
 {
 	int cpu;
 
 	for_each_possible_cpu(cpu)
-		per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
+		per_cpu_ptr(c->usage[0], cpu)->available_cache = 0;
 
 	return avail_factor(bch2_fs_sectors_free(c, bch2_fs_usage_read(c)));
 }
 
-/* Used by gc when it's starting: */
-void bch2_recalc_sectors_available(struct bch_fs *c)
-{
-	percpu_down_write(&c->usage_lock);
-	atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
-	percpu_up_write(&c->usage_lock);
-}
-
 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 {
 	percpu_down_read(&c->usage_lock);
-	this_cpu_sub(c->usage_percpu->online_reserved,
+	this_cpu_sub(c->usage[0]->online_reserved,
 		     res->sectors);
 
 	bch2_fs_stats_verify(c);
@@ -1005,7 +981,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 
 	percpu_down_read(&c->usage_lock);
 	preempt_disable();
-	stats = this_cpu_ptr(c->usage_percpu);
+	stats = this_cpu_ptr(c->usage[0]);
 
 	if (sectors <= stats->available_cache)
 		goto out;
@@ -1055,7 +1031,7 @@ recalculate:
 	}
 
 	percpu_down_write(&c->usage_lock);
-	sectors_available = __recalc_sectors_available(c);
+	sectors_available = bch2_recalc_sectors_available(c);
 
 	if (sectors <= sectors_available ||
 	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
@@ -1110,7 +1086,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	size_t copygc_reserve	= max_t(size_t, 2, nbuckets >> 7);
 	size_t free_inc_nr	= max(max_t(size_t, 1, nbuckets >> 12),
 				      btree_reserve);
-	bool resize = ca->buckets != NULL,
+	bool resize = ca->buckets[0] != NULL,
 	     start_copygc = ca->copygc_thread != NULL;
 	int ret = -ENOMEM;
 	unsigned i;
@@ -1170,7 +1146,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		       BITS_TO_LONGS(n) * sizeof(unsigned long));
 	}
 
-	rcu_assign_pointer(ca->buckets, buckets);
+	rcu_assign_pointer(ca->buckets[0], buckets);
 	buckets = old_buckets;
 
 	swap(ca->oldest_gens, oldest_gens);
@@ -1239,16 +1215,16 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 	kvpfree(ca->buckets_dirty,
 		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
 	kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
-	kvpfree(rcu_dereference_protected(ca->buckets, 1),
+	kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
 		sizeof(struct bucket_array) +
 		ca->mi.nbuckets * sizeof(struct bucket));
 
-	free_percpu(ca->usage_percpu);
+	free_percpu(ca->usage[0]);
 }
 
 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-	if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)))
+	if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
 		return -ENOMEM;
 
 	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index b48960fa5ce7..813e0c44e107 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -29,23 +29,34 @@
 	_old;							\
 })
 
-static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
+						  bool gc)
 {
-	return rcu_dereference_check(ca->buckets,
+	return rcu_dereference_check(ca->buckets[gc],
 				     !ca->fs ||
 				     percpu_rwsem_is_held(&ca->fs->usage_lock) ||
 				     lockdep_is_held(&ca->fs->gc_lock) ||
 				     lockdep_is_held(&ca->bucket_lock));
 }
 
-static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+{
+	return __bucket_array(ca, false);
+}
+
+static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
 {
-	struct bucket_array *buckets = bucket_array(ca);
+	struct bucket_array *buckets = __bucket_array(ca, gc);
 
 	BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
 	return buckets->b + b;
 }
 
+static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+{
+	return __bucket(ca, b, false);
+}
+
 static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
 					 size_t b, int rw)
 {
@@ -129,7 +140,7 @@ static inline bool bucket_unused(struct bucket_mark mark)
 
 /* Device usage: */
 
-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *, bool);
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
 
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
@@ -168,7 +179,7 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
 
 /* Filesystem usage: */
 
-struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
+struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *, bool);
 struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
 void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
 			 struct disk_reservation *, struct gc_pos);
@@ -207,17 +218,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       struct gc_pos, unsigned);
 
 #define BCH_BUCKET_MARK_NOATOMIC		(1 << 0)
-#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE	(1 << 1)
-#define BCH_BUCKET_MARK_GC_WILL_VISIT		(1 << 2)
-#define BCH_BUCKET_MARK_GC_LOCK_HELD		(1 << 3)
+#define BCH_BUCKET_MARK_GC			(1 << 1)
 
 void bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c,
 		   bool, s64, struct gc_pos,
 		   struct bch_fs_usage *, u64, unsigned);
 void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
 
-void bch2_recalc_sectors_available(struct bch_fs *);
-
 void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
 
 static inline void bch2_disk_reservation_put(struct bch_fs *c,
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 9ec96dbab0e8..0187f465d23f 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -64,8 +64,6 @@ struct bch_dev_usage {
 
 struct bch_fs_usage {
 	/* all fields are in units of 512 byte sectors: */
-	u64			online_reserved;
-	u64			available_cache;
 
 	struct {
 		u64		data[BCH_DATA_NR];
@@ -74,6 +72,10 @@ struct bch_fs_usage {
 	}			replicas[BCH_REPLICAS_MAX];
 
 	u64			buckets[BCH_DATA_NR];
+
+	/* fields starting here aren't touched by gc: */
+	u64			online_reserved;
+	u64			available_cache;
 };
 
 /*
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 939caa3b8183..4045c0e68462 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -782,9 +782,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
 				ca->mi.bucket_size,
 				gc_phase(GC_PHASE_SB),
-				new_fs
-				? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
-				: 0);
+				0);
 
 		if (c) {
 			spin_unlock(&c->journal.lock);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 931e50e8ad57..59f2aa7e047c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -374,7 +374,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	bch2_io_clock_exit(&c->io_clock[READ]);
 	bch2_fs_compress_exit(c);
 	percpu_free_rwsem(&c->usage_lock);
-	free_percpu(c->usage_percpu);
+	free_percpu(c->usage[0]);
 	mempool_exit(&c->btree_iters_pool);
 	mempool_exit(&c->btree_bounce_pool);
 	bioset_exit(&c->btree_bio);
@@ -606,7 +606,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			max(offsetof(struct btree_read_bio, bio),
 			    offsetof(struct btree_write_bio, wbio.bio)),
 			BIOSET_NEED_BVECS) ||
-	    !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
+	    !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) ||
 	    percpu_init_rwsem(&c->usage_lock) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
@@ -1028,8 +1028,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
 		return ret;
 
 	mutex_lock(&c->sb_lock);
-	bch2_mark_dev_superblock(ca->fs, ca,
-			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+	bch2_mark_dev_superblock(ca->fs, ca, 0);
 	mutex_unlock(&c->sb_lock);
 
 	bch2_dev_sysfs_online(c, ca);
@@ -1314,7 +1313,7 @@ static void dev_usage_clear(struct bch_dev *ca)
 
 	for_each_possible_cpu(cpu) {
 		struct bch_dev_usage *p =
-			per_cpu_ptr(ca->usage_percpu, cpu);
+			per_cpu_ptr(ca->usage[0], cpu);
 		memset(p, 0, sizeof(*p));
 	}
 
@@ -1375,8 +1374,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	 * allocate the journal, reset all the marks, then remark after we
 	 * attach...
 	 */
-	bch2_mark_dev_superblock(ca->fs, ca,
-			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+	bch2_mark_dev_superblock(ca->fs, ca, 0);
 
 	err = "journal alloc failed";
 	ret = bch2_dev_journal_alloc(ca);
@@ -1435,8 +1433,7 @@ have_slot:
 	ca->disk_sb.sb->dev_idx	= dev_idx;
 	bch2_dev_attach(c, ca, dev_idx);
 
-	bch2_mark_dev_superblock(c, ca,
-			BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+	bch2_mark_dev_superblock(c, ca, 0);
 
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 188e19572d91..8eacc0d2550b 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -478,7 +478,7 @@ STORE(__bch2_fs)
 		bch2_coalesce(c);
 
 	if (attr == &sysfs_trigger_gc)
-		bch2_gc(c);
+		bch2_gc(c, NULL, false);
 
 	if (attr == &sysfs_prune_cache) {
 		struct shrink_control sc;
author	Kent Overstreet <kent.overstreet@gmail.com>	2018-07-23 05:32:01 -0400
committer	Kent Overstreet <kent.overstreet@linux.dev>	2023-10-22 17:08:12 -0400
commit	9ca53b55f7415783c6cc8b751c99f2af6cc0a932 (patch)
tree	cef41ef35075c7bfaa765faf6ab9c5d58f0a56b4 /fs/bcachefs
parent	e647369168e02a06ff5ee229cc14ad72b2f5ddfd (diff)