summaryrefslogtreecommitdiff
path: root/fs/bcachefs
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2018-07-23 05:32:01 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2023-10-22 17:08:12 -0400
commit9ca53b55f7415783c6cc8b751c99f2af6cc0a932 (patch)
treecef41ef35075c7bfaa765faf6ab9c5d58f0a56b4 /fs/bcachefs
parente647369168e02a06ff5ee229cc14ad72b2f5ddfd (diff)
bcachefs: gc now operates on second set of bucket marks
This means we can now use gc to verify the allocation information - important for testing persistant alloc info Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs/bcachefs')
-rw-r--r--fs/bcachefs/alloc_background.c13
-rw-r--r--fs/bcachefs/bcachefs.h14
-rw-r--r--fs/bcachefs/btree_gc.c399
-rw-r--r--fs/bcachefs/btree_gc.h6
-rw-r--r--fs/bcachefs/btree_update_interior.c50
-rw-r--r--fs/bcachefs/buckets.c340
-rw-r--r--fs/bcachefs/buckets.h29
-rw-r--r--fs/bcachefs/buckets_types.h6
-rw-r--r--fs/bcachefs/journal.c4
-rw-r--r--fs/bcachefs/super.c15
-rw-r--r--fs/bcachefs/sysfs.c2
11 files changed, 495 insertions, 383 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index c17fba1eae96..3f0e2dd29fde 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -930,12 +930,6 @@ static int bch2_allocator_thread(void *arg)
pr_debug("free_inc now empty");
do {
- if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
- up_read(&c->gc_lock);
- bch_err(ca, "gc failure");
- goto stop;
- }
-
/*
* Find some buckets that we can invalidate, either
* they're completely unused, or only contain clean data
@@ -1293,9 +1287,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
bool invalidating_data = false;
int ret = 0;
- if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
- return -1;
-
if (test_alloc_startup(c)) {
invalidating_data = true;
goto not_enough;
@@ -1321,9 +1312,7 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
continue;
bch2_mark_alloc_bucket(c, ca, bu, true,
- gc_pos_alloc(c, NULL),
- BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
- BCH_BUCKET_MARK_GC_LOCK_HELD);
+ gc_pos_alloc(c, NULL), 0);
fifo_push(&ca->free_inc, bu);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index cdea3a1d9176..eaa2055000b6 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -347,7 +347,6 @@ enum gc_phase {
GC_PHASE_PENDING_DELETE,
GC_PHASE_ALLOC,
- GC_PHASE_DONE
};
struct gc_pos {
@@ -392,15 +391,14 @@ struct bch_dev {
* gc_lock, for device resize - holding any is sufficient for access:
* Or rcu_read_lock(), but only for ptr_stale():
*/
- struct bucket_array __rcu *buckets;
+ struct bucket_array __rcu *buckets[2];
unsigned long *buckets_dirty;
unsigned long *buckets_written;
/* most out of date gen in the btree */
u8 *oldest_gens;
struct rw_semaphore bucket_lock;
- struct bch_dev_usage __percpu *usage_percpu;
- struct bch_dev_usage usage_cached;
+ struct bch_dev_usage __percpu *usage[2];
/* Allocator: */
struct task_struct __rcu *alloc_thread;
@@ -478,7 +476,6 @@ enum {
/* errors: */
BCH_FS_ERROR,
- BCH_FS_GC_FAILURE,
/* misc: */
BCH_FS_BDEV_MOUNTED,
@@ -614,8 +611,8 @@ struct bch_fs {
atomic64_t sectors_available;
- struct bch_fs_usage __percpu *usage_percpu;
- struct bch_fs_usage usage_cached;
+ struct bch_fs_usage __percpu *usage[2];
+
struct percpu_rw_semaphore usage_lock;
struct closure_waitlist freelist_wait;
@@ -656,9 +653,6 @@ struct bch_fs {
*
* gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
*
- * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
- * currently running, and gc marks are currently valid
- *
* Protected by gc_pos_lock. Only written to by GC thread, so GC thread
* can read without a lock.
*/
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index e900fd4ffd06..6eba65fcb52c 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -260,8 +260,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
{
struct gc_pos pos = { 0 };
unsigned flags =
- BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
- BCH_BUCKET_MARK_GC_LOCK_HELD|
+ BCH_BUCKET_MARK_GC|
(initial ? BCH_BUCKET_MARK_NOATOMIC : 0);
int ret = 0;
@@ -484,9 +483,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
BCH_DATA_SB, flags);
}
- if (c)
- spin_lock(&c->journal.lock);
-
for (i = 0; i < ca->journal.nr; i++) {
b = ca->journal.buckets[i];
bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
@@ -495,7 +491,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
}
if (c) {
- spin_unlock(&c->journal.lock);
percpu_up_read(&c->usage_lock);
} else {
preempt_enable();
@@ -511,9 +506,7 @@ static void bch2_mark_superblocks(struct bch_fs *c)
gc_pos_set(c, gc_phase(GC_PHASE_SB));
for_each_online_member(ca, c, i)
- bch2_mark_dev_superblock(c, ca,
- BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
- BCH_BUCKET_MARK_GC_LOCK_HELD);
+ bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC);
mutex_unlock(&c->sb_lock);
}
@@ -521,7 +514,6 @@ static void bch2_mark_superblocks(struct bch_fs *c)
static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
{
struct gc_pos pos = { 0 };
- struct bch_fs_usage stats = { 0 };
struct btree_update *as;
struct pending_btree_node_free *d;
@@ -533,13 +525,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
bch2_mark_key(c, BKEY_TYPE_BTREE,
bkey_i_to_s_c(&d->key),
true, 0,
- pos, &stats, 0,
- BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
- BCH_BUCKET_MARK_GC_LOCK_HELD);
- /*
- * Don't apply stats - pending deletes aren't tracked in
- * bch_alloc_stats:
- */
+ pos, NULL, 0,
+ BCH_BUCKET_MARK_GC);
mutex_unlock(&c->btree_interior_update_lock);
}
@@ -560,8 +547,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
fifo_for_each_entry(i, &ca->free_inc, iter)
bch2_mark_alloc_bucket(c, ca, i, true,
gc_pos_alloc(c, NULL),
- BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
- BCH_BUCKET_MARK_GC_LOCK_HELD);
+ BCH_BUCKET_MARK_GC);
@@ -569,8 +555,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
fifo_for_each_entry(i, &ca->free[j], iter)
bch2_mark_alloc_bucket(c, ca, i, true,
gc_pos_alloc(c, NULL),
- BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
- BCH_BUCKET_MARK_GC_LOCK_HELD);
+ BCH_BUCKET_MARK_GC);
}
spin_unlock(&c->freelist_lock);
@@ -584,8 +569,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
gc_pos_alloc(c, ob),
- BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
- BCH_BUCKET_MARK_GC_LOCK_HELD);
+ BCH_BUCKET_MARK_GC);
}
spin_unlock(&ob->lock);
}
@@ -593,122 +577,310 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
percpu_up_read(&c->usage_lock);
}
-static void bch2_gc_start(struct bch_fs *c)
+static void bch2_gc_free(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_member_device(ca, c, i) {
+ kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
+ sizeof(struct bucket_array) +
+ ca->mi.nbuckets * sizeof(struct bucket));
+ ca->buckets[1] = NULL;
+
+ free_percpu(ca->usage[1]);
+ ca->usage[1] = NULL;
+ }
+
+ free_percpu(c->usage[1]);
+ c->usage[1] = NULL;
+}
+
+static void bch2_gc_done_nocheck(struct bch_fs *c)
{
struct bch_dev *ca;
- struct bucket_array *buckets;
- struct bucket_mark new;
unsigned i;
- size_t b;
int cpu;
- percpu_down_write(&c->usage_lock);
+ for_each_member_device(ca, c, i) {
+ struct bucket_array *src = __bucket_array(ca, 1);
- /*
- * Indicates to buckets code that gc is now in progress - done under
- * usage_lock to avoid racing with bch2_mark_key():
- */
- __gc_pos_set(c, gc_phase(GC_PHASE_START));
+ memcpy(__bucket_array(ca, 0), src,
+ sizeof(struct bucket_array) +
+ sizeof(struct bucket) * src->nbuckets);
+ };
- /* Save a copy of the existing bucket stats while we recompute them: */
for_each_member_device(ca, c, i) {
- ca->usage_cached = __bch2_dev_usage_read(ca);
+ struct bch_dev_usage *p;
+
for_each_possible_cpu(cpu) {
- struct bch_dev_usage *p =
- per_cpu_ptr(ca->usage_percpu, cpu);
+ p = per_cpu_ptr(ca->usage[0], cpu);
memset(p, 0, sizeof(*p));
}
+
+ preempt_disable();
+ *this_cpu_ptr(ca->usage[0]) = __bch2_dev_usage_read(ca, 1);
+ preempt_enable();
}
- c->usage_cached = __bch2_fs_usage_read(c);
- for_each_possible_cpu(cpu) {
- struct bch_fs_usage *p =
- per_cpu_ptr(c->usage_percpu, cpu);
+ {
+ struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
+ struct bch_fs_usage *p;
- memset(p->replicas, 0, sizeof(p->replicas));
- memset(p->buckets, 0, sizeof(p->buckets));
+ for_each_possible_cpu(cpu) {
+ p = per_cpu_ptr(c->usage[0], cpu);
+ memset(p, 0, offsetof(typeof(*p), online_reserved));
+ }
+
+ preempt_disable();
+ memcpy(this_cpu_ptr(c->usage[0]),
+ &src,
+ offsetof(typeof(*p), online_reserved));
+ preempt_enable();
}
+}
+
+static void bch2_gc_done(struct bch_fs *c, bool initial)
+{
+ struct bch_dev *ca;
+ unsigned i;
+ int cpu;
+
+#define copy_field(_f, _msg, ...) \
+ if (dst._f != src._f) { \
+ pr_info(_msg ": got %llu, should be %llu, fixing" \
+ , ##__VA_ARGS__, dst._f, src._f); \
+ dst._f = src._f; \
+ }
+#define copy_bucket_field(_f) \
+ if (dst->b[b].mark._f != src->b[b].mark._f) { \
+ pr_info("dev %u bucket %zu has wrong " #_f \
+ ": got %u, should be %u, fixing", \
+ i, b, dst->b[b].mark._f, src->b[b].mark._f); \
+ dst->b[b]._mark._f = src->b[b].mark._f; \
+ }
+#define copy_dev_field(_f, _msg, ...) \
+ copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
+#define copy_fs_field(_f, _msg, ...) \
+ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
+
+ percpu_down_write(&c->usage_lock);
+
+ if (initial) {
+ bch2_gc_done_nocheck(c);
+ goto out;
+ }
+
+ for_each_member_device(ca, c, i) {
+ struct bucket_array *dst = __bucket_array(ca, 0);
+ struct bucket_array *src = __bucket_array(ca, 1);
+ size_t b;
+
+ if (initial) {
+ memcpy(dst, src,
+ sizeof(struct bucket_array) +
+ sizeof(struct bucket) * dst->nbuckets);
+ }
+
+ for (b = 0; b < src->nbuckets; b++) {
+ copy_bucket_field(gen);
+ copy_bucket_field(data_type);
+ copy_bucket_field(owned_by_allocator);
+ copy_bucket_field(stripe);
+ copy_bucket_field(dirty_sectors);
+ copy_bucket_field(cached_sectors);
+ }
+ };
+
+ for_each_member_device(ca, c, i) {
+ struct bch_dev_usage dst = __bch2_dev_usage_read(ca, 0);
+ struct bch_dev_usage src = __bch2_dev_usage_read(ca, 1);
+ struct bch_dev_usage *p;
+ unsigned b;
+
+ for (b = 0; b < BCH_DATA_NR; b++)
+ copy_dev_field(buckets[b],
+ "buckets[%s]", bch2_data_types[b]);
+ copy_dev_field(buckets_alloc, "buckets_alloc");
+ copy_dev_field(buckets_ec, "buckets_ec");
+
+ for (b = 0; b < BCH_DATA_NR; b++)
+ copy_dev_field(sectors[b],
+ "sectors[%s]", bch2_data_types[b]);
+ copy_dev_field(sectors_fragmented,
+ "sectors_fragmented");
+
+ for_each_possible_cpu(cpu) {
+ p = per_cpu_ptr(ca->usage[0], cpu);
+ memset(p, 0, sizeof(*p));
+ }
+
+ preempt_disable();
+ p = this_cpu_ptr(ca->usage[0]);
+ *p = dst;
+ preempt_enable();
+ }
+
+ {
+ struct bch_fs_usage dst = __bch2_fs_usage_read(c, 0);
+ struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
+ struct bch_fs_usage *p;
+ unsigned r, b;
+
+ for (r = 0; r < BCH_REPLICAS_MAX; r++) {
+ for (b = 0; b < BCH_DATA_NR; b++)
+ copy_fs_field(replicas[r].data[b],
+ "replicas[%i].data[%s]",
+ r, bch2_data_types[b]);
+ copy_fs_field(replicas[r].ec_data,
+ "replicas[%i].ec_data", r);
+ copy_fs_field(replicas[r].persistent_reserved,
+ "replicas[%i].persistent_reserved", r);
+ }
+
+ for (b = 0; b < BCH_DATA_NR; b++)
+ copy_fs_field(buckets[b],
+ "buckets[%s]", bch2_data_types[b]);
+
+ for_each_possible_cpu(cpu) {
+ p = per_cpu_ptr(c->usage[0], cpu);
+ memset(p, 0, offsetof(typeof(*p), online_reserved));
+ }
+
+ preempt_disable();
+ p = this_cpu_ptr(c->usage[0]);
+ memcpy(p, &dst, offsetof(typeof(*p), online_reserved));
+ preempt_enable();
+ }
+out:
percpu_up_write(&c->usage_lock);
- /* Clear bucket marks: */
+#undef copy_field
+#undef copy_fs_field
+#undef copy_dev_field
+#undef copy_bucket_field
+}
+
+static int bch2_gc_start(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ BUG_ON(c->usage[1]);
+
+ c->usage[1] = alloc_percpu(struct bch_fs_usage);
+ if (!c->usage[1])
+ return -ENOMEM;
+
for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
- bucket_cmpxchg(buckets->b + b, new, ({
- new.owned_by_allocator = 0;
- new.data_type = 0;
- new.cached_sectors = 0;
- new.dirty_sectors = 0;
- new.stripe = 0;
- }));
- ca->oldest_gens[b] = new.gen;
+ BUG_ON(ca->buckets[1]);
+ BUG_ON(ca->usage[1]);
+
+ ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
+ ca->mi.nbuckets * sizeof(struct bucket),
+ GFP_KERNEL|__GFP_ZERO);
+ if (!ca->buckets[1]) {
+ percpu_ref_put(&ca->ref);
+ return -ENOMEM;
+ }
+
+ ca->usage[1] = alloc_percpu(struct bch_dev_usage);
+ if (!ca->usage[1]) {
+ percpu_ref_put(&ca->ref);
+ return -ENOMEM;
}
- up_read(&ca->bucket_lock);
}
+
+ percpu_down_write(&c->usage_lock);
+
+ for_each_member_device(ca, c, i) {
+ struct bucket_array *dst = __bucket_array(ca, 1);
+ struct bucket_array *src = __bucket_array(ca, 0);
+ size_t b;
+
+ dst->first_bucket = src->first_bucket;
+ dst->nbuckets = src->nbuckets;
+
+ for (b = 0; b < src->nbuckets; b++)
+ dst->b[b]._mark.gen = src->b[b].mark.gen;
+ };
+
+ percpu_up_write(&c->usage_lock);
+
+ return 0;
}
/**
- * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
+ * bch2_gc - walk _all_ references to buckets, and recompute them:
+ *
+ * Order matters here:
+ * - Concurrent GC relies on the fact that we have a total ordering for
+ * everything that GC walks - see gc_will_visit_node(),
+ * gc_will_visit_root()
+ *
+ * - also, references move around in the course of index updates and
+ * various other crap: everything needs to agree on the ordering
+ * references are allowed to move around in - e.g., we're allowed to
+ * start with a reference owned by an open_bucket (the allocator) and
+ * move it to the btree, but not the reverse.
+ *
+ * This is necessary to ensure that gc doesn't miss references that
+ * move around - if references move backwards in the ordering GC
+ * uses, GC could skip past them
*/
-void bch2_gc(struct bch_fs *c)
+int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial)
{
struct bch_dev *ca;
u64 start_time = local_clock();
- unsigned i;
+ unsigned i, iter = 0;
int ret;
- /*
- * Walk _all_ references to buckets, and recompute them:
- *
- * Order matters here:
- * - Concurrent GC relies on the fact that we have a total ordering for
- * everything that GC walks - see gc_will_visit_node(),
- * gc_will_visit_root()
- *
- * - also, references move around in the course of index updates and
- * various other crap: everything needs to agree on the ordering
- * references are allowed to move around in - e.g., we're allowed to
- * start with a reference owned by an open_bucket (the allocator) and
- * move it to the btree, but not the reverse.
- *
- * This is necessary to ensure that gc doesn't miss references that
- * move around - if references move backwards in the ordering GC
- * uses, GC could skip past them
- */
trace_gc_start(c);
- /*
- * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on
- * gc_lock if sectors_available goes to 0:
- */
- bch2_recalc_sectors_available(c);
-
down_write(&c->gc_lock);
- if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+again:
+ ret = bch2_gc_start(c);
+ if (ret)
goto out;
- bch2_gc_start(c);
-
bch2_mark_superblocks(c);
- ret = bch2_gc_btrees(c, NULL, false);
- if (ret) {
- bch_err(c, "btree gc failed: %d", ret);
- set_bit(BCH_FS_GC_FAILURE, &c->flags);
+ ret = bch2_gc_btrees(c, journal, initial);
+ if (ret)
goto out;
- }
bch2_mark_pending_btree_node_frees(c);
bch2_mark_allocator_buckets(c);
- /* Indicates that gc is no longer in progress: */
- gc_pos_set(c, gc_phase(GC_PHASE_DONE));
c->gc_count++;
out:
+ if (!ret && test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
+ /*
+ * XXX: make sure gens we fixed got saved
+ */
+ if (iter++ <= 2) {
+ bch_info(c, "Fixed gens, restarting mark and sweep:");
+ clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+ goto again;
+ }
+
+ bch_info(c, "Unable to fix bucket gens, looping");
+ ret = -EINVAL;
+ }
+
+ if (!ret)
+ bch2_gc_done(c, initial);
+
+ /* Indicates that gc is no longer in progress: */
+ __gc_pos_set(c, gc_phase(GC_PHASE_START));
+
+ bch2_gc_free(c);
up_write(&c->gc_lock);
+
+ if (!ret && initial)
+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+
trace_gc_end(c);
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
@@ -724,6 +896,7 @@ out:
* allocator thread - issue wakeup in case they blocked on gc_lock:
*/
closure_wake_up(&c->freelist_wait);
+ return ret;
}
/* Btree coalescing */
@@ -1039,9 +1212,6 @@ void bch2_coalesce(struct bch_fs *c)
{
enum btree_id id;
- if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
- return;
-
down_read(&c->gc_lock);
trace_gc_coalesce_start(c);
@@ -1053,7 +1223,6 @@ void bch2_coalesce(struct bch_fs *c)
if (ret) {
if (ret != -ESHUTDOWN)
bch_err(c, "btree coalescing failed: %d", ret);
- set_bit(BCH_FS_GC_FAILURE, &c->flags);
return;
}
}
@@ -1068,6 +1237,7 @@ static int bch2_gc_thread(void *arg)
struct io_clock *clock = &c->io_clock[WRITE];
unsigned long last = atomic_long_read(&clock->now);
unsigned last_kick = atomic_read(&c->kick_gc);
+ int ret;
set_freezable();
@@ -1101,7 +1271,9 @@ static int bch2_gc_thread(void *arg)
last = atomic_long_read(&clock->now);
last_kick = atomic_read(&c->kick_gc);
- bch2_gc(c);
+ ret = bch2_gc(c, NULL, false);
+ if (ret)
+ bch_err(c, "btree gc failed: %i", ret);
debug_check_no_locks_held();
}
@@ -1142,30 +1314,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
{
- unsigned iter = 0;
- int ret = 0;
-
- down_write(&c->gc_lock);
-again:
- bch2_gc_start(c);
-
- bch2_mark_superblocks(c);
-
- ret = bch2_gc_btrees(c, journal, true);
- if (ret)
- goto err;
-
- if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
- if (iter++ > 2) {
- bch_info(c, "Unable to fix bucket gens, looping");
- ret = -EINVAL;
- goto err;
- }
-
- bch_info(c, "Fixed gens, restarting initial mark and sweep:");
- clear_bit(BCH_FS_FIXED_GENS, &c->flags);
- goto again;
- }
+ int ret = bch2_gc(c, journal, true);
/*
* Skip past versions that might have possibly been used (as nonces),
@@ -1174,9 +1323,5 @@ again:
if (c->sb.encryption_type)
atomic64_add(1 << 16, &c->key_version);
- gc_pos_set(c, gc_phase(GC_PHASE_DONE));
- set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-err:
- up_write(&c->gc_lock);
return ret;
}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 47a590015325..bb77564b9463 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -7,7 +7,7 @@
enum bkey_type;
void bch2_coalesce(struct bch_fs *);
-void bch2_gc(struct bch_fs *);
+int bch2_gc(struct bch_fs *, struct list_head *, bool);
void bch2_gc_thread_stop(struct bch_fs *);
int bch2_gc_thread_start(struct bch_fs *);
int bch2_initial_gc(struct bch_fs *, struct list_head *);
@@ -105,14 +105,14 @@ static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *o
};
}
-static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
+static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
{
unsigned seq;
bool ret;
do {
seq = read_seqcount_begin(&c->gc_pos_lock);
- ret = gc_pos_cmp(c->gc_pos, pos) < 0;
+ ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
} while (read_seqcount_retry(&c->gc_pos_lock, seq));
return ret;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index af31819c88c7..2631b0732d4b 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -160,7 +160,6 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
{
struct bch_fs *c = as->c;
struct pending_btree_node_free *d;
- unsigned replicas;
/*
* btree_update lock is only needed here to avoid racing with
@@ -179,15 +178,6 @@ found:
d->index_update_done = true;
/*
- * Btree nodes are accounted as freed in bch_alloc_stats when they're
- * freed from the index:
- */
- replicas = bch2_extent_nr_dirty_ptrs(k);
- if (replicas)
- stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -=
- c->opts.btree_node_size * replicas;
-
- /*
* We're dropping @k from the btree, but it's still live until the
* index update is persistent so we need to keep a reference around for
* mark and sweep to find - that's primarily what the
@@ -208,15 +198,16 @@ found:
* bch2_mark_key() compares the current gc pos to the pos we're
* moving this reference from, hence one comparison here:
*/
- if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
- struct bch_fs_usage tmp = { 0 };
+ if (gc_pos_cmp(c->gc_pos, b
+ ? gc_pos_btree_node(b)
+ : gc_pos_btree_root(as->btree_id)) >= 0 &&
+ gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
+ struct gc_pos pos = { 0 };
bch2_mark_key(c, BKEY_TYPE_BTREE,
bkey_i_to_s_c(&d->key),
- false, 0, b
- ? gc_pos_btree_node(b)
- : gc_pos_btree_root(as->btree_id),
- &tmp, 0, 0);
+ false, 0, pos,
+ NULL, 0, BCH_BUCKET_MARK_GC);
/*
* Don't apply tmp - pending deletes aren't tracked in
* bch_alloc_stats:
@@ -287,19 +278,13 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
static void bch2_btree_node_free_ondisk(struct bch_fs *c,
struct pending_btree_node_free *pending)
{
- struct bch_fs_usage stats = { 0 };
-
BUG_ON(!pending->index_update_done);
bch2_mark_key(c, BKEY_TYPE_BTREE,
bkey_i_to_s_c(&pending->key),
false, 0,
gc_phase(GC_PHASE_PENDING_DELETE),
- &stats, 0, 0);
- /*
- * Don't apply stats - pending deletes aren't tracked in
- * bch_alloc_stats:
- */
+ NULL, 0, 0);
}
static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@@ -1939,6 +1924,25 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
btree_interior_update_add_node_reference(as, b);
+ /*
+ * XXX: the rest of the update path treats this like we're actually
+ * inserting a new node and deleting the existing node, so the
+ * reservation needs to include enough space for @b
+ *
+ * that is actually sketch as fuck though and I am surprised the code
+ * seems to work like that, definitely need to go back and rework it
+ * into something saner.
+ *
+ * (I think @b is just getting double counted until the btree update
+ * finishes and "deletes" @b on disk)
+ */
+ ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
+ c->opts.btree_node_size *
+ bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)),
+ BCH_DISK_RESERVATION_NOFAIL|
+ BCH_DISK_RESERVATION_GC_LOCK_HELD);
+ BUG_ON(ret);
+
parent = btree_node_parent(iter, b);
if (parent) {
if (new_hash) {
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 201798866242..2ebe8bad978e 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -85,8 +85,7 @@ static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
static void bch2_fs_stats_verify(struct bch_fs *c)
{
- struct bch_fs_usage stats =
- __bch2_fs_usage_read(c);
+ struct bch_fs_usage stats =_bch2_fs_usage_read(c);
unsigned i, j;
for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
@@ -209,43 +208,24 @@ do { \
_acc; \
})
-#define bch2_usage_read_cached(_c, _cached, _uncached) \
-({ \
- typeof(_cached) _ret; \
- unsigned _seq; \
- \
- do { \
- _seq = read_seqcount_begin(&(_c)->gc_pos_lock); \
- _ret = (_c)->gc_pos.phase == GC_PHASE_DONE \
- ? bch2_usage_read_raw(_uncached) \
- : (_cached); \
- } while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq)); \
- \
- _ret; \
-})
-
-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca, bool gc)
{
- return bch2_usage_read_raw(ca->usage_percpu);
+ return bch2_usage_read_raw(ca->usage[gc]);
}
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
{
- return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
+ return bch2_usage_read_raw(ca->usage[0]);
}
-struct bch_fs_usage
-__bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *c, bool gc)
{
- return bch2_usage_read_raw(c->usage_percpu);
+ return bch2_usage_read_raw(c->usage[gc]);
}
-struct bch_fs_usage
-bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c)
{
- return bch2_usage_read_cached(c,
- c->usage_cached,
- c->usage_percpu);
+ return bch2_usage_read_raw(c->usage[0]);
}
struct fs_usage_sum {
@@ -327,13 +307,11 @@ static inline enum bch_data_type bucket_type(struct bucket_mark m)
: m.data_type;
}
-static bool bucket_became_unavailable(struct bch_fs *c,
- struct bucket_mark old,
+static bool bucket_became_unavailable(struct bucket_mark old,
struct bucket_mark new)
{
return is_available_bucket(old) &&
- !is_available_bucket(new) &&
- (!c || c->gc_pos.phase == GC_PHASE_DONE);
+ !is_available_bucket(new);
}
void bch2_fs_usage_apply(struct bch_fs *c,
@@ -364,11 +342,13 @@ void bch2_fs_usage_apply(struct bch_fs *c,
percpu_down_read(&c->usage_lock);
preempt_disable();
/* online_reserved not subject to gc: */
- this_cpu_add(c->usage_percpu->online_reserved, stats->online_reserved);
+ this_cpu_add(c->usage[0]->online_reserved, stats->online_reserved);
stats->online_reserved = 0;
- if (!gc_will_visit(c, gc_pos))
- bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats);
+ bch2_usage_add(this_cpu_ptr(c->usage[0]), stats);
+
+ if (gc_visited(c, gc_pos))
+ bch2_usage_add(this_cpu_ptr(c->usage[1]), stats);
bch2_fs_stats_verify(c);
preempt_enable();
@@ -378,8 +358,9 @@ void bch2_fs_usage_apply(struct bch_fs *c,
}
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
- struct bch_fs_usage *stats,
- struct bucket_mark old, struct bucket_mark new)
+ struct bch_fs_usage *fs_usage,
+ struct bucket_mark old, struct bucket_mark new,
+ bool gc)
{
struct bch_dev_usage *dev_usage;
@@ -391,14 +372,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
bch2_data_types[old.data_type],
bch2_data_types[new.data_type]);
- stats->buckets[bucket_type(old)] -= ca->mi.bucket_size;
- stats->buckets[bucket_type(new)] += ca->mi.bucket_size;
-
preempt_disable();
- dev_usage = this_cpu_ptr(ca->usage_percpu);
+ dev_usage = this_cpu_ptr(ca->usage[gc]);
- dev_usage->buckets[bucket_type(old)]--;
- dev_usage->buckets[bucket_type(new)]++;
+ if (bucket_type(old) != bucket_type(new)) {
+ if (bucket_type(old)) {
+ fs_usage->buckets[bucket_type(old)] -= ca->mi.bucket_size;
+ dev_usage->buckets[bucket_type(old)]--;
+ } else {
+ fs_usage->buckets[bucket_type(new)] += ca->mi.bucket_size;
+ dev_usage->buckets[bucket_type(new)]++;
+ }
+ }
dev_usage->buckets_alloc +=
(int) new.owned_by_allocator - (int) old.owned_by_allocator;
@@ -425,21 +410,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
({ \
struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
\
- bch2_dev_usage_update(c, ca, stats, _old, new); \
+ bch2_dev_usage_update(c, ca, stats, _old, new, gc); \
_old; \
})
-void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, struct bucket_mark *old)
+static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, struct bucket_mark *old,
+ bool gc)
{
- struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
- struct bucket *g;
+ struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
+ struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark new;
- percpu_rwsem_assert_held(&c->usage_lock);
-
- g = bucket(ca, b);
-
*old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
BUG_ON(!is_available_bucket(new));
@@ -450,38 +432,49 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
new.gen++;
}));
- /*
- * This isn't actually correct yet, since fs usage is still
- * uncompressed sectors:
- */
stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors;
+}
+
+void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, struct bucket_mark *old)
+{
+ percpu_rwsem_assert_held(&c->usage_lock);
+
+ __bch2_invalidate_bucket(c, ca, b, old, false);
if (!old->owned_by_allocator && old->cached_sectors)
trace_invalidate(ca, bucket_to_sector(ca, b),
old->cached_sectors);
}
-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, bool owned_by_allocator,
- struct gc_pos pos, unsigned flags)
+static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, bool owned_by_allocator,
+ bool gc)
{
- struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
- struct bucket *g;
+ struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
+ struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark old, new;
- percpu_rwsem_assert_held(&c->usage_lock);
- g = bucket(ca, b);
-
- if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
- gc_will_visit(c, pos))
- return;
-
old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
new.owned_by_allocator = owned_by_allocator;
}));
- BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
- c->gc_pos.phase == GC_PHASE_DONE);
+ BUG_ON(!gc &&
+ !owned_by_allocator && !old.owned_by_allocator);
+}
+
+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, bool owned_by_allocator,
+ struct gc_pos pos, unsigned flags)
+{
+ percpu_rwsem_assert_held(&c->usage_lock);
+
+ if (!(flags & BCH_BUCKET_MARK_GC))
+ __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
+
+ if ((flags & BCH_BUCKET_MARK_GC) ||
+ gc_visited(c, pos))
+ __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
}
#define checked_add(a, b) \
@@ -491,37 +484,49 @@ do { \
BUG_ON((a) != _res); \
} while (0)
+static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, enum bch_data_type type,
+ unsigned sectors, bool gc)
+{
+ struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
+ struct bucket *g = __bucket(ca, b, gc);
+ struct bucket_mark old, new;
+
+ BUG_ON(type != BCH_DATA_SB &&
+ type != BCH_DATA_JOURNAL);
+
+ old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+ new.data_type = type;
+ checked_add(new.dirty_sectors, sectors);
+ }));
+
+ fs_usage->replicas[0].data[type] += sectors;
+}
+
void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, enum bch_data_type type,
unsigned sectors, struct gc_pos pos,
unsigned flags)
{
- struct bch_fs_usage *stats;
- struct bucket *g;
- struct bucket_mark old, new;
-
BUG_ON(type != BCH_DATA_SB &&
type != BCH_DATA_JOURNAL);
+ preempt_disable();
+
if (likely(c)) {
percpu_rwsem_assert_held(&c->usage_lock);
- if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
- gc_will_visit(c, pos))
- return;
-
- preempt_disable();
- stats = this_cpu_ptr(c->usage_percpu);
-
- g = bucket(ca, b);
- old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
- new.data_type = type;
- checked_add(new.dirty_sectors, sectors);
- }));
-
- stats->replicas[0].data[type] += sectors;
- preempt_enable();
+ if (!(flags & BCH_BUCKET_MARK_GC))
+ __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+ false);
+ if ((flags & BCH_BUCKET_MARK_GC) ||
+ gc_visited(c, pos))
+ __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+ true);
} else {
+ struct bucket *g;
+ struct bucket_mark old, new;
+
rcu_read_lock();
g = bucket(ca, b);
@@ -533,8 +538,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
rcu_read_unlock();
}
- BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
- bucket_became_unavailable(c, old, new));
+ preempt_enable();
}
static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
@@ -579,23 +583,15 @@ static void bch2_mark_pointer(struct bch_fs *c,
struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type,
struct bch_fs_usage *fs_usage,
- u64 journal_seq, unsigned flags)
+ u64 journal_seq, unsigned flags,
+ bool gc)
{
struct bucket_mark old, new;
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bucket *g = PTR_BUCKET(ca, &p.ptr);
+ size_t b = PTR_BUCKET_NR(ca, &p.ptr);
+ struct bucket *g = __bucket(ca, b, gc);
u64 v;
- if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
- if (journal_seq)
- bucket_cmpxchg(g, new, ({
- new.journal_seq_valid = 1;
- new.journal_seq = journal_seq;
- }));
-
- return;
- }
-
v = atomic64_read(&g->_mark.v);
do {
new.v.counter = old.v.counter = v;
@@ -637,10 +633,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
old.v.counter,
new.v.counter)) != old.v.counter);
- bch2_dev_usage_update(c, ca, fs_usage, old, new);
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
- BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
- bucket_became_unavailable(c, old, new));
+ BUG_ON(!gc && bucket_became_unavailable(old, new));
}
static void bch2_mark_stripe_ptr(struct bch_fs *c,
@@ -688,9 +683,9 @@ static void bch2_mark_stripe_ptr(struct bch_fs *c,
static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
s64 sectors, enum bch_data_type data_type,
- struct gc_pos pos,
struct bch_fs_usage *stats,
- u64 journal_seq, unsigned flags)
+ u64 journal_seq, unsigned flags,
+ bool gc)
{
BUG_ON(!sectors);
@@ -712,7 +707,7 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
s64 adjusted_disk_sectors = disk_sectors;
bch2_mark_pointer(c, e, p, disk_sectors, data_type,
- stats, journal_seq, flags);
+ stats, journal_seq, flags, gc);
if (!p.ptr.cached)
for (i = 0; i < p.ec_nr; i++)
@@ -758,21 +753,20 @@ static void bucket_set_stripe(struct bch_fs *c,
const struct bch_stripe *v,
bool enabled,
struct bch_fs_usage *fs_usage,
- u64 journal_seq)
+ u64 journal_seq,
+ bool gc)
{
unsigned i;
for (i = 0; i < v->nr_blocks; i++) {
const struct bch_extent_ptr *ptr = v->ptrs + i;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g;
+ size_t b = PTR_BUCKET_NR(ca, ptr);
+ struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark new, old;
BUG_ON(ptr_stale(ca, ptr));
- rcu_read_lock();
- g = PTR_BUCKET(ca, ptr);
-
old = bucket_cmpxchg(g, new, ({
new.stripe = enabled;
if (journal_seq) {
@@ -780,18 +774,18 @@ static void bucket_set_stripe(struct bch_fs *c,
new.journal_seq = journal_seq;
}
}));
- rcu_read_unlock();
BUG_ON(old.stripe == enabled);
- bch2_dev_usage_update(c, ca, fs_usage, old, new);
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
}
}
static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
- bool inserting, struct gc_pos pos,
+ bool inserting,
struct bch_fs_usage *fs_usage,
- u64 journal_seq, unsigned flags)
+ u64 journal_seq, unsigned flags,
+ bool gc)
{
switch (k.k->type) {
case BCH_STRIPE: {
@@ -820,74 +814,64 @@ static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
else
bch2_stripes_heap_del(c, m, idx);
- bucket_set_stripe(c, s.v, inserting, fs_usage, 0);
+ bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
break;
}
}
}
-void bch2_mark_key(struct bch_fs *c,
- enum bkey_type type, struct bkey_s_c k,
- bool inserting, s64 sectors,
- struct gc_pos pos,
- struct bch_fs_usage *stats,
- u64 journal_seq, unsigned flags)
+static void __bch2_mark_key(struct bch_fs *c,
+ enum bkey_type type, struct bkey_s_c k,
+ bool inserting, s64 sectors,
+ struct bch_fs_usage *stats,
+ u64 journal_seq, unsigned flags,
+ bool gc)
{
- /*
- * synchronization w.r.t. GC:
- *
- * Normally, bucket sector counts/marks are updated on the fly, as
- * references are added/removed from the btree, the lists of buckets the
- * allocator owns, other metadata buckets, etc.
- *
- * When GC is in progress and going to mark this reference, we do _not_
- * mark this reference here, to avoid double counting - GC will count it
- * when it gets to it.
- *
- * To know whether we should mark a given reference (GC either isn't
- * running, or has already marked references at this position) we
- * construct a total order for everything GC walks. Then, we can simply
- * compare the position of the reference we're marking - @pos - with
- * GC's current position. If GC is going to mark this reference, GC's
- * current position will be less than @pos; if GC's current position is
- * greater than @pos GC has either already walked this position, or
- * isn't running.
- *
- * To avoid racing with GC's position changing, we have to deal with
- * - GC's position being set to GC_POS_MIN when GC starts:
- * usage_lock guards against this
- * - GC's position overtaking @pos: we guard against this with
- * whatever lock protects the data structure the reference lives in
- * (e.g. the btree node lock, or the relevant allocator lock).
- */
-
- percpu_down_read(&c->usage_lock);
- if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
- gc_will_visit(c, pos))
- flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
-
- if (!stats)
- stats = this_cpu_ptr(c->usage_percpu);
-
switch (type) {
case BKEY_TYPE_BTREE:
bch2_mark_extent(c, k, inserting
? c->opts.btree_node_size
: -c->opts.btree_node_size,
BCH_DATA_BTREE,
- pos, stats, journal_seq, flags);
+ stats, journal_seq, flags, gc);
break;
case BKEY_TYPE_EXTENTS:
bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
- pos, stats, journal_seq, flags);
+ stats, journal_seq, flags, gc);
break;
case BKEY_TYPE_EC:
bch2_mark_stripe(c, k, inserting,
- pos, stats, journal_seq, flags);
+ stats, journal_seq, flags, gc);
break;
default:
break;
}
+}
+
+void bch2_mark_key(struct bch_fs *c,
+ enum bkey_type type, struct bkey_s_c k,
+ bool inserting, s64 sectors,
+ struct gc_pos pos,
+ struct bch_fs_usage *stats,
+ u64 journal_seq, unsigned flags)
+{
+ percpu_down_read(&c->usage_lock);
+
+ if (!(flags & BCH_BUCKET_MARK_GC)) {
+ if (!stats)
+ stats = this_cpu_ptr(c->usage[0]);
+
+ __bch2_mark_key(c, type, k, inserting, sectors,
+ stats, journal_seq, flags, false);
+ }
+
+ if ((flags & BCH_BUCKET_MARK_GC) ||
+ gc_visited(c, pos)) {
+ __bch2_mark_key(c, type, k, inserting, sectors,
+ this_cpu_ptr(c->usage[1]),
+ journal_seq, flags, true);
+ }
+
percpu_up_read(&c->usage_lock);
}
@@ -963,28 +947,20 @@ void bch2_mark_update(struct btree_insert *trans,
/* Disk reservations: */
-static u64 __recalc_sectors_available(struct bch_fs *c)
+static u64 bch2_recalc_sectors_available(struct bch_fs *c)
{
int cpu;
for_each_possible_cpu(cpu)
- per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
+ per_cpu_ptr(c->usage[0], cpu)->available_cache = 0;
return avail_factor(bch2_fs_sectors_free(c, bch2_fs_usage_read(c)));
}
-/* Used by gc when it's starting: */
-void bch2_recalc_sectors_available(struct bch_fs *c)
-{
- percpu_down_write(&c->usage_lock);
- atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
- percpu_up_write(&c->usage_lock);
-}
-
void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
{
percpu_down_read(&c->usage_lock);
- this_cpu_sub(c->usage_percpu->online_reserved,
+ this_cpu_sub(c->usage[0]->online_reserved,
res->sectors);
bch2_fs_stats_verify(c);
@@ -1005,7 +981,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
percpu_down_read(&c->usage_lock);
preempt_disable();
- stats = this_cpu_ptr(c->usage_percpu);
+ stats = this_cpu_ptr(c->usage[0]);
if (sectors <= stats->available_cache)
goto out;
@@ -1055,7 +1031,7 @@ recalculate:
}
percpu_down_write(&c->usage_lock);
- sectors_available = __recalc_sectors_available(c);
+ sectors_available = bch2_recalc_sectors_available(c);
if (sectors <= sectors_available ||
(flags & BCH_DISK_RESERVATION_NOFAIL)) {
@@ -1110,7 +1086,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7);
size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
btree_reserve);
- bool resize = ca->buckets != NULL,
+ bool resize = ca->buckets[0] != NULL,
start_copygc = ca->copygc_thread != NULL;
int ret = -ENOMEM;
unsigned i;
@@ -1170,7 +1146,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
BITS_TO_LONGS(n) * sizeof(unsigned long));
}
- rcu_assign_pointer(ca->buckets, buckets);
+ rcu_assign_pointer(ca->buckets[0], buckets);
buckets = old_buckets;
swap(ca->oldest_gens, oldest_gens);
@@ -1239,16 +1215,16 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
kvpfree(ca->buckets_dirty,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
- kvpfree(rcu_dereference_protected(ca->buckets, 1),
+ kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
- free_percpu(ca->usage_percpu);
+ free_percpu(ca->usage[0]);
}
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
- if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)))
+ if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
return -ENOMEM;
return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index b48960fa5ce7..813e0c44e107 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -29,23 +29,34 @@
_old; \
})
-static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
+ bool gc)
{
- return rcu_dereference_check(ca->buckets,
+ return rcu_dereference_check(ca->buckets[gc],
!ca->fs ||
percpu_rwsem_is_held(&ca->fs->usage_lock) ||
lockdep_is_held(&ca->fs->gc_lock) ||
lockdep_is_held(&ca->bucket_lock));
}
-static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+{
+ return __bucket_array(ca, false);
+}
+
+static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
{
- struct bucket_array *buckets = bucket_array(ca);
+ struct bucket_array *buckets = __bucket_array(ca, gc);
BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
return buckets->b + b;
}
+static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+{
+ return __bucket(ca, b, false);
+}
+
static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
size_t b, int rw)
{
@@ -129,7 +140,7 @@ static inline bool bucket_unused(struct bucket_mark mark)
/* Device usage: */
-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *, bool);
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
static inline u64 __dev_buckets_available(struct bch_dev *ca,
@@ -168,7 +179,7 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
/* Filesystem usage: */
-struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
+struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *, bool);
struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
struct disk_reservation *, struct gc_pos);
@@ -207,17 +218,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
struct gc_pos, unsigned);
#define BCH_BUCKET_MARK_NOATOMIC (1 << 0)
-#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 1)
-#define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 2)
-#define BCH_BUCKET_MARK_GC_LOCK_HELD (1 << 3)
+#define BCH_BUCKET_MARK_GC (1 << 1)
void bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c,
bool, s64, struct gc_pos,
struct bch_fs_usage *, u64, unsigned);
void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
-void bch2_recalc_sectors_available(struct bch_fs *);
-
void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
static inline void bch2_disk_reservation_put(struct bch_fs *c,
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 9ec96dbab0e8..0187f465d23f 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -64,8 +64,6 @@ struct bch_dev_usage {
struct bch_fs_usage {
/* all fields are in units of 512 byte sectors: */
- u64 online_reserved;
- u64 available_cache;
struct {
u64 data[BCH_DATA_NR];
@@ -74,6 +72,10 @@ struct bch_fs_usage {
} replicas[BCH_REPLICAS_MAX];
u64 buckets[BCH_DATA_NR];
+
+ /* fields starting here aren't touched by gc: */
+ u64 online_reserved;
+ u64 available_cache;
};
/*
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 939caa3b8183..4045c0e68462 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -782,9 +782,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
ca->mi.bucket_size,
gc_phase(GC_PHASE_SB),
- new_fs
- ? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
- : 0);
+ 0);
if (c) {
spin_unlock(&c->journal.lock);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 931e50e8ad57..59f2aa7e047c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -374,7 +374,7 @@ static void bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
percpu_free_rwsem(&c->usage_lock);
- free_percpu(c->usage_percpu);
+ free_percpu(c->usage[0]);
mempool_exit(&c->btree_iters_pool);
mempool_exit(&c->btree_bounce_pool);
bioset_exit(&c->btree_bio);
@@ -606,7 +606,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
max(offsetof(struct btree_read_bio, bio),
offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
- !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
+ !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) ||
percpu_init_rwsem(&c->usage_lock) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
@@ -1028,8 +1028,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
return ret;
mutex_lock(&c->sb_lock);
- bch2_mark_dev_superblock(ca->fs, ca,
- BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+ bch2_mark_dev_superblock(ca->fs, ca, 0);
mutex_unlock(&c->sb_lock);
bch2_dev_sysfs_online(c, ca);
@@ -1314,7 +1313,7 @@ static void dev_usage_clear(struct bch_dev *ca)
for_each_possible_cpu(cpu) {
struct bch_dev_usage *p =
- per_cpu_ptr(ca->usage_percpu, cpu);
+ per_cpu_ptr(ca->usage[0], cpu);
memset(p, 0, sizeof(*p));
}
@@ -1375,8 +1374,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
* allocate the journal, reset all the marks, then remark after we
* attach...
*/
- bch2_mark_dev_superblock(ca->fs, ca,
- BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+ bch2_mark_dev_superblock(ca->fs, ca, 0);
err = "journal alloc failed";
ret = bch2_dev_journal_alloc(ca);
@@ -1435,8 +1433,7 @@ have_slot:
ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx);
- bch2_mark_dev_superblock(c, ca,
- BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+ bch2_mark_dev_superblock(c, ca, 0);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 188e19572d91..8eacc0d2550b 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -478,7 +478,7 @@ STORE(__bch2_fs)
bch2_coalesce(c);
if (attr == &sysfs_trigger_gc)
- bch2_gc(c);
+ bch2_gc(c, NULL, false);
if (attr == &sysfs_prune_cache) {
struct shrink_control sc;