diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2018-07-23 05:32:01 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2023-10-22 17:08:12 -0400 |
commit | 9ca53b55f7415783c6cc8b751c99f2af6cc0a932 (patch) | |
tree | cef41ef35075c7bfaa765faf6ab9c5d58f0a56b4 /fs/bcachefs | |
parent | e647369168e02a06ff5ee229cc14ad72b2f5ddfd (diff) |
bcachefs: gc now operates on second set of bucket marks
This means we can now use gc to verify the allocation information -
important for testing persistant alloc info
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs/bcachefs')
-rw-r--r-- | fs/bcachefs/alloc_background.c | 13 | ||||
-rw-r--r-- | fs/bcachefs/bcachefs.h | 14 | ||||
-rw-r--r-- | fs/bcachefs/btree_gc.c | 399 | ||||
-rw-r--r-- | fs/bcachefs/btree_gc.h | 6 | ||||
-rw-r--r-- | fs/bcachefs/btree_update_interior.c | 50 | ||||
-rw-r--r-- | fs/bcachefs/buckets.c | 340 | ||||
-rw-r--r-- | fs/bcachefs/buckets.h | 29 | ||||
-rw-r--r-- | fs/bcachefs/buckets_types.h | 6 | ||||
-rw-r--r-- | fs/bcachefs/journal.c | 4 | ||||
-rw-r--r-- | fs/bcachefs/super.c | 15 | ||||
-rw-r--r-- | fs/bcachefs/sysfs.c | 2 |
11 files changed, 495 insertions, 383 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index c17fba1eae96..3f0e2dd29fde 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -930,12 +930,6 @@ static int bch2_allocator_thread(void *arg) pr_debug("free_inc now empty"); do { - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) { - up_read(&c->gc_lock); - bch_err(ca, "gc failure"); - goto stop; - } - /* * Find some buckets that we can invalidate, either * they're completely unused, or only contain clean data @@ -1293,9 +1287,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c) bool invalidating_data = false; int ret = 0; - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) - return -1; - if (test_alloc_startup(c)) { invalidating_data = true; goto not_enough; @@ -1321,9 +1312,7 @@ static int __bch2_fs_allocator_start(struct bch_fs *c) continue; bch2_mark_alloc_bucket(c, ca, bu, true, - gc_pos_alloc(c, NULL), - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); + gc_pos_alloc(c, NULL), 0); fifo_push(&ca->free_inc, bu); diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index cdea3a1d9176..eaa2055000b6 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -347,7 +347,6 @@ enum gc_phase { GC_PHASE_PENDING_DELETE, GC_PHASE_ALLOC, - GC_PHASE_DONE }; struct gc_pos { @@ -392,15 +391,14 @@ struct bch_dev { * gc_lock, for device resize - holding any is sufficient for access: * Or rcu_read_lock(), but only for ptr_stale(): */ - struct bucket_array __rcu *buckets; + struct bucket_array __rcu *buckets[2]; unsigned long *buckets_dirty; unsigned long *buckets_written; /* most out of date gen in the btree */ u8 *oldest_gens; struct rw_semaphore bucket_lock; - struct bch_dev_usage __percpu *usage_percpu; - struct bch_dev_usage usage_cached; + struct bch_dev_usage __percpu *usage[2]; /* Allocator: */ struct task_struct __rcu *alloc_thread; @@ -478,7 +476,6 @@ enum { /* errors: */ BCH_FS_ERROR, - BCH_FS_GC_FAILURE, /* misc: */ BCH_FS_BDEV_MOUNTED, @@ -614,8 +611,8 @@ struct bch_fs { atomic64_t sectors_available; - struct bch_fs_usage __percpu *usage_percpu; - struct bch_fs_usage usage_cached; + struct bch_fs_usage __percpu *usage[2]; + struct percpu_rw_semaphore usage_lock; struct closure_waitlist freelist_wait; @@ -656,9 +653,6 @@ struct bch_fs { * * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.) * - * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not - * currently running, and gc marks are currently valid - * * Protected by gc_pos_lock. Only written to by GC thread, so GC thread * can read without a lock. */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index e900fd4ffd06..6eba65fcb52c 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -260,8 +260,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type, { struct gc_pos pos = { 0 }; unsigned flags = - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD| + BCH_BUCKET_MARK_GC| (initial ? BCH_BUCKET_MARK_NOATOMIC : 0); int ret = 0; @@ -484,9 +483,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, BCH_DATA_SB, flags); } - if (c) - spin_lock(&c->journal.lock); - for (i = 0; i < ca->journal.nr; i++) { b = ca->journal.buckets[i]; bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL, @@ -495,7 +491,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, } if (c) { - spin_unlock(&c->journal.lock); percpu_up_read(&c->usage_lock); } else { preempt_enable(); @@ -511,9 +506,7 @@ static void bch2_mark_superblocks(struct bch_fs *c) gc_pos_set(c, gc_phase(GC_PHASE_SB)); for_each_online_member(ca, c, i) - bch2_mark_dev_superblock(c, ca, - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); + bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC); mutex_unlock(&c->sb_lock); } @@ -521,7 +514,6 @@ static void bch2_mark_superblocks(struct bch_fs *c) static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) { struct gc_pos pos = { 0 }; - struct bch_fs_usage stats = { 0 }; struct btree_update *as; struct pending_btree_node_free *d; @@ -533,13 +525,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) bch2_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&d->key), true, 0, - pos, &stats, 0, - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); - /* - * Don't apply stats - pending deletes aren't tracked in - * bch_alloc_stats: - */ + pos, NULL, 0, + BCH_BUCKET_MARK_GC); mutex_unlock(&c->btree_interior_update_lock); } @@ -560,8 +547,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) fifo_for_each_entry(i, &ca->free_inc, iter) bch2_mark_alloc_bucket(c, ca, i, true, gc_pos_alloc(c, NULL), - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); + BCH_BUCKET_MARK_GC); @@ -569,8 +555,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) fifo_for_each_entry(i, &ca->free[j], iter) bch2_mark_alloc_bucket(c, ca, i, true, gc_pos_alloc(c, NULL), - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); + BCH_BUCKET_MARK_GC); } spin_unlock(&c->freelist_lock); @@ -584,8 +569,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) ca = bch_dev_bkey_exists(c, ob->ptr.dev); bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, gc_pos_alloc(c, ob), - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); + BCH_BUCKET_MARK_GC); } spin_unlock(&ob->lock); } @@ -593,122 +577,310 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) percpu_up_read(&c->usage_lock); } -static void bch2_gc_start(struct bch_fs *c) +static void bch2_gc_free(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + for_each_member_device(ca, c, i) { + kvpfree(rcu_dereference_protected(ca->buckets[1], 1), + sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket)); + ca->buckets[1] = NULL; + + free_percpu(ca->usage[1]); + ca->usage[1] = NULL; + } + + free_percpu(c->usage[1]); + c->usage[1] = NULL; +} + +static void bch2_gc_done_nocheck(struct bch_fs *c) { struct bch_dev *ca; - struct bucket_array *buckets; - struct bucket_mark new; unsigned i; - size_t b; int cpu; - percpu_down_write(&c->usage_lock); + for_each_member_device(ca, c, i) { + struct bucket_array *src = __bucket_array(ca, 1); - /* - * Indicates to buckets code that gc is now in progress - done under - * usage_lock to avoid racing with bch2_mark_key(): - */ - __gc_pos_set(c, gc_phase(GC_PHASE_START)); + memcpy(__bucket_array(ca, 0), src, + sizeof(struct bucket_array) + + sizeof(struct bucket) * src->nbuckets); + }; - /* Save a copy of the existing bucket stats while we recompute them: */ for_each_member_device(ca, c, i) { - ca->usage_cached = __bch2_dev_usage_read(ca); + struct bch_dev_usage *p; + for_each_possible_cpu(cpu) { - struct bch_dev_usage *p = - per_cpu_ptr(ca->usage_percpu, cpu); + p = per_cpu_ptr(ca->usage[0], cpu); memset(p, 0, sizeof(*p)); } + + preempt_disable(); + *this_cpu_ptr(ca->usage[0]) = __bch2_dev_usage_read(ca, 1); + preempt_enable(); } - c->usage_cached = __bch2_fs_usage_read(c); - for_each_possible_cpu(cpu) { - struct bch_fs_usage *p = - per_cpu_ptr(c->usage_percpu, cpu); + { + struct bch_fs_usage src = __bch2_fs_usage_read(c, 1); + struct bch_fs_usage *p; - memset(p->replicas, 0, sizeof(p->replicas)); - memset(p->buckets, 0, sizeof(p->buckets)); + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(c->usage[0], cpu); + memset(p, 0, offsetof(typeof(*p), online_reserved)); + } + + preempt_disable(); + memcpy(this_cpu_ptr(c->usage[0]), + &src, + offsetof(typeof(*p), online_reserved)); + preempt_enable(); } +} + +static void bch2_gc_done(struct bch_fs *c, bool initial) +{ + struct bch_dev *ca; + unsigned i; + int cpu; + +#define copy_field(_f, _msg, ...) \ + if (dst._f != src._f) { \ + pr_info(_msg ": got %llu, should be %llu, fixing" \ + , ##__VA_ARGS__, dst._f, src._f); \ + dst._f = src._f; \ + } +#define copy_bucket_field(_f) \ + if (dst->b[b].mark._f != src->b[b].mark._f) { \ + pr_info("dev %u bucket %zu has wrong " #_f \ + ": got %u, should be %u, fixing", \ + i, b, dst->b[b].mark._f, src->b[b].mark._f); \ + dst->b[b]._mark._f = src->b[b].mark._f; \ + } +#define copy_dev_field(_f, _msg, ...) \ + copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) +#define copy_fs_field(_f, _msg, ...) \ + copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) + + percpu_down_write(&c->usage_lock); + + if (initial) { + bch2_gc_done_nocheck(c); + goto out; + } + + for_each_member_device(ca, c, i) { + struct bucket_array *dst = __bucket_array(ca, 0); + struct bucket_array *src = __bucket_array(ca, 1); + size_t b; + + if (initial) { + memcpy(dst, src, + sizeof(struct bucket_array) + + sizeof(struct bucket) * dst->nbuckets); + } + + for (b = 0; b < src->nbuckets; b++) { + copy_bucket_field(gen); + copy_bucket_field(data_type); + copy_bucket_field(owned_by_allocator); + copy_bucket_field(stripe); + copy_bucket_field(dirty_sectors); + copy_bucket_field(cached_sectors); + } + }; + + for_each_member_device(ca, c, i) { + struct bch_dev_usage dst = __bch2_dev_usage_read(ca, 0); + struct bch_dev_usage src = __bch2_dev_usage_read(ca, 1); + struct bch_dev_usage *p; + unsigned b; + + for (b = 0; b < BCH_DATA_NR; b++) + copy_dev_field(buckets[b], + "buckets[%s]", bch2_data_types[b]); + copy_dev_field(buckets_alloc, "buckets_alloc"); + copy_dev_field(buckets_ec, "buckets_ec"); + + for (b = 0; b < BCH_DATA_NR; b++) + copy_dev_field(sectors[b], + "sectors[%s]", bch2_data_types[b]); + copy_dev_field(sectors_fragmented, + "sectors_fragmented"); + + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(ca->usage[0], cpu); + memset(p, 0, sizeof(*p)); + } + + preempt_disable(); + p = this_cpu_ptr(ca->usage[0]); + *p = dst; + preempt_enable(); + } + + { + struct bch_fs_usage dst = __bch2_fs_usage_read(c, 0); + struct bch_fs_usage src = __bch2_fs_usage_read(c, 1); + struct bch_fs_usage *p; + unsigned r, b; + + for (r = 0; r < BCH_REPLICAS_MAX; r++) { + for (b = 0; b < BCH_DATA_NR; b++) + copy_fs_field(replicas[r].data[b], + "replicas[%i].data[%s]", + r, bch2_data_types[b]); + copy_fs_field(replicas[r].ec_data, + "replicas[%i].ec_data", r); + copy_fs_field(replicas[r].persistent_reserved, + "replicas[%i].persistent_reserved", r); + } + + for (b = 0; b < BCH_DATA_NR; b++) + copy_fs_field(buckets[b], + "buckets[%s]", bch2_data_types[b]); + + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(c->usage[0], cpu); + memset(p, 0, offsetof(typeof(*p), online_reserved)); + } + + preempt_disable(); + p = this_cpu_ptr(c->usage[0]); + memcpy(p, &dst, offsetof(typeof(*p), online_reserved)); + preempt_enable(); + } +out: percpu_up_write(&c->usage_lock); - /* Clear bucket marks: */ +#undef copy_field +#undef copy_fs_field +#undef copy_dev_field +#undef copy_bucket_field +} + +static int bch2_gc_start(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + BUG_ON(c->usage[1]); + + c->usage[1] = alloc_percpu(struct bch_fs_usage); + if (!c->usage[1]) + return -ENOMEM; + for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { - bucket_cmpxchg(buckets->b + b, new, ({ - new.owned_by_allocator = 0; - new.data_type = 0; - new.cached_sectors = 0; - new.dirty_sectors = 0; - new.stripe = 0; - })); - ca->oldest_gens[b] = new.gen; + BUG_ON(ca->buckets[1]); + BUG_ON(ca->usage[1]); + + ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket), + GFP_KERNEL|__GFP_ZERO); + if (!ca->buckets[1]) { + percpu_ref_put(&ca->ref); + return -ENOMEM; + } + + ca->usage[1] = alloc_percpu(struct bch_dev_usage); + if (!ca->usage[1]) { + percpu_ref_put(&ca->ref); + return -ENOMEM; } - up_read(&ca->bucket_lock); } + + percpu_down_write(&c->usage_lock); + + for_each_member_device(ca, c, i) { + struct bucket_array *dst = __bucket_array(ca, 1); + struct bucket_array *src = __bucket_array(ca, 0); + size_t b; + + dst->first_bucket = src->first_bucket; + dst->nbuckets = src->nbuckets; + + for (b = 0; b < src->nbuckets; b++) + dst->b[b]._mark.gen = src->b[b].mark.gen; + }; + + percpu_up_write(&c->usage_lock); + + return 0; } /** - * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes + * bch2_gc - walk _all_ references to buckets, and recompute them: + * + * Order matters here: + * - Concurrent GC relies on the fact that we have a total ordering for + * everything that GC walks - see gc_will_visit_node(), + * gc_will_visit_root() + * + * - also, references move around in the course of index updates and + * various other crap: everything needs to agree on the ordering + * references are allowed to move around in - e.g., we're allowed to + * start with a reference owned by an open_bucket (the allocator) and + * move it to the btree, but not the reverse. + * + * This is necessary to ensure that gc doesn't miss references that + * move around - if references move backwards in the ordering GC + * uses, GC could skip past them */ -void bch2_gc(struct bch_fs *c) +int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial) { struct bch_dev *ca; u64 start_time = local_clock(); - unsigned i; + unsigned i, iter = 0; int ret; - /* - * Walk _all_ references to buckets, and recompute them: - * - * Order matters here: - * - Concurrent GC relies on the fact that we have a total ordering for - * everything that GC walks - see gc_will_visit_node(), - * gc_will_visit_root() - * - * - also, references move around in the course of index updates and - * various other crap: everything needs to agree on the ordering - * references are allowed to move around in - e.g., we're allowed to - * start with a reference owned by an open_bucket (the allocator) and - * move it to the btree, but not the reverse. - * - * This is necessary to ensure that gc doesn't miss references that - * move around - if references move backwards in the ordering GC - * uses, GC could skip past them - */ trace_gc_start(c); - /* - * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on - * gc_lock if sectors_available goes to 0: - */ - bch2_recalc_sectors_available(c); - down_write(&c->gc_lock); - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) +again: + ret = bch2_gc_start(c); + if (ret) goto out; - bch2_gc_start(c); - bch2_mark_superblocks(c); - ret = bch2_gc_btrees(c, NULL, false); - if (ret) { - bch_err(c, "btree gc failed: %d", ret); - set_bit(BCH_FS_GC_FAILURE, &c->flags); + ret = bch2_gc_btrees(c, journal, initial); + if (ret) goto out; - } bch2_mark_pending_btree_node_frees(c); bch2_mark_allocator_buckets(c); - /* Indicates that gc is no longer in progress: */ - gc_pos_set(c, gc_phase(GC_PHASE_DONE)); c->gc_count++; out: + if (!ret && test_bit(BCH_FS_FIXED_GENS, &c->flags)) { + /* + * XXX: make sure gens we fixed got saved + */ + if (iter++ <= 2) { + bch_info(c, "Fixed gens, restarting mark and sweep:"); + clear_bit(BCH_FS_FIXED_GENS, &c->flags); + goto again; + } + + bch_info(c, "Unable to fix bucket gens, looping"); + ret = -EINVAL; + } + + if (!ret) + bch2_gc_done(c, initial); + + /* Indicates that gc is no longer in progress: */ + __gc_pos_set(c, gc_phase(GC_PHASE_START)); + + bch2_gc_free(c); up_write(&c->gc_lock); + + if (!ret && initial) + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + trace_gc_end(c); bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); @@ -724,6 +896,7 @@ out: * allocator thread - issue wakeup in case they blocked on gc_lock: */ closure_wake_up(&c->freelist_wait); + return ret; } /* Btree coalescing */ @@ -1039,9 +1212,6 @@ void bch2_coalesce(struct bch_fs *c) { enum btree_id id; - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) - return; - down_read(&c->gc_lock); trace_gc_coalesce_start(c); @@ -1053,7 +1223,6 @@ void bch2_coalesce(struct bch_fs *c) if (ret) { if (ret != -ESHUTDOWN) bch_err(c, "btree coalescing failed: %d", ret); - set_bit(BCH_FS_GC_FAILURE, &c->flags); return; } } @@ -1068,6 +1237,7 @@ static int bch2_gc_thread(void *arg) struct io_clock *clock = &c->io_clock[WRITE]; unsigned long last = atomic_long_read(&clock->now); unsigned last_kick = atomic_read(&c->kick_gc); + int ret; set_freezable(); @@ -1101,7 +1271,9 @@ static int bch2_gc_thread(void *arg) last = atomic_long_read(&clock->now); last_kick = atomic_read(&c->kick_gc); - bch2_gc(c); + ret = bch2_gc(c, NULL, false); + if (ret) + bch_err(c, "btree gc failed: %i", ret); debug_check_no_locks_held(); } @@ -1142,30 +1314,7 @@ int bch2_gc_thread_start(struct bch_fs *c) int bch2_initial_gc(struct bch_fs *c, struct list_head *journal) { - unsigned iter = 0; - int ret = 0; - - down_write(&c->gc_lock); -again: - bch2_gc_start(c); - - bch2_mark_superblocks(c); - - ret = bch2_gc_btrees(c, journal, true); - if (ret) - goto err; - - if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) { - if (iter++ > 2) { - bch_info(c, "Unable to fix bucket gens, looping"); - ret = -EINVAL; - goto err; - } - - bch_info(c, "Fixed gens, restarting initial mark and sweep:"); - clear_bit(BCH_FS_FIXED_GENS, &c->flags); - goto again; - } + int ret = bch2_gc(c, journal, true); /* * Skip past versions that might have possibly been used (as nonces), @@ -1174,9 +1323,5 @@ again: if (c->sb.encryption_type) atomic64_add(1 << 16, &c->key_version); - gc_pos_set(c, gc_phase(GC_PHASE_DONE)); - set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); -err: - up_write(&c->gc_lock); return ret; } diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 47a590015325..bb77564b9463 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -7,7 +7,7 @@ enum bkey_type; void bch2_coalesce(struct bch_fs *); -void bch2_gc(struct bch_fs *); +int bch2_gc(struct bch_fs *, struct list_head *, bool); void bch2_gc_thread_stop(struct bch_fs *); int bch2_gc_thread_start(struct bch_fs *); int bch2_initial_gc(struct bch_fs *, struct list_head *); @@ -105,14 +105,14 @@ static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *o }; } -static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos) +static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) { unsigned seq; bool ret; do { seq = read_seqcount_begin(&c->gc_pos_lock); - ret = gc_pos_cmp(c->gc_pos, pos) < 0; + ret = gc_pos_cmp(pos, c->gc_pos) <= 0; } while (read_seqcount_retry(&c->gc_pos_lock, seq)); return ret; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index af31819c88c7..2631b0732d4b 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -160,7 +160,6 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b, { struct bch_fs *c = as->c; struct pending_btree_node_free *d; - unsigned replicas; /* * btree_update lock is only needed here to avoid racing with @@ -179,15 +178,6 @@ found: d->index_update_done = true; /* - * Btree nodes are accounted as freed in bch_alloc_stats when they're - * freed from the index: - */ - replicas = bch2_extent_nr_dirty_ptrs(k); - if (replicas) - stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -= - c->opts.btree_node_size * replicas; - - /* * We're dropping @k from the btree, but it's still live until the * index update is persistent so we need to keep a reference around for * mark and sweep to find - that's primarily what the @@ -208,15 +198,16 @@ found: * bch2_mark_key() compares the current gc pos to the pos we're * moving this reference from, hence one comparison here: */ - if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) { - struct bch_fs_usage tmp = { 0 }; + if (gc_pos_cmp(c->gc_pos, b + ? gc_pos_btree_node(b) + : gc_pos_btree_root(as->btree_id)) >= 0 && + gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) { + struct gc_pos pos = { 0 }; bch2_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&d->key), - false, 0, b - ? gc_pos_btree_node(b) - : gc_pos_btree_root(as->btree_id), - &tmp, 0, 0); + false, 0, pos, + NULL, 0, BCH_BUCKET_MARK_GC); /* * Don't apply tmp - pending deletes aren't tracked in * bch_alloc_stats: @@ -287,19 +278,13 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, static void bch2_btree_node_free_ondisk(struct bch_fs *c, struct pending_btree_node_free *pending) { - struct bch_fs_usage stats = { 0 }; - BUG_ON(!pending->index_update_done); bch2_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&pending->key), false, 0, gc_phase(GC_PHASE_PENDING_DELETE), - &stats, 0, 0); - /* - * Don't apply stats - pending deletes aren't tracked in - * bch_alloc_stats: - */ + NULL, 0, 0); } static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, @@ -1939,6 +1924,25 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, btree_interior_update_add_node_reference(as, b); + /* + * XXX: the rest of the update path treats this like we're actually + * inserting a new node and deleting the existing node, so the + * reservation needs to include enough space for @b + * + * that is actually sketch as fuck though and I am surprised the code + * seems to work like that, definitely need to go back and rework it + * into something saner. + * + * (I think @b is just getting double counted until the btree update + * finishes and "deletes" @b on disk) + */ + ret = bch2_disk_reservation_add(c, &as->reserve->disk_res, + c->opts.btree_node_size * + bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)), + BCH_DISK_RESERVATION_NOFAIL| + BCH_DISK_RESERVATION_GC_LOCK_HELD); + BUG_ON(ret); + parent = btree_node_parent(iter, b); if (parent) { if (new_hash) { diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 201798866242..2ebe8bad978e 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -85,8 +85,7 @@ static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage); static void bch2_fs_stats_verify(struct bch_fs *c) { - struct bch_fs_usage stats = - __bch2_fs_usage_read(c); + struct bch_fs_usage stats =_bch2_fs_usage_read(c); unsigned i, j; for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) { @@ -209,43 +208,24 @@ do { \ _acc; \ }) -#define bch2_usage_read_cached(_c, _cached, _uncached) \ -({ \ - typeof(_cached) _ret; \ - unsigned _seq; \ - \ - do { \ - _seq = read_seqcount_begin(&(_c)->gc_pos_lock); \ - _ret = (_c)->gc_pos.phase == GC_PHASE_DONE \ - ? bch2_usage_read_raw(_uncached) \ - : (_cached); \ - } while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq)); \ - \ - _ret; \ -}) - -struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca) +struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca, bool gc) { - return bch2_usage_read_raw(ca->usage_percpu); + return bch2_usage_read_raw(ca->usage[gc]); } struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca) { - return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu); + return bch2_usage_read_raw(ca->usage[0]); } -struct bch_fs_usage -__bch2_fs_usage_read(struct bch_fs *c) +struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *c, bool gc) { - return bch2_usage_read_raw(c->usage_percpu); + return bch2_usage_read_raw(c->usage[gc]); } -struct bch_fs_usage -bch2_fs_usage_read(struct bch_fs *c) +struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c) { - return bch2_usage_read_cached(c, - c->usage_cached, - c->usage_percpu); + return bch2_usage_read_raw(c->usage[0]); } struct fs_usage_sum { @@ -327,13 +307,11 @@ static inline enum bch_data_type bucket_type(struct bucket_mark m) : m.data_type; } -static bool bucket_became_unavailable(struct bch_fs *c, - struct bucket_mark old, +static bool bucket_became_unavailable(struct bucket_mark old, struct bucket_mark new) { return is_available_bucket(old) && - !is_available_bucket(new) && - (!c || c->gc_pos.phase == GC_PHASE_DONE); + !is_available_bucket(new); } void bch2_fs_usage_apply(struct bch_fs *c, @@ -364,11 +342,13 @@ void bch2_fs_usage_apply(struct bch_fs *c, percpu_down_read(&c->usage_lock); preempt_disable(); /* online_reserved not subject to gc: */ - this_cpu_add(c->usage_percpu->online_reserved, stats->online_reserved); + this_cpu_add(c->usage[0]->online_reserved, stats->online_reserved); stats->online_reserved = 0; - if (!gc_will_visit(c, gc_pos)) - bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats); + bch2_usage_add(this_cpu_ptr(c->usage[0]), stats); + + if (gc_visited(c, gc_pos)) + bch2_usage_add(this_cpu_ptr(c->usage[1]), stats); bch2_fs_stats_verify(c); preempt_enable(); @@ -378,8 +358,9 @@ void bch2_fs_usage_apply(struct bch_fs *c, } static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, - struct bch_fs_usage *stats, - struct bucket_mark old, struct bucket_mark new) + struct bch_fs_usage *fs_usage, + struct bucket_mark old, struct bucket_mark new, + bool gc) { struct bch_dev_usage *dev_usage; @@ -391,14 +372,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, bch2_data_types[old.data_type], bch2_data_types[new.data_type]); - stats->buckets[bucket_type(old)] -= ca->mi.bucket_size; - stats->buckets[bucket_type(new)] += ca->mi.bucket_size; - preempt_disable(); - dev_usage = this_cpu_ptr(ca->usage_percpu); + dev_usage = this_cpu_ptr(ca->usage[gc]); - dev_usage->buckets[bucket_type(old)]--; - dev_usage->buckets[bucket_type(new)]++; + if (bucket_type(old) != bucket_type(new)) { + if (bucket_type(old)) { + fs_usage->buckets[bucket_type(old)] -= ca->mi.bucket_size; + dev_usage->buckets[bucket_type(old)]--; + } else { + fs_usage->buckets[bucket_type(new)] += ca->mi.bucket_size; + dev_usage->buckets[bucket_type(new)]++; + } + } dev_usage->buckets_alloc += (int) new.owned_by_allocator - (int) old.owned_by_allocator; @@ -425,21 +410,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, ({ \ struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \ \ - bch2_dev_usage_update(c, ca, stats, _old, new); \ + bch2_dev_usage_update(c, ca, stats, _old, new, gc); \ _old; \ }) -void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, struct bucket_mark *old) +static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, struct bucket_mark *old, + bool gc) { - struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu); - struct bucket *g; + struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]); + struct bucket *g = __bucket(ca, b, gc); struct bucket_mark new; - percpu_rwsem_assert_held(&c->usage_lock); - - g = bucket(ca, b); - *old = bucket_data_cmpxchg(c, ca, stats, g, new, ({ BUG_ON(!is_available_bucket(new)); @@ -450,38 +432,49 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, new.gen++; })); - /* - * This isn't actually correct yet, since fs usage is still - * uncompressed sectors: - */ stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors; +} + +void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, struct bucket_mark *old) +{ + percpu_rwsem_assert_held(&c->usage_lock); + + __bch2_invalidate_bucket(c, ca, b, old, false); if (!old->owned_by_allocator && old->cached_sectors) trace_invalidate(ca, bucket_to_sector(ca, b), old->cached_sectors); } -void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, bool owned_by_allocator, - struct gc_pos pos, unsigned flags) +static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, bool owned_by_allocator, + bool gc) { - struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu); - struct bucket *g; + struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]); + struct bucket *g = __bucket(ca, b, gc); struct bucket_mark old, new; - percpu_rwsem_assert_held(&c->usage_lock); - g = bucket(ca, b); - - if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && - gc_will_visit(c, pos)) - return; - old = bucket_data_cmpxchg(c, ca, stats, g, new, ({ new.owned_by_allocator = owned_by_allocator; })); - BUG_ON(!owned_by_allocator && !old.owned_by_allocator && - c->gc_pos.phase == GC_PHASE_DONE); + BUG_ON(!gc && + !owned_by_allocator && !old.owned_by_allocator); +} + +void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, bool owned_by_allocator, + struct gc_pos pos, unsigned flags) +{ + percpu_rwsem_assert_held(&c->usage_lock); + + if (!(flags & BCH_BUCKET_MARK_GC)) + __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false); + + if ((flags & BCH_BUCKET_MARK_GC) || + gc_visited(c, pos)) + __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true); } #define checked_add(a, b) \ @@ -491,37 +484,49 @@ do { \ BUG_ON((a) != _res); \ } while (0) +static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, enum bch_data_type type, + unsigned sectors, bool gc) +{ + struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]); + struct bucket *g = __bucket(ca, b, gc); + struct bucket_mark old, new; + + BUG_ON(type != BCH_DATA_SB && + type != BCH_DATA_JOURNAL); + + old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ + new.data_type = type; + checked_add(new.dirty_sectors, sectors); + })); + + fs_usage->replicas[0].data[type] += sectors; +} + void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, enum bch_data_type type, unsigned sectors, struct gc_pos pos, unsigned flags) { - struct bch_fs_usage *stats; - struct bucket *g; - struct bucket_mark old, new; - BUG_ON(type != BCH_DATA_SB && type != BCH_DATA_JOURNAL); + preempt_disable(); + if (likely(c)) { percpu_rwsem_assert_held(&c->usage_lock); - if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && - gc_will_visit(c, pos)) - return; - - preempt_disable(); - stats = this_cpu_ptr(c->usage_percpu); - - g = bucket(ca, b); - old = bucket_data_cmpxchg(c, ca, stats, g, new, ({ - new.data_type = type; - checked_add(new.dirty_sectors, sectors); - })); - - stats->replicas[0].data[type] += sectors; - preempt_enable(); + if (!(flags & BCH_BUCKET_MARK_GC)) + __bch2_mark_metadata_bucket(c, ca, b, type, sectors, + false); + if ((flags & BCH_BUCKET_MARK_GC) || + gc_visited(c, pos)) + __bch2_mark_metadata_bucket(c, ca, b, type, sectors, + true); } else { + struct bucket *g; + struct bucket_mark old, new; + rcu_read_lock(); g = bucket(ca, b); @@ -533,8 +538,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, rcu_read_unlock(); } - BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && - bucket_became_unavailable(c, old, new)); + preempt_enable(); } static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors) @@ -579,23 +583,15 @@ static void bch2_mark_pointer(struct bch_fs *c, struct extent_ptr_decoded p, s64 sectors, enum bch_data_type data_type, struct bch_fs_usage *fs_usage, - u64 journal_seq, unsigned flags) + u64 journal_seq, unsigned flags, + bool gc) { struct bucket_mark old, new; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket *g = PTR_BUCKET(ca, &p.ptr); + size_t b = PTR_BUCKET_NR(ca, &p.ptr); + struct bucket *g = __bucket(ca, b, gc); u64 v; - if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) { - if (journal_seq) - bucket_cmpxchg(g, new, ({ - new.journal_seq_valid = 1; - new.journal_seq = journal_seq; - })); - - return; - } - v = atomic64_read(&g->_mark.v); do { new.v.counter = old.v.counter = v; @@ -637,10 +633,9 @@ static void bch2_mark_pointer(struct bch_fs *c, old.v.counter, new.v.counter)) != old.v.counter); - bch2_dev_usage_update(c, ca, fs_usage, old, new); + bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); - BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && - bucket_became_unavailable(c, old, new)); + BUG_ON(!gc && bucket_became_unavailable(old, new)); } static void bch2_mark_stripe_ptr(struct bch_fs *c, @@ -688,9 +683,9 @@ static void bch2_mark_stripe_ptr(struct bch_fs *c, static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, s64 sectors, enum bch_data_type data_type, - struct gc_pos pos, struct bch_fs_usage *stats, - u64 journal_seq, unsigned flags) + u64 journal_seq, unsigned flags, + bool gc) { BUG_ON(!sectors); @@ -712,7 +707,7 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, s64 adjusted_disk_sectors = disk_sectors; bch2_mark_pointer(c, e, p, disk_sectors, data_type, - stats, journal_seq, flags); + stats, journal_seq, flags, gc); if (!p.ptr.cached) for (i = 0; i < p.ec_nr; i++) @@ -758,21 +753,20 @@ static void bucket_set_stripe(struct bch_fs *c, const struct bch_stripe *v, bool enabled, struct bch_fs_usage *fs_usage, - u64 journal_seq) + u64 journal_seq, + bool gc) { unsigned i; for (i = 0; i < v->nr_blocks; i++) { const struct bch_extent_ptr *ptr = v->ptrs + i; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g; + size_t b = PTR_BUCKET_NR(ca, ptr); + struct bucket *g = __bucket(ca, b, gc); struct bucket_mark new, old; BUG_ON(ptr_stale(ca, ptr)); - rcu_read_lock(); - g = PTR_BUCKET(ca, ptr); - old = bucket_cmpxchg(g, new, ({ new.stripe = enabled; if (journal_seq) { @@ -780,18 +774,18 @@ static void bucket_set_stripe(struct bch_fs *c, new.journal_seq = journal_seq; } })); - rcu_read_unlock(); BUG_ON(old.stripe == enabled); - bch2_dev_usage_update(c, ca, fs_usage, old, new); + bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); } } static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, - bool inserting, struct gc_pos pos, + bool inserting, struct bch_fs_usage *fs_usage, - u64 journal_seq, unsigned flags) + u64 journal_seq, unsigned flags, + bool gc) { switch (k.k->type) { case BCH_STRIPE: { @@ -820,74 +814,64 @@ static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, else bch2_stripes_heap_del(c, m, idx); - bucket_set_stripe(c, s.v, inserting, fs_usage, 0); + bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc); break; } } } -void bch2_mark_key(struct bch_fs *c, - enum bkey_type type, struct bkey_s_c k, - bool inserting, s64 sectors, - struct gc_pos pos, - struct bch_fs_usage *stats, - u64 journal_seq, unsigned flags) +static void __bch2_mark_key(struct bch_fs *c, + enum bkey_type type, struct bkey_s_c k, + bool inserting, s64 sectors, + struct bch_fs_usage *stats, + u64 journal_seq, unsigned flags, + bool gc) { - /* - * synchronization w.r.t. GC: - * - * Normally, bucket sector counts/marks are updated on the fly, as - * references are added/removed from the btree, the lists of buckets the - * allocator owns, other metadata buckets, etc. - * - * When GC is in progress and going to mark this reference, we do _not_ - * mark this reference here, to avoid double counting - GC will count it - * when it gets to it. - * - * To know whether we should mark a given reference (GC either isn't - * running, or has already marked references at this position) we - * construct a total order for everything GC walks. Then, we can simply - * compare the position of the reference we're marking - @pos - with - * GC's current position. If GC is going to mark this reference, GC's - * current position will be less than @pos; if GC's current position is - * greater than @pos GC has either already walked this position, or - * isn't running. - * - * To avoid racing with GC's position changing, we have to deal with - * - GC's position being set to GC_POS_MIN when GC starts: - * usage_lock guards against this - * - GC's position overtaking @pos: we guard against this with - * whatever lock protects the data structure the reference lives in - * (e.g. the btree node lock, or the relevant allocator lock). - */ - - percpu_down_read(&c->usage_lock); - if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && - gc_will_visit(c, pos)) - flags |= BCH_BUCKET_MARK_GC_WILL_VISIT; - - if (!stats) - stats = this_cpu_ptr(c->usage_percpu); - switch (type) { case BKEY_TYPE_BTREE: bch2_mark_extent(c, k, inserting ? c->opts.btree_node_size : -c->opts.btree_node_size, BCH_DATA_BTREE, - pos, stats, journal_seq, flags); + stats, journal_seq, flags, gc); break; case BKEY_TYPE_EXTENTS: bch2_mark_extent(c, k, sectors, BCH_DATA_USER, - pos, stats, journal_seq, flags); + stats, journal_seq, flags, gc); break; case BKEY_TYPE_EC: bch2_mark_stripe(c, k, inserting, - pos, stats, journal_seq, flags); + stats, journal_seq, flags, gc); break; default: break; } +} + +void bch2_mark_key(struct bch_fs *c, + enum bkey_type type, struct bkey_s_c k, + bool inserting, s64 sectors, + struct gc_pos pos, + struct bch_fs_usage *stats, + u64 journal_seq, unsigned flags) +{ + percpu_down_read(&c->usage_lock); + + if (!(flags & BCH_BUCKET_MARK_GC)) { + if (!stats) + stats = this_cpu_ptr(c->usage[0]); + + __bch2_mark_key(c, type, k, inserting, sectors, + stats, journal_seq, flags, false); + } + + if ((flags & BCH_BUCKET_MARK_GC) || + gc_visited(c, pos)) { + __bch2_mark_key(c, type, k, inserting, sectors, + this_cpu_ptr(c->usage[1]), + journal_seq, flags, true); + } + percpu_up_read(&c->usage_lock); } @@ -963,28 +947,20 @@ void bch2_mark_update(struct btree_insert *trans, /* Disk reservations: */ -static u64 __recalc_sectors_available(struct bch_fs *c) +static u64 bch2_recalc_sectors_available(struct bch_fs *c) { int cpu; for_each_possible_cpu(cpu) - per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0; + per_cpu_ptr(c->usage[0], cpu)->available_cache = 0; return avail_factor(bch2_fs_sectors_free(c, bch2_fs_usage_read(c))); } -/* Used by gc when it's starting: */ -void bch2_recalc_sectors_available(struct bch_fs *c) -{ - percpu_down_write(&c->usage_lock); - atomic64_set(&c->sectors_available, __recalc_sectors_available(c)); - percpu_up_write(&c->usage_lock); -} - void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) { percpu_down_read(&c->usage_lock); - this_cpu_sub(c->usage_percpu->online_reserved, + this_cpu_sub(c->usage[0]->online_reserved, res->sectors); bch2_fs_stats_verify(c); @@ -1005,7 +981,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, percpu_down_read(&c->usage_lock); preempt_disable(); - stats = this_cpu_ptr(c->usage_percpu); + stats = this_cpu_ptr(c->usage[0]); if (sectors <= stats->available_cache) goto out; @@ -1055,7 +1031,7 @@ recalculate: } percpu_down_write(&c->usage_lock); - sectors_available = __recalc_sectors_available(c); + sectors_available = bch2_recalc_sectors_available(c); if (sectors <= sectors_available || (flags & BCH_DISK_RESERVATION_NOFAIL)) { @@ -1110,7 +1086,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), btree_reserve); - bool resize = ca->buckets != NULL, + bool resize = ca->buckets[0] != NULL, start_copygc = ca->copygc_thread != NULL; int ret = -ENOMEM; unsigned i; @@ -1170,7 +1146,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) BITS_TO_LONGS(n) * sizeof(unsigned long)); } - rcu_assign_pointer(ca->buckets, buckets); + rcu_assign_pointer(ca->buckets[0], buckets); buckets = old_buckets; swap(ca->oldest_gens, oldest_gens); @@ -1239,16 +1215,16 @@ void bch2_dev_buckets_free(struct bch_dev *ca) kvpfree(ca->buckets_dirty, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8)); - kvpfree(rcu_dereference_protected(ca->buckets, 1), + kvpfree(rcu_dereference_protected(ca->buckets[0], 1), sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); - free_percpu(ca->usage_percpu); + free_percpu(ca->usage[0]); } int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { - if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage))) + if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage))) return -ENOMEM; return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index b48960fa5ce7..813e0c44e107 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -29,23 +29,34 @@ _old; \ }) -static inline struct bucket_array *bucket_array(struct bch_dev *ca) +static inline struct bucket_array *__bucket_array(struct bch_dev *ca, + bool gc) { - return rcu_dereference_check(ca->buckets, + return rcu_dereference_check(ca->buckets[gc], !ca->fs || percpu_rwsem_is_held(&ca->fs->usage_lock) || lockdep_is_held(&ca->fs->gc_lock) || lockdep_is_held(&ca->bucket_lock)); } -static inline struct bucket *bucket(struct bch_dev *ca, size_t b) +static inline struct bucket_array *bucket_array(struct bch_dev *ca) +{ + return __bucket_array(ca, false); +} + +static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) { - struct bucket_array *buckets = bucket_array(ca); + struct bucket_array *buckets = __bucket_array(ca, gc); BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); return buckets->b + b; } +static inline struct bucket *bucket(struct bch_dev *ca, size_t b) +{ + return __bucket(ca, b, false); +} + static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca, size_t b, int rw) { @@ -129,7 +140,7 @@ static inline bool bucket_unused(struct bucket_mark mark) /* Device usage: */ -struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *); +struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *, bool); struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *); static inline u64 __dev_buckets_available(struct bch_dev *ca, @@ -168,7 +179,7 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca) /* Filesystem usage: */ -struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *); +struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *, bool); struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *); void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, struct disk_reservation *, struct gc_pos); @@ -207,17 +218,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, struct gc_pos, unsigned); #define BCH_BUCKET_MARK_NOATOMIC (1 << 0) -#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 1) -#define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 2) -#define BCH_BUCKET_MARK_GC_LOCK_HELD (1 << 3) +#define BCH_BUCKET_MARK_GC (1 << 1) void bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c, bool, s64, struct gc_pos, struct bch_fs_usage *, u64, unsigned); void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *); -void bch2_recalc_sectors_available(struct bch_fs *); - void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); static inline void bch2_disk_reservation_put(struct bch_fs *c, diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 9ec96dbab0e8..0187f465d23f 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -64,8 +64,6 @@ struct bch_dev_usage { struct bch_fs_usage { /* all fields are in units of 512 byte sectors: */ - u64 online_reserved; - u64 available_cache; struct { u64 data[BCH_DATA_NR]; @@ -74,6 +72,10 @@ struct bch_fs_usage { } replicas[BCH_REPLICAS_MAX]; u64 buckets[BCH_DATA_NR]; + + /* fields starting here aren't touched by gc: */ + u64 online_reserved; + u64 available_cache; }; /* diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 939caa3b8183..4045c0e68462 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -782,9 +782,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL, ca->mi.bucket_size, gc_phase(GC_PHASE_SB), - new_fs - ? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE - : 0); + 0); if (c) { spin_unlock(&c->journal.lock); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 931e50e8ad57..59f2aa7e047c 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -374,7 +374,7 @@ static void bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); percpu_free_rwsem(&c->usage_lock); - free_percpu(c->usage_percpu); + free_percpu(c->usage[0]); mempool_exit(&c->btree_iters_pool); mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); @@ -606,7 +606,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) max(offsetof(struct btree_read_bio, bio), offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || - !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) || + !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) || percpu_init_rwsem(&c->usage_lock) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || @@ -1028,8 +1028,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) return ret; mutex_lock(&c->sb_lock); - bch2_mark_dev_superblock(ca->fs, ca, - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); + bch2_mark_dev_superblock(ca->fs, ca, 0); mutex_unlock(&c->sb_lock); bch2_dev_sysfs_online(c, ca); @@ -1314,7 +1313,7 @@ static void dev_usage_clear(struct bch_dev *ca) for_each_possible_cpu(cpu) { struct bch_dev_usage *p = - per_cpu_ptr(ca->usage_percpu, cpu); + per_cpu_ptr(ca->usage[0], cpu); memset(p, 0, sizeof(*p)); } @@ -1375,8 +1374,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) * allocate the journal, reset all the marks, then remark after we * attach... */ - bch2_mark_dev_superblock(ca->fs, ca, - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); + bch2_mark_dev_superblock(ca->fs, ca, 0); err = "journal alloc failed"; ret = bch2_dev_journal_alloc(ca); @@ -1435,8 +1433,7 @@ have_slot: ca->disk_sb.sb->dev_idx = dev_idx; bch2_dev_attach(c, ca, dev_idx); - bch2_mark_dev_superblock(c, ca, - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); + bch2_mark_dev_superblock(c, ca, 0); bch2_write_super(c); mutex_unlock(&c->sb_lock); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 188e19572d91..8eacc0d2550b 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -478,7 +478,7 @@ STORE(__bch2_fs) bch2_coalesce(c); if (attr == &sysfs_trigger_gc) - bch2_gc(c); + bch2_gc(c, NULL, false); if (attr == &sysfs_prune_cache) { struct shrink_control sc; |