diff options
author | Kent Overstreet <kent.overstreet@linux.dev> | 2024-01-23 00:01:07 -0500 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2024-03-13 21:22:24 -0400 |
commit | 91dcad18d38849f1702e0d50f5598bb3614c8ff8 (patch) | |
tree | 8684fd85fd37944babb2e386c6392afbc2ef34a2 | |
parent | 835cd3e147a9a8f8acfe7f3a405c582f04e90a33 (diff) |
bcachefs: Pin btree cache in ram for random access in fsck
Various phases of fsck involve checking references from one btree to
another: this means doing a sequential scan of one btree, and then
mostly random access into the second.
This is particularly painful for checking extents <-> backpointers; we
can prefetch btree node access on the sequential scan, but not on the
random access portion, and this is particularly painful on spinning
rust, where we'd like to keep the pipeline fairly full of btree node
reads so that the elevator can reduce seeking.
This patch implements prefetching and pinning of the portion of the
btree that we'll be doing random access to. We already calculate how
much of the random access btree will fit in memory so it's a fairly
straightforward change.
This will put more pressure on system memory usage, so we introduce a
new option, fsck_memory_usage_percent, which is the percentage of total
system ram that fsck is allowed to pin.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r-- | fs/bcachefs/backpointers.c | 137 | ||||
-rw-r--r-- | fs/bcachefs/bbpos_types.h | 2 | ||||
-rw-r--r-- | fs/bcachefs/btree_cache.c | 13 | ||||
-rw-r--r-- | fs/bcachefs/btree_types.h | 6 | ||||
-rw-r--r-- | fs/bcachefs/opts.h | 5 |
5 files changed, 72 insertions, 91 deletions
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index f9ccefc74c49..f2b33fe40bd8 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -554,60 +554,61 @@ static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) }; } -static size_t btree_nodes_fit_in_ram(struct bch_fs *c) +static u64 mem_may_pin_bytes(struct bch_fs *c) { struct sysinfo i; - u64 mem_bytes; - si_meminfo(&i); - mem_bytes = i.totalram * i.mem_unit; - return div_u64(mem_bytes >> 1, c->opts.btree_node_size); + + u64 mem_bytes = i.totalram * i.mem_unit; + return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100); +} + +static size_t btree_nodes_fit_in_ram(struct bch_fs *c) +{ + return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size); } static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, - unsigned btree_leaf_mask, - unsigned btree_interior_mask, + u64 btree_leaf_mask, + u64 btree_interior_mask, struct bbpos start, struct bbpos *end) { - struct btree_iter iter; - struct bkey_s_c k; - size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); - enum btree_id btree; + struct bch_fs *c = trans->c; + s64 mem_may_pin = mem_may_pin_bytes(c); int ret = 0; - for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) { - unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2; + btree_interior_mask |= btree_leaf_mask; + + c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask; + c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask; + c->btree_cache.pinned_nodes_start = start; + c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; + + for (enum btree_id btree = start.btree; + btree < BTREE_ID_NR && !ret; + btree++) { + unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1; + struct btree_iter iter; + struct btree *b; if (!((1U << btree) & btree_leaf_mask) && !((1U << btree) & btree_interior_mask)) continue; - bch2_trans_node_iter_init(trans, &iter, btree, - btree == start.btree ? start.pos : POS_MIN, - 0, depth, 0); - /* - * for_each_btree_key_contineu() doesn't check the return value - * from bch2_btree_iter_advance(), which is needed when - * iterating over interior nodes where we'll see keys at - * SPOS_MAX: - */ - do { - k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0); - ret = bkey_err(k); - if (!k.k || ret) - break; - - --btree_nodes; - if (!btree_nodes) { - *end = BBPOS(btree, k.k->p); + __for_each_btree_node(trans, iter, btree, + btree == start.btree ? start.pos : POS_MIN, + 0, depth, BTREE_ITER_PREFETCH, b, ret) { + mem_may_pin -= btree_buf_bytes(b); + if (mem_may_pin <= 0) { + c->btree_cache.pinned_nodes_end = *end = + BBPOS(btree, b->key.k.p); bch2_trans_iter_exit(trans, &iter); return 0; } - } while (bch2_btree_iter_advance(&iter)); + } bch2_trans_iter_exit(trans, &iter); } - *end = BBPOS_MAX; return ret; } @@ -665,62 +666,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, return 0; } -static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c, - struct bpos bucket) -{ - return bch2_dev_exists2(c, bucket.inode) - ? bucket_pos_to_bp(c, bucket, 0) - : bucket; -} - -static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, - struct bpos start, struct bpos *end) -{ - struct btree_iter alloc_iter; - struct btree_iter bp_iter; - struct bkey_s_c alloc_k, bp_k; - size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); - bool alloc_end = false, bp_end = false; - int ret = 0; - - bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc, - start, 0, 1, 0); - bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0); - while (1) { - alloc_k = !alloc_end - ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0) - : bkey_s_c_null; - bp_k = !bp_end - ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0) - : bkey_s_c_null; - - ret = bkey_err(alloc_k) ?: bkey_err(bp_k); - if ((!alloc_k.k && !bp_k.k) || ret) { - *end = SPOS_MAX; - break; - } - - --btree_nodes; - if (!btree_nodes) { - *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX; - break; - } - - if (bpos_lt(alloc_iter.pos, SPOS_MAX) && - bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) { - if (!bch2_btree_iter_advance(&alloc_iter)) - alloc_end = true; - } else { - if (!bch2_btree_iter_advance(&bp_iter)) - bp_end = true; - } - } - bch2_trans_iter_exit(trans, &bp_iter); - bch2_trans_iter_exit(trans, &alloc_iter); - return ret; -} - int bch2_check_extents_to_backpointers(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); @@ -731,10 +676,16 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bkey_init(&s.last_flushed.k->k); while (1) { - ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end); + struct bbpos end; + ret = bch2_get_btree_in_memory_pos(trans, + BIT_ULL(BTREE_ID_backpointers), + BIT_ULL(BTREE_ID_backpointers), + BBPOS(BTREE_ID_backpointers, s.bucket_start), &end); if (ret) break; + s.bucket_end = end.pos; + if ( bpos_eq(s.bucket_start, POS_MIN) && !bpos_eq(s.bucket_end, SPOS_MAX)) bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", @@ -762,6 +713,9 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); + c->btree_cache.pinned_nodes_leaf_mask = 0; + c->btree_cache.pinned_nodes_interior_mask = 0; + bch_err_fn(c, ret); return ret; } @@ -867,6 +821,9 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) } bch2_trans_put(trans); + c->btree_cache.pinned_nodes_leaf_mask = 0; + c->btree_cache.pinned_nodes_interior_mask = 0; + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h index 5198e94cf3b8..f63893344f80 100644 --- a/fs/bcachefs/bbpos_types.h +++ b/fs/bcachefs/bbpos_types.h @@ -13,6 +13,6 @@ static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) } #define BBPOS_MIN BBPOS(0, POS_MIN) -#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) +#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX) #endif /* _BCACHEFS_BBPOS_TYPES_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index e8665258360e..562561a9a510 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bbpos.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_io.h" @@ -208,6 +209,18 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) int ret = 0; lockdep_assert_held(&bc->lock); + + struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); + + u64 mask = b->c.level + ? bc->pinned_nodes_interior_mask + : bc->pinned_nodes_leaf_mask; + + if ((mask & BIT_ULL(b->c.btree_id)) && + bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && + bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) + return -BCH_ERR_ENOMEM_btree_node_reclaim; + wait_on_io: if (b->flags & ((1U << BTREE_NODE_dirty)| (1U << BTREE_NODE_read_in_flight)| diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index d4f4e52ee6ed..9404d96c38f3 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -5,6 +5,7 @@ #include <linux/list.h> #include <linux/rhashtable.h> +#include "bbpos_types.h" #include "btree_key_cache_types.h" #include "buckets_types.h" #include "darray.h" @@ -173,6 +174,11 @@ struct btree_cache { */ struct task_struct *alloc_lock; struct closure_waitlist alloc_wait; + + struct bbpos pinned_nodes_start; + struct bbpos pinned_nodes_end; + u64 pinned_nodes_leaf_mask; + u64 pinned_nodes_interior_mask; }; struct btree_node_iter { diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 9a83aca31b4b..136083c11f3a 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -337,6 +337,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Run fsck on mount") \ + x(fsck_memory_usage_percent, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_UINT(20, 70), \ + BCH2_NO_SB_OPT, 50, \ + NULL, "Maximum percentage of system ram fsck is allowed to pin")\ x(fix_errors, u8, \ OPT_FS|OPT_MOUNT, \ OPT_FN(bch2_opt_fix_errors), \ |