summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/bcachefs/Makefile3
-rw-r--r--fs/bcachefs/alloc_background.c47
-rw-r--r--fs/bcachefs/alloc_foreground.c4
-rw-r--r--fs/bcachefs/alloc_types.h3
-rw-r--r--fs/bcachefs/backpointers.c186
-rw-r--r--fs/bcachefs/backpointers.h32
-rw-r--r--fs/bcachefs/bcachefs.h8
-rw-r--r--fs/bcachefs/bcachefs_format.h1
-rw-r--r--fs/bcachefs/bset.c14
-rw-r--r--fs/bcachefs/bset.h2
-rw-r--r--fs/bcachefs/btree_cache.c5
-rw-r--r--fs/bcachefs/btree_gc.c512
-rw-r--r--fs/bcachefs/btree_io.c18
-rw-r--r--fs/bcachefs/btree_iter.c52
-rw-r--r--fs/bcachefs/btree_journal_iter.c48
-rw-r--r--fs/bcachefs/btree_journal_iter.h8
-rw-r--r--fs/bcachefs/btree_node_scan.c495
-rw-r--r--fs/bcachefs/btree_node_scan.h11
-rw-r--r--fs/bcachefs/btree_node_scan_types.h30
-rw-r--r--fs/bcachefs/btree_trans_commit.c5
-rw-r--r--fs/bcachefs/btree_update.c6
-rw-r--r--fs/bcachefs/btree_update_interior.c208
-rw-r--r--fs/bcachefs/btree_update_interior.h28
-rw-r--r--fs/bcachefs/btree_write_buffer.c14
-rw-r--r--fs/bcachefs/buckets.c12
-rw-r--r--fs/bcachefs/buckets.h1
-rw-r--r--fs/bcachefs/chardev.c2
-rw-r--r--fs/bcachefs/data_update.c12
-rw-r--r--fs/bcachefs/errcode.h3
-rw-r--r--fs/bcachefs/error.c6
-rw-r--r--fs/bcachefs/error.h6
-rw-r--r--fs/bcachefs/extents.c61
-rw-r--r--fs/bcachefs/extents.h25
-rw-r--r--fs/bcachefs/eytzinger.c234
-rw-r--r--fs/bcachefs/eytzinger.h63
-rw-r--r--fs/bcachefs/fs-io-direct.c4
-rw-r--r--fs/bcachefs/fs.c1
-rw-r--r--fs/bcachefs/fsck.c264
-rw-r--r--fs/bcachefs/inode.c2
-rw-r--r--fs/bcachefs/io_misc.c2
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c3
-rw-r--r--fs/bcachefs/logged_ops.c7
-rw-r--r--fs/bcachefs/mean_and_variance_test.c28
-rw-r--r--fs/bcachefs/opts.c4
-rw-r--r--fs/bcachefs/opts.h11
-rw-r--r--fs/bcachefs/recovery.c404
-rw-r--r--fs/bcachefs/recovery.h32
-rw-r--r--fs/bcachefs/recovery_passes.c249
-rw-r--r--fs/bcachefs/recovery_passes.h17
-rw-r--r--fs/bcachefs/recovery_passes_types.h (renamed from fs/bcachefs/recovery_types.h)11
-rw-r--r--fs/bcachefs/reflink.c3
-rw-r--r--fs/bcachefs/replicas.c19
-rw-r--r--fs/bcachefs/sb-downgrade.c2
-rw-r--r--fs/bcachefs/sb-errors_types.h7
-rw-r--r--fs/bcachefs/snapshot.c208
-rw-r--r--fs/bcachefs/snapshot.h89
-rw-r--r--fs/bcachefs/subvolume.c72
-rw-r--r--fs/bcachefs/subvolume.h3
-rw-r--r--fs/bcachefs/subvolume_types.h2
-rw-r--r--fs/bcachefs/super-io.c11
-rw-r--r--fs/bcachefs/super.c16
-rw-r--r--fs/bcachefs/util.c143
-rw-r--r--fs/bcachefs/util.h14
-rw-r--r--fs/namei.c7
-rw-r--r--fs/nfsd/nfs4state.c36
-rw-r--r--fs/nfsd/vfs.c3
-rw-r--r--fs/proc/Makefile2
-rw-r--r--fs/smb/client/dir.c7
-rw-r--r--fs/smb/client/fscache.c16
-rw-r--r--fs/smb/client/inode.c2
-rw-r--r--fs/smb/client/trace.h4
-rw-r--r--fs/vboxsf/file.c1
-rw-r--r--fs/vboxsf/super.c9
-rw-r--r--fs/vboxsf/utils.c3
-rw-r--r--fs/xfs/libxfs/xfs_sb.c40
-rw-r--r--fs/xfs/libxfs/xfs_sb.h5
-rw-r--r--fs/xfs/scrub/common.c4
-rw-r--r--fs/xfs/xfs_aops.c7
-rw-r--r--fs/xfs/xfs_icache.c8
-rw-r--r--fs/xfs/xfs_trans.h9
80 files changed, 2772 insertions, 1184 deletions
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index b02796c8a595..66ca0bbee639 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -17,6 +17,7 @@ bcachefs-y := \
btree_journal_iter.o \
btree_key_cache.o \
btree_locking.o \
+ btree_node_scan.o \
btree_trans_commit.o \
btree_update.o \
btree_update_interior.o \
@@ -37,6 +38,7 @@ bcachefs-y := \
error.o \
extents.o \
extent_update.o \
+ eytzinger.o \
fs.o \
fs-common.o \
fs-ioctl.o \
@@ -67,6 +69,7 @@ bcachefs-y := \
quota.o \
rebalance.o \
recovery.o \
+ recovery_passes.o \
reflink.o \
replicas.o \
sb-clean.o \
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 893e38f9db80..4ff56fa4d539 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1713,34 +1713,37 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
if (ret)
goto out;
- if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
- a->v.gen++;
- SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
- goto write;
- }
-
- if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
- bch2_trans_inconsistent(trans,
- "clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
- "%s",
- a->v.journal_seq,
- c->journal.flushed_seq_ondisk,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ if (a->v.dirty_sectors) {
+ if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+ trans, "attempting to discard bucket with dirty data\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = -EIO;
- }
goto out;
}
if (a->v.data_type != BCH_DATA_need_discard) {
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
- bch2_trans_inconsistent(trans,
- "bucket incorrectly set in need_discard btree\n"
- "%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
+ if (data_type_is_empty(a->v.data_type) &&
+ BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
+ a->v.gen++;
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+ goto write;
}
+ if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+ trans, "bucket incorrectly set in need_discard btree\n"
+ "%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = -EIO;
+ goto out;
+ }
+
+ if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
+ if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+ trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
+ a->v.journal_seq,
+ c->journal.flushed_seq_ondisk,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = -EIO;
goto out;
}
@@ -1835,6 +1838,7 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo
if (ret)
goto err;
+ BUG_ON(a->v.dirty_sectors);
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
a->v.data_type = alloc_data_type(a->v, a->v.data_type);
@@ -1942,6 +1946,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
goto out;
BUG_ON(a->v.data_type != BCH_DATA_cached);
+ BUG_ON(a->v.dirty_sectors);
if (!a->v.cached_sectors)
bch_err(c, "invalidating empty bucket, confused");
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 214b15c84d1f..a1fc30adf912 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -188,8 +188,10 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
{
switch (watermark) {
- case BCH_WATERMARK_reclaim:
+ case BCH_WATERMARK_interior_updates:
return 0;
+ case BCH_WATERMARK_reclaim:
+ return OPEN_BUCKETS_COUNT / 6;
case BCH_WATERMARK_btree:
case BCH_WATERMARK_btree_copygc:
return OPEN_BUCKETS_COUNT / 4;
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index b91b7a461056..c2226e947c41 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -22,7 +22,8 @@ struct bucket_alloc_state {
x(copygc) \
x(btree) \
x(btree_copygc) \
- x(reclaim)
+ x(reclaim) \
+ x(interior_updates)
enum bch_watermark {
#define x(name) BCH_WATERMARK_##name,
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 8cb35ea572cb..114328acde72 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -8,6 +8,7 @@
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
+#include "checksum.h"
#include "error.h"
#include <linux/mm.h>
@@ -29,8 +30,7 @@ static bool extent_matches_bp(struct bch_fs *c,
if (p.ptr.cached)
continue;
- bch2_extent_ptr_to_bp(c, btree_id, level, k, p,
- &bucket2, &bp2);
+ bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bucket2, &bp2);
if (bpos_eq(bucket, bucket2) &&
!memcmp(&bp, &bp2, sizeof(bp)))
return true;
@@ -44,6 +44,11 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
struct printbuf *err)
{
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+
+ /* these will be caught by fsck */
+ if (!bch2_dev_exists2(c, bp.k->p.inode))
+ return 0;
+
struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
int ret = 0;
@@ -378,7 +383,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
backpointer_to_missing_alloc,
"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
alloc_iter.pos.inode, alloc_iter.pos.offset,
- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, bp_iter, 0);
goto out;
}
@@ -414,6 +419,84 @@ struct extents_to_bp_state {
struct bkey_buf last_flushed;
};
+static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree,
+ struct bkey_s_c extent, unsigned dev)
+{
+ struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent);
+ int ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bch2_bkey_drop_device(bkey_i_to_s(n), dev);
+ return bch2_btree_insert_trans(trans, btree, n, 0);
+}
+
+static int check_extent_checksum(struct btree_trans *trans,
+ enum btree_id btree, struct bkey_s_c extent,
+ enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ struct printbuf buf = PRINTBUF;
+ void *data_buf = NULL;
+ struct bio *bio = NULL;
+ size_t bytes;
+ int ret = 0;
+
+ if (bkey_is_btree_ptr(extent.k))
+ return false;
+
+ bkey_for_each_ptr_decode(extent.k, ptrs, p, entry)
+ if (p.ptr.dev == dev)
+ goto found;
+ BUG();
+found:
+ if (!p.crc.csum_type)
+ return false;
+
+ bytes = p.crc.compressed_size << 9;
+
+ struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
+ if (!bch2_dev_get_ioref(ca, READ))
+ return false;
+
+ data_buf = kvmalloc(bytes, GFP_KERNEL);
+ if (!data_buf) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ bio = bio_alloc(ca->disk_sb.bdev, 1, REQ_OP_READ, GFP_KERNEL);
+ bio->bi_iter.bi_sector = p.ptr.offset;
+ bch2_bio_map(bio, data_buf, bytes);
+ ret = submit_bio_wait(bio);
+ if (ret)
+ goto err;
+
+ prt_str(&buf, "extents pointing to same space, but first extent checksum bad:");
+ prt_printf(&buf, "\n %s ", bch2_btree_id_str(btree));
+ bch2_bkey_val_to_text(&buf, c, extent);
+ prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree));
+ bch2_bkey_val_to_text(&buf, c, extent2);
+
+ struct nonce nonce = extent_nonce(extent.k->version, p.crc);
+ struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
+ if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
+ c, dup_backpointer_to_bad_csum_extent,
+ "%s", buf.buf))
+ ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
+fsck_err:
+err:
+ if (bio)
+ bio_put(bio);
+ kvfree(data_buf);
+ percpu_ref_put(&ca->io_ref);
+ printbuf_exit(&buf);
+ return ret;
+}
+
static int check_bp_exists(struct btree_trans *trans,
struct extents_to_bp_state *s,
struct bpos bucket,
@@ -421,7 +504,8 @@ static int check_bp_exists(struct btree_trans *trans,
struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
- struct btree_iter bp_iter = { NULL };
+ struct btree_iter bp_iter = {};
+ struct btree_iter other_extent_iter = {};
struct printbuf buf = PRINTBUF;
struct bkey_s_c bp_k;
struct bkey_buf tmp;
@@ -429,13 +513,19 @@ static int check_bp_exists(struct btree_trans *trans,
bch2_bkey_buf_init(&tmp);
+ if (!bch2_dev_bucket_exists(c, bucket)) {
+ prt_str(&buf, "extent for nonexistent device:bucket ");
+ bch2_bpos_to_text(&buf, bucket);
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ bch_err(c, "%s", buf.buf);
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+
if (bpos_lt(bucket, s->bucket_start) ||
bpos_gt(bucket, s->bucket_end))
return 0;
- if (!bch2_dev_bucket_exists(c, bucket))
- goto missing;
-
bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
bucket_pos_to_bp(c, bucket, bp.bucket_offset),
0);
@@ -461,21 +551,94 @@ static int check_bp_exists(struct btree_trans *trans,
ret = -BCH_ERR_transaction_restart_write_buffer_flush;
goto out;
}
- goto missing;
+
+ goto check_existing_bp;
}
out:
err:
fsck_err:
+ bch2_trans_iter_exit(trans, &other_extent_iter);
bch2_trans_iter_exit(trans, &bp_iter);
bch2_bkey_buf_exit(&tmp, c);
printbuf_exit(&buf);
return ret;
+check_existing_bp:
+ /* Do we have a backpointer for a different extent? */
+ if (bp_k.k->type != KEY_TYPE_backpointer)
+ goto missing;
+
+ struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v;
+
+ struct bkey_s_c other_extent =
+ bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0);
+ ret = bkey_err(other_extent);
+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+ ret = 0;
+ if (ret)
+ goto err;
+
+ if (!other_extent.k)
+ goto missing;
+
+ if (bch2_extents_match(orig_k, other_extent)) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, other_extent);
+ bch_err(c, "%s", buf.buf);
+
+ if (other_extent.k->size <= orig_k.k->size) {
+ ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode);
+ if (ret)
+ goto err;
+ goto out;
+ } else {
+ ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode);
+ if (ret)
+ goto err;
+ goto missing;
+ }
+ }
+
+ ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode);
+ if (ret < 0)
+ goto err;
+ if (ret) {
+ ret = 0;
+ goto missing;
+ }
+
+ ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode);
+ if (ret < 0)
+ goto err;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+
+ printbuf_reset(&buf);
+ prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode);
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, other_extent);
+ bch_err(c, "%s", buf.buf);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
missing:
+ printbuf_reset(&buf);
prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
bch2_btree_id_str(bp.btree_id), bp.level);
bch2_bkey_val_to_text(&buf, c, orig_k);
- prt_printf(&buf, "\nbp pos ");
- bch2_bpos_to_text(&buf, bp_iter.pos);
+ prt_printf(&buf, "\n got: ");
+ bch2_bkey_val_to_text(&buf, c, bp_k);
+
+ struct bkey_i_backpointer n_bp_k;
+ bkey_backpointer_init(&n_bp_k.k_i);
+ n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
+ n_bp_k.v = bp;
+ prt_printf(&buf, "\n want: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i));
if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
@@ -502,8 +665,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
if (p.ptr.cached)
continue;
- bch2_extent_ptr_to_bp(c, btree, level,
- k, p, &bucket_pos, &bp);
+ bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bucket_pos, &bp);
ret = check_bp_exists(trans, s, bucket_pos, bp, k);
if (ret)
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 327365a9feac..da012ca7daee 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -90,20 +90,40 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i);
}
-static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, struct extent_ptr_decoded p)
+static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
+ struct extent_ptr_decoded p,
+ const union bch_extent_entry *entry)
{
- return level ? BCH_DATA_btree :
- p.has_ec ? BCH_DATA_stripe :
- BCH_DATA_user;
+ switch (k.k->type) {
+ case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
+ return BCH_DATA_btree;
+ case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v:
+ return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user;
+ case KEY_TYPE_stripe: {
+ const struct bch_extent_ptr *ptr = &entry->ptr;
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+ BUG_ON(ptr < s.v->ptrs ||
+ ptr >= s.v->ptrs + s.v->nr_blocks);
+
+ return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
+ ? BCH_DATA_parity
+ : BCH_DATA_user;
+ }
+ default:
+ BUG();
+ }
}
static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct extent_ptr_decoded p,
+ const union bch_extent_entry *entry,
struct bpos *bucket_pos, struct bch_backpointer *bp)
{
- enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
s64 sectors = level ? btree_sectors(c) : k.k->size;
u32 bucket_offset;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 799aa32b6b4d..a31a5f706929 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -209,7 +209,7 @@
#include "fifo.h"
#include "nocow_locking_types.h"
#include "opts.h"
-#include "recovery_types.h"
+#include "recovery_passes_types.h"
#include "sb-errors_types.h"
#include "seqmutex.h"
#include "time_stats.h"
@@ -456,6 +456,7 @@ enum bch_time_stats {
#include "alloc_types.h"
#include "btree_types.h"
+#include "btree_node_scan_types.h"
#include "btree_write_buffer_types.h"
#include "buckets_types.h"
#include "buckets_waiting_for_journal_types.h"
@@ -614,6 +615,7 @@ struct bch_dev {
*/
#define BCH_FS_FLAGS() \
+ x(new_fs) \
x(started) \
x(may_go_rw) \
x(rw) \
@@ -796,6 +798,7 @@ struct bch_fs {
u64 features;
u64 compat;
unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)];
+ u64 btrees_lost_data;
} sb;
@@ -810,7 +813,6 @@ struct bch_fs {
/* snapshot.c: */
struct snapshot_table __rcu *snapshots;
- size_t snapshot_table_size;
struct mutex snapshot_table_lock;
struct rw_semaphore snapshot_create_lock;
@@ -1104,6 +1106,8 @@ struct bch_fs {
struct journal_keys journal_keys;
struct list_head journal_iters;
+ struct find_btree_nodes found_btree_nodes;
+
u64 last_bucket_seq_cleanup;
u64 counters_on_mount[BCH_COUNTER_NR];
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index bff8750ac0d7..63102992d955 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -818,6 +818,7 @@ struct bch_sb_field_ext {
struct bch_sb_field field;
__le64 recovery_passes_required[2];
__le64 errors_silent[8];
+ __le64 btrees_lost_data;
};
struct bch_sb_field_downgrade_entry {
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 3fd1085b6c61..3bb477840eab 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -134,18 +134,24 @@ void bch2_dump_btree_node_iter(struct btree *b,
printbuf_exit(&buf);
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-void __bch2_verify_btree_nr_keys(struct btree *b)
+struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b)
{
struct bset_tree *t;
struct bkey_packed *k;
- struct btree_nr_keys nr = { 0 };
+ struct btree_nr_keys nr = {};
for_each_bset(b, t)
bset_tree_for_each_key(b, t, k)
if (!bkey_deleted(k))
btree_keys_account_key_add(&nr, t - b->set, k);
+ return nr;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *b)
+{
+ struct btree_nr_keys nr = bch2_btree_node_count_keys(b);
BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 79c77baaa383..120a79fd456b 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -458,6 +458,8 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
/* Accounting: */
+struct btree_nr_keys bch2_btree_node_count_keys(struct btree *);
+
static inline void btree_keys_account_key(struct btree_nr_keys *n,
unsigned bset,
struct bkey_packed *k,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 562561a9a510..84474324dba9 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -808,7 +808,8 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
prt_printf(&buf, "\nmax ");
bch2_bpos_to_text(&buf, b->data->max_key);
- bch2_fs_inconsistent(c, "%s", buf.buf);
+ bch2_fs_topology_error(c, "%s", buf.buf);
+
printbuf_exit(&buf);
}
@@ -1134,6 +1135,8 @@ void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
b = btree_cache_find(bc, k);
if (!b)
return;
+
+ BUG_ON(b == btree_node_root(trans->c, b));
wait_on_io:
/* not allowed to wait on io with btree locks held: */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index bdaed29f084a..6280da1244b5 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -7,11 +7,13 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
#include "bkey_methods.h"
#include "bkey_buf.h"
#include "btree_journal_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
+#include "btree_node_scan.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "btree_gc.h"
@@ -24,7 +26,7 @@
#include "journal.h"
#include "keylist.h"
#include "move.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "reflink.h"
#include "replicas.h"
#include "super-io.h"
@@ -40,6 +42,7 @@
#define DROP_THIS_NODE 10
#define DROP_PREV_NODE 11
+#define DID_FILL_FROM_SCAN 12
static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
{
@@ -70,90 +73,6 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
__gc_pos_set(c, new_pos);
}
-/*
- * Missing: if an interior btree node is empty, we need to do something -
- * perhaps just kill it
- */
-static int bch2_gc_check_topology(struct bch_fs *c,
- struct btree *b,
- struct bkey_buf *prev,
- struct bkey_buf cur,
- bool is_last)
-{
- struct bpos node_start = b->data->min_key;
- struct bpos node_end = b->data->max_key;
- struct bpos expected_start = bkey_deleted(&prev->k->k)
- ? node_start
- : bpos_successor(prev->k->k.p);
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
- int ret = 0;
-
- if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
- struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
-
- if (!bpos_eq(expected_start, bp->v.min_key)) {
- bch2_topology_error(c);
-
- if (bkey_deleted(&prev->k->k)) {
- prt_printf(&buf1, "start of node: ");
- bch2_bpos_to_text(&buf1, node_start);
- } else {
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
- }
- bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
-
- if (__fsck_err(c,
- FSCK_CAN_FIX|
- FSCK_CAN_IGNORE|
- FSCK_NO_RATELIMIT,
- btree_node_topology_bad_min_key,
- "btree node with incorrect min_key at btree %s level %u:\n"
- " prev %s\n"
- " cur %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) {
- bch_info(c, "Halting mark and sweep to start topology repair pass");
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
- goto err;
- } else {
- set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
- }
- }
- }
-
- if (is_last && !bpos_eq(cur.k->k.p, node_end)) {
- bch2_topology_error(c);
-
- printbuf_reset(&buf1);
- printbuf_reset(&buf2);
-
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
- bch2_bpos_to_text(&buf2, node_end);
-
- if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT,
- btree_node_topology_bad_max_key,
- "btree node with incorrect max_key at btree %s level %u:\n"
- " %s\n"
- " expected %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf) &&
- should_restart_for_topology_repair(c)) {
- bch_info(c, "Halting mark and sweep to start topology repair pass");
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
- goto err;
- } else {
- set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
- }
- }
-
- bch2_bkey_buf_copy(prev, c, cur.k);
-err:
-fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- return ret;
-}
-
static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
{
switch (b->key.k.type) {
@@ -212,6 +131,17 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
struct bkey_i_btree_ptr_v2 *new;
int ret;
+ if (c->opts.verbose) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, " -> ");
+ bch2_bpos_to_text(&buf, new_min);
+
+ bch_info(c, "%s(): %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
+
new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
if (!new)
return -BCH_ERR_ENOMEM_gc_repair_key;
@@ -237,6 +167,17 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
struct bkey_i_btree_ptr_v2 *new;
int ret;
+ if (c->opts.verbose) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, " -> ");
+ bch2_bpos_to_text(&buf, new_max);
+
+ bch_info(c, "%s(): %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
+
ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
if (ret)
return ret;
@@ -268,127 +209,138 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
return 0;
}
-static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
- struct btree *prev, struct btree *cur)
+static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b,
+ struct btree *prev, struct btree *cur,
+ struct bpos *pulled_from_scan)
{
struct bpos expected_start = !prev
? b->data->min_key
: bpos_successor(prev->key.k.p);
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- if (!prev) {
- prt_printf(&buf1, "start of node: ");
- bch2_bpos_to_text(&buf1, b->data->min_key);
- } else {
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
+ BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
+ b->data->min_key));
+
+ if (bpos_eq(expected_start, cur->data->min_key))
+ return 0;
+
+ prt_printf(&buf, " at btree %s level %u:\n parent: ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ if (prev) {
+ prt_printf(&buf, "\n prev: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
}
- bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
-
- if (prev &&
- bpos_gt(expected_start, cur->data->min_key) &&
- BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
- /* cur overwrites prev: */
-
- if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
- cur->data->min_key), c,
- btree_node_topology_overwritten_by_next_node,
- "btree node overwritten by next node at btree %s level %u:\n"
- " node %s\n"
- " next %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf)) {
- ret = DROP_PREV_NODE;
- goto out;
- }
+ prt_str(&buf, "\n next: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
- if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
- bpos_predecessor(cur->data->min_key)), c,
- btree_node_topology_bad_max_key,
- "btree node with incorrect max_key at btree %s level %u:\n"
- " node %s\n"
- " next %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf))
- ret = set_node_max(c, prev,
- bpos_predecessor(cur->data->min_key));
- } else {
- /* prev overwrites cur: */
-
- if (mustfix_fsck_err_on(bpos_ge(expected_start,
- cur->data->max_key), c,
- btree_node_topology_overwritten_by_prev_node,
- "btree node overwritten by prev node at btree %s level %u:\n"
- " prev %s\n"
- " node %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf)) {
- ret = DROP_THIS_NODE;
- goto out;
- }
+ if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */
+ if (b->c.level == 1 &&
+ bpos_lt(*pulled_from_scan, cur->data->min_key)) {
+ ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
+ expected_start,
+ bpos_predecessor(cur->data->min_key));
+ if (ret)
+ goto err;
- if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
- btree_node_topology_bad_min_key,
- "btree node with incorrect min_key at btree %s level %u:\n"
- " prev %s\n"
- " node %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf))
- ret = set_node_min(c, cur, expected_start);
+ *pulled_from_scan = cur->data->min_key;
+ ret = DID_FILL_FROM_SCAN;
+ } else {
+ if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
+ "btree node with incorrect min_key%s", buf.buf))
+ ret = set_node_min(c, cur, expected_start);
+ }
+ } else { /* overlap */
+ if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */
+ if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */
+ if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node,
+ "btree node overwritten by next node%s", buf.buf))
+ ret = DROP_PREV_NODE;
+ } else {
+ if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
+ "btree node with incorrect max_key%s", buf.buf))
+ ret = set_node_max(c, prev,
+ bpos_predecessor(cur->data->min_key));
+ }
+ } else {
+ if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */
+ if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node,
+ "btree node overwritten by prev node%s", buf.buf))
+ ret = DROP_THIS_NODE;
+ } else {
+ if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
+ "btree node with incorrect min_key%s", buf.buf))
+ ret = set_node_min(c, cur, expected_start);
+ }
+ }
}
-out:
+err:
fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
+ printbuf_exit(&buf);
return ret;
}
static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
- struct btree *child)
+ struct btree *child, struct bpos *pulled_from_scan)
{
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
- bch2_bpos_to_text(&buf2, b->key.k.p);
+ if (bpos_eq(child->key.k.p, b->key.k.p))
+ return 0;
- if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
- btree_node_topology_bad_max_key,
- "btree node with incorrect max_key at btree %s level %u:\n"
- " %s\n"
- " expected %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf)) {
- ret = set_node_max(c, child, b->key.k.p);
- if (ret)
- goto err;
+ prt_printf(&buf, "at btree %s level %u:\n parent: ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ prt_str(&buf, "\n child: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
+
+ if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
+ "btree node with incorrect max_key%s", buf.buf)) {
+ if (b->c.level == 1 &&
+ bpos_lt(*pulled_from_scan, b->key.k.p)) {
+ ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
+ bpos_successor(child->key.k.p), b->key.k.p);
+ if (ret)
+ goto err;
+
+ *pulled_from_scan = b->key.k.p;
+ ret = DID_FILL_FROM_SCAN;
+ } else {
+ ret = set_node_max(c, child, b->key.k.p);
+ }
}
err:
fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
+ printbuf_exit(&buf);
return ret;
}
-static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
+static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
+ struct bpos *pulled_from_scan)
{
struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf prev_k, cur_k;
struct btree *prev = NULL, *cur = NULL;
- bool have_child, dropped_children = false;
+ bool have_child, new_pass = false;
struct printbuf buf = PRINTBUF;
int ret = 0;
if (!b->c.level)
return 0;
-again:
- prev = NULL;
- have_child = dropped_children = false;
+
bch2_bkey_buf_init(&prev_k);
bch2_bkey_buf_init(&cur_k);
+again:
+ cur = prev = NULL;
+ have_child = new_pass = false;
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
@@ -415,9 +367,10 @@ again:
b->c.level - 1,
buf.buf)) {
bch2_btree_node_evict(trans, cur_k.k);
- ret = bch2_journal_key_delete(c, b->c.btree_id,
- b->c.level, cur_k.k->k.p);
cur = NULL;
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?:
+ bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur_k.k->k.p);
if (ret)
break;
continue;
@@ -427,7 +380,23 @@ again:
if (ret)
break;
- ret = btree_repair_node_boundaries(c, b, prev, cur);
+ if (bch2_btree_node_is_stale(c, cur)) {
+ bch_info(c, "btree node %s older than nodes found by scanning", buf.buf);
+ six_unlock_read(&cur->c.lock);
+ bch2_btree_node_evict(trans, cur_k.k);
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur_k.k->k.p);
+ cur = NULL;
+ if (ret)
+ break;
+ continue;
+ }
+
+ ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan);
+ if (ret == DID_FILL_FROM_SCAN) {
+ new_pass = true;
+ ret = 0;
+ }
if (ret == DROP_THIS_NODE) {
six_unlock_read(&cur->c.lock);
@@ -445,6 +414,7 @@ again:
prev = NULL;
if (ret == DROP_PREV_NODE) {
+ bch_info(c, "dropped prev node");
bch2_btree_node_evict(trans, prev_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, prev_k.k->k.p);
@@ -452,8 +422,6 @@ again:
break;
bch2_btree_and_journal_iter_exit(&iter);
- bch2_bkey_buf_exit(&prev_k, c);
- bch2_bkey_buf_exit(&cur_k, c);
goto again;
} else if (ret)
break;
@@ -465,7 +433,11 @@ again:
if (!ret && !IS_ERR_OR_NULL(prev)) {
BUG_ON(cur);
- ret = btree_repair_node_end(c, b, prev);
+ ret = btree_repair_node_end(c, b, prev, pulled_from_scan);
+ if (ret == DID_FILL_FROM_SCAN) {
+ new_pass = true;
+ ret = 0;
+ }
}
if (!IS_ERR_OR_NULL(prev))
@@ -479,6 +451,10 @@ again:
goto err;
bch2_btree_and_journal_iter_exit(&iter);
+
+ if (new_pass)
+ goto again;
+
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
@@ -495,7 +471,7 @@ again:
if (ret)
goto err;
- ret = bch2_btree_repair_topology_recurse(trans, cur);
+ ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
six_unlock_read(&cur->c.lock);
cur = NULL;
@@ -503,7 +479,7 @@ again:
bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
- dropped_children = true;
+ new_pass = true;
}
if (ret)
@@ -530,12 +506,14 @@ fsck_err:
six_unlock_read(&cur->c.lock);
bch2_btree_and_journal_iter_exit(&iter);
- bch2_bkey_buf_exit(&prev_k, c);
- bch2_bkey_buf_exit(&cur_k, c);
- if (!ret && dropped_children)
+ if (!ret && new_pass)
goto again;
+ BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
+
+ bch2_bkey_buf_exit(&prev_k, c);
+ bch2_bkey_buf_exit(&cur_k, c);
printbuf_exit(&buf);
return ret;
}
@@ -543,32 +521,63 @@ fsck_err:
int bch2_check_topology(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree *b;
- unsigned i;
+ struct bpos pulled_from_scan = POS_MIN;
int ret = 0;
- for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
+ bool reconstructed_root = false;
- if (!r->alive)
- continue;
+ if (r->error) {
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ if (ret)
+ break;
+reconstruct_root:
+ bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i));
- b = r->b;
- if (btree_node_fake(b))
- continue;
+ r->alive = false;
+ r->error = 0;
+
+ if (!bch2_btree_has_scanned_nodes(c, i)) {
+ mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing,
+ "no nodes found for btree %s, continue?", bch2_btree_id_str(i));
+ bch2_btree_root_alloc_fake(c, i, 0);
+ } else {
+ bch2_btree_root_alloc_fake(c, i, 1);
+ ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
+ if (ret)
+ break;
+ }
+
+ bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ reconstructed_root = true;
+ }
+
+ struct btree *b = r->b;
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- ret = bch2_btree_repair_topology_recurse(trans, b);
+ ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
six_unlock_read(&b->c.lock);
if (ret == DROP_THIS_NODE) {
- bch_err(c, "empty btree root - repair unimplemented");
- ret = -BCH_ERR_fsck_repair_unimplemented;
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+ mutex_lock(&c->btree_cache.lock);
+ list_move(&b->list, &c->btree_cache.freeable);
+ mutex_unlock(&c->btree_cache.lock);
+
+ r->b = NULL;
+
+ if (!reconstructed_root)
+ goto reconstruct_root;
+
+ bch_err(c, "empty btree root %s", bch2_btree_id_str(i));
+ bch2_btree_root_alloc_fake(c, i, 0);
+ r->alive = false;
+ ret = 0;
}
}
-
+fsck_err:
bch2_trans_put(trans);
-
return ret;
}
@@ -591,7 +600,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, p, entry_c);
if (fsck_err_on(!g->gen_valid,
c, ptr_to_missing_alloc_key,
@@ -657,7 +666,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
continue;
if (fsck_err_on(bucket_data_type(g->data_type) &&
- bucket_data_type(g->data_type) != data_type, c,
+ bucket_data_type(g->data_type) !=
+ bucket_data_type(data_type), c,
ptr_bucket_data_type_mismatch,
"bucket %u:%zu different types of data in same bucket: %s, %s\n"
"while marking %s",
@@ -698,18 +708,13 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
}
if (do_update) {
- struct bkey_ptrs ptrs;
- union bch_extent_entry *entry;
- struct bch_extent_ptr *ptr;
- struct bkey_i *new;
-
if (is_root) {
bch_err(c, "cannot update btree roots yet");
ret = -EINVAL;
goto err;
}
- new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
+ struct bkey_i *new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
if (!new) {
ret = -BCH_ERR_ENOMEM_gc_repair_key;
bch_err_msg(c, ret, "allocating new key");
@@ -724,7 +729,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
* btree node isn't there anymore, the read path will
* sort it out:
*/
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_GC_BUCKET(ca, ptr);
@@ -732,19 +737,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
ptr->gen = g->gen;
}
} else {
- bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_GC_BUCKET(ca, ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
-
- (ptr->cached &&
- (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
- (!ptr->cached &&
- gen_cmp(ptr->gen, g->gen) < 0) ||
- gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
- (g->data_type &&
- g->data_type != data_type);
- }));
+ struct bkey_ptrs ptrs;
+ union bch_extent_entry *entry;
+restart_drop_ptrs:
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry);
+
+ if ((p.ptr.cached &&
+ (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) ||
+ (!p.ptr.cached &&
+ gen_cmp(p.ptr.gen, g->gen) < 0) ||
+ gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX ||
+ (g->data_type &&
+ g->data_type != data_type)) {
+ bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr);
+ goto restart_drop_ptrs;
+ }
+ }
again:
ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
bkey_extent_entry_for_each(ptrs, entry) {
@@ -774,12 +786,6 @@ found:
}
}
- ret = bch2_journal_key_insert_take(c, btree_id, level, new);
- if (ret) {
- kfree(new);
- goto err;
- }
-
if (level)
bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
@@ -793,6 +799,12 @@ found:
bch_info(c, "new key %s", buf.buf);
}
+ ret = bch2_journal_key_insert_take(c, btree_id, level, new);
+ if (ret) {
+ kfree(new);
+ goto err;
+ }
+
*k = bkey_i_to_s_c(new);
}
err:
@@ -819,10 +831,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
BUG_ON(bch2_journal_seq_verify &&
k->k->version.lo > atomic64_read(&c->journal.seq));
- ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
- if (ret)
- goto err;
-
if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
bkey_version_in_future,
"key version number higher than recorded: %llu > %llu",
@@ -831,8 +839,13 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
atomic64_set(&c->key_version, k->k->version.lo);
}
+ ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
+ if (ret)
+ goto err;
+
ret = commit_do(trans, NULL, NULL, 0,
- bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC));
+ bch2_key_trigger(trans, btree_id, level, old,
+ unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC));
fsck_err:
err:
bch_err_fn(c, ret);
@@ -841,42 +854,30 @@ err:
static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
{
- struct bch_fs *c = trans->c;
struct btree_node_iter iter;
struct bkey unpacked;
struct bkey_s_c k;
- struct bkey_buf prev, cur;
int ret = 0;
+ ret = bch2_btree_node_check_topology(trans, b);
+ if (ret)
+ return ret;
+
if (!btree_node_type_needs_gc(btree_node_type(b)))
return 0;
bch2_btree_node_iter_init_from_start(&iter, b);
- bch2_bkey_buf_init(&prev);
- bch2_bkey_buf_init(&cur);
- bkey_init(&prev.k->k);
while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
&k, initial);
if (ret)
- break;
+ return ret;
bch2_btree_node_iter_advance(&iter, b);
-
- if (b->c.level) {
- bch2_bkey_buf_reassemble(&cur, c, k);
-
- ret = bch2_gc_check_topology(c, b, &prev, cur,
- bch2_btree_node_iter_end(&iter));
- if (ret)
- break;
- }
}
- bch2_bkey_buf_exit(&cur, c);
- bch2_bkey_buf_exit(&prev, c);
- return ret;
+ return 0;
}
static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
@@ -925,14 +926,16 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
- struct bkey_buf cur, prev;
+ struct bkey_buf cur;
struct printbuf buf = PRINTBUF;
int ret = 0;
+ ret = bch2_btree_node_check_topology(trans, b);
+ if (ret)
+ return ret;
+
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
- bch2_bkey_buf_init(&prev);
bch2_bkey_buf_init(&cur);
- bkey_init(&prev.k->k);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
BUG_ON(bpos_lt(k.k->p, b->data->min_key));
@@ -943,20 +946,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
if (ret)
goto fsck_err;
- if (b->c.level) {
- bch2_bkey_buf_reassemble(&cur, c, k);
- k = bkey_i_to_s_c(cur.k);
-
- bch2_btree_and_journal_iter_advance(&iter);
-
- ret = bch2_gc_check_topology(c, b,
- &prev, cur,
- !bch2_btree_and_journal_iter_peek(&iter).k);
- if (ret)
- goto fsck_err;
- } else {
- bch2_btree_and_journal_iter_advance(&iter);
- }
+ bch2_btree_and_journal_iter_advance(&iter);
}
if (b->c.level > target_depth) {
@@ -1015,7 +1005,6 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
}
fsck_err:
bch2_bkey_buf_exit(&cur, c);
- bch2_bkey_buf_exit(&prev, c);
bch2_btree_and_journal_iter_exit(&iter);
printbuf_exit(&buf);
return ret;
@@ -1033,9 +1022,6 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
b = bch2_btree_id_root(c, btree_id)->b;
- if (btree_node_fake(b))
- return 0;
-
six_lock_read(&b->c.lock, NULL, NULL);
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->data->min_key);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 34df8ccc5fec..d7de82ac3893 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -654,6 +654,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
*/
bch2_bset_set_no_aux_tree(b, b->set);
bch2_btree_build_aux_trees(b);
+ b->nr = bch2_btree_node_count_keys(b);
struct bkey_s_c k;
struct bkey unpacked;
@@ -1263,10 +1264,12 @@ out:
return retry_read;
fsck_err:
if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
- ret == -BCH_ERR_btree_node_read_err_must_retry)
+ ret == -BCH_ERR_btree_node_read_err_must_retry) {
retry_read = 1;
- else
+ } else {
set_btree_node_read_error(b);
+ bch2_btree_lost_data(c, b->c.btree_id);
+ }
goto out;
}
@@ -1327,6 +1330,7 @@ start:
if (!can_retry) {
set_btree_node_read_error(b);
+ bch2_btree_lost_data(c, b->c.btree_id);
break;
}
}
@@ -1526,9 +1530,10 @@ fsck_err:
ret = -1;
}
- if (ret)
+ if (ret) {
set_btree_node_read_error(b);
- else if (*saw_error)
+ bch2_btree_lost_data(c, b->c.btree_id);
+ } else if (*saw_error)
bch2_btree_node_rewrite_async(c, b);
for (i = 0; i < ra->nr; i++) {
@@ -1657,13 +1662,14 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
prt_str(&buf, "btree node read error: no device to read from\n at ");
bch2_btree_pos_to_text(&buf, c, b);
- bch_err(c, "%s", buf.buf);
+ bch_err_ratelimited(c, "%s", buf.buf);
if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
bch2_fatal_error(c);
set_btree_node_read_error(b);
+ bch2_btree_lost_data(c, b->c.btree_id);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
printbuf_exit(&buf);
@@ -1860,7 +1866,7 @@ static void btree_node_write_work(struct work_struct *work)
} else {
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
- BCH_WATERMARK_reclaim|
+ BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw,
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 51bcdc6c6d1c..2a211a4bebd1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -927,8 +927,22 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
if (ret)
goto err;
} else {
- bch2_bkey_buf_unpack(&tmp, c, l->b,
- bch2_btree_node_iter_peek(&l->iter, l->b));
+ struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
+ if (!k) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "node not found at pos ");
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_str(&buf, " within parent node ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key));
+
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ ret = -BCH_ERR_btree_need_topology_repair;
+ goto err;
+ }
+
+ bch2_bkey_buf_unpack(&tmp, c, l->b, k);
if ((flags & BTREE_ITER_PREFETCH) &&
c->opts.btree_node_prefetch) {
@@ -962,7 +976,6 @@ err:
return ret;
}
-
static int bch2_btree_path_traverse_all(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
@@ -2790,6 +2803,31 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
struct btree_transaction_stats *s = btree_trans_stats(trans);
s->max_mem = max(s->max_mem, new_bytes);
+ if (trans->used_mempool) {
+ if (trans->mem_bytes >= new_bytes)
+ goto out_change_top;
+
+ /* No more space from mempool item, need malloc new one */
+ new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+ if (unlikely(!new_mem)) {
+ bch2_trans_unlock(trans);
+
+ new_mem = kmalloc(new_bytes, GFP_KERNEL);
+ if (!new_mem)
+ return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+
+ ret = bch2_trans_relock(trans);
+ if (ret) {
+ kfree(new_mem);
+ return ERR_PTR(ret);
+ }
+ }
+ memcpy(new_mem, trans->mem, trans->mem_top);
+ trans->used_mempool = false;
+ mempool_free(trans->mem, &c->btree_trans_mem_pool);
+ goto out_new_mem;
+ }
+
new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_mem)) {
bch2_trans_unlock(trans);
@@ -2798,6 +2836,8 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
new_bytes = BTREE_TRANS_MEM_MAX;
+ memcpy(new_mem, trans->mem, trans->mem_top);
+ trans->used_mempool = true;
kfree(trans->mem);
}
@@ -2811,7 +2851,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
if (ret)
return ERR_PTR(ret);
}
-
+out_new_mem:
trans->mem = new_mem;
trans->mem_bytes = new_bytes;
@@ -2819,7 +2859,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
}
-
+out_change_top:
p = trans->mem + trans->mem_top;
trans->mem_top += size;
memset(p, 0, size);
@@ -3093,7 +3133,7 @@ void bch2_trans_put(struct btree_trans *trans)
if (paths_allocated != trans->_paths_allocated)
kvfree_rcu_mightsleep(paths_allocated);
- if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
+ if (trans->used_mempool)
mempool_free(trans->mem, &c->btree_trans_mem_pool);
else
kfree(trans->mem);
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 50e04356d72c..5cbcbfe85235 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -261,6 +261,22 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
return bch2_journal_key_insert(c, id, level, &whiteout);
}
+bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
+ unsigned level, struct bpos pos)
+{
+ struct journal_keys *keys = &trans->c->journal_keys;
+ size_t idx = bch2_journal_key_search(keys, btree, level, pos);
+
+ if (!trans->journal_replay_not_finished)
+ return false;
+
+ return (idx < keys->size &&
+ keys->data[idx].btree_id == btree &&
+ keys->data[idx].level == level &&
+ bpos_eq(keys->data[idx].k->k.p, pos) &&
+ bkey_deleted(&keys->data[idx].k->k));
+}
+
void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos pos)
{
@@ -363,7 +379,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
{
- struct bkey_s_c btree_k, journal_k, ret;
+ struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
if (iter->prefetch && iter->journal.level)
btree_and_journal_iter_prefetch(iter);
@@ -375,9 +391,10 @@ again:
bpos_lt(btree_k.k->p, iter->pos))
bch2_journal_iter_advance_btree(iter);
- while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
- bpos_lt(journal_k.k->p, iter->pos))
- bch2_journal_iter_advance(&iter->journal);
+ if (iter->trans->journal_replay_not_finished)
+ while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+ bpos_lt(journal_k.k->p, iter->pos))
+ bch2_journal_iter_advance(&iter->journal);
ret = journal_k.k &&
(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
@@ -435,7 +452,9 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
bch2_btree_node_iter_init_from_start(&node_iter, b);
__bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
- list_add(&iter->journal.list, &trans->c->journal_iters);
+ if (trans->journal_replay_not_finished &&
+ !test_bit(BCH_FS_may_go_rw, &trans->c->flags))
+ list_add(&iter->journal.list, &trans->c->journal_iters);
}
/* sort and dedup all keys in the journal: */
@@ -548,3 +567,22 @@ int bch2_journal_keys_sort(struct bch_fs *c)
bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
return 0;
}
+
+void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
+ unsigned level_min, unsigned level_max,
+ struct bpos start, struct bpos end)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ size_t dst = 0;
+
+ move_gap(keys, keys->nr);
+
+ darray_for_each(*keys, i)
+ if (!(i->btree_id == btree &&
+ i->level >= level_min &&
+ i->level <= level_max &&
+ bpos_ge(i->k->k.p, start) &&
+ bpos_le(i->k->k.p, end)))
+ keys->data[dst++] = *i;
+ keys->nr = keys->gap = dst;
+}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index c9d19da3ea04..af25046ebcaa 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -40,8 +40,8 @@ int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *);
int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
unsigned, struct bpos);
-void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
- unsigned, struct bpos);
+bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos);
+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos);
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
@@ -66,4 +66,8 @@ void bch2_journal_entries_free(struct bch_fs *);
int bch2_journal_keys_sort(struct bch_fs *);
+void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
+ unsigned, unsigned,
+ struct bpos, struct bpos);
+
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
new file mode 100644
index 000000000000..3f33be7e5e5c
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan.c
@@ -0,0 +1,495 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_journal_iter.h"
+#include "btree_node_scan.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal_io.h"
+#include "recovery_passes.h"
+
+#include <linux/kthread.h>
+#include <linux/sort.h>
+
+struct find_btree_nodes_worker {
+ struct closure *cl;
+ struct find_btree_nodes *f;
+ struct bch_dev *ca;
+};
+
+static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
+{
+ prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie);
+ bch2_bpos_to_text(out, n->min_key);
+ prt_str(out, "-");
+ bch2_bpos_to_text(out, n->max_key);
+
+ if (n->range_updated)
+ prt_str(out, " range updated");
+ if (n->overwritten)
+ prt_str(out, " overwritten");
+
+ for (unsigned i = 0; i < n->nr_ptrs; i++) {
+ prt_char(out, ' ');
+ bch2_extent_ptr_to_text(out, c, n->ptrs + i);
+ }
+}
+
+static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
+{
+ printbuf_indent_add(out, 2);
+ darray_for_each(nodes, i) {
+ found_btree_node_to_text(out, c, i);
+ prt_newline(out);
+ }
+ printbuf_indent_sub(out, 2);
+}
+
+static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
+{
+ struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
+
+ set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
+ bp->k.p = f->max_key;
+ bp->v.seq = cpu_to_le64(f->cookie);
+ bp->v.sectors_written = 0;
+ bp->v.flags = 0;
+ bp->v.min_key = f->min_key;
+ SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
+ memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
+}
+
+static bool found_btree_node_is_readable(struct btree_trans *trans,
+ const struct found_btree_node *f)
+{
+ struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k;
+
+ found_btree_node_to_key(&k.k, f);
+
+ struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
+ bool ret = !IS_ERR_OR_NULL(b);
+ if (ret)
+ six_unlock_read(&b->c.lock);
+
+ /*
+ * We might update this node's range; if that happens, we need the node
+ * to be re-read so the read path can trim keys that are no longer in
+ * this node
+ */
+ if (b != btree_node_root(trans->c, b))
+ bch2_btree_node_evict(trans, &k.k);
+ return ret;
+}
+
+static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
+{
+ const struct found_btree_node *l = _l;
+ const struct found_btree_node *r = _r;
+
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ cmp_int(l->level, r->level) ?:
+ cmp_int(l->cookie, r->cookie);
+}
+
+/*
+ * Given two found btree nodes, if their sequence numbers are equal, take the
+ * one that's readable:
+ */
+static int found_btree_node_cmp_time(const struct found_btree_node *l,
+ const struct found_btree_node *r)
+{
+ return cmp_int(l->seq, r->seq);
+}
+
+static int found_btree_node_cmp_pos(const void *_l, const void *_r)
+{
+ const struct found_btree_node *l = _l;
+ const struct found_btree_node *r = _r;
+
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ -cmp_int(l->level, r->level) ?:
+ bpos_cmp(l->min_key, r->min_key) ?:
+ -found_btree_node_cmp_time(l, r);
+}
+
+static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
+ struct bio *bio, struct btree_node *bn, u64 offset)
+{
+ struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
+ bio->bi_iter.bi_sector = offset;
+ bch2_bio_map(bio, bn, PAGE_SIZE);
+
+ submit_bio_wait(bio);
+ if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+ "IO error in try_read_btree_node() at %llu: %s",
+ offset, bch2_blk_status_to_str(bio->bi_status)))
+ return;
+
+ if (le64_to_cpu(bn->magic) != bset_magic(c))
+ return;
+
+ rcu_read_lock();
+ struct found_btree_node n = {
+ .btree_id = BTREE_NODE_ID(bn),
+ .level = BTREE_NODE_LEVEL(bn),
+ .seq = BTREE_NODE_SEQ(bn),
+ .cookie = le64_to_cpu(bn->keys.seq),
+ .min_key = bn->min_key,
+ .max_key = bn->max_key,
+ .nr_ptrs = 1,
+ .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr,
+ .ptrs[0].offset = offset,
+ .ptrs[0].dev = ca->dev_idx,
+ .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)),
+ };
+ rcu_read_unlock();
+
+ if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
+ mutex_lock(&f->lock);
+ if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
+ bch_err(c, "try_read_btree_node() can't handle endian conversion");
+ f->ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (darray_push(&f->nodes, n))
+ f->ret = -ENOMEM;
+unlock:
+ mutex_unlock(&f->lock);
+ }
+}
+
+static int read_btree_nodes_worker(void *p)
+{
+ struct find_btree_nodes_worker *w = p;
+ struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
+ struct bch_dev *ca = w->ca;
+ void *buf = (void *) __get_free_page(GFP_KERNEL);
+ struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
+ unsigned long last_print = jiffies;
+
+ if (!buf || !bio) {
+ bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
+ w->f->ret = -ENOMEM;
+ goto err;
+ }
+
+ for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
+ for (unsigned bucket_offset = 0;
+ bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
+ bucket_offset += btree_sectors(c)) {
+ if (time_after(jiffies, last_print + HZ * 30)) {
+ u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
+ u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
+
+ bch_info(ca, "%s: %2u%% done", __func__,
+ (unsigned) div64_u64(cur_sector * 100, end_sector));
+ last_print = jiffies;
+ }
+
+ try_read_btree_node(w->f, ca, bio, buf,
+ bucket * ca->mi.bucket_size + bucket_offset);
+ }
+err:
+ bio_put(bio);
+ free_page((unsigned long) buf);
+ percpu_ref_get(&ca->io_ref);
+ closure_put(w->cl);
+ kfree(w);
+ return 0;
+}
+
+static int read_btree_nodes(struct find_btree_nodes *f)
+{
+ struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+ struct closure cl;
+ int ret = 0;
+
+ closure_init_stack(&cl);
+
+ for_each_online_member(c, ca) {
+ struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
+ struct task_struct *t;
+
+ if (!w) {
+ percpu_ref_put(&ca->io_ref);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ percpu_ref_get(&ca->io_ref);
+ closure_get(&cl);
+ w->cl = &cl;
+ w->f = f;
+ w->ca = ca;
+
+ t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
+ ret = IS_ERR_OR_NULL(t);
+ if (ret) {
+ percpu_ref_put(&ca->io_ref);
+ closure_put(&cl);
+ f->ret = ret;
+ bch_err(c, "error starting kthread: %i", ret);
+ break;
+ }
+ }
+err:
+ closure_sync(&cl);
+ return f->ret ?: ret;
+}
+
+static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
+{
+ while (n + 1 < end &&
+ found_btree_node_cmp_pos(n, n + 1) > 0) {
+ swap(n[0], n[1]);
+ n++;
+ }
+}
+
+static int handle_overwrites(struct bch_fs *c,
+ struct found_btree_node *start,
+ struct found_btree_node *end)
+{
+ struct found_btree_node *n;
+again:
+ for (n = start + 1;
+ n < end &&
+ n->btree_id == start->btree_id &&
+ n->level == start->level &&
+ bpos_lt(n->min_key, start->max_key);
+ n++) {
+ int cmp = found_btree_node_cmp_time(start, n);
+
+ if (cmp > 0) {
+ if (bpos_cmp(start->max_key, n->max_key) >= 0)
+ n->overwritten = true;
+ else {
+ n->range_updated = true;
+ n->min_key = bpos_successor(start->max_key);
+ n->range_updated = true;
+ bubble_up(n, end);
+ goto again;
+ }
+ } else if (cmp < 0) {
+ BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
+
+ start->max_key = bpos_predecessor(n->min_key);
+ start->range_updated = true;
+ } else {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "overlapping btree nodes with same seq! halting\n ");
+ found_btree_node_to_text(&buf, c, start);
+ prt_str(&buf, "\n ");
+ found_btree_node_to_text(&buf, c, n);
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int bch2_scan_for_btree_nodes(struct bch_fs *c)
+{
+ struct find_btree_nodes *f = &c->found_btree_nodes;
+ struct printbuf buf = PRINTBUF;
+ size_t dst;
+ int ret = 0;
+
+ if (f->nodes.nr)
+ return 0;
+
+ mutex_init(&f->lock);
+
+ ret = read_btree_nodes(f);
+ if (ret)
+ return ret;
+
+ if (!f->nodes.nr) {
+ bch_err(c, "%s: no btree nodes found", __func__);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (0 && c->opts.verbose) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "%s: nodes found:\n", __func__);
+ found_btree_nodes_to_text(&buf, c, f->nodes);
+ bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ }
+
+ sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
+
+ dst = 0;
+ darray_for_each(f->nodes, i) {
+ struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
+
+ if (prev &&
+ prev->cookie == i->cookie) {
+ if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
+ bch_err(c, "%s: found too many replicas for btree node", __func__);
+ ret = -EINVAL;
+ goto err;
+ }
+ prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
+ } else {
+ f->nodes.data[dst++] = *i;
+ }
+ }
+ f->nodes.nr = dst;
+
+ sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+
+ if (0 && c->opts.verbose) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
+ found_btree_nodes_to_text(&buf, c, f->nodes);
+ bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ }
+
+ dst = 0;
+ darray_for_each(f->nodes, i) {
+ if (i->overwritten)
+ continue;
+
+ ret = handle_overwrites(c, i, &darray_top(f->nodes));
+ if (ret)
+ goto err;
+
+ BUG_ON(i->overwritten);
+ f->nodes.data[dst++] = *i;
+ }
+ f->nodes.nr = dst;
+
+ if (c->opts.verbose) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
+ found_btree_nodes_to_text(&buf, c, f->nodes);
+ bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ }
+
+ eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
+{
+ const struct found_btree_node *l = _l;
+ const struct found_btree_node *r = _r;
+
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ -cmp_int(l->level, r->level) ?:
+ bpos_cmp(l->max_key, r->min_key);
+}
+
+#define for_each_found_btree_node_in_range(_f, _search, _idx) \
+ for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \
+ sizeof((_f)->nodes.data[0]), \
+ found_btree_node_range_start_cmp, &search); \
+ _idx < (_f)->nodes.nr && \
+ (_f)->nodes.data[_idx].btree_id == _search.btree_id && \
+ (_f)->nodes.data[_idx].level == _search.level && \
+ bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \
+ _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
+
+bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
+{
+ struct find_btree_nodes *f = &c->found_btree_nodes;
+
+ struct found_btree_node search = {
+ .btree_id = b->c.btree_id,
+ .level = b->c.level,
+ .min_key = b->data->min_key,
+ .max_key = b->key.k.p,
+ };
+
+ for_each_found_btree_node_in_range(f, search, idx)
+ if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
+ return true;
+ return false;
+}
+
+bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
+{
+ struct found_btree_node search = {
+ .btree_id = btree,
+ .level = 0,
+ .min_key = POS_MIN,
+ .max_key = SPOS_MAX,
+ };
+
+ for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
+ return true;
+ return false;
+}
+
+int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
+ unsigned level, struct bpos node_min, struct bpos node_max)
+{
+ struct find_btree_nodes *f = &c->found_btree_nodes;
+
+ int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ if (ret)
+ return ret;
+
+ if (c->opts.verbose) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
+ bch2_bpos_to_text(&buf, node_min);
+ prt_str(&buf, " - ");
+ bch2_bpos_to_text(&buf, node_max);
+
+ bch_info(c, "%s(): %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ struct found_btree_node search = {
+ .btree_id = btree,
+ .level = level,
+ .min_key = node_min,
+ .max_key = node_max,
+ };
+
+ for_each_found_btree_node_in_range(f, search, idx) {
+ struct found_btree_node n = f->nodes.data[idx];
+
+ n.range_updated |= bpos_lt(n.min_key, node_min);
+ n.min_key = bpos_max(n.min_key, node_min);
+
+ n.range_updated |= bpos_gt(n.max_key, node_max);
+ n.max_key = bpos_min(n.max_key, node_max);
+
+ struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
+
+ found_btree_node_to_key(&tmp.k, &n);
+
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
+ bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+
+ BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL));
+
+ ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
+{
+ darray_exit(&f->nodes);
+}
diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h
new file mode 100644
index 000000000000..08687b209787
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_NODE_SCAN_H
+#define _BCACHEFS_BTREE_NODE_SCAN_H
+
+int bch2_scan_for_btree_nodes(struct bch_fs *);
+bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *);
+bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
+int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos);
+void bch2_find_btree_nodes_exit(struct find_btree_nodes *);
+
+#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
new file mode 100644
index 000000000000..abb7b27d556a
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan_types.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
+#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
+
+#include "darray.h"
+
+struct found_btree_node {
+ bool range_updated:1;
+ bool overwritten:1;
+ u8 btree_id;
+ u8 level;
+ u32 seq;
+ u64 cookie;
+
+ struct bpos min_key;
+ struct bpos max_key;
+
+ unsigned nr_ptrs;
+ struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX];
+};
+
+typedef DARRAY(struct found_btree_node) found_btree_nodes;
+
+struct find_btree_nodes {
+ int ret;
+ struct mutex lock;
+ found_btree_nodes nodes;
+};
+
+#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 30d69a6d133e..aa9da4970740 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -318,7 +318,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
i->k->k.p.snapshot &&
- bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
+ bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0);
}
static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
@@ -887,6 +887,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
int ret, unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
+ enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
switch (ret) {
case -BCH_ERR_btree_insert_btree_node_full:
@@ -905,7 +906,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
* flag
*/
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
+ watermark < BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
break;
}
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index a4b40c1656a5..8e47e260eba5 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -38,6 +38,9 @@ static noinline int extent_front_merge(struct btree_trans *trans,
struct bkey_i *update;
int ret;
+ if (unlikely(trans->journal_replay_not_finished))
+ return 0;
+
update = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(update);
if (ret)
@@ -69,6 +72,9 @@ static noinline int extent_back_merge(struct btree_trans *trans,
struct bch_fs *c = trans->c;
int ret;
+ if (unlikely(trans->journal_replay_not_finished))
+ return 0;
+
ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
if (ret < 0)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index b2f5f2e50f7e..32397b99752f 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "bkey_buf.h"
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_gc.h"
@@ -18,12 +19,20 @@
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "super-io.h"
#include "trace.h"
#include <linux/random.h>
+const char * const bch2_btree_update_modes[] = {
+#define x(t) #t,
+ BCH_WATERMARKS()
+#undef x
+ NULL
+};
+
static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
btree_path_idx_t, struct btree *, struct keylist *);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
@@ -44,56 +53,103 @@ static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
return path_idx;
}
-/* Debug code: */
-
/*
* Verify that child nodes correctly span parent node's range:
*/
-static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
+int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct bpos next_node = b->data->min_key;
- struct btree_node_iter iter;
+ struct bch_fs *c = trans->c;
+ struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2
+ ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
+ : b->data->min_key;
+ struct btree_and_journal_iter iter;
struct bkey_s_c k;
- struct bkey_s_c_btree_ptr_v2 bp;
- struct bkey unpacked;
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
+ struct bkey_buf prev;
+ int ret = 0;
- BUG_ON(!b->c.level);
+ BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
+ b->data->min_key));
- if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
- return;
+ if (!b->c.level)
+ return 0;
- bch2_btree_node_iter_init_from_start(&iter, b);
+ bch2_bkey_buf_init(&prev);
+ bkey_init(&prev.k->k);
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
- while (1) {
- k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
if (k.k->type != KEY_TYPE_btree_ptr_v2)
- break;
- bp = bkey_s_c_to_btree_ptr_v2(k);
+ goto out;
- if (!bpos_eq(next_node, bp.v->min_key)) {
- bch2_dump_btree_node(c, b);
- bch2_bpos_to_text(&buf1, next_node);
- bch2_bpos_to_text(&buf2, bp.v->min_key);
- panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
- }
+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
- bch2_btree_node_iter_advance(&iter, b);
+ struct bpos expected_min = bkey_deleted(&prev.k->k)
+ ? node_min
+ : bpos_successor(prev.k->k.p);
- if (bch2_btree_node_iter_end(&iter)) {
- if (!bpos_eq(k.k->p, b->key.k.p)) {
- bch2_dump_btree_node(c, b);
- bch2_bpos_to_text(&buf1, b->key.k.p);
- bch2_bpos_to_text(&buf2, k.k->p);
- panic("expected end %s got %s\n", buf1.buf, buf2.buf);
- }
- break;
+ if (!bpos_eq(expected_min, bp.v->min_key)) {
+ bch2_topology_error(c);
+
+ printbuf_reset(&buf);
+ prt_str(&buf, "end of prev node doesn't match start of next node\n"),
+ prt_printf(&buf, " in btree %s level %u node ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, "\n prev ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+ prt_str(&buf, "\n next ");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf);
+ goto topology_repair;
}
- next_node = bpos_successor(k.k->p);
+ bch2_bkey_buf_reassemble(&prev, c, k);
+ bch2_btree_and_journal_iter_advance(&iter);
+ }
+
+ if (bkey_deleted(&prev.k->k)) {
+ bch2_topology_error(c);
+
+ printbuf_reset(&buf);
+ prt_str(&buf, "empty interior node\n");
+ prt_printf(&buf, " in btree %s level %u node ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf);
+ goto topology_repair;
+ } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
+ bch2_topology_error(c);
+
+ printbuf_reset(&buf);
+ prt_str(&buf, "last child node doesn't end at end of parent node\n");
+ prt_printf(&buf, " in btree %s level %u node ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, "\n last key ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+
+ need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf);
+ goto topology_repair;
}
-#endif
+out:
+fsck_err:
+ bch2_btree_and_journal_iter_exit(&iter);
+ bch2_bkey_buf_exit(&prev, c);
+ printbuf_exit(&buf);
+ return ret;
+topology_repair:
+ if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
+ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) {
+ bch2_inconsistent_error(c);
+ ret = -BCH_ERR_btree_need_topology_repair;
+ } else {
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
+ }
+ goto out;
}
/* Calculate ideal packed bkey format for new btree nodes: */
@@ -254,7 +310,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct open_buckets obs = { .nr = 0 };
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim
+ unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim
? BTREE_NODE_RESERVE
: 0;
int ret;
@@ -638,7 +694,7 @@ static void btree_update_nodes_written(struct btree_update *as)
* which may require allocations as well.
*/
ret = commit_do(trans, &as->disk_res, &journal_seq,
- BCH_WATERMARK_reclaim|
+ BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_journal_reclaim,
@@ -797,11 +853,11 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
- BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode != BTREE_UPDATE_none);
BUG_ON(!btree_node_dirty(b));
BUG_ON(!b->c.level);
- as->mode = BTREE_INTERIOR_UPDATING_NODE;
+ as->mode = BTREE_UPDATE_node;
as->b = b;
set_btree_node_write_blocked(b);
@@ -824,7 +880,7 @@ static void btree_update_reparent(struct btree_update *as,
lockdep_assert_held(&c->btree_interior_update_lock);
child->b = NULL;
- child->mode = BTREE_INTERIOR_UPDATING_AS;
+ child->mode = BTREE_UPDATE_update;
bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
bch2_update_reparent_journal_pin_flush);
@@ -835,7 +891,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
struct bkey_i *insert = &b->key;
struct bch_fs *c = as->c;
- BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode != BTREE_UPDATE_none);
BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
ARRAY_SIZE(as->journal_entries));
@@ -849,7 +905,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
- as->mode = BTREE_INTERIOR_UPDATING_ROOT;
+ as->mode = BTREE_UPDATE_root;
mutex_unlock(&c->btree_interior_update_lock);
}
@@ -1027,7 +1083,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
struct bch_fs *c = as->c;
u64 start_time = as->start_time;
- BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode == BTREE_UPDATE_none);
if (as->took_gc_lock)
up_read(&as->c->gc_lock);
@@ -1072,7 +1128,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK;
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark != BCH_WATERMARK_reclaim)
+ watermark < BCH_WATERMARK_reclaim)
journal_flags |= JOURNAL_RES_GET_NONBLOCK;
ret = drop_locks_do(trans,
@@ -1123,7 +1179,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as->c = c;
as->start_time = start_time;
as->ip_started = _RET_IP_;
- as->mode = BTREE_INTERIOR_NO_UPDATE;
+ as->mode = BTREE_UPDATE_none;
+ as->watermark = watermark;
as->took_gc_lock = true;
as->btree_id = path->btree_id;
as->update_level = update_level;
@@ -1168,7 +1225,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
*/
if (bch2_err_matches(ret, ENOSPC) &&
(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark != BCH_WATERMARK_reclaim) {
+ watermark < BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err;
}
@@ -1380,9 +1437,16 @@ static void __btree_split_node(struct btree_update *as,
if (bkey_deleted(k))
continue;
+ uk = bkey_unpack_key(b, k);
+
+ if (b->c.level &&
+ u64s < n1_u64s &&
+ u64s + k->u64s >= n1_u64s &&
+ bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p))
+ n1_u64s += k->u64s;
+
i = u64s >= n1_u64s;
u64s += k->u64s;
- uk = bkey_unpack_key(b, k);
if (!i)
n1_pos = uk.p;
bch2_bkey_format_add_key(&format[i], &uk);
@@ -1441,8 +1505,7 @@ static void __btree_split_node(struct btree_update *as,
bch2_verify_btree_nr_keys(n[i]);
- if (b->c.level)
- btree_node_interior_verify(as->c, n[i]);
+ BUG_ON(bch2_btree_node_check_topology(trans, n[i]));
}
}
@@ -1473,7 +1536,7 @@ static void btree_split_insert_keys(struct btree_update *as,
__bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
- btree_node_interior_verify(as->c, b);
+ BUG_ON(bch2_btree_node_check_topology(trans, b));
}
}
@@ -1488,9 +1551,14 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
u64 start_time = local_clock();
int ret = 0;
+ bch2_verify_btree_nr_keys(b);
BUG_ON(!parent && (b != btree_node_root(c, b)));
BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
+ ret = bch2_btree_node_check_topology(trans, b);
+ if (ret)
+ return ret;
+
bch2_btree_interior_update_will_free_node(as, b);
if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
@@ -1710,7 +1778,11 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
goto split;
}
- btree_node_interior_verify(c, b);
+ ret = bch2_btree_node_check_topology(trans, b);
+ if (ret) {
+ bch2_btree_node_unlock_write(trans, path, b);
+ return ret;
+ }
bch2_btree_insert_keys_interior(as, trans, path, b, keys);
@@ -1728,7 +1800,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
bch2_btree_node_unlock_write(trans, path, b);
- btree_node_interior_verify(c, b);
+ BUG_ON(bch2_btree_node_check_topology(trans, b));
return 0;
split:
/*
@@ -1818,9 +1890,12 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path,
{
struct bch_fs *c = trans->c;
struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
+
+ if (btree_node_fake(b))
+ return bch2_btree_split_leaf(trans, path, flags);
+
struct btree_update *as =
- bch2_btree_update_start(trans, trans->paths + path,
- b->c.level, true, flags);
+ bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags);
if (IS_ERR(as))
return PTR_ERR(as);
@@ -2391,7 +2466,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
bch2_btree_set_root_inmem(c, b);
}
-static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
+static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level)
{
struct bch_fs *c = trans->c;
struct closure cl;
@@ -2410,7 +2485,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
set_btree_node_fake(b);
set_btree_node_need_rewrite(b);
- b->c.level = 0;
+ b->c.level = level;
b->c.btree_id = id;
bkey_btree_ptr_init(&b->key);
@@ -2437,9 +2512,21 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
return 0;
}
-void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
+void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
+{
+ bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level));
+}
+
+static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
{
- bch2_trans_run(c, __bch2_btree_root_alloc(trans, id));
+ prt_printf(out, "%ps: btree=%s watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
+ (void *) as->ip_started,
+ bch2_btree_id_str(as->btree_id),
+ bch2_watermarks[as->watermark],
+ bch2_btree_update_modes[as->mode],
+ as->nodes_written,
+ closure_nr_remaining(&as->cl),
+ as->journal.seq);
}
void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
@@ -2448,12 +2535,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
mutex_lock(&c->btree_interior_update_lock);
list_for_each_entry(as, &c->btree_interior_update_list, list)
- prt_printf(out, "%ps: mode=%u nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
- (void *) as->ip_started,
- as->mode,
- as->nodes_written,
- closure_nr_remaining(&as->cl),
- as->journal.seq);
+ bch2_btree_update_to_text(out, as);
mutex_unlock(&c->btree_interior_update_lock);
}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index f651dd48aaa0..88dcf5a22a3b 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -10,6 +10,20 @@
#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
+int bch2_btree_node_check_topology(struct btree_trans *, struct btree *);
+
+#define BTREE_UPDATE_MODES() \
+ x(none) \
+ x(node) \
+ x(root) \
+ x(update)
+
+enum btree_update_mode {
+#define x(n) BTREE_UPDATE_##n,
+ BTREE_UPDATE_MODES()
+#undef x
+};
+
/*
* Tracks an in progress split/rewrite of a btree node and the update to the
* parent node:
@@ -37,14 +51,8 @@ struct btree_update {
struct list_head list;
struct list_head unwritten_list;
- /* What kind of update are we doing? */
- enum {
- BTREE_INTERIOR_NO_UPDATE,
- BTREE_INTERIOR_UPDATING_NODE,
- BTREE_INTERIOR_UPDATING_ROOT,
- BTREE_INTERIOR_UPDATING_AS,
- } mode;
-
+ enum btree_update_mode mode;
+ enum bch_watermark watermark;
unsigned nodes_written:1;
unsigned took_gc_lock:1;
@@ -54,7 +62,7 @@ struct btree_update {
struct disk_reservation disk_res;
/*
- * BTREE_INTERIOR_UPDATING_NODE:
+ * BTREE_UPDATE_node:
* The update that made the new nodes visible was a regular update to an
* existing interior node - @b. We can't write out the update to @b
* until the new nodes we created are finished writing, so we block @b
@@ -163,7 +171,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
struct bkey_i *, unsigned, bool);
void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
-void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
+void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned);
static inline unsigned btree_update_reserve_required(struct bch_fs *c,
struct btree *b)
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 5cbad8445782..baf63e2fddb6 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -11,6 +11,7 @@
#include "journal_reclaim.h"
#include <linux/prefetch.h>
+#include <linux/sort.h>
static int bch2_btree_write_buffer_journal_flush(struct journal *,
struct journal_entry_pin *, u64);
@@ -46,6 +47,14 @@ static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_ke
#endif
}
+static int wb_key_seq_cmp(const void *_l, const void *_r)
+{
+ const struct btree_write_buffered_key *l = _l;
+ const struct btree_write_buffered_key *r = _r;
+
+ return cmp_int(l->journal_seq, r->journal_seq);
+}
+
/* Compare excluding idx, the low 24 bits: */
static inline bool wb_key_eq(const void *_l, const void *_r)
{
@@ -357,6 +366,11 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
*/
trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
+ sort(wb->flushing.keys.data,
+ wb->flushing.keys.nr,
+ sizeof(wb->flushing.keys.data[0]),
+ wb_key_seq_cmp, NULL);
+
darray_for_each(wb->flushing.keys, i) {
if (!i->journal_seq)
continue;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 96edf2c34d43..941401a210f5 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -525,6 +525,7 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
"different types of data in same bucket: %s, %s",
bch2_data_type_str(g->data_type),
bch2_data_type_str(data_type))) {
+ BUG();
ret = -EIO;
goto err;
}
@@ -628,6 +629,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
bch2_data_type_str(ptr_data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ BUG();
ret = -EIO;
goto err;
}
@@ -815,14 +817,14 @@ static int __mark_pointer(struct btree_trans *trans,
static int bch2_trigger_pointer(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct extent_ptr_decoded p,
- s64 *sectors,
- unsigned flags)
+ const union bch_extent_entry *entry,
+ s64 *sectors, unsigned flags)
{
bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
struct bpos bucket;
struct bch_backpointer bp;
- bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp);
+ bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, entry, &bucket, &bp);
*sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
@@ -851,7 +853,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (flags & BTREE_TRIGGER_GC) {
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
percpu_down_read(&c->mark_lock);
struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
@@ -979,7 +981,7 @@ static int __trigger_extent(struct btree_trans *trans,
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
s64 disk_sectors;
- ret = bch2_trigger_pointer(trans, btree_id, level, k, p, &disk_sectors, flags);
+ ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags);
if (ret < 0)
return ret;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 6387e039f789..00aaf4bb5139 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -226,6 +226,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma
fallthrough;
case BCH_WATERMARK_btree_copygc:
case BCH_WATERMARK_reclaim:
+ case BCH_WATERMARK_interior_updates:
break;
}
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 38defa19d52d..cbfa6459bdbc 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -7,7 +7,7 @@
#include "chardev.h"
#include "journal.h"
#include "move.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 4150feca42a2..34731ee0217f 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -14,6 +14,7 @@
#include "move.h"
#include "nocow_locking.h"
#include "rebalance.h"
+#include "snapshot.h"
#include "subvolume.h"
#include "trace.h"
@@ -509,6 +510,14 @@ int bch2_data_update_init(struct btree_trans *trans,
unsigned ptrs_locked = 0;
int ret = 0;
+ /*
+ * fs is corrupt we have a key for a snapshot node that doesn't exist,
+ * and we have to check for this because we go rw before repairing the
+ * snapshots table - just skip it, we can move it later.
+ */
+ if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot)))
+ return -BCH_ERR_data_update_done;
+
bch2_bkey_buf_init(&m->k);
bch2_bkey_buf_reassemble(&m->k, c, k);
m->btree_id = btree_id;
@@ -571,8 +580,7 @@ int bch2_data_update_init(struct btree_trans *trans,
move_ctxt_wait_event(ctxt,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
PTR_BUCKET_POS(c, &p.ptr), 0)) ||
- (!atomic_read(&ctxt->read_sectors) &&
- !atomic_read(&ctxt->write_sectors)));
+ list_empty(&ctxt->ios));
if (!locked)
bch2_bucket_nocow_lock(&c->nocow_locks,
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index af25d8ec60f2..01a79fa3eacb 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -252,7 +252,8 @@
x(BCH_ERR_nopromote, nopromote_in_flight) \
x(BCH_ERR_nopromote, nopromote_no_writes) \
x(BCH_ERR_nopromote, nopromote_enomem) \
- x(0, need_inode_lock)
+ x(0, need_inode_lock) \
+ x(0, invalid_snapshot_node)
enum bch_errcode {
BCH_ERR_START = 2048,
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 043431206799..82a6656c941c 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -1,7 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "error.h"
-#include "recovery.h"
+#include "journal.h"
+#include "recovery_passes.h"
#include "super.h"
#include "thread_with_file.h"
@@ -16,7 +17,8 @@ bool bch2_inconsistent_error(struct bch_fs *c)
return false;
case BCH_ON_ERROR_ro:
if (bch2_fs_emergency_read_only(c))
- bch_err(c, "inconsistency detected - emergency read only");
+ bch_err(c, "inconsistency detected - emergency read only at journal seq %llu",
+ journal_cur_seq(&c->journal));
return true;
case BCH_ON_ERROR_panic:
panic(bch2_fmt(c, "panic after error"));
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index ae1d6674c512..36caedf72d89 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -32,6 +32,12 @@ bool bch2_inconsistent_error(struct bch_fs *);
int bch2_topology_error(struct bch_fs *);
+#define bch2_fs_topology_error(c, ...) \
+({ \
+ bch_err(c, "btree topology error: " __VA_ARGS__); \
+ bch2_topology_error(c); \
+})
+
#define bch2_fs_inconsistent(c, ...) \
({ \
bch_err(c, __VA_ARGS__); \
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 61395b113df9..0e3ca99fbd2d 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -189,13 +189,18 @@ int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
int ret = 0;
- bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err,
- btree_ptr_v2_val_too_big,
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
+ c, err, btree_ptr_v2_val_too_big,
"value too big (%zu > %zu)",
bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
+ bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
+ c, err, btree_ptr_v2_min_key_bad,
+ "min_key > key");
+
ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
fsck_err:
return ret;
@@ -973,6 +978,31 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
return bkey_deleted(k.k);
}
+void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
+{
+ struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+ ? bch_dev_bkey_exists(c, ptr->dev)
+ : NULL;
+
+ if (!ca) {
+ prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+ (u64) ptr->offset, ptr->gen,
+ ptr->cached ? " cached" : "");
+ } else {
+ u32 offset;
+ u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+ prt_printf(out, "ptr: %u:%llu:%u gen %u",
+ ptr->dev, b, offset, ptr->gen);
+ if (ptr->cached)
+ prt_str(out, " cached");
+ if (ptr->unwritten)
+ prt_str(out, " unwritten");
+ if (ca && ptr_stale(ca, ptr))
+ prt_printf(out, " stale");
+ }
+}
+
void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
@@ -988,31 +1018,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " ");
switch (__extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr: {
- const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
- struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
- ? bch_dev_bkey_exists(c, ptr->dev)
- : NULL;
-
- if (!ca) {
- prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
- (u64) ptr->offset, ptr->gen,
- ptr->cached ? " cached" : "");
- } else {
- u32 offset;
- u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
-
- prt_printf(out, "ptr: %u:%llu:%u gen %u",
- ptr->dev, b, offset, ptr->gen);
- if (ptr->cached)
- prt_str(out, " cached");
- if (ptr->unwritten)
- prt_str(out, " unwritten");
- if (ca && ptr_stale(ca, ptr))
- prt_printf(out, " stale");
- }
+ case BCH_EXTENT_ENTRY_ptr:
+ bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
break;
- }
+
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128: {
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index fd2669cdd76f..528e817eacbd 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -596,30 +596,6 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
return ret;
}
-static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr)
-{
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- return BCH_DATA_btree;
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- return BCH_DATA_user;
- case KEY_TYPE_stripe: {
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
- BUG_ON(ptr < s.v->ptrs ||
- ptr >= s.v->ptrs + s.v->nr_blocks);
-
- return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
- ? BCH_DATA_parity
- : BCH_DATA_user;
- }
- default:
- BUG();
- }
-}
-
unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
@@ -700,6 +676,7 @@ bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
new file mode 100644
index 000000000000..4ce5e957a6e9
--- /dev/null
+++ b/fs/bcachefs/eytzinger.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "eytzinger.h"
+
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required alignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
+{
+ unsigned char lsbits = (unsigned char)size;
+
+ (void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+ return (lsbits & (align - 1)) == 0;
+}
+
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory. This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is still valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static void swap_words_32(void *a, void *b, size_t n)
+{
+ do {
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+ } while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory. This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible. If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI). Are there any cases the kernel needs to worry about?
+ */
+static void swap_words_64(void *a, void *b, size_t n)
+{
+ do {
+#ifdef CONFIG_64BIT
+ u64 t = *(u64 *)(a + (n -= 8));
+ *(u64 *)(a + n) = *(u64 *)(b + n);
+ *(u64 *)(b + n) = t;
+#else
+ /* Use two 32-bit transfers to avoid base+index+4 addressing */
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+
+ t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+#endif
+ } while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static void swap_bytes(void *a, void *b, size_t n)
+{
+ do {
+ char t = ((char *)a)[--n];
+ ((char *)a)[n] = ((char *)b)[n];
+ ((char *)b)[n] = t;
+ } while (n);
+}
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 (swap_r_func_t)0
+#define SWAP_WORDS_32 (swap_r_func_t)1
+#define SWAP_BYTES (swap_r_func_t)2
+#define SWAP_WRAPPER (swap_r_func_t)3
+
+struct wrapper {
+ cmp_func_t cmp;
+ swap_func_t swap;
+};
+
+/*
+ * The function pointer is last to make tail calls most efficient if the
+ * compiler decides not to inline this function.
+ */
+static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
+{
+ if (swap_func == SWAP_WRAPPER) {
+ ((const struct wrapper *)priv)->swap(a, b, (int)size);
+ return;
+ }
+
+ if (swap_func == SWAP_WORDS_64)
+ swap_words_64(a, b, size);
+ else if (swap_func == SWAP_WORDS_32)
+ swap_words_32(a, b, size);
+ else if (swap_func == SWAP_BYTES)
+ swap_bytes(a, b, size);
+ else
+ swap_func(a, b, (int)size, priv);
+}
+
+#define _CMP_WRAPPER ((cmp_r_func_t)0L)
+
+static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
+{
+ if (cmp == _CMP_WRAPPER)
+ return ((const struct wrapper *)priv)->cmp(a, b);
+ return cmp(a, b, priv);
+}
+
+static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
+ cmp_r_func_t cmp_func, const void *priv,
+ size_t l, size_t r)
+{
+ return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
+ base + inorder_to_eytzinger0(r, n) * size,
+ cmp_func, priv);
+}
+
+static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
+ swap_r_func_t swap_func, const void *priv,
+ size_t l, size_t r)
+{
+ do_swap(base + inorder_to_eytzinger0(l, n) * size,
+ base + inorder_to_eytzinger0(r, n) * size,
+ size, swap_func, priv);
+}
+
+void eytzinger0_sort_r(void *base, size_t n, size_t size,
+ cmp_r_func_t cmp_func,
+ swap_r_func_t swap_func,
+ const void *priv)
+{
+ int i, c, r;
+
+ /* called from 'sort' without swap function, let's pick the default */
+ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
+ swap_func = NULL;
+
+ if (!swap_func) {
+ if (is_aligned(base, size, 8))
+ swap_func = SWAP_WORDS_64;
+ else if (is_aligned(base, size, 4))
+ swap_func = SWAP_WORDS_32;
+ else
+ swap_func = SWAP_BYTES;
+ }
+
+ /* heapify */
+ for (i = n / 2 - 1; i >= 0; --i) {
+ for (r = i; r * 2 + 1 < n; r = c) {
+ c = r * 2 + 1;
+
+ if (c + 1 < n &&
+ eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
+ c++;
+
+ if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
+ break;
+
+ eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+ }
+ }
+
+ /* sort */
+ for (i = n - 1; i > 0; --i) {
+ eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
+
+ for (r = 0; r * 2 + 1 < i; r = c) {
+ c = r * 2 + 1;
+
+ if (c + 1 < i &&
+ eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
+ c++;
+
+ if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
+ break;
+
+ eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+ }
+ }
+}
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+ cmp_func_t cmp_func,
+ swap_func_t swap_func)
+{
+ struct wrapper w = {
+ .cmp = cmp_func,
+ .swap = swap_func,
+ };
+
+ return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
+}
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index b04750dbf870..ee0e2df33322 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -5,23 +5,33 @@
#include <linux/bitops.h>
#include <linux/log2.h>
-#include "util.h"
+#ifdef EYTZINGER_DEBUG
+#define EYTZINGER_BUG_ON(cond) BUG_ON(cond)
+#else
+#define EYTZINGER_BUG_ON(cond)
+#endif
/*
* Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array
- */
-
-/*
- * One based indexing version:
+ * array.
*
- * With one based indexing each level of the tree starts at a power of two -
- * good for cacheline alignment:
+ * Consider using an eytzinger tree any time you would otherwise be doing binary
+ * search over an array. Binary search is a worst case scenario for branch
+ * prediction and prefetching, but in an eytzinger tree every node's children
+ * are adjacent in memory, thus we can prefetch children before knowing the
+ * result of the comparison, assuming multiple nodes fit on a cacheline.
+ *
+ * Two variants are provided, for one based indexing and zero based indexing.
+ *
+ * Zero based indexing is more convenient, but one based indexing has better
+ * alignment and thus better performance because each new level of the tree
+ * starts at a power of two, and thus if element 0 was cacheline aligned, each
+ * new level will be as well.
*/
static inline unsigned eytzinger1_child(unsigned i, unsigned child)
{
- EBUG_ON(child > 1);
+ EYTZINGER_BUG_ON(child > 1);
return (i << 1) + child;
}
@@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size)
static inline unsigned eytzinger1_next(unsigned i, unsigned size)
{
- EBUG_ON(i > size);
+ EYTZINGER_BUG_ON(i > size);
if (eytzinger1_right_child(i) <= size) {
i = eytzinger1_right_child(i);
@@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
{
- EBUG_ON(i > size);
+ EYTZINGER_BUG_ON(i > size);
if (eytzinger1_left_child(i) <= size) {
i = eytzinger1_left_child(i) + 1;
@@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
unsigned shift = __fls(size) - b;
int s;
- EBUG_ON(!i || i > size);
+ EYTZINGER_BUG_ON(!i || i > size);
i ^= 1U << b;
i <<= 1;
@@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
unsigned shift;
int s;
- EBUG_ON(!i || i > size);
+ EYTZINGER_BUG_ON(!i || i > size);
/*
* sign bit trick:
@@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
static inline unsigned eytzinger0_child(unsigned i, unsigned child)
{
- EBUG_ON(child > 1);
+ EYTZINGER_BUG_ON(child > 1);
return (i << 1) + 1 + child;
}
@@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))
-typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
-
/* return greatest node <= @search, or -1 if not found */
static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
- eytzinger_cmp_fn cmp, const void *search)
+ cmp_func_t cmp, const void *search)
{
unsigned i, n = 0;
@@ -244,21 +252,24 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
do {
i = n;
- n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+ n = eytzinger0_child(i, cmp(base + i * size, search) <= 0);
} while (n < nr);
if (n & 1) {
/* @i was greater than @search, return previous node: */
-
- if (i == eytzinger0_first(nr))
- return -1;
-
return eytzinger0_prev(i, nr);
} else {
return i;
}
}
+static inline ssize_t eytzinger0_find_gt(void *base, size_t nr, size_t size,
+ cmp_func_t cmp, const void *search)
+{
+ ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
+ return eytzinger0_next(idx, size);
+}
+
#define eytzinger0_find(base, nr, size, _cmp, search) \
({ \
void *_base = (base); \
@@ -269,13 +280,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
int _res; \
\
while (_i < _nr && \
- (_res = _cmp(_search, _base + _i * _size, _size))) \
+ (_res = _cmp(_search, _base + _i * _size))) \
_i = eytzinger0_child(_i, _res > 0); \
_i; \
})
-void eytzinger0_sort(void *, size_t, size_t,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t));
+void eytzinger0_sort_r(void *, size_t, size_t,
+ cmp_r_func_t, swap_r_func_t, const void *);
+void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
#endif /* _EYTZINGER_H */
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 33cb6da3a5ad..f49e6c0f0f68 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -536,7 +536,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
if (likely(!dio->iter.count) || dio->op.error)
break;
- bio_reset(bio, NULL, REQ_OP_WRITE);
+ bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
}
out:
return bch2_dio_write_done(dio);
@@ -618,7 +618,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
bio = bio_alloc_bioset(NULL,
bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
- REQ_OP_WRITE,
+ REQ_OP_WRITE | REQ_SYNC | REQ_IDLE,
GFP_KERNEL,
&c->dio_write_bioset);
dio = container_of(bio, struct dio_write, op.wbio.bio);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 0ccee05f6887..b5ea9fa1259d 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1997,6 +1997,7 @@ out:
return dget(sb->s_root);
err_put_super:
+ __bch2_fs_stop(c);
deactivate_locked_super(sb);
return ERR_PTR(bch2_err_class(ret));
}
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 47d4eefaba7b..8e2010212cc3 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -12,7 +12,7 @@
#include "fsck.h"
#include "inode.h"
#include "keylist.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "snapshot.h"
#include "super.h"
#include "xattr.h"
@@ -63,9 +63,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol,
u32 *snapshot, u64 *inum)
{
struct bch_subvolume s;
- int ret;
-
- ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
+ int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
*snapshot = le32_to_cpu(s.snapshot);
*inum = le64_to_cpu(s.inode);
@@ -158,9 +156,10 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
- ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash_info, &iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+ &dir_hash_info, &iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
bch2_trans_iter_exit(trans, &iter);
err:
bch_err_fn(c, ret);
@@ -169,7 +168,8 @@ err:
/* Get lost+found, create if it doesn't exist: */
static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
- struct bch_inode_unpacked *lostfound)
+ struct bch_inode_unpacked *lostfound,
+ u64 reattaching_inum)
{
struct bch_fs *c = trans->c;
struct qstr lostfound_str = QSTR("lost+found");
@@ -184,19 +184,36 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
return ret;
subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
- u32 subvol_snapshot;
- ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol),
- &subvol_snapshot, &root_inum.inum);
- bch_err_msg(c, ret, "looking up root subvol");
+ struct bch_subvolume subvol;
+ ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
+ false, 0, &subvol);
+ bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
+ le32_to_cpu(st.master_subvol), snapshot);
if (ret)
return ret;
+ if (!subvol.inode) {
+ struct btree_iter iter;
+ struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
+ 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(subvol);
+ if (ret)
+ return ret;
+
+ subvol->v.inode = cpu_to_le64(reattaching_inum);
+ bch2_trans_iter_exit(trans, &iter);
+ }
+
+ root_inum.inum = le64_to_cpu(subvol.inode);
+
struct bch_inode_unpacked root_inode;
struct bch_hash_info root_hash_info;
u32 root_inode_snapshot = snapshot;
ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
- bch_err_msg(c, ret, "looking up root inode");
+ bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
+ root_inum.inum, le32_to_cpu(st.master_subvol));
if (ret)
return ret;
@@ -292,7 +309,7 @@ static int reattach_inode(struct btree_trans *trans,
snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
}
- ret = lookup_lostfound(trans, dirent_snapshot, &lostfound);
+ ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
if (ret)
return ret;
@@ -363,6 +380,112 @@ static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume
return ret;
}
+static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
+{
+ struct bch_fs *c = trans->c;
+
+ if (!bch2_snapshot_is_leaf(c, snapshotid)) {
+ bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+
+ /*
+ * If inum isn't set, that means we're being called from check_dirents,
+ * not check_inodes - the root of this subvolume doesn't exist or we
+ * would have found it there:
+ */
+ if (!inum) {
+ struct btree_iter inode_iter = {};
+ struct bch_inode_unpacked new_inode;
+ u64 cpu = raw_smp_processor_id();
+
+ bch2_inode_init_early(c, &new_inode);
+ bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
+
+ new_inode.bi_subvol = subvolid;
+
+ int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
+ bch2_btree_iter_traverse(&inode_iter) ?:
+ bch2_inode_write(trans, &inode_iter, &new_inode);
+ bch2_trans_iter_exit(trans, &inode_iter);
+ if (ret)
+ return ret;
+
+ inum = new_inode.bi_inum;
+ }
+
+ bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
+
+ struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
+ int ret = PTR_ERR_OR_ZERO(new_subvol);
+ if (ret)
+ return ret;
+
+ bkey_subvolume_init(&new_subvol->k_i);
+ new_subvol->k.p.offset = subvolid;
+ new_subvol->v.snapshot = cpu_to_le32(snapshotid);
+ new_subvol->v.inode = cpu_to_le64(inum);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
+ if (ret)
+ return ret;
+
+ struct btree_iter iter;
+ struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_snapshots, POS(0, snapshotid),
+ 0, snapshot);
+ ret = PTR_ERR_OR_ZERO(s);
+ bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
+ if (ret)
+ return ret;
+
+ u32 snapshot_tree = le32_to_cpu(s->v.tree);
+
+ s->v.subvol = cpu_to_le32(subvolid);
+ SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
+ bch2_trans_iter_exit(trans, &iter);
+
+ struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
+ 0, snapshot_tree);
+ ret = PTR_ERR_OR_ZERO(st);
+ bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
+ if (ret)
+ return ret;
+
+ if (!st->v.master_subvol)
+ st->v.master_subvol = cpu_to_le32(subvolid);
+
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+}
+
+static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked new_inode;
+
+ bch2_inode_init_early(c, &new_inode);
+ bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL);
+ new_inode.bi_size = size;
+ new_inode.bi_inum = inum;
+
+ return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
+}
+
+static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum)
+{
+ struct btree_iter iter = {};
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
+ struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
+ bch2_trans_iter_exit(trans, &iter);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG);
+}
+
struct snapshots_seen_entry {
u32 id;
u32 equiv;
@@ -1064,6 +1187,11 @@ static int check_inode(struct btree_trans *trans,
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
+ if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
+ ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
+ goto do_update;
+ }
+
if (fsck_err_on(ret,
c, inode_bi_subvol_missing,
"inode %llu:%u bi_subvol points to missing subvolume %u",
@@ -1081,7 +1209,7 @@ static int check_inode(struct btree_trans *trans,
do_update = true;
}
}
-
+do_update:
if (do_update) {
ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
bch_err_msg(c, ret, "in fsck updating inode");
@@ -1130,8 +1258,8 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
i->count = count2;
if (i->count != count2) {
- bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
- w->last_pos.inode, i->snapshot, i->count, count2);
+ bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
+ w->last_pos.inode, i->snapshot, i->count, count2);
return -BCH_ERR_internal_fsck_err;
}
@@ -1371,10 +1499,6 @@ static int check_overlapping_extents(struct btree_trans *trans,
goto err;
}
- ret = extent_ends_at(c, extent_ends, seen, k);
- if (ret)
- goto err;
-
extent_ends->last_pos = k.k->p;
err:
return ret;
@@ -1438,6 +1562,17 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
if (k.k->type != KEY_TYPE_whiteout) {
+ if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
+ ret = reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ inode->last_pos.inode--;
+ ret = -BCH_ERR_transaction_restart_nested;
+ goto err;
+ }
+
if (fsck_err_on(!i, c, extent_in_missing_inode,
"extent in missing inode:\n %s",
(printbuf_reset(&buf),
@@ -1504,6 +1639,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
i->seen_this_pos = true;
}
+
+ if (k.k->type != KEY_TYPE_whiteout) {
+ ret = extent_ends_at(c, extent_ends, s, k);
+ if (ret)
+ goto err;
+ }
out:
err:
fsck_err:
@@ -1584,8 +1725,8 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_
return count2;
if (i->count != count2) {
- bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
- i->count, count2);
+ bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
+ w->last_pos.inode, i->snapshot, i->count, count2);
i->count = count2;
if (i->inode.bi_nlink == i->count)
continue;
@@ -1782,6 +1923,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
u32 parent_snapshot;
+ u32 new_parent_subvol = 0;
u64 parent_inum;
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -1790,6 +1932,27 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
+ if (ret ||
+ (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
+ int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
+ if (ret2 && !bch2_err_matches(ret, ENOENT))
+ return ret2;
+ }
+
+ if (ret &&
+ !new_parent_subvol &&
+ (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
+ /*
+ * Couldn't find a subvol for dirent's snapshot - but we lost
+ * subvols, so we need to reconstruct:
+ */
+ ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
+ if (ret)
+ return ret;
+
+ parent_snapshot = d.k->p.snapshot;
+ }
+
if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
"dirent parent_subvol points to missing subvolume\n%s",
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
@@ -1798,10 +1961,10 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
parent_snapshot,
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
- u32 new_parent_subvol;
- ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
- if (ret)
- goto err;
+ if (!new_parent_subvol) {
+ bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
ret = PTR_ERR_OR_ZERO(new_dirent);
@@ -1847,9 +2010,16 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
+ goto err;
+
+ if (ret) {
+ bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ ret = 0;
+ goto err;
+ }
- if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol,
+ if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
c, inode_bi_parent_wrong,
"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
target_inum,
@@ -1857,13 +2027,13 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
subvol_root.bi_parent_subvol = parent_subvol;
ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
if (ret)
- return ret;
+ goto err;
}
ret = check_dirent_target(trans, iter, d, &subvol_root,
target_snapshot);
if (ret)
- return ret;
+ goto err;
out:
err:
fsck_err:
@@ -1880,7 +2050,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct snapshots_seen *s)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c_dirent d;
struct inode_walker_entry *i;
struct printbuf buf = PRINTBUF;
struct bpos equiv;
@@ -1919,6 +2088,17 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
dir->first_this_inode = false;
+ if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
+ ret = reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ dir->last_pos.inode--;
+ ret = -BCH_ERR_transaction_restart_nested;
+ goto err;
+ }
+
if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
"dirent in nonexisting directory:\n%s",
(printbuf_reset(&buf),
@@ -1953,7 +2133,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (k.k->type != KEY_TYPE_dirent)
goto out;
- d = bkey_s_c_to_dirent(k);
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
if (d.v->d_type == DT_SUBVOL) {
ret = check_dirent_to_subvol(trans, iter, d);
@@ -2098,17 +2278,21 @@ static int check_root_trans(struct btree_trans *trans)
if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
"root subvol missing")) {
- struct bkey_i_subvolume root_subvol;
+ struct bkey_i_subvolume *root_subvol =
+ bch2_trans_kmalloc(trans, sizeof(*root_subvol));
+ ret = PTR_ERR_OR_ZERO(root_subvol);
+ if (ret)
+ goto err;
snapshot = U32_MAX;
inum = BCACHEFS_ROOT_INO;
- bkey_subvolume_init(&root_subvol.k_i);
- root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
- root_subvol.v.flags = 0;
- root_subvol.v.snapshot = cpu_to_le32(snapshot);
- root_subvol.v.inode = cpu_to_le64(inum);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0);
+ bkey_subvolume_init(&root_subvol->k_i);
+ root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_subvol->v.flags = 0;
+ root_subvol->v.snapshot = cpu_to_le32(snapshot);
+ root_subvol->v.inode = cpu_to_le64(inum);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
bch_err_msg(c, ret, "writing root subvol");
if (ret)
goto err;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 2b5e06770ab3..ca4a066e9a54 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -552,8 +552,8 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out,
prt_printf(out, "bi_sectors=%llu", inode->bi_sectors);
prt_newline(out);
- prt_newline(out);
prt_printf(out, "bi_version=%llu", inode->bi_version);
+ prt_newline(out);
#define x(_name, _bits) \
prt_printf(out, #_name "=%llu", (u64) inode->_name); \
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 1baf78594cca..82f9170dab3f 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -264,6 +264,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
ret = 0;
err:
bch2_logged_op_finish(trans, op_k);
+ bch_err_fn(c, ret);
return ret;
}
@@ -476,6 +477,7 @@ case LOGGED_OP_FINSERT_finish:
break;
}
err:
+ bch_err_fn(c, ret);
bch2_logged_op_finish(trans, op_k);
bch2_trans_iter_exit(trans, &iter);
return ret;
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index b5303874fc35..37a024e034d4 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -95,8 +95,7 @@ out:
return ret ?: bch2_blacklist_table_initialize(c);
}
-static int journal_seq_blacklist_table_cmp(const void *_l,
- const void *_r, size_t size)
+static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
{
const struct journal_seq_blacklist_table_entry *l = _l;
const struct journal_seq_blacklist_table_entry *r = _r;
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index 9fac838d123e..b82f8209041f 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -37,7 +37,6 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
struct bkey_buf sk;
u32 restart_count = trans->restart_count;
- int ret;
if (!fn)
return 0;
@@ -45,11 +44,11 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
bch2_bkey_buf_init(&sk);
bch2_bkey_buf_reassemble(&sk, c, k);
- ret = drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?:
- fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count);
+ fn->resume(trans, sk.k);
bch2_bkey_buf_exit(&sk, c);
- return ret;
+
+ return trans_was_restarted(trans, restart_count);
}
int bch2_resume_logged_ops(struct bch_fs *c)
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
index db63b3f3b338..4c298e74723d 100644
--- a/fs/bcachefs/mean_and_variance_test.c
+++ b/fs/bcachefs/mean_and_variance_test.c
@@ -136,20 +136,8 @@ static void mean_and_variance_test_1(struct kunit *test)
d, mean, stddev, weighted_mean, weighted_stddev);
}
-static void mean_and_variance_test_2(struct kunit *test)
-{
- s64 d[] = { 100, 10, 10, 10, 10, 10, 10 };
- s64 mean[] = { 10, 10, 10, 10, 10, 10, 10 };
- s64 stddev[] = { 9, 9, 9, 9, 9, 9, 9 };
- s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 };
- s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 };
-
- do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
- d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
/* Test behaviour where we switch from one steady state to another: */
-static void mean_and_variance_test_3(struct kunit *test)
+static void mean_and_variance_test_2(struct kunit *test)
{
s64 d[] = { 100, 100, 100, 100, 100 };
s64 mean[] = { 22, 32, 40, 46, 50 };
@@ -161,18 +149,6 @@ static void mean_and_variance_test_3(struct kunit *test)
d, mean, stddev, weighted_mean, weighted_stddev);
}
-static void mean_and_variance_test_4(struct kunit *test)
-{
- s64 d[] = { 100, 100, 100, 100, 100 };
- s64 mean[] = { 10, 11, 12, 13, 14 };
- s64 stddev[] = { 9, 13, 15, 17, 19 };
- s64 weighted_mean[] = { 32, 49, 61, 71, 78 };
- s64 weighted_stddev[] = { 38, 44, 44, 41, 38 };
-
- do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
- d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
static void mean_and_variance_fast_divpow2(struct kunit *test)
{
s64 i;
@@ -230,8 +206,6 @@ static struct kunit_case mean_and_variance_test_cases[] = {
KUNIT_CASE(mean_and_variance_weighted_advanced_test),
KUNIT_CASE(mean_and_variance_test_1),
KUNIT_CASE(mean_and_variance_test_2),
- KUNIT_CASE(mean_and_variance_test_3),
- KUNIT_CASE(mean_and_variance_test_4),
{}
};
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 08ea0cfc4aef..e1800c4119b5 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -7,6 +7,7 @@
#include "disk_groups.h"
#include "error.h"
#include "opts.h"
+#include "recovery_passes.h"
#include "super-io.h"
#include "util.h"
@@ -205,6 +206,9 @@ const struct bch_option bch2_opt_table[] = {
#define OPT_STR(_choices) .type = BCH_OPT_STR, \
.min = 0, .max = ARRAY_SIZE(_choices), \
.choices = _choices
+#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \
+ .min = 0, .max = U64_MAX, \
+ .choices = _choices
#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn
#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 136083c11f3a..1ac4135cca1c 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -362,12 +362,17 @@ enum fsck_err_opts {
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
- NULL, "Don't replay the journal") \
- x(keep_journal, u8, \
+ NULL, "Exit recovery immediately prior to journal replay")\
+ x(recovery_pass_last, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_STR_NOLIMIT(bch2_recovery_passes), \
+ BCH2_NO_SB_OPT, 0, \
+ NULL, "Exit recovery after specified pass") \
+ x(retain_recovery_info, u8, \
0, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
- NULL, "Don't free journal entries/keys after startup")\
+ NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\
x(read_entire_journal, u8, \
0, \
OPT_BOOL(), \
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 03f9d6afe467..b76c16152579 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1,35 +1,31 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
#include "alloc_background.h"
-#include "btree_gc.h"
+#include "bkey_buf.h"
#include "btree_journal_iter.h"
+#include "btree_node_scan.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "buckets.h"
#include "dirent.h"
-#include "ec.h"
#include "errcode.h"
#include "error.h"
#include "fs-common.h"
-#include "fsck.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
-#include "lru.h"
#include "logged_ops.h"
#include "move.h"
#include "quota.h"
#include "rebalance.h"
#include "recovery.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "sb-clean.h"
#include "sb-downgrade.h"
#include "snapshot.h"
-#include "subvolume.h"
#include "super-io.h"
#include <linux/sort.h>
@@ -37,6 +33,20 @@
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
+{
+ u64 b = BIT_ULL(btree);
+
+ if (!(c->sb.btrees_lost_data & b)) {
+ bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree));
+
+ mutex_lock(&c->sb_lock);
+ bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+}
+
static bool btree_id_is_alloc(enum btree_id id)
{
switch (id) {
@@ -52,7 +62,7 @@ static bool btree_id_is_alloc(enum btree_id id)
}
/* for -o reconstruct_alloc: */
-static void do_reconstruct_alloc(struct bch_fs *c)
+static void bch2_reconstruct_alloc(struct bch_fs *c)
{
bch2_journal_log_msg(c, "dropping alloc info");
bch_info(c, "dropping and reconstructing all alloc info");
@@ -87,15 +97,17 @@ static void do_reconstruct_alloc(struct bch_fs *c)
c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
- struct journal_keys *keys = &c->journal_keys;
- size_t src, dst;
- move_gap(keys, keys->nr);
-
- for (src = 0, dst = 0; src < keys->nr; src++)
- if (!btree_id_is_alloc(keys->data[src].btree_id))
- keys->data[dst++] = keys->data[src];
- keys->nr = keys->gap = dst;
+ bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ bch2_shoot_down_journal_keys(c, BTREE_ID_freespace,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
}
/*
@@ -186,7 +198,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
return cmp_int(l->journal_seq, r->journal_seq);
}
-static int bch2_journal_replay(struct bch_fs *c)
+int bch2_journal_replay(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
DARRAY(struct journal_key *) keys_sorted = { 0 };
@@ -194,6 +206,7 @@ static int bch2_journal_replay(struct bch_fs *c)
u64 start_seq = c->journal_replay_seq_start;
u64 end_seq = c->journal_replay_seq_start;
struct btree_trans *trans = bch2_trans_get(c);
+ bool immediate_flush = false;
int ret = 0;
if (keys->nr) {
@@ -215,6 +228,13 @@ static int bch2_journal_replay(struct bch_fs *c)
darray_for_each(*keys, k) {
cond_resched();
+ /*
+ * k->allocated means the key wasn't read in from the journal,
+ * rather it was from early repair code
+ */
+ if (k->allocated)
+ immediate_flush = true;
+
/* Skip fastpath if we're low on space in the journal */
ret = c->journal.watermark ? -1 :
commit_do(trans, NULL, NULL,
@@ -266,7 +286,8 @@ static int bch2_journal_replay(struct bch_fs *c)
bch2_trans_put(trans);
trans = NULL;
- if (!c->opts.keep_journal)
+ if (!c->opts.retain_recovery_info &&
+ c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay)
bch2_journal_keys_put_initial(c);
replay_now_at(j, j->replay_journal_seq_end);
@@ -274,6 +295,12 @@ static int bch2_journal_replay(struct bch_fs *c)
bch2_journal_set_replay_done(j);
+ /* if we did any repair, flush it immediately */
+ if (immediate_flush) {
+ bch2_journal_flush_all_pins(&c->journal);
+ ret = bch2_journal_meta(&c->journal);
+ }
+
if (keys->nr)
bch2_journal_log_msg(c, "journal replay finished");
err:
@@ -423,10 +450,9 @@ static int journal_replay_early(struct bch_fs *c,
static int read_btree_roots(struct bch_fs *c)
{
- unsigned i;
int ret = 0;
- for (i = 0; i < btree_id_nr_alive(c); i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (!r->alive)
@@ -435,186 +461,46 @@ static int read_btree_roots(struct bch_fs *c)
if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
continue;
- if (r->error) {
- __fsck_err(c,
- btree_id_is_alloc(i)
- ? FSCK_CAN_IGNORE : 0,
- btree_root_bkey_invalid,
- "invalid btree root %s",
- bch2_btree_id_str(i));
- if (i == BTREE_ID_alloc)
+ if (mustfix_fsck_err_on((ret = r->error),
+ c, btree_root_bkey_invalid,
+ "invalid btree root %s",
+ bch2_btree_id_str(i)) ||
+ mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)),
+ c, btree_root_read_error,
+ "error reading btree root %s l=%u: %s",
+ bch2_btree_id_str(i), r->level, bch2_err_str(ret))) {
+ if (btree_id_is_alloc(i)) {
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
- }
+ r->error = 0;
+ } else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
+ bch_info(c, "will run btree node scan");
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+ }
- ret = bch2_btree_root_read(c, i, &r->key, r->level);
- if (ret) {
- fsck_err(c,
- btree_root_read_error,
- "error reading btree root %s",
- bch2_btree_id_str(i));
- if (btree_id_is_alloc(i))
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
ret = 0;
+ bch2_btree_lost_data(c, i);
}
}
- for (i = 0; i < BTREE_ID_NR; i++) {
+ for (unsigned i = 0; i < BTREE_ID_NR; i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
- if (!r->b) {
+ if (!r->b && !r->error) {
r->alive = false;
r->level = 0;
- bch2_btree_root_alloc(c, i);
+ bch2_btree_root_alloc_fake(c, i, 0);
}
}
fsck_err:
return ret;
}
-static int bch2_initialize_subvolumes(struct bch_fs *c)
-{
- struct bkey_i_snapshot_tree root_tree;
- struct bkey_i_snapshot root_snapshot;
- struct bkey_i_subvolume root_volume;
- int ret;
-
- bkey_snapshot_tree_init(&root_tree.k_i);
- root_tree.k.p.offset = 1;
- root_tree.v.master_subvol = cpu_to_le32(1);
- root_tree.v.root_snapshot = cpu_to_le32(U32_MAX);
-
- bkey_snapshot_init(&root_snapshot.k_i);
- root_snapshot.k.p.offset = U32_MAX;
- root_snapshot.v.flags = 0;
- root_snapshot.v.parent = 0;
- root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
- root_snapshot.v.tree = cpu_to_le32(1);
- SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
-
- bkey_subvolume_init(&root_volume.k_i);
- root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
- root_volume.v.flags = 0;
- root_volume.v.snapshot = cpu_to_le32(U32_MAX);
- root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
-
- ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?:
- bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?:
- bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_inode_unpacked inode;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (!bkey_is_inode(k.k)) {
- bch_err(trans->c, "root inode not found");
- ret = -BCH_ERR_ENOENT_inode;
- goto err;
- }
-
- ret = bch2_inode_unpack(k, &inode);
- BUG_ON(ret);
-
- inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
-
- ret = bch2_inode_write(trans, &iter, &inode);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/* set bi_subvol on root inode */
-noinline_for_stack
-static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
-{
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
- __bch2_fs_upgrade_for_subvolumes(trans));
- bch_err_fn(c, ret);
- return ret;
-}
-
-const char * const bch2_recovery_passes[] = {
-#define x(_fn, ...) #_fn,
- BCH_RECOVERY_PASSES()
-#undef x
- NULL
-};
-
-static int bch2_check_allocations(struct bch_fs *c)
-{
- return bch2_gc(c, true, c->opts.norecovery);
-}
-
-static int bch2_set_may_go_rw(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
-
- /*
- * After we go RW, the journal keys buffer can't be modified (except for
- * setting journal_key->overwritten: it will be accessed by multiple
- * threads
- */
- move_gap(keys, keys->nr);
-
- set_bit(BCH_FS_may_go_rw, &c->flags);
-
- if (keys->nr || c->opts.fsck || !c->sb.clean)
- return bch2_fs_read_write_early(c);
- return 0;
-}
-
-struct recovery_pass_fn {
- int (*fn)(struct bch_fs *);
- unsigned when;
-};
-
-static struct recovery_pass_fn recovery_pass_fns[] = {
-#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when },
- BCH_RECOVERY_PASSES()
-#undef x
-};
-
-u64 bch2_recovery_passes_to_stable(u64 v)
-{
- static const u8 map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
- BCH_RECOVERY_PASSES()
-#undef x
- };
-
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(map[i]);
- return ret;
-}
-
-u64 bch2_recovery_passes_from_stable(u64 v)
-{
- static const u8 map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
- BCH_RECOVERY_PASSES()
-#undef x
- };
-
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(map[i]);
- return ret;
-}
-
static bool check_version_upgrade(struct bch_fs *c)
{
unsigned latest_version = bcachefs_metadata_version_current;
@@ -687,96 +573,6 @@ static bool check_version_upgrade(struct bch_fs *c)
return false;
}
-u64 bch2_fsck_recovery_passes(void)
-{
- u64 ret = 0;
-
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
- if (recovery_pass_fns[i].when & PASS_FSCK)
- ret |= BIT_ULL(i);
- return ret;
-}
-
-static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
-
- if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
- return false;
- if (c->recovery_passes_explicit & BIT_ULL(pass))
- return true;
- if ((p->when & PASS_FSCK) && c->opts.fsck)
- return true;
- if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
- return true;
- if (p->when & PASS_ALWAYS)
- return true;
- return false;
-}
-
-static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
- int ret;
-
- if (!(p->when & PASS_SILENT))
- bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
- bch2_recovery_passes[pass]);
- ret = p->fn(c);
- if (ret)
- return ret;
- if (!(p->when & PASS_SILENT))
- bch2_print(c, KERN_CONT " done\n");
-
- return 0;
-}
-
-static int bch2_run_recovery_passes(struct bch_fs *c)
-{
- int ret = 0;
-
- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
- if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
- unsigned pass = c->curr_recovery_pass;
-
- ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
- (ret && c->curr_recovery_pass < pass))
- continue;
- if (ret)
- break;
-
- c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
- }
- c->curr_recovery_pass++;
- c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
- }
-
- return ret;
-}
-
-int bch2_run_online_recovery_passes(struct bch_fs *c)
-{
- int ret = 0;
-
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
- struct recovery_pass_fn *p = recovery_pass_fns + i;
-
- if (!(p->when & PASS_ONLINE))
- continue;
-
- ret = bch2_run_recovery_pass(c, i);
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
- i = c->curr_recovery_pass;
- continue;
- }
- if (ret)
- break;
- }
-
- return ret;
-}
-
int bch2_fs_recovery(struct bch_fs *c)
{
struct bch_sb_field_clean *clean = NULL;
@@ -809,24 +605,14 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (c->opts.fsck && c->opts.norecovery) {
- bch_err(c, "cannot select both norecovery and fsck");
- ret = -EINVAL;
- goto err;
- }
+ if (c->opts.norecovery)
+ c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1;
if (!c->opts.nochanges) {
mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
bool write_sb = false;
- struct bch_sb_field_ext *ext =
- bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
- if (!ext) {
- ret = -BCH_ERR_ENOSPC_sb;
- mutex_unlock(&c->sb_lock);
- goto err;
- }
-
if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
ext->recovery_passes_required[0] |=
cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
@@ -885,7 +671,7 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
+ if (!c->sb.clean || c->opts.fsck || c->opts.retain_recovery_info) {
struct genradix_iter iter;
struct journal_replay **i;
@@ -965,7 +751,7 @@ use_clean:
c->journal_replay_seq_end = blacklist_seq - 1;
if (c->opts.reconstruct_alloc)
- do_reconstruct_alloc(c);
+ bch2_reconstruct_alloc(c);
zero_out_btree_mem_ptr(&c->journal_keys);
@@ -1017,6 +803,12 @@ use_clean:
clear_bit(BCH_FS_fsck_running, &c->flags);
+ /* fsync if we fixed errors */
+ if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
+ bch2_journal_flush_all_pins(&c->journal);
+ bch2_journal_meta(&c->journal);
+ }
+
/* If we fixed errors, verify that fs is actually clean now: */
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
test_bit(BCH_FS_errors_fixed, &c->flags) &&
@@ -1051,6 +843,7 @@ use_clean:
}
mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
bool write_sb = false;
if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
@@ -1064,15 +857,18 @@ use_clean:
write_sb = true;
}
- if (!test_bit(BCH_FS_error, &c->flags)) {
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- if (ext &&
- (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) ||
- !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) {
- memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required));
- memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
- write_sb = true;
- }
+ if (!test_bit(BCH_FS_error, &c->flags) &&
+ !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) {
+ memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
+ write_sb = true;
+ }
+
+ if (c->opts.fsck &&
+ !test_bit(BCH_FS_error, &c->flags) &&
+ c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 &&
+ ext->btrees_lost_data) {
+ ext->btrees_lost_data = 0;
+ write_sb = true;
}
if (c->opts.fsck &&
@@ -1113,9 +909,10 @@ use_clean:
out:
bch2_flush_fsck_errs(c);
- if (!c->opts.keep_journal &&
- test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+ if (!c->opts.retain_recovery_info) {
bch2_journal_keys_put_initial(c);
+ bch2_find_btree_nodes_exit(&c->found_btree_nodes);
+ }
kfree(clean);
if (!ret &&
@@ -1141,6 +938,7 @@ int bch2_fs_initialize(struct bch_fs *c)
int ret;
bch_notice(c, "initializing new filesystem");
+ set_bit(BCH_FS_new_fs, &c->flags);
mutex_lock(&c->sb_lock);
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
@@ -1155,11 +953,11 @@ int bch2_fs_initialize(struct bch_fs *c)
}
mutex_unlock(&c->sb_lock);
- c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
+ c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
set_bit(BCH_FS_may_go_rw, &c->flags);
for (unsigned i = 0; i < BTREE_ID_NR; i++)
- bch2_btree_root_alloc(c, i);
+ bch2_btree_root_alloc_fake(c, i, 0);
for_each_member_device(c, ca)
bch2_dev_usage_init(ca);
@@ -1230,7 +1028,7 @@ int bch2_fs_initialize(struct bch_fs *c)
if (ret)
goto err;
- c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1;
+ c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1;
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 4e9d24719b2e..4bf818de1f2f 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -2,37 +2,9 @@
#ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H
-extern const char * const bch2_recovery_passes[];
+void bch2_btree_lost_data(struct bch_fs *, enum btree_id);
-u64 bch2_recovery_passes_to_stable(u64 v);
-u64 bch2_recovery_passes_from_stable(u64 v);
-
-/*
- * For when we need to rewind recovery passes and run a pass we skipped:
- */
-static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
- enum bch_recovery_pass pass)
-{
- if (c->recovery_passes_explicit & BIT_ULL(pass))
- return 0;
-
- bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
- bch2_recovery_passes[pass], pass,
- bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
-
- c->recovery_passes_explicit |= BIT_ULL(pass);
-
- if (c->curr_recovery_pass >= pass) {
- c->curr_recovery_pass = pass;
- c->recovery_passes_complete &= (1ULL << pass) >> 1;
- return -BCH_ERR_restart_recovery;
- } else {
- return 0;
- }
-}
-
-int bch2_run_online_recovery_passes(struct bch_fs *);
-u64 bch2_fsck_recovery_passes(void);
+int bch2_journal_replay(struct bch_fs *);
int bch2_fs_recovery(struct bch_fs *);
int bch2_fs_initialize(struct bch_fs *);
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
new file mode 100644
index 000000000000..cb501460d615
--- /dev/null
+++ b/fs/bcachefs/recovery_passes.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "btree_gc.h"
+#include "btree_node_scan.h"
+#include "ec.h"
+#include "fsck.h"
+#include "inode.h"
+#include "journal.h"
+#include "lru.h"
+#include "logged_ops.h"
+#include "rebalance.h"
+#include "recovery.h"
+#include "recovery_passes.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "super.h"
+#include "super-io.h"
+
+const char * const bch2_recovery_passes[] = {
+#define x(_fn, ...) #_fn,
+ BCH_RECOVERY_PASSES()
+#undef x
+ NULL
+};
+
+static int bch2_check_allocations(struct bch_fs *c)
+{
+ return bch2_gc(c, true, false);
+}
+
+static int bch2_set_may_go_rw(struct bch_fs *c)
+{
+ struct journal_keys *keys = &c->journal_keys;
+
+ /*
+ * After we go RW, the journal keys buffer can't be modified (except for
+ * setting journal_key->overwritten: it will be accessed by multiple
+ * threads
+ */
+ move_gap(keys, keys->nr);
+
+ set_bit(BCH_FS_may_go_rw, &c->flags);
+
+ if (keys->nr || c->opts.fsck || !c->sb.clean)
+ return bch2_fs_read_write_early(c);
+ return 0;
+}
+
+struct recovery_pass_fn {
+ int (*fn)(struct bch_fs *);
+ unsigned when;
+};
+
+static struct recovery_pass_fn recovery_pass_fns[] = {
+#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when },
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static const u8 passes_to_stable_map[] = {
+#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
+{
+ return passes_to_stable_map[pass];
+}
+
+u64 bch2_recovery_passes_to_stable(u64 v)
+{
+ u64 ret = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
+ if (v & BIT_ULL(i))
+ ret |= BIT_ULL(passes_to_stable_map[i]);
+ return ret;
+}
+
+u64 bch2_recovery_passes_from_stable(u64 v)
+{
+ static const u8 map[] = {
+#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+ };
+
+ u64 ret = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
+ if (v & BIT_ULL(i))
+ ret |= BIT_ULL(map[i]);
+ return ret;
+}
+
+/*
+ * For when we need to rewind recovery passes and run a pass we skipped:
+ */
+int bch2_run_explicit_recovery_pass(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ if (c->recovery_passes_explicit & BIT_ULL(pass))
+ return 0;
+
+ bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
+ bch2_recovery_passes[pass], pass,
+ bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+
+ c->recovery_passes_explicit |= BIT_ULL(pass);
+
+ if (c->curr_recovery_pass >= pass) {
+ c->curr_recovery_pass = pass;
+ c->recovery_passes_complete &= (1ULL << pass) >> 1;
+ return -BCH_ERR_restart_recovery;
+ } else {
+ return 0;
+ }
+}
+
+int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ if (!test_bit_le64(s, ext->recovery_passes_required)) {
+ __set_bit_le64(s, ext->recovery_passes_required);
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+
+ return bch2_run_explicit_recovery_pass(c, pass);
+}
+
+static void bch2_clear_recovery_pass_required(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ if (test_bit_le64(s, ext->recovery_passes_required)) {
+ __clear_bit_le64(s, ext->recovery_passes_required);
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+}
+
+u64 bch2_fsck_recovery_passes(void)
+{
+ u64 ret = 0;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
+ if (recovery_pass_fns[i].when & PASS_FSCK)
+ ret |= BIT_ULL(i);
+ return ret;
+}
+
+static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
+
+ if (c->recovery_passes_explicit & BIT_ULL(pass))
+ return true;
+ if ((p->when & PASS_FSCK) && c->opts.fsck)
+ return true;
+ if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
+ return true;
+ if (p->when & PASS_ALWAYS)
+ return true;
+ return false;
+}
+
+static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
+ int ret;
+
+ if (!(p->when & PASS_SILENT))
+ bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
+ bch2_recovery_passes[pass]);
+ ret = p->fn(c);
+ if (ret)
+ return ret;
+ if (!(p->when & PASS_SILENT))
+ bch2_print(c, KERN_CONT " done\n");
+
+ return 0;
+}
+
+int bch2_run_online_recovery_passes(struct bch_fs *c)
+{
+ int ret = 0;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
+ struct recovery_pass_fn *p = recovery_pass_fns + i;
+
+ if (!(p->when & PASS_ONLINE))
+ continue;
+
+ ret = bch2_run_recovery_pass(c, i);
+ if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
+ i = c->curr_recovery_pass;
+ continue;
+ }
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int bch2_run_recovery_passes(struct bch_fs *c)
+{
+ int ret = 0;
+
+ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
+ if (c->opts.recovery_pass_last &&
+ c->curr_recovery_pass > c->opts.recovery_pass_last)
+ break;
+
+ if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
+ unsigned pass = c->curr_recovery_pass;
+
+ ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
+ if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
+ (ret && c->curr_recovery_pass < pass))
+ continue;
+ if (ret)
+ break;
+
+ c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
+ }
+
+ c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
+
+ if (!test_bit(BCH_FS_error, &c->flags))
+ bch2_clear_recovery_pass_required(c, c->curr_recovery_pass);
+
+ c->curr_recovery_pass++;
+ }
+
+ return ret;
+}
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
new file mode 100644
index 000000000000..99b464e127b8
--- /dev/null
+++ b/fs/bcachefs/recovery_passes.h
@@ -0,0 +1,17 @@
+#ifndef _BCACHEFS_RECOVERY_PASSES_H
+#define _BCACHEFS_RECOVERY_PASSES_H
+
+extern const char * const bch2_recovery_passes[];
+
+u64 bch2_recovery_passes_to_stable(u64 v);
+u64 bch2_recovery_passes_from_stable(u64 v);
+
+u64 bch2_fsck_recovery_passes(void);
+
+int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
+int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass);
+
+int bch2_run_online_recovery_passes(struct bch_fs *);
+int bch2_run_recovery_passes(struct bch_fs *);
+
+#endif /* _BCACHEFS_RECOVERY_PASSES_H */
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_passes_types.h
index 4959e95e7c74..773aea9a0080 100644
--- a/fs/bcachefs/recovery_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_RECOVERY_TYPES_H
-#define _BCACHEFS_RECOVERY_TYPES_H
+#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H
+#define _BCACHEFS_RECOVERY_PASSES_TYPES_H
#define PASS_SILENT BIT(0)
#define PASS_FSCK BIT(1)
@@ -13,6 +13,7 @@
* must never change:
*/
#define BCH_RECOVERY_PASSES() \
+ x(scan_for_btree_nodes, 37, 0) \
x(check_topology, 4, 0) \
x(alloc_read, 0, PASS_ALWAYS) \
x(stripes_read, 1, PASS_ALWAYS) \
@@ -31,13 +32,13 @@
x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \
x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
x(bucket_gens_init, 17, 0) \
+ x(reconstruct_snapshots, 38, 0) \
x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \
x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
x(fs_upgrade_for_subvolumes, 22, 0) \
- x(resume_logged_ops, 23, PASS_ALWAYS) \
x(check_inodes, 24, PASS_FSCK) \
x(check_extents, 25, PASS_FSCK) \
x(check_indirect_extents, 26, PASS_FSCK) \
@@ -47,6 +48,7 @@
x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
x(check_nlinks, 31, PASS_FSCK) \
+ x(resume_logged_ops, 23, PASS_ALWAYS) \
x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \
x(fix_reflink_p, 33, 0) \
x(set_fs_needs_rebalance, 34, 0) \
@@ -56,6 +58,7 @@ enum bch_recovery_pass {
#define x(n, id, when) BCH_RECOVERY_PASS_##n,
BCH_RECOVERY_PASSES()
#undef x
+ BCH_RECOVERY_PASS_NR
};
/* But we also need stable identifiers that can be used in the superblock */
@@ -65,4 +68,4 @@ enum bch_recovery_pass_stable {
#undef x
};
-#endif /* _BCACHEFS_RECOVERY_TYPES_H */
+#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index c47c66c2b394..ff7864731a07 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -185,8 +185,7 @@ not_found:
} else {
bkey_error_init(update);
update->k.p = p.k->p;
- update->k.p.offset = next_idx;
- update->k.size = next_idx - *idx;
+ update->k.size = p.k->size;
set_bkey_val_u64s(&update->k, 0);
}
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index cc2672c12031..678b9c20e251 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -6,12 +6,15 @@
#include "replicas.h"
#include "super-io.h"
+#include <linux/sort.h>
+
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *);
/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
-static int bch2_memcmp(const void *l, const void *r, size_t size)
+static int bch2_memcmp(const void *l, const void *r, const void *priv)
{
+ size_t size = (size_t) priv;
return memcmp(l, r, size);
}
@@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
- eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
+ eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
+ bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
}
static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
@@ -228,7 +232,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
verify_replicas_entry(search);
-#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size)
+#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size)
idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
entry_cmp, search);
#undef entry_cmp
@@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
{
unsigned i;
- sort_cmp_size(cpu_r->entries,
- cpu_r->nr,
- cpu_r->entry_size,
- bch2_memcmp, NULL);
+ sort_r(cpu_r->entries,
+ cpu_r->nr,
+ cpu_r->entry_size,
+ bch2_memcmp, NULL,
+ (void *)(size_t)cpu_r->entry_size);
for (i = 0; i < cpu_r->nr; i++) {
struct bch_replicas_entry_v1 *e =
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index e4396cb0bacb..d6f81179c3a2 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -7,7 +7,7 @@
#include "bcachefs.h"
#include "darray.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "sb-downgrade.h"
#include "sb-errors.h"
#include "super-io.h"
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index 5178bf579f7c..d7d609131030 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -265,7 +265,12 @@
x(subvol_children_bad, 257) \
x(subvol_loop, 258) \
x(subvol_unreachable, 259) \
- x(btree_node_bkey_bad_u64s, 260)
+ x(btree_node_bkey_bad_u64s, 260) \
+ x(btree_node_topology_empty_interior_node, 261) \
+ x(btree_ptr_v2_min_key_bad, 262) \
+ x(btree_root_unreadable_and_scan_found_nothing, 263) \
+ x(snapshot_node_missing, 264) \
+ x(dup_backpointer_to_bad_csum_extent, 265)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 39debe814bf3..0e806f04f3d7 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -8,6 +8,7 @@
#include "errcode.h"
#include "error.h"
#include "fs.h"
+#include "recovery_passes.h"
#include "snapshot.h"
#include <linux/random.h>
@@ -93,8 +94,10 @@ static int bch2_snapshot_tree_create(struct btree_trans *trans,
static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
{
- while (id && id < ancestor)
- id = __snapshot_t(t, id)->parent;
+ while (id && id < ancestor) {
+ const struct snapshot_t *s = __snapshot_t(t, id);
+ id = s ? s->parent : 0;
+ }
return id == ancestor;
}
@@ -110,6 +113,8 @@ static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancest
static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
{
const struct snapshot_t *s = __snapshot_t(t, id);
+ if (!s)
+ return 0;
if (s->skip[2] <= ancestor)
return s->skip[2];
@@ -127,7 +132,7 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
rcu_read_lock();
struct snapshot_table *t = rcu_dereference(c->snapshots);
- if (unlikely(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots)) {
+ if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
goto out;
}
@@ -151,36 +156,39 @@ out:
static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
{
size_t idx = U32_MAX - id;
- size_t new_size;
struct snapshot_table *new, *old;
- new_size = max(16UL, roundup_pow_of_two(idx + 1));
+ size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
+ size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
- new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
+ new = kvzalloc(new_bytes, GFP_KERNEL);
if (!new)
return NULL;
+ new->nr = new_size;
+
old = rcu_dereference_protected(c->snapshots, true);
if (old)
- memcpy(new->s,
- rcu_dereference_protected(c->snapshots, true)->s,
- sizeof(new->s[0]) * c->snapshot_table_size);
+ memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
rcu_assign_pointer(c->snapshots, new);
- c->snapshot_table_size = new_size;
- kvfree_rcu_mightsleep(old);
+ kvfree_rcu(old, rcu);
- return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+ return &rcu_dereference_protected(c->snapshots,
+ lockdep_is_held(&c->snapshot_table_lock))->s[idx];
}
static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
{
size_t idx = U32_MAX - id;
+ struct snapshot_table *table =
+ rcu_dereference_protected(c->snapshots,
+ lockdep_is_held(&c->snapshot_table_lock));
lockdep_assert_held(&c->snapshot_table_lock);
- if (likely(idx < c->snapshot_table_size))
- return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+ if (likely(table && idx < table->nr))
+ return &table->s[idx];
return __snapshot_t_mut(c, id);
}
@@ -567,6 +575,13 @@ static int check_snapshot_tree(struct btree_trans *trans,
u32 subvol_id;
ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
+ bch_err_fn(c, ret);
+
+ if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
+ ret = 0;
+ goto err;
+ }
+
if (ret)
goto err;
@@ -724,7 +739,6 @@ static int check_snapshot(struct btree_trans *trans,
u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
u32 real_depth;
struct printbuf buf = PRINTBUF;
- bool should_have_subvol;
u32 i, id;
int ret = 0;
@@ -770,7 +784,7 @@ static int check_snapshot(struct btree_trans *trans,
}
}
- should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
+ bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
!BCH_SNAPSHOT_DELETED(&s);
if (should_have_subvol) {
@@ -872,6 +886,154 @@ int bch2_check_snapshots(struct bch_fs *c)
return ret;
}
+static int check_snapshot_exists(struct btree_trans *trans, u32 id)
+{
+ struct bch_fs *c = trans->c;
+
+ if (bch2_snapshot_equiv(c, id))
+ return 0;
+
+ u32 tree_id;
+ int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
+ if (ret)
+ return ret;
+
+ struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
+ ret = PTR_ERR_OR_ZERO(snapshot);
+ if (ret)
+ return ret;
+
+ bkey_snapshot_init(&snapshot->k_i);
+ snapshot->k.p = POS(0, id);
+ snapshot->v.tree = cpu_to_le32(tree_id);
+ snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c));
+
+ return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
+ bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+ bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
+ bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i));
+}
+
+/* Figure out which snapshot nodes belong in the same tree: */
+struct snapshot_tree_reconstruct {
+ enum btree_id btree;
+ struct bpos cur_pos;
+ snapshot_id_list cur_ids;
+ DARRAY(snapshot_id_list) trees;
+};
+
+static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
+{
+ darray_for_each(r->trees, i)
+ darray_exit(i);
+ darray_exit(&r->trees);
+ darray_exit(&r->cur_ids);
+}
+
+static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
+{
+ return r->btree == BTREE_ID_inodes
+ ? r->cur_pos.offset == pos.offset
+ : r->cur_pos.inode == pos.inode;
+}
+
+static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
+{
+ darray_for_each(*l, i)
+ if (snapshot_list_has_id(r, *i))
+ return true;
+ return false;
+}
+
+static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
+{
+ bool first = true;
+ darray_for_each(*s, i) {
+ if (!first)
+ prt_char(out, ' ');
+ first = false;
+ prt_printf(out, "%u", *i);
+ }
+}
+
+static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
+{
+ if (r->cur_ids.nr) {
+ darray_for_each(r->trees, i)
+ if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
+ int ret = snapshot_list_merge(c, i, &r->cur_ids);
+ if (ret)
+ return ret;
+ goto out;
+ }
+ darray_push(&r->trees, r->cur_ids);
+ darray_init(&r->cur_ids);
+ }
+out:
+ r->cur_ids.nr = 0;
+ return 0;
+}
+
+static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
+{
+ if (!same_snapshot(r, pos))
+ snapshot_tree_reconstruct_next(c, r);
+ r->cur_pos = pos;
+ return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
+}
+
+int bch2_reconstruct_snapshots(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct printbuf buf = PRINTBUF;
+ struct snapshot_tree_reconstruct r = {};
+ int ret = 0;
+
+ for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
+ if (btree_type_has_snapshots(btree)) {
+ r.btree = btree;
+
+ ret = for_each_btree_key(trans, iter, btree, POS_MIN,
+ BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({
+ get_snapshot_trees(c, &r, k.k->p);
+ }));
+ if (ret)
+ goto err;
+
+ snapshot_tree_reconstruct_next(c, &r);
+ }
+ }
+
+ darray_for_each(r.trees, t) {
+ printbuf_reset(&buf);
+ snapshot_id_list_to_text(&buf, t);
+
+ darray_for_each(*t, id) {
+ if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
+ c, snapshot_node_missing,
+ "snapshot node %u from tree %s missing", *id, buf.buf)) {
+ if (t->nr > 1) {
+ bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
+ }
+
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_snapshot_exists(trans, *id));
+ if (ret)
+ goto err;
+ }
+ }
+ }
+fsck_err:
+err:
+ bch2_trans_put(trans);
+ snapshot_tree_reconstruct_exit(&r);
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
/*
* Mark a snapshot as deleted, for future cleanup:
*/
@@ -1682,6 +1844,20 @@ int bch2_snapshots_read(struct bch_fs *c)
POS_MIN, 0, k,
(set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
bch_err_fn(c, ret);
+
+ /*
+ * It's important that we check if we need to reconstruct snapshots
+ * before going RW, so we mark that pass as required in the superblock -
+ * otherwise, we could end up deleting keys with missing snapshot nodes
+ * instead
+ */
+ BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
+ test_bit(BCH_FS_may_go_rw, &c->flags));
+
+ if (bch2_err_matches(ret, EIO) ||
+ (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
+ ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
+
return ret;
}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 7c66ffc06385..b7d2fed37c4f 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -33,7 +33,11 @@ int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
{
- return &t->s[U32_MAX - id];
+ u32 idx = U32_MAX - id;
+
+ return likely(t && idx < t->nr)
+ ? &t->s[idx]
+ : NULL;
}
static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
@@ -44,7 +48,8 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
{
rcu_read_lock();
- id = snapshot_t(c, id)->tree;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ id = s ? s->tree : 0;
rcu_read_unlock();
return id;
@@ -52,7 +57,8 @@ static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
{
- return snapshot_t(c, id)->parent;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ return s ? s->parent : 0;
}
static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
@@ -66,19 +72,19 @@ static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
{
-#ifdef CONFIG_BCACHEFS_DEBUG
- u32 parent = snapshot_t(c, id)->parent;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ if (!s)
+ return 0;
- if (parent &&
- snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
+ u32 parent = s->parent;
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBU) &&
+ parent &&
+ s->depth != snapshot_t(c, parent)->depth + 1)
panic("id %u depth=%u parent %u depth=%u\n",
id, snapshot_t(c, id)->depth,
parent, snapshot_t(c, parent)->depth);
return parent;
-#else
- return snapshot_t(c, id)->parent;
-#endif
}
static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
@@ -116,7 +122,8 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
{
- return snapshot_t(c, id)->equiv;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ return s ? s->equiv : 0;
}
static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
@@ -133,38 +140,22 @@ static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
return id == bch2_snapshot_equiv(c, id);
}
-static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
+static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
{
- const struct snapshot_t *s;
- bool ret;
-
rcu_read_lock();
- s = snapshot_t(c, id);
- ret = s->children[0];
+ const struct snapshot_t *s = snapshot_t(c, id);
+ int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
rcu_read_unlock();
return ret;
}
-static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
-{
- return !bch2_snapshot_is_internal_node(c, id);
-}
-
-static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
+static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
{
- const struct snapshot_t *s;
- u32 parent = __bch2_snapshot_parent(c, id);
-
- if (!parent)
- return 0;
-
- s = snapshot_t(c, __bch2_snapshot_parent(c, id));
- if (id == s->children[0])
- return s->children[1];
- if (id == s->children[1])
- return s->children[0];
- return 0;
+ int ret = bch2_snapshot_is_internal_node(c, id);
+ if (ret < 0)
+ return ret;
+ return !ret;
}
static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
@@ -218,15 +209,34 @@ static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list
static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
{
- int ret;
-
BUG_ON(snapshot_list_has_id(s, id));
- ret = darray_push(s, id);
+ int ret = darray_push(s, id);
if (ret)
bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
return ret;
}
+static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+ int ret = snapshot_list_has_id(s, id)
+ ? 0
+ : darray_push(s, id);
+ if (ret)
+ bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+ return ret;
+}
+
+static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src)
+{
+ darray_for_each(*src, i) {
+ int ret = snapshot_list_add_nodup(c, dst, *i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
struct bch_snapshot *s);
int bch2_snapshot_get_subvol(struct btree_trans *, u32,
@@ -238,6 +248,7 @@ int bch2_snapshot_node_create(struct btree_trans *, u32,
int bch2_check_snapshot_trees(struct bch_fs *);
int bch2_check_snapshots(struct bch_fs *);
+int bch2_reconstruct_snapshots(struct bch_fs *);
int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
void bch2_delete_dead_snapshots_work(struct work_struct *);
@@ -249,7 +260,7 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
struct bpos pos)
{
if (!btree_type_has_snapshots(id) ||
- bch2_snapshot_is_leaf(trans->c, pos.snapshot))
+ bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0)
return 0;
return __bch2_key_has_snapshot_overwrites(trans, id, pos);
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index ce7aed121942..88a79c823276 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -595,6 +595,78 @@ err:
return ret;
}
+int bch2_initialize_subvolumes(struct bch_fs *c)
+{
+ struct bkey_i_snapshot_tree root_tree;
+ struct bkey_i_snapshot root_snapshot;
+ struct bkey_i_subvolume root_volume;
+ int ret;
+
+ bkey_snapshot_tree_init(&root_tree.k_i);
+ root_tree.k.p.offset = 1;
+ root_tree.v.master_subvol = cpu_to_le32(1);
+ root_tree.v.root_snapshot = cpu_to_le32(U32_MAX);
+
+ bkey_snapshot_init(&root_snapshot.k_i);
+ root_snapshot.k.p.offset = U32_MAX;
+ root_snapshot.v.flags = 0;
+ root_snapshot.v.parent = 0;
+ root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
+ root_snapshot.v.tree = cpu_to_le32(1);
+ SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
+
+ bkey_subvolume_init(&root_volume.k_i);
+ root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_volume.v.flags = 0;
+ root_volume.v.snapshot = cpu_to_le32(U32_MAX);
+ root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
+
+ ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?:
+ bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?:
+ bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_inode_unpacked inode;
+ int ret;
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (!bkey_is_inode(k.k)) {
+ bch_err(trans->c, "root inode not found");
+ ret = -BCH_ERR_ENOENT_inode;
+ goto err;
+ }
+
+ ret = bch2_inode_unpack(k, &inode);
+ BUG_ON(ret);
+
+ inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
+
+ ret = bch2_inode_write(trans, &iter, &inode);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+/* set bi_subvol on root inode */
+int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
+{
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+ __bch2_fs_upgrade_for_subvolumes(trans));
+ bch_err_fn(c, ret);
+ return ret;
+}
+
int bch2_fs_subvolumes_init(struct bch_fs *c)
{
INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 903c05162c06..d2015d549bd2 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -37,6 +37,9 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *);
int bch2_subvolume_unlink(struct btree_trans *, u32);
int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);
+int bch2_initialize_subvolumes(struct bch_fs *);
+int bch2_fs_upgrade_for_subvolumes(struct bch_fs *);
+
int bch2_fs_subvolumes_init(struct bch_fs *);
#endif /* _BCACHEFS_SUBVOLUME_H */
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index ae644adfc391..9b10c8947828 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -20,6 +20,8 @@ struct snapshot_t {
};
struct snapshot_table {
+ struct rcu_head rcu;
+ size_t nr;
#ifndef RUST_BINDGEN
DECLARE_FLEX_ARRAY(struct snapshot_t, s);
#else
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index ad28e370b640..e0aa3655b63b 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -8,7 +8,7 @@
#include "journal.h"
#include "journal_sb.h"
#include "journal_seq_blacklist.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "quota.h"
#include "sb-clean.h"
@@ -527,9 +527,11 @@ static void bch2_sb_update(struct bch_fs *c)
memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
- if (ext)
+ if (ext) {
le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
sizeof(c->sb.errors_silent) * 8);
+ c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
+ }
for_each_member_device(c, ca) {
struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
@@ -1162,6 +1164,11 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
kfree(errors_silent);
}
+
+ prt_printf(out, "Btrees with missing data:");
+ prt_tab(out);
+ prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data));
+ prt_newline(out);
}
static const struct bch_sb_field_ops bch_sb_field_ops_ext = {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 1ad6e5cd9476..ed63018f21be 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -15,6 +15,7 @@
#include "btree_gc.h"
#include "btree_journal_iter.h"
#include "btree_key_cache.h"
+#include "btree_node_scan.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "btree_write_buffer.h"
@@ -365,7 +366,7 @@ void bch2_fs_read_only(struct bch_fs *c)
!test_bit(BCH_FS_emergency_ro, &c->flags) &&
test_bit(BCH_FS_started, &c->flags) &&
test_bit(BCH_FS_clean_shutdown, &c->flags) &&
- !c->opts.norecovery) {
+ c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
BUG_ON(atomic_read(&c->btree_cache.dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
@@ -510,7 +511,8 @@ err:
int bch2_fs_read_write(struct bch_fs *c)
{
- if (c->opts.norecovery)
+ if (c->opts.recovery_pass_last &&
+ c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)
return -BCH_ERR_erofs_norecovery;
if (c->opts.nochanges)
@@ -535,6 +537,7 @@ static void __bch2_fs_free(struct bch_fs *c)
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
+ bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c);
bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);
@@ -559,6 +562,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
bch2_journal_keys_put_initial(c);
+ bch2_find_btree_nodes_exit(&c->found_btree_nodes);
BUG_ON(atomic_read(&c->journal_keys.ref));
bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock);
@@ -1015,8 +1019,16 @@ int bch2_fs_start(struct bch_fs *c)
for_each_online_member(c, ca)
bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now);
+ struct bch_sb_field_ext *ext =
+ bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
mutex_unlock(&c->sb_lock);
+ if (!ext) {
+ bch_err(c, "insufficient space in superblock for sb_field_ext");
+ ret = -BCH_ERR_ENOSPC_sb;
+ goto err;
+ }
+
for_each_rw_member(c, ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 216fadf16928..92c6ad75e702 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -707,149 +707,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
}
}
-static int alignment_ok(const void *base, size_t align)
-{
- return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
- ((unsigned long)base & (align - 1)) == 0;
-}
-
-static void u32_swap(void *a, void *b, size_t size)
-{
- u32 t = *(u32 *)a;
- *(u32 *)a = *(u32 *)b;
- *(u32 *)b = t;
-}
-
-static void u64_swap(void *a, void *b, size_t size)
-{
- u64 t = *(u64 *)a;
- *(u64 *)a = *(u64 *)b;
- *(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, size_t size)
-{
- char t;
-
- do {
- t = *(char *)a;
- *(char *)a++ = *(char *)b;
- *(char *)b++ = t;
- } while (--size > 0);
-}
-
-static inline int do_cmp(void *base, size_t n, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- size_t l, size_t r)
-{
- return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
- base + inorder_to_eytzinger0(r, n) * size,
- size);
-}
-
-static inline void do_swap(void *base, size_t n, size_t size,
- void (*swap_func)(void *, void *, size_t),
- size_t l, size_t r)
-{
- swap_func(base + inorder_to_eytzinger0(l, n) * size,
- base + inorder_to_eytzinger0(r, n) * size,
- size);
-}
-
-void eytzinger0_sort(void *base, size_t n, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t))
-{
- int i, c, r;
-
- if (!swap_func) {
- if (size == 4 && alignment_ok(base, 4))
- swap_func = u32_swap;
- else if (size == 8 && alignment_ok(base, 8))
- swap_func = u64_swap;
- else
- swap_func = generic_swap;
- }
-
- /* heapify */
- for (i = n / 2 - 1; i >= 0; --i) {
- for (r = i; r * 2 + 1 < n; r = c) {
- c = r * 2 + 1;
-
- if (c + 1 < n &&
- do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
- c++;
-
- if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
- break;
-
- do_swap(base, n, size, swap_func, r, c);
- }
- }
-
- /* sort */
- for (i = n - 1; i > 0; --i) {
- do_swap(base, n, size, swap_func, 0, i);
-
- for (r = 0; r * 2 + 1 < i; r = c) {
- c = r * 2 + 1;
-
- if (c + 1 < i &&
- do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
- c++;
-
- if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
- break;
-
- do_swap(base, n, size, swap_func, r, c);
- }
- }
-}
-
-void sort_cmp_size(void *base, size_t num, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t size))
-{
- /* pre-scale counters for performance */
- int i = (num/2 - 1) * size, n = num * size, c, r;
-
- if (!swap_func) {
- if (size == 4 && alignment_ok(base, 4))
- swap_func = u32_swap;
- else if (size == 8 && alignment_ok(base, 8))
- swap_func = u64_swap;
- else
- swap_func = generic_swap;
- }
-
- /* heapify */
- for ( ; i >= 0; i -= size) {
- for (r = i; r * 2 + size < n; r = c) {
- c = r * 2 + size;
- if (c < n - size &&
- cmp_func(base + c, base + c + size, size) < 0)
- c += size;
- if (cmp_func(base + r, base + c, size) >= 0)
- break;
- swap_func(base + r, base + c, size);
- }
- }
-
- /* sort */
- for (i = n - size; i > 0; i -= size) {
- swap_func(base, base + i, size);
- for (r = 0; r * 2 + size < i; r = c) {
- c = r * 2 + size;
- if (c < i - size &&
- cmp_func(base + c, base + c + size, size) < 0)
- c += size;
- if (cmp_func(base + r, base + c, size) >= 0)
- break;
- swap_func(base + r, base + c, size);
- }
- }
-}
-
#if 0
void eytzinger1_test(void)
{
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 175aee3074c7..b7e7c29278fc 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -631,10 +631,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem);
}
-void sort_cmp_size(void *base, size_t num, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t));
-
/* just the memmove, doesn't update @_nr */
#define __array_insert_item(_array, _nr, _pos) \
memmove(&(_array)[(_pos) + 1], \
@@ -797,4 +793,14 @@ static inline void __set_bit_le64(size_t bit, __le64 *addr)
addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
}
+static inline void __clear_bit_le64(size_t bit, __le64 *addr)
+{
+ addr[bit / 64] &= !cpu_to_le64(BIT_ULL(bit % 64));
+}
+
+static inline bool test_bit_le64(size_t bit, __le64 *addr)
+{
+ return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0;
+}
+
#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/namei.c b/fs/namei.c
index ceb9ddf8dfdd..c5b2a25be7d0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4050,6 +4050,8 @@ retry:
case 0: case S_IFREG:
error = vfs_create(idmap, path.dentry->d_inode,
dentry, mode, true);
+ if (!error)
+ security_path_post_mknod(idmap, dentry);
break;
case S_IFCHR: case S_IFBLK:
error = vfs_mknod(idmap, path.dentry->d_inode,
@@ -4060,11 +4062,6 @@ retry:
dentry, mode, 0);
break;
}
-
- if (error)
- goto out2;
-
- security_path_post_mknod(idmap, dentry);
out2:
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1a93c7fcf76c..2391ab3c3231 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3831,15 +3831,20 @@ nfsd4_create_session(struct svc_rqst *rqstp,
else
cs_slot = &unconf->cl_cs_slot;
status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
- if (status) {
- if (status == nfserr_replay_cache) {
- status = nfsd4_replay_create_session(cr_ses, cs_slot);
- goto out_free_conn;
- }
+ switch (status) {
+ case nfs_ok:
+ cs_slot->sl_seqid++;
+ cr_ses->seqid = cs_slot->sl_seqid;
+ break;
+ case nfserr_replay_cache:
+ status = nfsd4_replay_create_session(cr_ses, cs_slot);
+ fallthrough;
+ case nfserr_jukebox:
+ /* The server MUST NOT cache NFS4ERR_DELAY */
+ goto out_free_conn;
+ default:
goto out_cache_error;
}
- cs_slot->sl_seqid++;
- cr_ses->seqid = cs_slot->sl_seqid;
/* RFC 8881 Section 18.36.4 Phase 3: Client ID confirmation. */
if (conf) {
@@ -3859,10 +3864,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
old = find_confirmed_client_by_name(&unconf->cl_name, nn);
if (old) {
status = mark_client_expired_locked(old);
- if (status) {
- old = NULL;
- goto out_cache_error;
- }
+ if (status)
+ goto out_expired_error;
trace_nfsd_clid_replaced(&old->cl_clientid);
}
move_to_confirmed(unconf);
@@ -3894,6 +3897,17 @@ nfsd4_create_session(struct svc_rqst *rqstp,
expire_client(old);
return status;
+out_expired_error:
+ old = NULL;
+ /*
+ * Revert the slot seq_nr change so the server will process
+ * the client's resend instead of returning a cached response.
+ */
+ if (status == nfserr_jukebox) {
+ cs_slot->sl_seqid--;
+ cr_ses->seqid = cs_slot->sl_seqid;
+ goto out_free_conn;
+ }
out_cache_error:
nfsd4_cache_create_session(cr_ses, cs_slot, status);
out_free_conn:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6a9464262fae..2e41eb4c3cec 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1852,7 +1852,7 @@ retry:
trap = lock_rename(tdentry, fdentry);
if (IS_ERR(trap)) {
err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
- goto out;
+ goto out_want_write;
}
err = fh_fill_pre_attrs(ffhp);
if (err != nfs_ok)
@@ -1922,6 +1922,7 @@ retry:
}
out_unlock:
unlock_rename(tdentry, fdentry);
+out_want_write:
fh_drop_write(ffhp);
/*
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index bd08616ed8ba..7b4db9c56e6a 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -5,7 +5,7 @@
obj-y += proc.o
-CFLAGS_task_mmu.o += $(call cc-option,-Wno-override-init,)
+CFLAGS_task_mmu.o += -Wno-override-init
proc-y := nommu.o task_nommu.o
proc-$(CONFIG_MMU) := task_mmu.o
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index 89333d9bce36..d11dc3aa458b 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -612,11 +612,18 @@ int cifs_mknod(struct mnt_idmap *idmap, struct inode *inode,
goto mknod_out;
}
+ trace_smb3_mknod_enter(xid, tcon->ses->Suid, tcon->tid, full_path);
+
rc = tcon->ses->server->ops->make_node(xid, inode, direntry, tcon,
full_path, mode,
device_number);
mknod_out:
+ if (rc)
+ trace_smb3_mknod_err(xid, tcon->ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_mknod_done(xid, tcon->ses->Suid, tcon->tid);
+
free_dentry_path(page);
free_xid(xid);
cifs_put_tlink(tlink);
diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c
index c4a3cb736881..340efce8f052 100644
--- a/fs/smb/client/fscache.c
+++ b/fs/smb/client/fscache.c
@@ -12,6 +12,16 @@
#include "cifs_fs_sb.h"
#include "cifsproto.h"
+/*
+ * Key for fscache inode. [!] Contents must match comparisons in cifs_find_inode().
+ */
+struct cifs_fscache_inode_key {
+
+ __le64 uniqueid; /* server inode number */
+ __le64 createtime; /* creation time on server */
+ u8 type; /* S_IFMT file type */
+} __packed;
+
static void cifs_fscache_fill_volume_coherency(
struct cifs_tcon *tcon,
struct cifs_fscache_volume_coherency_data *cd)
@@ -97,15 +107,19 @@ void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon)
void cifs_fscache_get_inode_cookie(struct inode *inode)
{
struct cifs_fscache_inode_coherency_data cd;
+ struct cifs_fscache_inode_key key;
struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+ key.uniqueid = cpu_to_le64(cifsi->uniqueid);
+ key.createtime = cpu_to_le64(cifsi->createtime);
+ key.type = (inode->i_mode & S_IFMT) >> 12;
cifs_fscache_fill_coherency(&cifsi->netfs.inode, &cd);
cifsi->netfs.cache =
fscache_acquire_cookie(tcon->fscache, 0,
- &cifsi->uniqueid, sizeof(cifsi->uniqueid),
+ &key, sizeof(key),
&cd, sizeof(cd),
i_size_read(&cifsi->netfs.inode));
if (cifsi->netfs.cache)
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index d28ab0af6049..91b07ef9e25c 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -1351,6 +1351,8 @@ cifs_find_inode(struct inode *inode, void *opaque)
{
struct cifs_fattr *fattr = opaque;
+ /* [!] The compared values must be the same in struct cifs_fscache_inode_key. */
+
/* don't match inode with different uniqueid */
if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
return 0;
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index f9c1fd32d0b8..5e83cb9da902 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -375,6 +375,7 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mknod_enter);
DECLARE_EVENT_CLASS(smb3_inf_compound_done_class,
TP_PROTO(unsigned int xid,
@@ -415,7 +416,7 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_wsl_ea_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done);
-
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mknod_done);
DECLARE_EVENT_CLASS(smb3_inf_compound_err_class,
TP_PROTO(unsigned int xid,
@@ -461,6 +462,7 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_wsl_ea_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mknod_err);
/*
* For logging SMB3 Status code and Command for responses which return errors
diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index 2307f8037efc..118dedef8ebe 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -218,6 +218,7 @@ const struct file_operations vboxsf_reg_fops = {
.release = vboxsf_file_release,
.fsync = noop_fsync,
.splice_read = filemap_splice_read,
+ .setlease = simple_nosetlease,
};
const struct inode_operations vboxsf_reg_iops = {
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index cabe8ac4fefc..ffb1d565da39 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -151,11 +151,11 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc)
if (!sbi->nls) {
vbg_err("vboxsf: Count not load '%s' nls\n", nls_name);
err = -EINVAL;
- goto fail_free;
+ goto fail_destroy_idr;
}
}
- sbi->bdi_id = ida_simple_get(&vboxsf_bdi_ida, 0, 0, GFP_KERNEL);
+ sbi->bdi_id = ida_alloc(&vboxsf_bdi_ida, GFP_KERNEL);
if (sbi->bdi_id < 0) {
err = sbi->bdi_id;
goto fail_free;
@@ -221,9 +221,10 @@ fail_unmap:
vboxsf_unmap_folder(sbi->root);
fail_free:
if (sbi->bdi_id >= 0)
- ida_simple_remove(&vboxsf_bdi_ida, sbi->bdi_id);
+ ida_free(&vboxsf_bdi_ida, sbi->bdi_id);
if (sbi->nls)
unload_nls(sbi->nls);
+fail_destroy_idr:
idr_destroy(&sbi->ino_idr);
kfree(sbi);
return err;
@@ -268,7 +269,7 @@ static void vboxsf_put_super(struct super_block *sb)
vboxsf_unmap_folder(sbi->root);
if (sbi->bdi_id >= 0)
- ida_simple_remove(&vboxsf_bdi_ida, sbi->bdi_id);
+ ida_free(&vboxsf_bdi_ida, sbi->bdi_id);
if (sbi->nls)
unload_nls(sbi->nls);
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index 72ac9320e6a3..9515bbf0b54c 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -440,7 +440,6 @@ int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len,
{
const char *in;
char *out;
- size_t out_len;
size_t out_bound_len;
size_t in_bound_len;
@@ -448,7 +447,6 @@ int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len,
in_bound_len = utf8_len;
out = name;
- out_len = 0;
/* Reserve space for terminating 0 */
out_bound_len = name_bound_len - 1;
@@ -469,7 +467,6 @@ int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len,
out += nb;
out_bound_len -= nb;
- out_len += nb;
}
*out = 0;
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index d991eec05436..73a4b895de67 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -530,7 +530,8 @@ xfs_validate_sb_common(
}
if (!xfs_validate_stripe_geometry(mp, XFS_FSB_TO_B(mp, sbp->sb_unit),
- XFS_FSB_TO_B(mp, sbp->sb_width), 0, false))
+ XFS_FSB_TO_B(mp, sbp->sb_width), 0,
+ xfs_buf_daddr(bp) == XFS_SB_DADDR, false))
return -EFSCORRUPTED;
/*
@@ -1323,8 +1324,10 @@ xfs_sb_get_secondary(
}
/*
- * sunit, swidth, sectorsize(optional with 0) should be all in bytes,
- * so users won't be confused by values in error messages.
+ * sunit, swidth, sectorsize(optional with 0) should be all in bytes, so users
+ * won't be confused by values in error messages. This function returns false
+ * if the stripe geometry is invalid and the caller is unable to repair the
+ * stripe configuration later in the mount process.
*/
bool
xfs_validate_stripe_geometry(
@@ -1332,20 +1335,21 @@ xfs_validate_stripe_geometry(
__s64 sunit,
__s64 swidth,
int sectorsize,
+ bool may_repair,
bool silent)
{
if (swidth > INT_MAX) {
if (!silent)
xfs_notice(mp,
"stripe width (%lld) is too large", swidth);
- return false;
+ goto check_override;
}
if (sunit > swidth) {
if (!silent)
xfs_notice(mp,
"stripe unit (%lld) is larger than the stripe width (%lld)", sunit, swidth);
- return false;
+ goto check_override;
}
if (sectorsize && (int)sunit % sectorsize) {
@@ -1353,21 +1357,21 @@ xfs_validate_stripe_geometry(
xfs_notice(mp,
"stripe unit (%lld) must be a multiple of the sector size (%d)",
sunit, sectorsize);
- return false;
+ goto check_override;
}
if (sunit && !swidth) {
if (!silent)
xfs_notice(mp,
"invalid stripe unit (%lld) and stripe width of 0", sunit);
- return false;
+ goto check_override;
}
if (!sunit && swidth) {
if (!silent)
xfs_notice(mp,
"invalid stripe width (%lld) and stripe unit of 0", swidth);
- return false;
+ goto check_override;
}
if (sunit && (int)swidth % (int)sunit) {
@@ -1375,9 +1379,27 @@ xfs_validate_stripe_geometry(
xfs_notice(mp,
"stripe width (%lld) must be a multiple of the stripe unit (%lld)",
swidth, sunit);
- return false;
+ goto check_override;
}
return true;
+
+check_override:
+ if (!may_repair)
+ return false;
+ /*
+ * During mount, mp->m_dalign will not be set unless the sunit mount
+ * option was set. If it was set, ignore the bad stripe alignment values
+ * and allow the validation and overwrite later in the mount process to
+ * attempt to overwrite the bad stripe alignment values with the values
+ * supplied by mount options.
+ */
+ if (!mp->m_dalign)
+ return false;
+ if (!silent)
+ xfs_notice(mp,
+"Will try to correct with specified mount options sunit (%d) and swidth (%d)",
+ BBTOB(mp->m_dalign), BBTOB(mp->m_swidth));
+ return true;
}
/*
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 2e8e8d63d4eb..37b1ed1bc209 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -35,8 +35,9 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp,
struct xfs_trans *tp, xfs_agnumber_t agno,
struct xfs_buf **bpp);
-extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp,
- __s64 sunit, __s64 swidth, int sectorsize, bool silent);
+bool xfs_validate_stripe_geometry(struct xfs_mount *mp,
+ __s64 sunit, __s64 swidth, int sectorsize, bool may_repair,
+ bool silent);
uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index abff79a77c72..47a20cf5205f 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -1044,9 +1044,7 @@ xchk_irele(
struct xfs_scrub *sc,
struct xfs_inode *ip)
{
- if (current->journal_info != NULL) {
- ASSERT(current->journal_info == sc->tp);
-
+ if (sc->tp) {
/*
* If we are in a transaction, we /cannot/ drop the inode
* ourselves, because the VFS will trigger writeback, which
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 1698507d1ac7..3f428620ebf2 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -503,13 +503,6 @@ xfs_vm_writepages(
{
struct xfs_writepage_ctx wpc = { };
- /*
- * Writing back data in a transaction context can result in recursive
- * transactions. This is bad, so issue a warning and get out of here.
- */
- if (WARN_ON_ONCE(current->journal_info))
- return 0;
-
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index e64265bc0b33..74f1812b03cb 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -2039,8 +2039,10 @@ xfs_inodegc_want_queue_work(
* - Memory shrinkers queued the inactivation worker and it hasn't finished.
* - The queue depth exceeds the maximum allowable percpu backlog.
*
- * Note: If the current thread is running a transaction, we don't ever want to
- * wait for other transactions because that could introduce a deadlock.
+ * Note: If we are in a NOFS context here (e.g. current thread is running a
+ * transaction) the we don't want to block here as inodegc progress may require
+ * filesystem resources we hold to make progress and that could result in a
+ * deadlock. Hence we skip out of here if we are in a scoped NOFS context.
*/
static inline bool
xfs_inodegc_want_flush_work(
@@ -2048,7 +2050,7 @@ xfs_inodegc_want_flush_work(
unsigned int items,
unsigned int shrinker_hits)
{
- if (current->journal_info)
+ if (current->flags & PF_MEMALLOC_NOFS)
return false;
if (shrinker_hits > 0)
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 3f7e3a09a49f..1636663707dc 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -268,19 +268,14 @@ static inline void
xfs_trans_set_context(
struct xfs_trans *tp)
{
- ASSERT(current->journal_info == NULL);
tp->t_pflags = memalloc_nofs_save();
- current->journal_info = tp;
}
static inline void
xfs_trans_clear_context(
struct xfs_trans *tp)
{
- if (current->journal_info == tp) {
- memalloc_nofs_restore(tp->t_pflags);
- current->journal_info = NULL;
- }
+ memalloc_nofs_restore(tp->t_pflags);
}
static inline void
@@ -288,10 +283,8 @@ xfs_trans_switch_context(
struct xfs_trans *old_tp,
struct xfs_trans *new_tp)
{
- ASSERT(current->journal_info == old_tp);
new_tp->t_pflags = old_tp->t_pflags;
old_tp->t_pflags = 0;
- current->journal_info = new_tp;
}
#endif /* __XFS_TRANS_H__ */