1 files changed, 128 insertions, 12 deletions
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index d0e92d948002..3f56b584f8ec 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -6,6 +6,7 @@
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_write_buffer.h"
+#include "disk_accounting.h"
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
@@ -134,7 +135,9 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
 
 static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
 			       struct btree_write_buffered_key *wb,
-			       bool *write_locked, size_t *fast)
+			       bool *write_locked,
+			       bool *accounting_accumulated,
+			       size_t *fast)
 {
 	struct btree_path *path;
 	int ret;
@@ -147,6 +150,16 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
 	if (ret)
 		return ret;
 
+	if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
+		struct bkey u;
+		struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
+
+		if (k.k->type == KEY_TYPE_accounting)
+			bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
+						   bkey_s_c_to_accounting(k));
+	}
+	*accounting_accumulated = true;
+
 	/*
 	 * We can't clone a path that has write locks: unshare it now, before
 	 * set_pos and traverse():
@@ -259,8 +272,9 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 	struct journal *j = &c->journal;
 	struct btree_write_buffer *wb = &c->btree_write_buffer;
 	struct btree_iter iter = { NULL };
-	size_t skipped = 0, fast = 0, slowpath = 0;
+	size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
 	bool write_locked = false;
+	bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
 	int ret = 0;
 
 	bch2_trans_unlock(trans);
@@ -301,11 +315,22 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 
 		BUG_ON(!k->journal_seq);
 
+		if (!accounting_replay_done &&
+		    k->k.k.type == KEY_TYPE_accounting) {
+			slowpath++;
+			continue;
+		}
+
 		if (i + 1 < &darray_top(wb->sorted) &&
 		    wb_key_eq(i, i + 1)) {
 			struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
 
-			skipped++;
+			if (k->k.k.type == KEY_TYPE_accounting &&
+			    n->k.k.type == KEY_TYPE_accounting)
+				bch2_accounting_accumulate(bkey_i_to_accounting(&n->k),
+							   bkey_i_to_s_c_accounting(&k->k));
+
+			overwritten++;
 			n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
 			k->journal_seq = 0;
 			continue;
@@ -340,13 +365,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 		bch2_btree_iter_set_pos(&iter, k->k.k.p);
 		btree_iter_path(trans, &iter)->preserve = false;
 
+		bool accounting_accumulated = false;
 		do {
 			if (race_fault()) {
 				ret = -BCH_ERR_journal_reclaim_would_deadlock;
 				break;
 			}
 
-			ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
+			ret = wb_flush_one(trans, &iter, k, &write_locked,
+					   &accounting_accumulated, &fast);
 			if (!write_locked)
 				bch2_trans_begin(trans);
 		} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
@@ -387,8 +414,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 			if (!i->journal_seq)
 				continue;
 
-			bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
-						bch2_btree_write_buffer_journal_flush);
+			if (!accounting_replay_done &&
+			    i->k.k.type == KEY_TYPE_accounting) {
+				could_not_insert++;
+				continue;
+			}
+
+			if (!could_not_insert)
+				bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
+							bch2_btree_write_buffer_journal_flush);
 
 			bch2_trans_begin(trans);
 
@@ -401,13 +435,45 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 					btree_write_buffered_insert(trans, i));
 			if (ret)
 				goto err;
+
+			i->journal_seq = 0;
+		}
+
+		/*
+		 * If journal replay hasn't finished with accounting keys we
+		 * can't flush accounting keys at all - condense them and leave
+		 * them for next time.
+		 *
+		 * Q: Can the write buffer overflow?
+		 * A Shouldn't be any actual risk. It's just new accounting
+		 * updates that the write buffer can't flush, and those are only
+		 * going to be generated by interior btree node updates as
+		 * journal replay has to split/rewrite nodes to make room for
+		 * its updates.
+		 *
+		 * And for those new acounting updates, updates to the same
+		 * counters get accumulated as they're flushed from the journal
+		 * to the write buffer - see the patch for eytzingcer tree
+		 * accumulated. So we could only overflow if the number of
+		 * distinct counters touched somehow was very large.
+		 */
+		if (could_not_insert) {
+			struct btree_write_buffered_key *dst = wb->flushing.keys.data;
+
+			darray_for_each(wb->flushing.keys, i)
+				if (i->journal_seq)
+					*dst++ = *i;
+			wb->flushing.keys.nr = dst - wb->flushing.keys.data;
 		}
 	}
 err:
+	if (ret || !could_not_insert) {
+		bch2_journal_pin_drop(j, &wb->flushing.pin);
+		wb->flushing.keys.nr = 0;
+	}
+
 	bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
-	trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
-	bch2_journal_pin_drop(j, &wb->flushing.pin);
-	wb->flushing.keys.nr = 0;
+	trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);
 	return ret;
 }
 
@@ -494,7 +560,7 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
 	return ret;
 }
 
-/**
+/*
  * In check and repair code, when checking references to write buffer btrees we
  * need to issue a flush before we have a definitive error: this issues a flush
  * if this is a key we haven't yet checked.
@@ -544,6 +610,29 @@ static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
 	bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
 }
 
+static void wb_accounting_sort(struct btree_write_buffer *wb)
+{
+	eytzinger0_sort(wb->accounting.data, wb->accounting.nr,
+			sizeof(wb->accounting.data[0]),
+			wb_key_cmp, NULL);
+}
+
+int bch2_accounting_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree,
+				       struct bkey_i_accounting *k)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	struct btree_write_buffered_key new = { .btree = btree };
+
+	bkey_copy(&new.k, &k->k_i);
+
+	int ret = darray_push(&wb->accounting, new);
+	if (ret)
+		return ret;
+
+	wb_accounting_sort(wb);
+	return 0;
+}
+
 int bch2_journal_key_to_wb_slowpath(struct bch_fs *c,
 			     struct journal_keys_to_wb *dst,
 			     enum btree_id btree, struct bkey_i *k)
@@ -613,11 +702,35 @@ void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_ke
 
 	bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
 			     bch2_btree_write_buffer_journal_flush);
+
+	darray_for_each(wb->accounting, i)
+		memset(&i->k.v, 0, bkey_val_bytes(&i->k.k));
 }
 
-void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
+int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
 {
 	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	unsigned live_accounting_keys = 0;
+	int ret = 0;
+
+	darray_for_each(wb->accounting, i)
+		if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&i->k))) {
+			i->journal_seq = dst->seq;
+			live_accounting_keys++;
+			ret = __bch2_journal_key_to_wb(c, dst, i->btree, &i->k);
+			if (ret)
+				break;
+		}
+
+	if (live_accounting_keys * 2 < wb->accounting.nr) {
+		struct btree_write_buffered_key *dst = wb->accounting.data;
+
+		darray_for_each(wb->accounting, src)
+			if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&src->k)))
+				*dst++ = *src;
+		wb->accounting.nr = dst - wb->accounting.data;
+		wb_accounting_sort(wb);
+	}
 
 	if (!dst->wb->keys.nr)
 		bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
@@ -630,6 +743,8 @@ void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys
 	if (dst->wb == &wb->flushing)
 		mutex_unlock(&wb->flushing.lock);
 	mutex_unlock(&wb->inc.lock);
+
+	return ret;
 }
 
 static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
@@ -653,7 +768,7 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu
 	buf->need_flush_to_write_buffer = false;
 	spin_unlock(&c->journal.lock);
 out:
-	bch2_journal_keys_to_write_buffer_end(c, &dst);
+	ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret;
 	return ret;
 }
 
@@ -685,6 +800,7 @@ void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
 	BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
 	       !bch2_journal_error(&c->journal));
 
+	darray_exit(&wb->accounting);
 	darray_exit(&wb->sorted);
 	darray_exit(&wb->flushing.keys);
 	darray_exit(&wb->inc.keys);