diff options
Diffstat (limited to 'fs/bcachefs/journal_reclaim.c')
-rw-r--r-- | fs/bcachefs/journal_reclaim.c | 396 |
1 files changed, 245 insertions, 151 deletions
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 98345dcd1e67..3b5b646859cb 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -2,15 +2,213 @@ #include "bcachefs.h" #include "journal.h" +#include "journal_io.h" #include "journal_reclaim.h" #include "replicas.h" #include "super.h" +/* Free space calculations: */ + +unsigned bch2_journal_dev_buckets_available(struct journal *j, + struct journal_device *ja) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + unsigned next = (ja->cur_idx + 1) % ja->nr; + unsigned available = (ja->last_idx + ja->nr - next) % ja->nr; + + /* + * Allocator startup needs some journal space before we can do journal + * replay: + */ + if (available && + test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) + available--; + + /* + * Don't use the last bucket unless writing the new last_seq + * will make another bucket available: + */ + if (available && + journal_last_seq(j) <= ja->bucket_seq[ja->last_idx]) + --available; + + return available; +} + +void bch2_journal_space_available(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + unsigned sectors_next_entry = UINT_MAX; + unsigned sectors_total = UINT_MAX; + unsigned max_entry_size = min(j->buf[0].buf_size >> 9, + j->buf[1].buf_size >> 9); + unsigned i, nr_online = 0, nr_devs = 0; + unsigned unwritten_sectors = j->reservations.prev_buf_unwritten + ? journal_prev_buf(j)->sectors + : 0; + int ret = 0; + + lockdep_assert_held(&j->lock); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_JOURNAL]) { + struct journal_device *ja = &ca->journal; + unsigned buckets_this_device, sectors_this_device; + + if (!ja->nr) + continue; + + nr_online++; + + buckets_this_device = bch2_journal_dev_buckets_available(j, ja); + sectors_this_device = ja->sectors_free; + + /* + * We that we don't allocate the space for a journal entry + * until we write it out - thus, account for it here: + */ + if (unwritten_sectors >= sectors_this_device) { + if (!buckets_this_device) + continue; + + buckets_this_device--; + sectors_this_device = ca->mi.bucket_size; + } + + sectors_this_device -= unwritten_sectors; + + if (sectors_this_device < ca->mi.bucket_size && + buckets_this_device) { + buckets_this_device--; + sectors_this_device = ca->mi.bucket_size; + } + + if (!sectors_this_device) + continue; + + sectors_next_entry = min(sectors_next_entry, + sectors_this_device); + + sectors_total = min(sectors_total, + buckets_this_device * ca->mi.bucket_size + + sectors_this_device); + + max_entry_size = min_t(unsigned, max_entry_size, + ca->mi.bucket_size); + + nr_devs++; + } + rcu_read_unlock(); + + if (nr_online < c->opts.metadata_replicas_required) { + ret = -EROFS; + sectors_next_entry = 0; + } else if (!sectors_next_entry || + nr_devs < min_t(unsigned, nr_online, + c->opts.metadata_replicas)) { + ret = -ENOSPC; + sectors_next_entry = 0; + } else if (!fifo_free(&j->pin)) { + ret = -ENOSPC; + sectors_next_entry = 0; + } + + j->cur_entry_sectors = sectors_next_entry; + j->cur_entry_error = ret; + + if (!ret) + journal_wake(j); +} + +/* Discards - last part of journal reclaim: */ + +static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +{ + bool ret; + + spin_lock(&j->lock); + ret = ja->nr && + ja->last_idx != ja->cur_idx && + ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk; + spin_unlock(&j->lock); + + return ret; +} + +/* + * Advance ja->last_idx as long as it points to buckets that are no longer + * dirty, issuing discards if necessary: + */ +static void journal_do_discards(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + unsigned iter; + + mutex_lock(&j->reclaim_lock); + + for_each_rw_member(ca, c, iter) { + struct journal_device *ja = &ca->journal; + + while (should_discard_bucket(j, ja)) { + if (ca->mi.discard && + bdev_max_discard_sectors(ca->disk_sb.bdev)) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket_to_sector(ca, + ja->buckets[ja->last_idx]), + ca->mi.bucket_size, GFP_NOIO); + + spin_lock(&j->lock); + ja->last_idx = (ja->last_idx + 1) % ja->nr; + + bch2_journal_space_available(j); + spin_unlock(&j->lock); + } + } + + mutex_unlock(&j->reclaim_lock); +} + /* * Journal entry pinning - machinery for holding a reference on a given journal * entry, holding it open to ensure it gets replayed during recovery: */ +static void bch2_journal_reclaim_fast(struct journal *j) +{ + struct journal_entry_pin_list temp; + bool popped = false; + + lockdep_assert_held(&j->lock); + + /* + * Unpin journal entries whose reference counts reached zero, meaning + * all btree nodes got written out + */ + while (!fifo_empty(&j->pin) && + !atomic_read(&fifo_peek_front(&j->pin).count)) { + BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); + BUG_ON(!fifo_pop(&j->pin, temp)); + popped = true; + } + + if (popped) + bch2_journal_space_available(j); +} + +void bch2_journal_pin_put(struct journal *j, u64 seq) +{ + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + + if (atomic_dec_and_test(&pin_list->count)) { + spin_lock(&j->lock); + bch2_journal_reclaim_fast(j); + spin_unlock(&j->lock); + } +} + static inline void __journal_pin_add(struct journal *j, u64 seq, struct journal_entry_pin *pin, @@ -25,10 +223,7 @@ static inline void __journal_pin_add(struct journal *j, pin->seq = seq; pin->flush = flush_fn; - if (flush_fn) - list_add(&pin->list, &pin_list->list); - else - INIT_LIST_HEAD(&pin->list); + list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); /* * If the journal is currently full, we might want to call flush_fn @@ -130,86 +325,53 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) * data off of a specific device: */ -/** - * bch2_journal_reclaim_fast - do the fast part of journal reclaim - * - * Called from IO submission context, does not block. Cleans up after btree - * write completions by advancing the journal pin and each cache's last_idx, - * kicking off discards and background reclaim as necessary. - */ -void bch2_journal_reclaim_fast(struct journal *j) -{ - struct journal_entry_pin_list temp; - bool popped = false; - - lockdep_assert_held(&j->lock); - - /* - * Unpin journal entries whose reference counts reached zero, meaning - * all btree nodes got written out - */ - while (!fifo_empty(&j->pin) && - !atomic_read(&fifo_peek_front(&j->pin).count)) { - BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); - BUG_ON(!fifo_pop(&j->pin, temp)); - popped = true; - } - - if (popped) - journal_wake(j); -} - -static void journal_pin_mark_flushing(struct journal *j, - struct journal_entry_pin *pin, - u64 seq) -{ - lockdep_assert_held(&j->reclaim_lock); - - list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); - BUG_ON(j->flush_in_progress); - j->flush_in_progress = pin; -} - -static void journal_pin_flush(struct journal *j, - struct journal_entry_pin *pin, - u64 seq) -{ - pin->flush(j, pin, seq); - - BUG_ON(j->flush_in_progress != pin); - j->flush_in_progress = NULL; - wake_up(&j->pin_flush_wait); -} - static struct journal_entry_pin * -journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) +journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *ret = NULL; - /* no need to iterate over empty fifo entries: */ - bch2_journal_reclaim_fast(j); + spin_lock(&j->lock); + + BUG_ON(!atomic_read(&fifo_peek_front(&j->pin).count)); fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) - if (*seq > seq_to_flush || + if (*seq > max_seq || (ret = list_first_entry_or_null(&pin_list->list, struct journal_entry_pin, list))) break; + if (ret) { + list_move(&ret->list, &pin_list->flushed); + BUG_ON(j->flush_in_progress); + j->flush_in_progress = ret; + j->last_flushed = jiffies; + } + + spin_unlock(&j->lock); + return ret; } -static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +static void journal_flush_pins(struct journal *j, u64 seq_to_flush, + unsigned min_nr) { - bool ret; + struct journal_entry_pin *pin; + u64 seq; - spin_lock(&j->lock); - ret = ja->nr && - (ja->last_idx != ja->cur_idx && - ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk); - spin_unlock(&j->lock); + lockdep_assert_held(&j->reclaim_lock); - return ret; + while ((pin = journal_get_next_pin(j, min_nr + ? U64_MAX : seq_to_flush, &seq))) { + if (min_nr) + min_nr--; + + pin->flush(j, pin, seq); + + BUG_ON(j->flush_in_progress != pin); + j->flush_in_progress = NULL; + wake_up(&j->pin_flush_wait); + } } /** @@ -236,104 +398,44 @@ void bch2_journal_reclaim_work(struct work_struct *work) struct bch_fs, journal.reclaim_work); struct journal *j = &c->journal; struct bch_dev *ca; - struct journal_entry_pin *pin; - u64 seq, seq_to_flush = 0; - unsigned iter, bucket_to_flush; - unsigned long next_flush; - bool reclaim_lock_held = false, need_flush; + unsigned iter, bucket_to_flush, min_nr = 0; + u64 seq_to_flush = 0; + + journal_do_discards(j); + + mutex_lock(&j->reclaim_lock); + spin_lock(&j->lock); - /* - * Advance last_idx to point to the oldest journal entry containing - * btree node updates that have not yet been written out - */ for_each_rw_member(ca, c, iter) { struct journal_device *ja = &ca->journal; if (!ja->nr) continue; - while (should_discard_bucket(j, ja)) { - if (!reclaim_lock_held) { - /* - * ugh: - * might be called from __journal_res_get() - * under wait_event() - have to go back to - * TASK_RUNNING before doing something that - * would block, but only if we're doing work: - */ - __set_current_state(TASK_RUNNING); - - mutex_lock(&j->reclaim_lock); - reclaim_lock_held = true; - /* recheck under reclaim_lock: */ - continue; - } - if (ca->mi.discard && - bdev_max_discard_sectors(ca->disk_sb.bdev)) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, - ja->buckets[ja->last_idx]), - ca->mi.bucket_size, GFP_NOIO); - - spin_lock(&j->lock); - ja->last_idx = (ja->last_idx + 1) % ja->nr; - spin_unlock(&j->lock); - - journal_wake(j); - } - - /* - * Write out enough btree nodes to free up 50% journal - * buckets - */ - spin_lock(&j->lock); + /* Try to keep the journal at most half full: */ bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr; seq_to_flush = max_t(u64, seq_to_flush, ja->bucket_seq[bucket_to_flush]); - spin_unlock(&j->lock); } /* Also flush if the pin fifo is more than half full */ - spin_lock(&j->lock); seq_to_flush = max_t(s64, seq_to_flush, (s64) journal_cur_seq(j) - (j->pin.size >> 1)); + spin_unlock(&j->lock); /* * If it's been longer than j->reclaim_delay_ms since we last flushed, * make sure to flush at least one journal pin: */ - next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms); - need_flush = time_after(jiffies, next_flush); - - while ((pin = journal_get_next_pin(j, need_flush - ? U64_MAX - : seq_to_flush, &seq))) { - if (!reclaim_lock_held) { - spin_unlock(&j->lock); - __set_current_state(TASK_RUNNING); - mutex_lock(&j->reclaim_lock); - reclaim_lock_held = true; - spin_lock(&j->lock); - continue; - } + if (time_after(jiffies, j->last_flushed + + msecs_to_jiffies(j->reclaim_delay_ms))) + min_nr = 1; - journal_pin_mark_flushing(j, pin, seq); - spin_unlock(&j->lock); - - journal_pin_flush(j, pin, seq); - - need_flush = false; - j->last_flushed = jiffies; + journal_flush_pins(j, seq_to_flush, min_nr); - spin_lock(&j->lock); - } - - spin_unlock(&j->lock); - - if (reclaim_lock_held) - mutex_unlock(&j->reclaim_lock); + mutex_unlock(&j->reclaim_lock); if (!test_bit(BCH_FS_RO, &c->flags)) queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, @@ -342,8 +444,6 @@ void bch2_journal_reclaim_work(struct work_struct *work) static int journal_flush_done(struct journal *j, u64 seq_to_flush) { - struct journal_entry_pin *pin; - u64 pin_seq; int ret; ret = bch2_journal_error(j); @@ -351,16 +451,10 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush) return ret; mutex_lock(&j->reclaim_lock); - spin_lock(&j->lock); - - while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq))) { - journal_pin_mark_flushing(j, pin, pin_seq); - spin_unlock(&j->lock); - journal_pin_flush(j, pin, pin_seq); + journal_flush_pins(j, seq_to_flush, 0); - spin_lock(&j->lock); - } + spin_lock(&j->lock); /* * If journal replay hasn't completed, the unreplayed journal entries * hold refs on their corresponding sequence numbers |