51 files changed, 541 insertions, 436 deletions
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 15bb7989c387..3acf5e050072 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -512,7 +512,7 @@ static int afs_iget5_set_root(struct inode *inode, void *opaque)
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 
 	vnode->volume		= as->volume;
-	vnode->fid.vid		= as->volume->vid,
+	vnode->fid.vid		= as->volume->vid;
 	vnode->fid.vnode	= 1;
 	vnode->fid.unique	= 1;
 	inode->i_ino		= 1;
@@ -545,7 +545,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key)
 	BUG_ON(!(inode->i_state & I_NEW));
 
 	vnode = AFS_FS_I(inode);
-	vnode->cb_v_check = atomic_read(&as->volume->cb_v_break),
+	vnode->cb_v_check = atomic_read(&as->volume->cb_v_break);
 	afs_set_netfs_context(vnode);
 
 	op = afs_alloc_operation(key, as->volume);
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 8dec2c6cbb7e..1de9fac3bcf4 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -29,7 +29,7 @@
 #include <linux/sched/task.h>
 #include <linux/sort.h>
 
-static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket);
+static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
 
 /* Persistent alloc info: */
 
@@ -893,12 +893,12 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 		if (statechange(a->data_type == BCH_DATA_need_discard) &&
 		    !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
 		    bucket_flushed(new_a))
-			bch2_discard_one_bucket_fast(c, new.k->p);
+			bch2_discard_one_bucket_fast(ca, new.k->p.offset);
 
 		if (statechange(a->data_type == BCH_DATA_cached) &&
 		    !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
 		    should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
-			bch2_do_invalidates(c);
+			bch2_dev_do_invalidates(ca);
 
 		if (statechange(a->data_type == BCH_DATA_need_gc_gens))
 			bch2_gc_gens_async(c);
@@ -1636,34 +1636,38 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
 	return ret;
 }
 
-static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket)
+static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
 {
 	int ret;
 
-	mutex_lock(&c->discard_buckets_in_flight_lock);
-	darray_for_each(c->discard_buckets_in_flight, i)
-		if (bkey_eq(*i, bucket)) {
+	mutex_lock(&ca->discard_buckets_in_flight_lock);
+	darray_for_each(ca->discard_buckets_in_flight, i)
+		if (i->bucket == bucket) {
 			ret = -BCH_ERR_EEXIST_discard_in_flight_add;
 			goto out;
 		}
 
-	ret = darray_push(&c->discard_buckets_in_flight, bucket);
+	ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
+			   .in_progress = in_progress,
+			   .bucket	= bucket,
+	}));
 out:
-	mutex_unlock(&c->discard_buckets_in_flight_lock);
+	mutex_unlock(&ca->discard_buckets_in_flight_lock);
 	return ret;
 }
 
-static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket)
+static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
 {
-	mutex_lock(&c->discard_buckets_in_flight_lock);
-	darray_for_each(c->discard_buckets_in_flight, i)
-		if (bkey_eq(*i, bucket)) {
-			darray_remove_item(&c->discard_buckets_in_flight, i);
+	mutex_lock(&ca->discard_buckets_in_flight_lock);
+	darray_for_each(ca->discard_buckets_in_flight, i)
+		if (i->bucket == bucket) {
+			BUG_ON(!i->in_progress);
+			darray_remove_item(&ca->discard_buckets_in_flight, i);
 			goto found;
 		}
 	BUG();
 found:
-	mutex_unlock(&c->discard_buckets_in_flight_lock);
+	mutex_unlock(&ca->discard_buckets_in_flight_lock);
 }
 
 struct discard_buckets_state {
@@ -1671,26 +1675,11 @@ struct discard_buckets_state {
 	u64		open;
 	u64		need_journal_commit;
 	u64		discarded;
-	struct bch_dev	*ca;
 	u64		need_journal_commit_this_dev;
 };
 
-static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca)
-{
-	if (s->ca == ca)
-		return;
-
-	if (s->ca && s->need_journal_commit_this_dev >
-	    bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets)
-		bch2_journal_flush_async(&c->journal, NULL);
-
-	if (s->ca)
-		percpu_ref_put(&s->ca->io_ref);
-	s->ca = ca;
-	s->need_journal_commit_this_dev = 0;
-}
-
 static int bch2_discard_one_bucket(struct btree_trans *trans,
+				   struct bch_dev *ca,
 				   struct btree_iter *need_discard_iter,
 				   struct bpos *discard_pos_done,
 				   struct discard_buckets_state *s)
@@ -1704,16 +1693,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 	bool discard_locked = false;
 	int ret = 0;
 
-	struct bch_dev *ca = s->ca && s->ca->dev_idx == pos.inode
-		? s->ca
-		: bch2_dev_get_ioref(c, pos.inode, WRITE);
-	if (!ca) {
-		bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
-		return 0;
-	}
-
-	discard_buckets_next_dev(c, s, ca);
-
 	if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
 		s->open++;
 		goto out;
@@ -1773,7 +1752,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 		goto out;
 	}
 
-	if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true)))
+	if (discard_in_flight_add(ca, iter.pos.offset, true))
 		goto out;
 
 	discard_locked = true;
@@ -1811,7 +1790,7 @@ write:
 	s->discarded++;
 out:
 	if (discard_locked)
-		discard_in_flight_remove(c, iter.pos);
+		discard_in_flight_remove(ca, iter.pos.offset);
 	s->seen++;
 	bch2_trans_iter_exit(trans, &iter);
 	printbuf_exit(&buf);
@@ -1820,7 +1799,8 @@ out:
 
 static void bch2_do_discards_work(struct work_struct *work)
 {
-	struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
+	struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
+	struct bch_fs *c = ca->fs;
 	struct discard_buckets_state s = {};
 	struct bpos discard_pos_done = POS_MAX;
 	int ret;
@@ -1831,23 +1811,41 @@ static void bch2_do_discards_work(struct work_struct *work)
 	 * successful commit:
 	 */
 	ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter,
-				   BTREE_ID_need_discard, POS_MIN, 0, k,
-			bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s)));
-
-	discard_buckets_next_dev(c, &s, NULL);
+		for_each_btree_key_upto(trans, iter,
+				   BTREE_ID_need_discard,
+				   POS(ca->dev_idx, 0),
+				   POS(ca->dev_idx, U64_MAX), 0, k,
+			bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s)));
 
 	trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
 			      bch2_err_str(ret));
 
 	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+	percpu_ref_put(&ca->io_ref);
+}
+
+void bch2_dev_do_discards(struct bch_dev *ca)
+{
+	struct bch_fs *c = ca->fs;
+
+	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+		return;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
+		goto put_ioref;
+
+	if (queue_work(c->write_ref_wq, &ca->discard_work))
+		return;
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+put_ioref:
+	percpu_ref_put(&ca->io_ref);
 }
 
 void bch2_do_discards(struct bch_fs *c)
 {
-	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
-	    !queue_work(c->write_ref_wq, &c->discard_work))
-		bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+	for_each_member_device(c, ca)
+		bch2_dev_do_discards(ca);
 }
 
 static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
@@ -1876,68 +1874,69 @@ err:
 
 static void bch2_do_discards_fast_work(struct work_struct *work)
 {
-	struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work);
+	struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
+	struct bch_fs *c = ca->fs;
 
 	while (1) {
 		bool got_bucket = false;
-		struct bpos bucket;
-		struct bch_dev *ca;
+		u64 bucket;
 
-		mutex_lock(&c->discard_buckets_in_flight_lock);
-		darray_for_each(c->discard_buckets_in_flight, i) {
-			if (i->snapshot)
+		mutex_lock(&ca->discard_buckets_in_flight_lock);
+		darray_for_each(ca->discard_buckets_in_flight, i) {
+			if (i->in_progress)
 				continue;
 
-			ca = bch2_dev_get_ioref(c, i->inode, WRITE);
-			if (!ca) {
-				darray_remove_item(&c->discard_buckets_in_flight, i);
-				continue;
-			}
-
 			got_bucket = true;
-			bucket = *i;
-			i->snapshot = true;
+			bucket = i->bucket;
+			i->in_progress = true;
 			break;
 		}
-		mutex_unlock(&c->discard_buckets_in_flight_lock);
+		mutex_unlock(&ca->discard_buckets_in_flight_lock);
 
 		if (!got_bucket)
 			break;
 
 		if (ca->mi.discard && !c->opts.nochanges)
 			blkdev_issue_discard(ca->disk_sb.bdev,
-					     bucket.offset * ca->mi.bucket_size,
+					     bucket_to_sector(ca, bucket),
 					     ca->mi.bucket_size,
 					     GFP_KERNEL);
 
 		int ret = bch2_trans_do(c, NULL, NULL,
-					BCH_WATERMARK_btree|
-					BCH_TRANS_COMMIT_no_enospc,
-					bch2_clear_bucket_needs_discard(trans, bucket));
+			BCH_WATERMARK_btree|
+			BCH_TRANS_COMMIT_no_enospc,
+			bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket)));
 		bch_err_fn(c, ret);
 
-		percpu_ref_put(&ca->io_ref);
-		discard_in_flight_remove(c, bucket);
+		discard_in_flight_remove(ca, bucket);
 
 		if (ret)
 			break;
 	}
 
 	bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+	percpu_ref_put(&ca->io_ref);
 }
 
-static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket)
+static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
 {
-	rcu_read_lock();
-	struct bch_dev *ca = bch2_dev_rcu(c, bucket.inode);
-	bool dead = !ca || percpu_ref_is_dying(&ca->io_ref);
-	rcu_read_unlock();
+	struct bch_fs *c = ca->fs;
+
+	if (discard_in_flight_add(ca, bucket, false))
+		return;
+
+	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+		return;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
+		goto put_ioref;
+
+	if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
+		return;
 
-	if (!dead &&
-	    !discard_in_flight_add(c, bucket) &&
-	    bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) &&
-	    !queue_work(c->write_ref_wq, &c->discard_fast_work))
-		bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+	bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+put_ioref:
+	percpu_ref_put(&ca->io_ref);
 }
 
 static int invalidate_one_bucket(struct btree_trans *trans,
@@ -2038,7 +2037,8 @@ again:
 
 static void bch2_do_invalidates_work(struct work_struct *work)
 {
-	struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
+	struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
+	struct bch_fs *c = ca->fs;
 	struct btree_trans *trans = bch2_trans_get(c);
 	int ret = 0;
 
@@ -2046,52 +2046,63 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 	if (ret)
 		goto err;
 
-	for_each_member_device(c, ca) {
-		s64 nr_to_invalidate =
-			should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
-		struct btree_iter iter;
-		bool wrapped = false;
-
-		bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
-				     lru_pos(ca->dev_idx, 0,
-					     ((bch2_current_io_time(c, READ) + U32_MAX) &
-					      LRU_TIME_MAX)), 0);
-
-		while (true) {
-			bch2_trans_begin(trans);
-
-			struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
-			ret = bkey_err(k);
-			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-				continue;
-			if (ret)
-				break;
-			if (!k.k)
-				break;
+	s64 nr_to_invalidate =
+		should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
+	struct btree_iter iter;
+	bool wrapped = false;
 
-			ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
-			if (ret)
-				break;
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
+			     lru_pos(ca->dev_idx, 0,
+				     ((bch2_current_io_time(c, READ) + U32_MAX) &
+				      LRU_TIME_MAX)), 0);
 
-			bch2_btree_iter_advance(&iter);
-		}
-		bch2_trans_iter_exit(trans, &iter);
+	while (true) {
+		bch2_trans_begin(trans);
 
-		if (ret < 0) {
-			bch2_dev_put(ca);
+		struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
+		ret = bkey_err(k);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
 			break;
-		}
+		if (!k.k)
+			break;
+
+		ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
+		if (ret)
+			break;
+
+		bch2_btree_iter_advance(&iter);
 	}
+	bch2_trans_iter_exit(trans, &iter);
 err:
 	bch2_trans_put(trans);
 	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+	percpu_ref_put(&ca->io_ref);
+}
+
+void bch2_dev_do_invalidates(struct bch_dev *ca)
+{
+	struct bch_fs *c = ca->fs;
+
+	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+		return;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
+		goto put_ioref;
+
+	if (queue_work(c->write_ref_wq, &ca->invalidate_work))
+		return;
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+put_ioref:
+	percpu_ref_put(&ca->io_ref);
 }
 
 void bch2_do_invalidates(struct bch_fs *c)
 {
-	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
-	    !queue_work(c->write_ref_wq, &c->invalidate_work))
-		bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+	for_each_member_device(c, ca)
+		bch2_dev_do_invalidates(ca);
 }
 
 int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
@@ -2407,16 +2418,20 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
 			set_bit(ca->dev_idx, c->rw_devs[i].d);
 }
 
-void bch2_fs_allocator_background_exit(struct bch_fs *c)
+void bch2_dev_allocator_background_exit(struct bch_dev *ca)
+{
+	darray_exit(&ca->discard_buckets_in_flight);
+}
+
+void bch2_dev_allocator_background_init(struct bch_dev *ca)
 {
-	darray_exit(&c->discard_buckets_in_flight);
+	mutex_init(&ca->discard_buckets_in_flight_lock);
+	INIT_WORK(&ca->discard_work, bch2_do_discards_work);
+	INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
+	INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
 }
 
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->freelist_lock);
-	mutex_init(&c->discard_buckets_in_flight_lock);
-	INIT_WORK(&c->discard_work, bch2_do_discards_work);
-	INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work);
-	INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
 }
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index c3cc3c5ba5b6..ba2c5557a3f0 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -275,6 +275,7 @@ int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
 		       enum btree_iter_update_trigger_flags);
 int bch2_check_alloc_info(struct bch_fs *);
 int bch2_check_alloc_to_lru_refs(struct bch_fs *);
+void bch2_dev_do_discards(struct bch_dev *);
 void bch2_do_discards(struct bch_fs *);
 
 static inline u64 should_invalidate_buckets(struct bch_dev *ca,
@@ -289,6 +290,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
 	return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
 }
 
+void bch2_dev_do_invalidates(struct bch_dev *);
 void bch2_do_invalidates(struct bch_fs *);
 
 static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
@@ -312,7 +314,9 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *);
 void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
 
-void bch2_fs_allocator_background_exit(struct bch_fs *);
+void bch2_dev_allocator_background_exit(struct bch_dev *);
+void bch2_dev_allocator_background_init(struct bch_dev *);
+
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 927a5f300b30..9d3d64746a5b 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -621,13 +621,13 @@ again:
 	avail = dev_buckets_free(ca, *usage, watermark);
 
 	if (usage->d[BCH_DATA_need_discard].buckets > avail)
-		bch2_do_discards(c);
+		bch2_dev_do_discards(ca);
 
 	if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
 		bch2_gc_gens_async(c);
 
 	if (should_invalidate_buckets(ca, *usage))
-		bch2_do_invalidates(c);
+		bch2_dev_do_invalidates(ca);
 
 	if (!avail) {
 		if (cl && !waiting) {
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index a6b83ecab7ce..1106fec6e155 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -493,6 +493,11 @@ struct io_count {
 	u64			sectors[2][BCH_DATA_NR];
 };
 
+struct discard_in_flight {
+	bool			in_progress:1;
+	u64			bucket:63;
+};
+
 struct bch_dev {
 	struct kobject		kobj;
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -554,6 +559,12 @@ struct bch_dev {
 	size_t			inc_gen_really_needs_gc;
 	size_t			buckets_waiting_on_journal;
 
+	struct work_struct	invalidate_work;
+	struct work_struct	discard_work;
+	struct mutex		discard_buckets_in_flight_lock;
+	DARRAY(struct discard_in_flight)	discard_buckets_in_flight;
+	struct work_struct	discard_fast_work;
+
 	atomic64_t		rebalance_work;
 
 	struct journal_device	journal;
@@ -915,11 +926,6 @@ struct bch_fs {
 	unsigned		write_points_nr;
 
 	struct buckets_waiting_for_journal buckets_waiting_for_journal;
-	struct work_struct	invalidate_work;
-	struct work_struct	discard_work;
-	struct mutex		discard_buckets_in_flight_lock;
-	DARRAY(struct bpos)	discard_buckets_in_flight;
-	struct work_struct	discard_fast_work;
 
 	/* GARBAGE COLLECTION */
 	struct work_struct	gc_gens_work;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3a1419d17888..0ed9e6574fcd 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3130,7 +3130,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
 
 	trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
 	memset(trans, 0, sizeof(*trans));
-	closure_init_stack(&trans->ref);
 
 	seqmutex_lock(&c->btree_trans_lock);
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
@@ -3150,18 +3149,12 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
 			BUG_ON(pos_task &&
 			       pid == pos_task->pid &&
 			       pos->locked);
-
-			if (pos_task && pid < pos_task->pid) {
-				list_add_tail(&trans->list, &pos->list);
-				goto list_add_done;
-			}
 		}
 	}
-	list_add_tail(&trans->list, &c->btree_trans_list);
-list_add_done:
+
+	list_add(&trans->list, &c->btree_trans_list);
 	seqmutex_unlock(&c->btree_trans_lock);
 got_trans:
-	trans->ref.closure_get_happened = false;
 	trans->c		= c;
 	trans->last_begin_time	= local_clock();
 	trans->fn_idx		= fn_idx;
@@ -3200,6 +3193,8 @@ got_trans:
 	trans->srcu_idx		= srcu_read_lock(&c->btree_trans_barrier);
 	trans->srcu_lock_time	= jiffies;
 	trans->srcu_held	= true;
+
+	closure_init_stack_release(&trans->ref);
 	return trans;
 }
 
@@ -3257,10 +3252,10 @@ void bch2_trans_put(struct btree_trans *trans)
 		bch2_journal_keys_put(c);
 
 	/*
-	 * trans->ref protects trans->locking_wait.task, btree_paths arary; used
+	 * trans->ref protects trans->locking_wait.task, btree_paths array; used
 	 * by cycle detector
 	 */
-	closure_sync(&trans->ref);
+	closure_return_sync(&trans->ref);
 	trans->locking_wait.task = NULL;
 
 	unsigned long *paths_allocated = trans->paths_allocated;
@@ -3385,8 +3380,6 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
 				per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
 
 			if (trans) {
-				closure_sync(&trans->ref);
-
 				seqmutex_lock(&c->btree_trans_lock);
 				list_del(&trans->list);
 				seqmutex_unlock(&c->btree_trans_lock);
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 9e54323f0f5f..6d82e1165adc 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -216,7 +216,8 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
 
 		ret =   PTR_ERR_OR_ZERO(optstr) ?:
 			bch2_parse_mount_opts(NULL, &thr->opts, optstr);
-		kfree(optstr);
+		if (!IS_ERR(optstr))
+			kfree(optstr);
 
 		if (ret)
 			goto err;
@@ -319,7 +320,8 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
 		return ret;
 
 	ret = bch2_dev_add(c, path);
-	kfree(path);
+	if (!IS_ERR(path))
+		kfree(path);
 
 	return ret;
 }
@@ -850,7 +852,8 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
 
 		ret =   PTR_ERR_OR_ZERO(optstr) ?:
 			bch2_parse_mount_opts(c, &thr->opts, optstr);
-		kfree(optstr);
+		if (!IS_ERR(optstr))
+			kfree(optstr);
 
 		if (ret)
 			goto err;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 51cbf3928361..f0d4727c4dc2 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -568,6 +568,32 @@ static const struct file_operations cached_btree_nodes_ops = {
 	.read		= bch2_cached_btree_nodes_read,
 };
 
+typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r);
+
+static void list_sort(struct list_head *head, list_cmp_fn cmp)
+{
+	struct list_head *pos;
+
+	list_for_each(pos, head)
+		while (!list_is_last(pos, head) &&
+		       cmp(pos, pos->next) > 0) {
+			struct list_head *pos2, *next = pos->next;
+
+			list_del(next);
+			list_for_each(pos2, head)
+				if (cmp(next, pos2) < 0)
+					goto pos_found;
+			BUG();
+pos_found:
+			list_add_tail(next, pos2);
+		}
+}
+
+static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r)
+{
+	return cmp_int(l, r);
+}
+
 static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 					    size_t size, loff_t *ppos)
 {
@@ -575,41 +601,39 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 	struct bch_fs *c = i->c;
 	struct btree_trans *trans;
 	ssize_t ret = 0;
-	u32 seq;
 
 	i->ubuf = buf;
 	i->size	= size;
 	i->ret	= 0;
 restart:
 	seqmutex_lock(&c->btree_trans_lock);
-	list_for_each_entry(trans, &c->btree_trans_list, list) {
-		struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+	list_sort(&c->btree_trans_list, list_ptr_order_cmp);
 
-		if (!task || task->pid <= i->iter)
+	list_for_each_entry(trans, &c->btree_trans_list, list) {
+		if ((ulong) trans < i->iter)
 			continue;
 
-		closure_get(&trans->ref);
-		seq = seqmutex_seq(&c->btree_trans_lock);
-		seqmutex_unlock(&c->btree_trans_lock);
+		i->iter = (ulong) trans;
 
-		ret = flush_buf(i);
-		if (ret) {
-			closure_put(&trans->ref);
-			goto unlocked;
-		}
+		if (!closure_get_not_zero(&trans->ref))
+			continue;
+
+		u32 seq = seqmutex_unlock(&c->btree_trans_lock);
 
 		bch2_btree_trans_to_text(&i->buf, trans);
 
 		prt_printf(&i->buf, "backtrace:\n");
 		printbuf_indent_add(&i->buf, 2);
-		bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL);
+		bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL);
 		printbuf_indent_sub(&i->buf, 2);
 		prt_newline(&i->buf);
 
-		i->iter = task->pid;
-
 		closure_put(&trans->ref);
 
+		ret = flush_buf(i);
+		if (ret)
+			goto unlocked;
+
 		if (!seqmutex_relock(&c->btree_trans_lock, seq))
 			goto restart;
 	}
@@ -804,50 +828,55 @@ static const struct file_operations btree_transaction_stats_op = {
 	.read		= btree_transaction_stats_read,
 };
 
-static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
-					    size_t size, loff_t *ppos)
+/* walk btree transactions until we find a deadlock and print it */
+static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	struct dump_iter *i = file->private_data;
-	struct bch_fs *c = i->c;
 	struct btree_trans *trans;
-	ssize_t ret = 0;
-	u32 seq;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-
-	if (i->iter)
-		goto out;
+	pid_t iter = 0;
 restart:
 	seqmutex_lock(&c->btree_trans_lock);
 	list_for_each_entry(trans, &c->btree_trans_list, list) {
 		struct task_struct *task = READ_ONCE(trans->locking_wait.task);
 
-		if (!task || task->pid <= i->iter)
+		if (!task || task->pid <= iter)
 			continue;
 
-		closure_get(&trans->ref);
-		seq = seqmutex_seq(&c->btree_trans_lock);
-		seqmutex_unlock(&c->btree_trans_lock);
+		iter = task->pid;
 
-		ret = flush_buf(i);
-		if (ret) {
-			closure_put(&trans->ref);
-			goto out;
-		}
+		if (!closure_get_not_zero(&trans->ref))
+			continue;
 
-		bch2_check_for_deadlock(trans, &i->buf);
+		u32 seq = seqmutex_unlock(&c->btree_trans_lock);
 
-		i->iter = task->pid;
+		bool found = bch2_check_for_deadlock(trans, out) != 0;
 
 		closure_put(&trans->ref);
 
+		if (found)
+			return;
+
 		if (!seqmutex_relock(&c->btree_trans_lock, seq))
 			goto restart;
 	}
 	seqmutex_unlock(&c->btree_trans_lock);
-out:
+}
+
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
+					    size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	ssize_t ret = 0;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	if (!i->iter) {
+		btree_deadlock_to_text(&i->buf, c);
+		i->iter++;
+	}
+
 	if (i->buf.allocation_failure)
 		ret = -ENOMEM;
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index dac2f498ae8b..13669dd0e375 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1521,6 +1521,11 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
 	struct journal_entry_pin *pin;
 
 	spin_lock(&j->lock);
+	if (!test_bit(JOURNAL_running, &j->flags)) {
+		spin_unlock(&j->lock);
+		return true;
+	}
+
 	*seq = max(*seq, j->pin.front);
 
 	if (*seq >= j->pin.back) {
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 492426c8d869..db24ce21b2ac 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1677,6 +1677,13 @@ static CLOSURE_CALLBACK(journal_write_done)
 		mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
 	}
 
+	/*
+	 * We don't typically trigger journal writes from her - the next journal
+	 * write will be triggered immediately after the previous one is
+	 * allocated, in bch2_journal_write() - but the journal write error path
+	 * is special:
+	 */
+	bch2_journal_do_writes(j);
 	spin_unlock(&j->lock);
 }
 
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index ed4846709611..1f25c111c54c 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -232,7 +232,7 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c)
 	BUG_ON(nr != t->nr);
 
 	unsigned i;
-	for (src = bl->start, i = eytzinger0_first(t->nr);
+	for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr);
 	     src < bl->start + nr;
 	     src++, i = eytzinger0_next(i, nr)) {
 		BUG_ON(t->entries[i].start	!= le64_to_cpu(src->start));
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
index bda33e59e226..c1270d790e43 100644
--- a/fs/bcachefs/sb-errors.c
+++ b/fs/bcachefs/sb-errors.c
@@ -110,19 +110,25 @@ out:
 void bch2_sb_errors_from_cpu(struct bch_fs *c)
 {
 	bch_sb_errors_cpu *src = &c->fsck_error_counts;
-	struct bch_sb_field_errors *dst =
-		bch2_sb_field_resize(&c->disk_sb, errors,
-				     bch2_sb_field_errors_u64s(src->nr));
+	struct bch_sb_field_errors *dst;
 	unsigned i;
 
+	mutex_lock(&c->fsck_error_counts_lock);
+
+	dst = bch2_sb_field_resize(&c->disk_sb, errors,
+				   bch2_sb_field_errors_u64s(src->nr));
+
 	if (!dst)
-		return;
+		goto err;
 
 	for (i = 0; i < src->nr; i++) {
 		SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id);
 		SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr);
 		dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time);
 	}
+
+err:
+	mutex_unlock(&c->fsck_error_counts_lock);
 }
 
 static int bch2_sb_errors_to_cpu(struct bch_fs *c)
diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h
index c1860d8163fb..c4b3d8d3f414 100644
--- a/fs/bcachefs/seqmutex.h
+++ b/fs/bcachefs/seqmutex.h
@@ -19,17 +19,14 @@ static inline bool seqmutex_trylock(struct seqmutex *lock)
 static inline void seqmutex_lock(struct seqmutex *lock)
 {
 	mutex_lock(&lock->lock);
-}
-
-static inline void seqmutex_unlock(struct seqmutex *lock)
-{
 	lock->seq++;
-	mutex_unlock(&lock->lock);
 }
 
-static inline u32 seqmutex_seq(struct seqmutex *lock)
+static inline u32 seqmutex_unlock(struct seqmutex *lock)
 {
-	return lock->seq;
+	u32 seq = lock->seq;
+	mutex_unlock(&lock->lock);
+	return seq;
 }
 
 static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq)
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 4ef98e696673..24023d6a9698 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -168,6 +168,9 @@ static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
 	size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
 	size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
 
+	if (unlikely(new_bytes > INT_MAX))
+		return NULL;
+
 	new = kvzalloc(new_bytes, GFP_KERNEL);
 	if (!new)
 		return NULL;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 9083df82073a..fb906467201e 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -536,7 +536,6 @@ static void __bch2_fs_free(struct bch_fs *c)
 
 	bch2_find_btree_nodes_exit(&c->found_btree_nodes);
 	bch2_free_pending_node_rewrites(c);
-	bch2_fs_allocator_background_exit(c);
 	bch2_fs_sb_errors_exit(c);
 	bch2_fs_counters_exit(c);
 	bch2_fs_snapshots_exit(c);
@@ -1195,6 +1194,7 @@ static void bch2_dev_free(struct bch_dev *ca)
 
 	kfree(ca->buckets_nouse);
 	bch2_free_super(&ca->disk_sb);
+	bch2_dev_allocator_background_exit(ca);
 	bch2_dev_journal_exit(ca);
 
 	free_percpu(ca->io_done);
@@ -1317,6 +1317,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 	atomic_long_set(&ca->ref, 1);
 #endif
 
+	bch2_dev_allocator_background_init(ca);
+
 	if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
 			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
 	    !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
@@ -1529,6 +1531,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
 	 * The allocator thread itself allocates btree nodes, so stop it first:
 	 */
 	bch2_dev_allocator_remove(c, ca);
+	bch2_recalc_capacity(c);
 	bch2_dev_journal_stop(&c->journal, ca);
 }
 
@@ -1540,6 +1543,7 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 
 	bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
+	bch2_dev_do_discards(ca);
 }
 
 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 1a66be33bb04..60066822b532 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1924,8 +1924,17 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
 next:
 		if (ret) {
 			/* Refcount held by the reclaim_bgs list after splice. */
-			btrfs_get_block_group(bg);
-			list_add_tail(&bg->bg_list, &retry_list);
+			spin_lock(&fs_info->unused_bgs_lock);
+			/*
+			 * This block group might be added to the unused list
+			 * during the above process. Move it back to the
+			 * reclaim list otherwise.
+			 */
+			if (list_empty(&bg->bg_list)) {
+				btrfs_get_block_group(bg);
+				list_add_tail(&bg->bg_list, &retry_list);
+			}
+			spin_unlock(&fs_info->unused_bgs_lock);
 		}
 		btrfs_put_block_group(bg);
 
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 3ab8dea5036b..dabc3d0793cf 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2697,7 +2697,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
 	u64 offset = bytenr - block_group->start;
 	u64 to_free, to_unusable;
 	int bg_reclaim_threshold = 0;
-	bool initial = (size == block_group->length);
+	bool initial = ((size == block_group->length) && (block_group->alloc_offset == 0));
 	u64 reclaimable_unusable;
 
 	WARN_ON(!initial && offset + size > block_group->zone_capacity);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index fc2a7ea26354..bf0f81d59b6b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1351,7 +1351,7 @@ static int flush_reservations(struct btrfs_fs_info *fs_info)
 
 int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_root *quota_root;
+	struct btrfs_root *quota_root = NULL;
 	struct btrfs_trans_handle *trans = NULL;
 	int ret = 0;
 
@@ -1449,9 +1449,9 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
 	btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
 			      quota_root->node, 0, 1);
 
-	btrfs_put_root(quota_root);
 
 out:
+	btrfs_put_root(quota_root);
 	mutex_unlock(&fs_info->qgroup_ioctl_lock);
 	if (ret && trans)
 		btrfs_end_transaction(trans);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index afd6932f5e89..d7caa3732f07 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1688,20 +1688,24 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
 					    (i << fs_info->sectorsize_bits);
 			int err;
 
-			bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
-					       fs_info, scrub_read_endio, stripe);
-			bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
-
 			io_stripe.is_scrub = true;
+			stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits;
+			/*
+			 * For RST cases, we need to manually split the bbio to
+			 * follow the RST boundary.
+			 */
 			err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
-					      &stripe_len, &bioc, &io_stripe,
-					      &mirror);
+					      &stripe_len, &bioc, &io_stripe, &mirror);
 			btrfs_put_bioc(bioc);
-			if (err) {
-				btrfs_bio_end_io(bbio,
-						 errno_to_blk_status(err));
-				return;
+			if (err < 0) {
+				set_bit(i, &stripe->io_error_bitmap);
+				set_bit(i, &stripe->error_bitmap);
+				continue;
 			}
+
+			bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
+					       fs_info, scrub_read_endio, stripe);
+			bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
 		}
 
 		__bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 26a2e5aa08e9..0bce1d45e252 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -138,6 +138,25 @@ static void wait_log_commit(struct btrfs_root *root, int transid);
  * and once to do all the other items.
  */
 
+static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
+{
+	unsigned int nofs_flag;
+	struct inode *inode;
+
+	/*
+	 * We're holding a transaction handle whether we are logging or
+	 * replaying a log tree, so we must make sure NOFS semantics apply
+	 * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL
+	 * to allocate an inode, which can recurse back into the filesystem and
+	 * attempt a transaction commit, resulting in a deadlock.
+	 */
+	nofs_flag = memalloc_nofs_save();
+	inode = btrfs_iget(root->fs_info->sb, objectid, root);
+	memalloc_nofs_restore(nofs_flag);
+
+	return inode;
+}
+
 /*
  * start a sub transaction and setup the log tree
  * this increments the log tree writer count to make the people
@@ -600,7 +619,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
 {
 	struct inode *inode;
 
-	inode = btrfs_iget(root->fs_info->sb, objectid, root);
+	inode = btrfs_iget_logging(objectid, root);
 	if (IS_ERR(inode))
 		inode = NULL;
 	return inode;
@@ -5438,7 +5457,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 				struct btrfs_log_ctx *ctx)
 {
 	struct btrfs_root *root = start_inode->root;
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_path *path;
 	LIST_HEAD(dir_list);
 	struct btrfs_dir_list *dir_elem;
@@ -5499,7 +5517,7 @@ again:
 				continue;
 
 			btrfs_release_path(path);
-			di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
+			di_inode = btrfs_iget_logging(di_key.objectid, root);
 			if (IS_ERR(di_inode)) {
 				ret = PTR_ERR(di_inode);
 				goto out;
@@ -5559,7 +5577,7 @@ again:
 		btrfs_add_delayed_iput(curr_inode);
 		curr_inode = NULL;
 
-		vfs_inode = btrfs_iget(fs_info->sb, ino, root);
+		vfs_inode = btrfs_iget_logging(ino, root);
 		if (IS_ERR(vfs_inode)) {
 			ret = PTR_ERR(vfs_inode);
 			break;
@@ -5654,7 +5672,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
 	if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
 		return BTRFS_LOG_FORCE_COMMIT;
 
-	inode = btrfs_iget(root->fs_info->sb, ino, root);
+	inode = btrfs_iget_logging(ino, root);
 	/*
 	 * If the other inode that had a conflicting dir entry was deleted in
 	 * the current transaction then we either:
@@ -5755,7 +5773,6 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
 				  struct btrfs_log_ctx *ctx)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret = 0;
 
 	/*
@@ -5786,7 +5803,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
 		list_del(&curr->list);
 		kfree(curr);
 
-		inode = btrfs_iget(fs_info->sb, ino, root);
+		inode = btrfs_iget_logging(ino, root);
 		/*
 		 * If the other inode that had a conflicting dir entry was
 		 * deleted in the current transaction, we need to log its parent
@@ -5797,7 +5814,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
 			if (ret != -ENOENT)
 				break;
 
-			inode = btrfs_iget(fs_info->sb, parent, root);
+			inode = btrfs_iget_logging(parent, root);
 			if (IS_ERR(inode)) {
 				ret = PTR_ERR(inode);
 				break;
@@ -6319,7 +6336,6 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
 				    struct btrfs_log_ctx *ctx)
 {
 	const bool orig_log_new_dentries = ctx->log_new_dentries;
-	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_item *item;
 	int ret = 0;
 
@@ -6345,7 +6361,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
 		if (key.type == BTRFS_ROOT_ITEM_KEY)
 			continue;
 
-		di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root);
+		di_inode = btrfs_iget_logging(key.objectid, inode->root);
 		if (IS_ERR(di_inode)) {
 			ret = PTR_ERR(di_inode);
 			break;
@@ -6729,7 +6745,6 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 				 struct btrfs_inode *inode,
 				 struct btrfs_log_ctx *ctx)
 {
-	struct btrfs_fs_info *fs_info = trans->fs_info;
 	int ret;
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -6794,8 +6809,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 				cur_offset = item_size;
 			}
 
-			dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
-					       root);
+			dir_inode = btrfs_iget_logging(inode_key.objectid, root);
 			/*
 			 * If the parent inode was deleted, return an error to
 			 * fallback to a transaction commit. This is to prevent
@@ -6857,7 +6871,6 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
 
 	while (true) {
-		struct btrfs_fs_info *fs_info = root->fs_info;
 		struct extent_buffer *leaf;
 		int slot;
 		struct btrfs_key search_key;
@@ -6872,7 +6885,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
 		search_key.objectid = found_key.offset;
 		search_key.type = BTRFS_INODE_ITEM_KEY;
 		search_key.offset = 0;
-		inode = btrfs_iget(fs_info->sb, ino, root);
+		inode = btrfs_iget_logging(ino, root);
 		if (IS_ERR(inode))
 			return PTR_ERR(inode);
 
diff --git a/fs/dcache.c b/fs/dcache.c
index 407095188f83..d58dc9e58f3b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3029,28 +3029,25 @@ EXPORT_SYMBOL(d_splice_alias);
   
 bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 {
-	bool result;
+	bool subdir;
 	unsigned seq;
 
 	if (new_dentry == old_dentry)
 		return true;
 
-	do {
-		/* for restarting inner loop in case of seq retry */
-		seq = read_seqbegin(&rename_lock);
-		/*
-		 * Need rcu_readlock to protect against the d_parent trashing
-		 * due to d_move
-		 */
-		rcu_read_lock();
-		if (d_ancestor(old_dentry, new_dentry))
-			result = true;
-		else
-			result = false;
-		rcu_read_unlock();
-	} while (read_seqretry(&rename_lock, seq));
-
-	return result;
+	/* Access d_parent under rcu as d_move() may change it. */
+	rcu_read_lock();
+	seq = read_seqbegin(&rename_lock);
+	subdir = d_ancestor(old_dentry, new_dentry);
+	 /* Try lockless once... */
+	if (read_seqretry(&rename_lock, seq)) {
+		/* ...else acquire lock for progress even on deep chains. */
+		read_seqlock_excl(&rename_lock);
+		subdir = d_ancestor(old_dentry, new_dentry);
+		read_sequnlock_excl(&rename_lock);
+	}
+	rcu_read_unlock();
+	return subdir;
 }
 EXPORT_SYMBOL(is_subdir);
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index c93bd24d2771..1b91d9513013 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -343,7 +343,7 @@ static int erofs_read_superblock(struct super_block *sb)
 	sbi->build_time = le64_to_cpu(dsb->build_time);
 	sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec);
 
-	memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid));
+	super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid));
 
 	ret = strscpy(sbi->volume_name, dsb->volume_name,
 		      sizeof(dsb->volume_name));
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 9b248ee5fef2..74d3d7bffcf3 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -711,6 +711,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
 
 	err = z_erofs_do_map_blocks(inode, map, flags);
 out:
+	if (err)
+		map->m_llen = 0;
 	trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
 	return err;
 }
diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
index 036024bce9f7..b80f612867c2 100644
--- a/fs/erofs/zutil.c
+++ b/fs/erofs/zutil.c
@@ -148,7 +148,7 @@ int __init z_erofs_gbuf_init(void)
 
 void z_erofs_gbuf_exit(void)
 {
-	int i;
+	int i, j;
 
 	for (i = 0; i < z_erofs_gbuf_count + (!!z_erofs_rsvbuf); ++i) {
 		struct z_erofs_gbuf *gbuf = &z_erofs_gbufpool[i];
@@ -161,9 +161,9 @@ void z_erofs_gbuf_exit(void)
 		if (!gbuf->pages)
 			continue;
 
-		for (i = 0; i < gbuf->nrpages; ++i)
-			if (gbuf->pages[i])
-				put_page(gbuf->pages[i]);
+		for (j = 0; j < gbuf->nrpages; ++j)
+			if (gbuf->pages[j])
+				put_page(gbuf->pages[j]);
 		kfree(gbuf->pages);
 		gbuf->pages = NULL;
 	}
diff --git a/fs/locks.c b/fs/locks.c
index 90c8746874de..c360d1992d21 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2448,8 +2448,9 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 	error = do_lock_file_wait(filp, cmd, file_lock);
 
 	/*
-	 * Attempt to detect a close/fcntl race and recover by releasing the
-	 * lock that was just acquired. There is no need to do that when we're
+	 * Detect close/fcntl races and recover by zapping all POSIX locks
+	 * associated with this file and our files_struct, just like on
+	 * filp_flush(). There is no need to do that when we're
 	 * unlocking though, or for OFD locks.
 	 */
 	if (!error && file_lock->c.flc_type != F_UNLCK &&
@@ -2464,9 +2465,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 		f = files_lookup_fd_locked(files, fd);
 		spin_unlock(&files->file_lock);
 		if (f != filp) {
-			file_lock->c.flc_type = F_UNLCK;
-			error = do_lock_file_wait(filp, cmd, file_lock);
-			WARN_ON_ONCE(error);
+			locks_remove_posix(filp, files);
 			error = -EBADF;
 		}
 	}
diff --git a/fs/namei.c b/fs/namei.c
index 37fb0a8aa09a..1e05a0f3f04d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3572,8 +3572,12 @@ static const char *open_last_lookups(struct nameidata *nd,
 	else
 		inode_lock_shared(dir->d_inode);
 	dentry = lookup_open(nd, file, op, got_write);
-	if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
-		fsnotify_create(dir->d_inode, dentry);
+	if (!IS_ERR(dentry)) {
+		if (file->f_mode & FMODE_CREATED)
+			fsnotify_create(dir->d_inode, dentry);
+		if (file->f_mode & FMODE_OPENED)
+			fsnotify_open(file);
+	}
 	if (open_flag & O_CREAT)
 		inode_unlock(dir->d_inode);
 	else
@@ -3700,6 +3704,8 @@ int vfs_tmpfile(struct mnt_idmap *idmap,
 	mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
 	error = dir->i_op->tmpfile(idmap, dir, file, mode);
 	dput(child);
+	if (file->f_mode & FMODE_OPENED)
+		fsnotify_open(file);
 	if (error)
 		return error;
 	/* Don't check for other permissions, the inode was just created */
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 07bc1fd43530..d583af7a2209 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -523,6 +523,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
 	struct netfs_group *group;
 	struct folio *folio = page_folio(vmf->page);
 	struct file *file = vmf->vma->vm_file;
+	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = file_inode(file);
 	struct netfs_inode *ictx = netfs_inode(inode);
 	vm_fault_t ret = VM_FAULT_RETRY;
@@ -534,6 +535,11 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
 
 	if (folio_lock_killable(folio) < 0)
 		goto out;
+	if (folio->mapping != mapping) {
+		folio_unlock(folio);
+		ret = VM_FAULT_NOPAGE;
+		goto out;
+	}
 
 	if (folio_wait_writeback_killable(folio)) {
 		ret = VM_FAULT_LOCKED;
@@ -549,9 +555,9 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
 	group = netfs_folio_group(folio);
 	if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) {
 		folio_unlock(folio);
-		err = filemap_fdatawait_range(inode->i_mapping,
-					      folio_pos(folio),
-					      folio_pos(folio) + folio_size(folio));
+		err = filemap_fdatawrite_range(mapping,
+					       folio_pos(folio),
+					       folio_pos(folio) + folio_size(folio));
 		switch (err) {
 		case 0:
 			ret = VM_FAULT_RETRY;
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index e14cd53ac9fd..88f2adfab75e 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -92,8 +92,9 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
 	__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
 	if (async)
 		wreq->iocb = iocb;
+	wreq->len = iov_iter_count(&wreq->io_iter);
 	wreq->cleanup = netfs_cleanup_dio_write;
-	ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), iov_iter_count(&wreq->io_iter));
+	ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len);
 	if (ret < 0) {
 		_debug("begin = %zd", ret);
 		goto out;
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 95e281a8af78..acd9ca14e264 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -63,15 +63,6 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
 /*
  * misc.c
  */
-#define NETFS_FLAG_PUT_MARK		BIT(0)
-#define NETFS_FLAG_PAGECACHE_MARK	BIT(1)
-int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
-			    struct folio *folio, unsigned int flags,
-			    gfp_t gfp_mask);
-int netfs_add_folios_to_buffer(struct xarray *buffer,
-			       struct address_space *mapping,
-			       pgoff_t index, pgoff_t to, gfp_t gfp_mask);
-void netfs_clear_buffer(struct xarray *buffer);
 
 /*
  * objects.c
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index bc1fc54fb724..83e644bd518f 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -8,87 +8,6 @@
 #include <linux/swap.h>
 #include "internal.h"
 
-/*
- * Attach a folio to the buffer and maybe set marks on it to say that we need
- * to put the folio later and twiddle the pagecache flags.
- */
-int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
-			    struct folio *folio, unsigned int flags,
-			    gfp_t gfp_mask)
-{
-	XA_STATE_ORDER(xas, xa, index, folio_order(folio));
-
-retry:
-	xas_lock(&xas);
-	for (;;) {
-		xas_store(&xas, folio);
-		if (!xas_error(&xas))
-			break;
-		xas_unlock(&xas);
-		if (!xas_nomem(&xas, gfp_mask))
-			return xas_error(&xas);
-		goto retry;
-	}
-
-	if (flags & NETFS_FLAG_PUT_MARK)
-		xas_set_mark(&xas, NETFS_BUF_PUT_MARK);
-	if (flags & NETFS_FLAG_PAGECACHE_MARK)
-		xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK);
-	xas_unlock(&xas);
-	return xas_error(&xas);
-}
-
-/*
- * Create the specified range of folios in the buffer attached to the read
- * request.  The folios are marked with NETFS_BUF_PUT_MARK so that we know that
- * these need freeing later.
- */
-int netfs_add_folios_to_buffer(struct xarray *buffer,
-			       struct address_space *mapping,
-			       pgoff_t index, pgoff_t to, gfp_t gfp_mask)
-{
-	struct folio *folio;
-	int ret;
-
-	if (to + 1 == index) /* Page range is inclusive */
-		return 0;
-
-	do {
-		/* TODO: Figure out what order folio can be allocated here */
-		folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0);
-		if (!folio)
-			return -ENOMEM;
-		folio->index = index;
-		ret = netfs_xa_store_and_mark(buffer, index, folio,
-					      NETFS_FLAG_PUT_MARK, gfp_mask);
-		if (ret < 0) {
-			folio_put(folio);
-			return ret;
-		}
-
-		index += folio_nr_pages(folio);
-	} while (index <= to && index != 0);
-
-	return 0;
-}
-
-/*
- * Clear an xarray buffer, putting a ref on the folios that have
- * NETFS_BUF_PUT_MARK set.
- */
-void netfs_clear_buffer(struct xarray *buffer)
-{
-	struct folio *folio;
-	XA_STATE(xas, buffer, 0);
-
-	rcu_read_lock();
-	xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) {
-		folio_put(folio);
-	}
-	rcu_read_unlock();
-	xa_destroy(buffer);
-}
-
 /**
  * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback
  * @mapping: The mapping the folio belongs to.
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 3aa86e268f40..ec6cf8707fb0 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -483,7 +483,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
 	if (!debug)
 		kdebug("R=%x: No submit", wreq->debug_id);
 
-	if (flen < fsize)
+	if (foff + flen < fsize)
 		for (int s = 0; s < NR_IO_STREAMS; s++)
 			netfs_issue_write(wreq, &wreq->io_streams[s]);
 
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 533b65057e18..c848ebe5d08f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -2169,6 +2169,8 @@ static __net_init int nfsd_net_init(struct net *net)
 	nn->nfsd_svcstats.program = &nfsd_program;
 	nn->nfsd_versions = NULL;
 	nn->nfsd4_minorversions = NULL;
+	nn->nfsd_info.mutex = &nfsd_mutex;
+	nn->nfsd_serv = NULL;
 	nfsd4_init_leases_net(nn);
 	get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
 	seqlock_init(&nn->writeverf_lock);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index cd9a6a1a9fc8..89d7918de7b1 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -672,7 +672,6 @@ int nfsd_create_serv(struct net *net)
 		return error;
 	}
 	spin_lock(&nfsd_notifier_lock);
-	nn->nfsd_info.mutex = &nfsd_mutex;
 	nn->nfsd_serv = serv;
 	spin_unlock(&nfsd_notifier_lock);
 
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 89caef7513db..ba50388ee4bf 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -377,11 +377,12 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
  * @target: offset number of an entry in the group (start point)
  * @bsize: size in bits
  * @lock: spin lock protecting @bitmap
+ * @wrap: whether to wrap around
  */
 static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
 					    unsigned long target,
 					    unsigned int bsize,
-					    spinlock_t *lock)
+					    spinlock_t *lock, bool wrap)
 {
 	int pos, end = bsize;
 
@@ -397,6 +398,8 @@ static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
 
 		end = target;
 	}
+	if (!wrap)
+		return -ENOSPC;
 
 	/* wrap around */
 	for (pos = 0; pos < end; pos++) {
@@ -495,9 +498,10 @@ int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp)
  * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
  * @inode: inode of metadata file using this allocator
  * @req: nilfs_palloc_req structure exchanged for the allocation
+ * @wrap: whether to wrap around
  */
 int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
-				     struct nilfs_palloc_req *req)
+				     struct nilfs_palloc_req *req, bool wrap)
 {
 	struct buffer_head *desc_bh, *bitmap_bh;
 	struct nilfs_palloc_group_desc *desc;
@@ -516,7 +520,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
 	entries_per_group = nilfs_palloc_entries_per_group(inode);
 
 	for (i = 0; i < ngroups; i += n) {
-		if (group >= ngroups) {
+		if (group >= ngroups && wrap) {
 			/* wrap around */
 			group = 0;
 			maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
@@ -550,7 +554,14 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
 			bitmap_kaddr = kmap_local_page(bitmap_bh->b_page);
 			bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
 			pos = nilfs_palloc_find_available_slot(
-				bitmap, group_offset, entries_per_group, lock);
+				bitmap, group_offset, entries_per_group, lock,
+				wrap);
+			/*
+			 * Since the search for a free slot in the second and
+			 * subsequent bitmap blocks always starts from the
+			 * beginning, the wrap flag only has an effect on the
+			 * first search.
+			 */
 			kunmap_local(bitmap_kaddr);
 			if (pos >= 0)
 				goto found;
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index b667e869ac07..d825a9faca6d 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -50,8 +50,8 @@ struct nilfs_palloc_req {
 	struct buffer_head *pr_entry_bh;
 };
 
-int nilfs_palloc_prepare_alloc_entry(struct inode *,
-				     struct nilfs_palloc_req *);
+int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
+				     struct nilfs_palloc_req *req, bool wrap);
 void nilfs_palloc_commit_alloc_entry(struct inode *,
 				     struct nilfs_palloc_req *);
 void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 180fc8d36213..fc1caf63a42a 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -75,7 +75,7 @@ int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req)
 {
 	int ret;
 
-	ret = nilfs_palloc_prepare_alloc_entry(dat, req);
+	ret = nilfs_palloc_prepare_alloc_entry(dat, req, true);
 	if (ret < 0)
 		return ret;
 
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 52e50b1b7f22..dddfa604491a 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -135,6 +135,9 @@ static bool nilfs_check_folio(struct folio *folio, char *kaddr)
 			goto Enamelen;
 		if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
 			goto Espan;
+		if (unlikely(p->inode &&
+			     NILFS_PRIVATE_INODE(le64_to_cpu(p->inode))))
+			goto Einumber;
 	}
 	if (offs != limit)
 		goto Eend;
@@ -160,6 +163,9 @@ Enamelen:
 	goto bad_entry;
 Espan:
 	error = "directory entry across blocks";
+	goto bad_entry;
+Einumber:
+	error = "disallowed inode number";
 bad_entry:
 	nilfs_error(sb,
 		    "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%zd, name_len=%d",
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 612e609158b5..1e86b9303b7c 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -56,13 +56,10 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
 	struct nilfs_palloc_req req;
 	int ret;
 
-	req.pr_entry_nr = 0;  /*
-			       * 0 says find free inode from beginning
-			       * of a group. dull code!!
-			       */
+	req.pr_entry_nr = NILFS_FIRST_INO(ifile->i_sb);
 	req.pr_entry_bh = NULL;
 
-	ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
+	ret = nilfs_palloc_prepare_alloc_entry(ifile, &req, false);
 	if (!ret) {
 		ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
 						   &req.pr_entry_bh);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 728e90be3570..4017f7856440 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -116,9 +116,15 @@ enum {
 #define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)
 
 #define NILFS_MDT_INODE(sb, ino) \
-	((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & BIT(ino)))
+	((ino) < NILFS_USER_INO && (NILFS_MDT_INO_BITS & BIT(ino)))
 #define NILFS_VALID_INODE(sb, ino) \
-	((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & BIT(ino)))
+	((ino) >= NILFS_FIRST_INO(sb) ||				\
+	 ((ino) < NILFS_USER_INO && (NILFS_SYS_INO_BITS & BIT(ino))))
+
+#define NILFS_PRIVATE_INODE(ino) ({					\
+	ino_t __ino = (ino);						\
+	((__ino) < NILFS_USER_INO && (__ino) != NILFS_ROOT_INO &&	\
+	 (__ino) != NILFS_SKETCH_INO); })
 
 /**
  * struct nilfs_transaction_info: context information for synchronization
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index f41d7b6d432c..e44dde57ab65 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -452,6 +452,12 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
 	}
 
 	nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
+	if (nilfs->ns_first_ino < NILFS_USER_INO) {
+		nilfs_err(nilfs->ns_sb,
+			  "too small lower limit for non-reserved inode numbers: %u",
+			  nilfs->ns_first_ino);
+		return -EINVAL;
+	}
 
 	nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
 	if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 85da0629415d..1e829ed7b0ef 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -182,7 +182,7 @@ struct the_nilfs {
 	unsigned long		ns_nrsvsegs;
 	unsigned long		ns_first_data_block;
 	int			ns_inode_size;
-	int			ns_first_ino;
+	unsigned int		ns_first_ino;
 	u32			ns_crc_seed;
 
 	/* /sys/fs/<nilfs>/<device> */
diff --git a/fs/open.c b/fs/open.c
index 89cafb572061..278b3edcda44 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -202,13 +202,13 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 	return error;
 }
 
-SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
+SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length)
 {
 	return do_sys_ftruncate(fd, length, 1);
 }
 
 #ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
+COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length)
 {
 	return do_sys_ftruncate(fd, length, 1);
 }
@@ -1004,11 +1004,6 @@ static int do_dentry_open(struct file *f,
 		}
 	}
 
-	/*
-	 * Once we return a file with FMODE_OPENED, __fput() will call
-	 * fsnotify_close(), so we need fsnotify_open() here for symmetry.
-	 */
-	fsnotify_open(f);
 	return 0;
 
 cleanup_all:
@@ -1085,8 +1080,19 @@ EXPORT_SYMBOL(file_path);
  */
 int vfs_open(const struct path *path, struct file *file)
 {
+	int ret;
+
 	file->f_path = *path;
-	return do_dentry_open(file, NULL);
+	ret = do_dentry_open(file, NULL);
+	if (!ret) {
+		/*
+		 * Once we return a file with FMODE_OPENED, __fput() will call
+		 * fsnotify_close(), so we need fsnotify_open() here for
+		 * symmetry.
+		 */
+		fsnotify_open(file);
+	}
+	return ret;
 }
 
 struct file *dentry_open(const struct path *path, int flags,
@@ -1177,8 +1183,10 @@ struct file *kernel_file_open(const struct path *path, int flags,
 	error = do_dentry_open(f, NULL);
 	if (error) {
 		fput(f);
-		f = ERR_PTR(error);
+		return ERR_PTR(error);
 	}
+
+	fsnotify_open(f);
 	return f;
 }
 EXPORT_SYMBOL_GPL(kernel_file_open);
diff --git a/fs/super.c b/fs/super.c
index b72f1d288e95..095ba793e10c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1502,8 +1502,17 @@ static int fs_bdev_thaw(struct block_device *bdev)
 
 	lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
 
+	/*
+	 * The block device may have been frozen before it was claimed by a
+	 * filesystem. Concurrently another process might try to mount that
+	 * frozen block device and has temporarily claimed the block device for
+	 * that purpose causing a concurrent fs_bdev_thaw() to end up here. The
+	 * mounter is already about to abort mounting because they still saw an
+	 * elevanted bdev->bd_fsfreeze_count so get_bdev_super() will return
+	 * NULL in that case.
+	 */
 	sb = get_bdev_super(bdev);
-	if (WARN_ON_ONCE(!sb))
+	if (!sb)
 		return -EINVAL;
 
 	if (sb->s_op->thaw_super)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index c101cf266bc4..6af6f744fdd6 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4058,20 +4058,32 @@ xfs_bmapi_reserve_delalloc(
 	xfs_extlen_t		indlen;
 	uint64_t		fdblocks;
 	int			error;
-	xfs_fileoff_t		aoff = off;
+	xfs_fileoff_t		aoff;
+	bool			use_cowextszhint =
+					whichfork == XFS_COW_FORK && !prealloc;
 
+retry:
 	/*
 	 * Cap the alloc length. Keep track of prealloc so we know whether to
 	 * tag the inode before we return.
 	 */
+	aoff = off;
 	alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
 	if (!eof)
 		alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
 	if (prealloc && alen >= len)
 		prealloc = alen - len;
 
-	/* Figure out the extent size, adjust alen */
-	if (whichfork == XFS_COW_FORK) {
+	/*
+	 * If we're targetting the COW fork but aren't creating a speculative
+	 * posteof preallocation, try to expand the reservation to align with
+	 * the COW extent size hint if there's sufficient free space.
+	 *
+	 * Unlike the data fork, the CoW cancellation functions will free all
+	 * the reservations at inactivation, so we don't require that every
+	 * delalloc reservation have a dirty pagecache.
+	 */
+	if (use_cowextszhint) {
 		struct xfs_bmbt_irec	prev;
 		xfs_extlen_t		extsz = xfs_get_cowextsz_hint(ip);
 
@@ -4090,7 +4102,7 @@ xfs_bmapi_reserve_delalloc(
 	 */
 	error = xfs_quota_reserve_blkres(ip, alen);
 	if (error)
-		return error;
+		goto out;
 
 	/*
 	 * Split changing sb for alen and indlen since they could be coming
@@ -4140,6 +4152,17 @@ out_unreserve_frextents:
 out_unreserve_quota:
 	if (XFS_IS_QUOTA_ON(mp))
 		xfs_quota_unreserve_blkres(ip, alen);
+out:
+	if (error == -ENOSPC || error == -EDQUOT) {
+		trace_xfs_delalloc_enospc(ip, off, len);
+
+		if (prealloc || use_cowextszhint) {
+			/* retry without any preallocation */
+			use_cowextszhint = false;
+			prealloc = 0;
+			goto retry;
+		}
+	}
 	return error;
 }
 
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 97996cb79aaa..454b63ef7201 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -996,7 +996,7 @@ struct xfs_getparents_by_handle {
 #define XFS_IOC_FSGEOMETRY	     _IOR ('X', 126, struct xfs_fsop_geom)
 #define XFS_IOC_BULKSTAT	     _IOR ('X', 127, struct xfs_bulkstat_req)
 #define XFS_IOC_INUMBERS	     _IOR ('X', 128, struct xfs_inumbers_req)
-#define XFS_IOC_EXCHANGE_RANGE	     _IOWR('X', 129, struct xfs_exchange_range)
+#define XFS_IOC_EXCHANGE_RANGE	     _IOW ('X', 129, struct xfs_exchange_range)
 /*	XFS_IOC_GETFSUUID ---------- deprecated 140	 */
 
 
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index e7a7bfbe75b4..513b50da6215 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -379,10 +379,13 @@ xfs_dinode_verify_fork(
 		/*
 		 * A directory small enough to fit in the inode must be stored
 		 * in local format.  The directory sf <-> extents conversion
-		 * code updates the directory size accordingly.
+		 * code updates the directory size accordingly.  Directories
+		 * being truncated have zero size and are not subject to this
+		 * check.
 		 */
 		if (S_ISDIR(mode)) {
-			if (be64_to_cpu(dip->di_size) <= fork_size &&
+			if (dip->di_size &&
+			    be64_to_cpu(dip->di_size) <= fork_size &&
 			    fork_format != XFS_DINODE_FMT_LOCAL)
 				return __this_address;
 		}
@@ -528,9 +531,19 @@ xfs_dinode_verify(
 	if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN)
 		return __this_address;
 
-	/* No zero-length symlinks/dirs. */
-	if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0)
-		return __this_address;
+	/*
+	 * No zero-length symlinks/dirs unless they're unlinked and hence being
+	 * inactivated.
+	 */
+	if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) {
+		if (dip->di_version > 1) {
+			if (dip->di_nlink)
+				return __this_address;
+		} else {
+			if (dip->di_onlink)
+				return __this_address;
+		}
+	}
 
 	fa = xfs_dinode_verify_nrext64(mp, dip);
 	if (fa)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index ac2e77ebb54c..a4d9fbc21b83 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -486,13 +486,11 @@ out_unlock:
 
 /*
  * Test whether it is appropriate to check an inode for and free post EOF
- * blocks. The 'force' parameter determines whether we should also consider
- * regular files that are marked preallocated or append-only.
+ * blocks.
  */
 bool
 xfs_can_free_eofblocks(
-	struct xfs_inode	*ip,
-	bool			force)
+	struct xfs_inode	*ip)
 {
 	struct xfs_bmbt_irec	imap;
 	struct xfs_mount	*mp = ip->i_mount;
@@ -526,11 +524,11 @@ xfs_can_free_eofblocks(
 		return false;
 
 	/*
-	 * Do not free real preallocated or append-only files unless the file
-	 * has delalloc blocks and we are forced to remove them.
+	 * Only free real extents for inodes with persistent preallocations or
+	 * the append-only flag.
 	 */
 	if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
-		if (!force || ip->i_delayed_blks == 0)
+		if (ip->i_delayed_blks == 0)
 			return false;
 
 	/*
@@ -584,6 +582,22 @@ xfs_free_eofblocks(
 	/* Wait on dio to ensure i_size has settled. */
 	inode_dio_wait(VFS_I(ip));
 
+	/*
+	 * For preallocated files only free delayed allocations.
+	 *
+	 * Note that this means we also leave speculative preallocations in
+	 * place for preallocated files.
+	 */
+	if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) {
+		if (ip->i_delayed_blks) {
+			xfs_bmap_punch_delalloc_range(ip,
+				round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
+				LLONG_MAX);
+		}
+		xfs_inode_clear_eofblocks_tag(ip);
+		return 0;
+	}
+
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
 	if (error) {
 		ASSERT(xfs_is_shutdown(mp));
@@ -891,7 +905,7 @@ xfs_prepare_shift(
 	 * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
 	 * into the accessible region of the file.
 	 */
-	if (xfs_can_free_eofblocks(ip, true)) {
+	if (xfs_can_free_eofblocks(ip)) {
 		error = xfs_free_eofblocks(ip);
 		if (error)
 			return error;
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 51f84d8ff372..eb0895bfb9da 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -63,7 +63,7 @@ int	xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
 				xfs_off_t len);
 
 /* EOF block manipulation functions */
-bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
+bool	xfs_can_free_eofblocks(struct xfs_inode *ip);
 int	xfs_free_eofblocks(struct xfs_inode *ip);
 
 int	xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0953163a2d84..9967334ea99f 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1155,7 +1155,7 @@ xfs_inode_free_eofblocks(
 	}
 	*lockflags |= XFS_IOLOCK_EXCL;
 
-	if (xfs_can_free_eofblocks(ip, false))
+	if (xfs_can_free_eofblocks(ip))
 		return xfs_free_eofblocks(ip);
 
 	/* inode could be preallocated or append-only */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index f36091e1e7f5..a4e3cd8971fc 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -42,6 +42,7 @@
 #include "xfs_pnfs.h"
 #include "xfs_parent.h"
 #include "xfs_xattr.h"
+#include "xfs_sb.h"
 
 struct kmem_cache *xfs_inode_cache;
 
@@ -870,9 +871,16 @@ xfs_init_new_inode(
 	 * this saves us from needing to run a separate transaction to set the
 	 * fork offset in the immediate future.
 	 */
-	if (init_xattrs && xfs_has_attr(mp)) {
+	if (init_xattrs) {
 		ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
 		xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
+
+		if (!xfs_has_attr(mp)) {
+			spin_lock(&mp->m_sb_lock);
+			xfs_add_attr(mp);
+			spin_unlock(&mp->m_sb_lock);
+			xfs_log_sb(tp);
+		}
 	}
 
 	/*
@@ -1595,7 +1603,7 @@ xfs_release(
 	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
 		return 0;
 
-	if (xfs_can_free_eofblocks(ip, false)) {
+	if (xfs_can_free_eofblocks(ip)) {
 		/*
 		 * Check if the inode is being opened, written and closed
 		 * frequently and we have delayed allocation blocks outstanding
@@ -1856,15 +1864,13 @@ xfs_inode_needs_inactive(
 
 	/*
 	 * This file isn't being freed, so check if there are post-eof blocks
-	 * to free.  @force is true because we are evicting an inode from the
-	 * cache.  Post-eof blocks must be freed, lest we end up with broken
-	 * free space accounting.
+	 * to free.
 	 *
 	 * Note: don't bother with iolock here since lockdep complains about
 	 * acquiring it in reclaim context. We have the only reference to the
 	 * inode at this point anyways.
 	 */
-	return xfs_can_free_eofblocks(ip, true);
+	return xfs_can_free_eofblocks(ip);
 }
 
 /*
@@ -1947,15 +1953,11 @@ xfs_inactive(
 
 	if (VFS_I(ip)->i_nlink != 0) {
 		/*
-		 * force is true because we are evicting an inode from the
-		 * cache. Post-eof blocks must be freed, lest we end up with
-		 * broken free space accounting.
-		 *
 		 * Note: don't bother with iolock here since lockdep complains
 		 * about acquiring it in reclaim context. We have the only
 		 * reference to the inode at this point anyways.
 		 */
-		if (xfs_can_free_eofblocks(ip, true))
+		if (xfs_can_free_eofblocks(ip))
 			error = xfs_free_eofblocks(ip);
 
 		goto out;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 378342673925..414903885ab9 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1148,33 +1148,23 @@ xfs_buffered_write_iomap_begin(
 		}
 	}
 
-retry:
-	error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
-			end_fsb - offset_fsb, prealloc_blocks,
-			allocfork == XFS_DATA_FORK ? &imap : &cmap,
-			allocfork == XFS_DATA_FORK ? &icur : &ccur,
-			allocfork == XFS_DATA_FORK ? eof : cow_eof);
-	switch (error) {
-	case 0:
-		break;
-	case -ENOSPC:
-	case -EDQUOT:
-		/* retry without any preallocation */
-		trace_xfs_delalloc_enospc(ip, offset, count);
-		if (prealloc_blocks) {
-			prealloc_blocks = 0;
-			goto retry;
-		}
-		fallthrough;
-	default:
-		goto out_unlock;
-	}
-
 	if (allocfork == XFS_COW_FORK) {
+		error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
+				end_fsb - offset_fsb, prealloc_blocks, &cmap,
+				&ccur, cow_eof);
+		if (error)
+			goto out_unlock;
+
 		trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap);
 		goto found_cow;
 	}
 
+	error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
+			end_fsb - offset_fsb, prealloc_blocks, &imap, &icur,
+			eof);
+	if (error)
+		goto out_unlock;
+
 	/*
 	 * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
 	 * them out if the write happens to fail.