diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-15 11:39:44 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-15 11:39:44 -0700 |
commit | 2aae1d67fd1d9070f8f23a6e7d9a7a093cf35fbb (patch) | |
tree | 425d848ecbbd12856fe7ef7190b8497d47ed4c7e /fs | |
parent | b8fc1bd73a5a12e48f9fd2e7ccea60cadf718c93 (diff) | |
parent | dc99c0ff53f588bb210b1e8b3314c7581cde68a2 (diff) |
Merge tag 'vfs-6.11.inode' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs inode / dentry updates from Christian Brauner:
"This contains smaller performance improvements to inodes and dentries:
inode:
- Add rcu based inode lookup variants.
They avoid one inode hash lock acquire in the common case thereby
significantly reducing contention. We already support RCU-based
operations but didn't take advantage of them during inode
insertion.
Callers of iget_locked() get the improvement without any code
changes. Callers that need a custom callback can switch to
iget5_locked_rcu() as e.g., did btrfs.
With 20 threads each walking a dedicated 1000 dirs * 1000 files
directory tree to stat(2) on a 32 core + 24GB ram vm:
before: 3.54s user 892.30s system 1966% cpu 45.549 total
after: 3.28s user 738.66s system 1955% cpu 37.932 total (-16.7%)
Long-term we should pick up the effort to introduce more
fine-grained locking and possibly improve on the currently used
hash implementation.
- Start zeroing i_state in inode_init_always() instead of doing it in
individual filesystems.
This allows us to remove an unneeded lock acquire in new_inode()
and not burden individual filesystems with this.
dcache:
- Move d_lockref out of the area used by RCU lookup to avoid
cacheline ping poing because the embedded name is sharing a
cacheline with d_lockref.
- Fix dentry size on 32bit with CONFIG_SMP=y so it does actually end
up with 128 bytes in total"
* tag 'vfs-6.11.inode' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
fs: fix dentry size
vfs: move d_lockref out of the area used by RCU lookup
bcachefs: remove now spurious i_state initialization
xfs: remove now spurious i_state initialization in xfs_inode_alloc
vfs: partially sanitize i_state zeroing on inode creation
xfs: preserve i_state around inode_init_always in xfs_reinit_inode
btrfs: use iget5_locked_rcu
vfs: add rcu-based find_inode variants for iget ops
Diffstat (limited to 'fs')
-rw-r--r-- | fs/bcachefs/fs.c | 1 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 2 | ||||
-rw-r--r-- | fs/inode.c | 108 | ||||
-rw-r--r-- | fs/xfs/xfs_icache.c | 5 |
4 files changed, 86 insertions, 30 deletions
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index fa1fee05cf8f..0c7d1bc0548a 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -244,7 +244,6 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) inode->ei_flags = 0; mutex_init(&inode->ei_quota_lock); memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); - inode->v.i_state = 0; if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) { kmem_cache_free(bch2_inode_cache, inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3a2b902b2d1f..d62c96f00ff8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5587,7 +5587,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, args.ino = ino; args.root = root; - inode = iget5_locked(s, hashval, btrfs_find_actor, + inode = iget5_locked_rcu(s, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); return inode; diff --git a/fs/inode.c b/fs/inode.c index e0815acc5abb..f356fe2ec2b6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -162,6 +162,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; + inode->i_state = 0; atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; @@ -231,6 +232,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) if (unlikely(security_inode_alloc(inode))) return -ENOMEM; + this_cpu_inc(nr_inodes); return 0; @@ -886,36 +888,45 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) return freed; } -static void __wait_on_freeing_inode(struct inode *inode); +static void __wait_on_freeing_inode(struct inode *inode, bool locked); /* * Called with the inode lock held. */ static struct inode *find_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), - void *data) + void *data, bool locked) { struct inode *inode = NULL; + if (locked) + lockdep_assert_held(&inode_hash_lock); + else + lockdep_assert_not_held(&inode_hash_lock); + + rcu_read_lock(); repeat: - hlist_for_each_entry(inode, head, i_hash) { + hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_sb != sb) continue; if (!test(inode, data)) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { - __wait_on_freeing_inode(inode); + __wait_on_freeing_inode(inode, locked); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { spin_unlock(&inode->i_lock); + rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); spin_unlock(&inode->i_lock); + rcu_read_unlock(); return inode; } + rcu_read_unlock(); return NULL; } @@ -924,29 +935,39 @@ repeat: * iget_locked for details. */ static struct inode *find_inode_fast(struct super_block *sb, - struct hlist_head *head, unsigned long ino) + struct hlist_head *head, unsigned long ino, + bool locked) { struct inode *inode = NULL; + if (locked) + lockdep_assert_held(&inode_hash_lock); + else + lockdep_assert_not_held(&inode_hash_lock); + + rcu_read_lock(); repeat: - hlist_for_each_entry(inode, head, i_hash) { + hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_ino != ino) continue; if (inode->i_sb != sb) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { - __wait_on_freeing_inode(inode); + __wait_on_freeing_inode(inode, locked); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { spin_unlock(&inode->i_lock); + rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); spin_unlock(&inode->i_lock); + rcu_read_unlock(); return inode; } + rcu_read_unlock(); return NULL; } @@ -1004,14 +1025,7 @@ EXPORT_SYMBOL(get_next_ino); */ struct inode *new_inode_pseudo(struct super_block *sb) { - struct inode *inode = alloc_inode(sb); - - if (inode) { - spin_lock(&inode->i_lock); - inode->i_state = 0; - spin_unlock(&inode->i_lock); - } - return inode; + return alloc_inode(sb); } /** @@ -1161,7 +1175,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, again: spin_lock(&inode_hash_lock); - old = find_inode(inode->i_sb, head, test, data); + old = find_inode(inode->i_sb, head, test, data, true); if (unlikely(old)) { /* * Uhhuh, somebody else created the same inode under us. @@ -1235,7 +1249,6 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, struct inode *new = alloc_inode(sb); if (new) { - new->i_state = 0; inode = inode_insert5(new, hashval, test, set, data); if (unlikely(inode != new)) destroy_inode(new); @@ -1246,6 +1259,47 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, EXPORT_SYMBOL(iget5_locked); /** + * iget5_locked_rcu - obtain an inode from a mounted file system + * @sb: super block of file system + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * This is equivalent to iget5_locked, except the @test callback must + * tolerate the inode not being stable, including being mid-teardown. + */ +struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode, *new; + +again: + inode = find_inode(sb, head, test, data, false); + if (inode) { + if (IS_ERR(inode)) + return NULL; + wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } + return inode; + } + + new = alloc_inode(sb); + if (new) { + inode = inode_insert5(new, hashval, test, set, data); + if (unlikely(inode != new)) + destroy_inode(new); + } + return inode; +} +EXPORT_SYMBOL_GPL(iget5_locked_rcu); + +/** * iget_locked - obtain an inode from a mounted file system * @sb: super block of file system * @ino: inode number to get @@ -1263,9 +1317,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; again: - spin_lock(&inode_hash_lock); - inode = find_inode_fast(sb, head, ino); - spin_unlock(&inode_hash_lock); + inode = find_inode_fast(sb, head, ino, false); if (inode) { if (IS_ERR(inode)) return NULL; @@ -1283,7 +1335,7 @@ again: spin_lock(&inode_hash_lock); /* We released the lock, so.. */ - old = find_inode_fast(sb, head, ino); + old = find_inode_fast(sb, head, ino, true); if (!old) { inode->i_ino = ino; spin_lock(&inode->i_lock); @@ -1419,7 +1471,7 @@ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, struct inode *inode; spin_lock(&inode_hash_lock); - inode = find_inode(sb, head, test, data); + inode = find_inode(sb, head, test, data, true); spin_unlock(&inode_hash_lock); return IS_ERR(inode) ? NULL : inode; @@ -1474,7 +1526,7 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) struct inode *inode; again: spin_lock(&inode_hash_lock); - inode = find_inode_fast(sb, head, ino); + inode = find_inode_fast(sb, head, ino, true); spin_unlock(&inode_hash_lock); if (inode) { @@ -2235,17 +2287,21 @@ EXPORT_SYMBOL(inode_needs_sync); * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list * will DTRT. */ -static void __wait_on_freeing_inode(struct inode *inode) +static void __wait_on_freeing_inode(struct inode *inode, bool locked) { wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); wq = bit_waitqueue(&inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); - spin_unlock(&inode_hash_lock); + rcu_read_unlock(); + if (locked) + spin_unlock(&inode_hash_lock); schedule(); finish_wait(wq, &wait.wq_entry); - spin_lock(&inode_hash_lock); + if (locked) + spin_lock(&inode_hash_lock); + rcu_read_lock(); } static __initdata unsigned long ihash_entries; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9967334ea99f..cf629302d48e 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -86,9 +86,8 @@ xfs_inode_alloc( return NULL; } - /* VFS doesn't initialise i_mode or i_state! */ + /* VFS doesn't initialise i_mode! */ VFS_I(ip)->i_mode = 0; - VFS_I(ip)->i_state = 0; mapping_set_large_folios(VFS_I(ip)->i_mapping); XFS_STATS_INC(mp, vn_active); @@ -314,6 +313,7 @@ xfs_reinit_inode( dev_t dev = inode->i_rdev; kuid_t uid = inode->i_uid; kgid_t gid = inode->i_gid; + unsigned long state = inode->i_state; error = inode_init_always(mp->m_super, inode); @@ -324,6 +324,7 @@ xfs_reinit_inode( inode->i_rdev = dev; inode->i_uid = uid; inode->i_gid = gid; + inode->i_state = state; mapping_set_large_folios(inode->i_mapping); return error; } |