summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_dentry.c9
-rw-r--r--fs/9p/vfs_inode.c1
-rw-r--r--fs/Kconfig.binfmt8
-rw-r--r--fs/afs/inode.c5
-rw-r--r--fs/afs/mntpt.c5
-rw-r--r--fs/aio.c8
-rw-r--r--fs/attr.c2
-rw-r--r--fs/autofs/init.c1
-rw-r--r--fs/autofs/inode.c16
-rw-r--r--fs/bcachefs/alloc_background.c359
-rw-r--r--fs/bcachefs/alloc_background.h14
-rw-r--r--fs/bcachefs/alloc_foreground.c6
-rw-r--r--fs/bcachefs/backpointers.c72
-rw-r--r--fs/bcachefs/bcachefs.h68
-rw-r--r--fs/bcachefs/bcachefs_format.h208
-rw-r--r--fs/bcachefs/bkey.c7
-rw-r--r--fs/bcachefs/bkey.h7
-rw-r--r--fs/bcachefs/bkey_methods.c6
-rw-r--r--fs/bcachefs/bkey_methods.h3
-rw-r--r--fs/bcachefs/btree_cache.c9
-rw-r--r--fs/bcachefs/btree_gc.c89
-rw-r--r--fs/bcachefs/btree_gc.h44
-rw-r--r--fs/bcachefs/btree_gc_types.h29
-rw-r--r--fs/bcachefs/btree_io.c93
-rw-r--r--fs/bcachefs/btree_iter.c42
-rw-r--r--fs/bcachefs/btree_key_cache.c43
-rw-r--r--fs/bcachefs/btree_locking.c11
-rw-r--r--fs/bcachefs/btree_locking.h22
-rw-r--r--fs/bcachefs/btree_node_scan.c9
-rw-r--r--fs/bcachefs/btree_types.h17
-rw-r--r--fs/bcachefs/btree_write_buffer.c37
-rw-r--r--fs/bcachefs/btree_write_buffer.h3
-rw-r--r--fs/bcachefs/buckets.c297
-rw-r--r--fs/bcachefs/buckets.h25
-rw-r--r--fs/bcachefs/buckets_types.h2
-rw-r--r--fs/bcachefs/chardev.c9
-rw-r--r--fs/bcachefs/clock.c7
-rw-r--r--fs/bcachefs/data_update.c47
-rw-r--r--fs/bcachefs/data_update.h5
-rw-r--r--fs/bcachefs/debug.c113
-rw-r--r--fs/bcachefs/disk_groups_format.h21
-rw-r--r--fs/bcachefs/ec.c28
-rw-r--r--fs/bcachefs/errcode.h3
-rw-r--r--fs/bcachefs/error.c19
-rw-r--r--fs/bcachefs/error.h7
-rw-r--r--fs/bcachefs/extents.c9
-rw-r--r--fs/bcachefs/eytzinger.h6
-rw-r--r--fs/bcachefs/fs-io-buffered.c6
-rw-r--r--fs/bcachefs/fs-io-direct.c4
-rw-r--r--fs/bcachefs/fs-ioctl.c19
-rw-r--r--fs/bcachefs/fs.c45
-rw-r--r--fs/bcachefs/fsck.c54
-rw-r--r--fs/bcachefs/io_misc.c2
-rw-r--r--fs/bcachefs/io_read.c41
-rw-r--r--fs/bcachefs/io_write.c19
-rw-r--r--fs/bcachefs/journal.c26
-rw-r--r--fs/bcachefs/journal.h2
-rw-r--r--fs/bcachefs/journal_io.c32
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c2
-rw-r--r--fs/bcachefs/journal_seq_blacklist_format.h15
-rw-r--r--fs/bcachefs/lru.c39
-rw-r--r--fs/bcachefs/lru.h6
-rw-r--r--fs/bcachefs/mean_and_variance_test.c1
-rw-r--r--fs/bcachefs/move.c41
-rw-r--r--fs/bcachefs/movinggc.c7
-rw-r--r--fs/bcachefs/opts.h2
-rw-r--r--fs/bcachefs/recovery.c12
-rw-r--r--fs/bcachefs/replicas_format.h31
-rw-r--r--fs/bcachefs/sb-downgrade.c15
-rw-r--r--fs/bcachefs/sb-downgrade_format.h17
-rw-r--r--fs/bcachefs/sb-errors.c14
-rw-r--r--fs/bcachefs/sb-errors_format.h310
-rw-r--r--fs/bcachefs/sb-errors_types.h281
-rw-r--r--fs/bcachefs/sb-members_format.h110
-rw-r--r--fs/bcachefs/seqmutex.h11
-rw-r--r--fs/bcachefs/snapshot.c100
-rw-r--r--fs/bcachefs/snapshot.h1
-rw-r--r--fs/bcachefs/str_hash.h2
-rw-r--r--fs/bcachefs/super-io.c25
-rw-r--r--fs/bcachefs/super.c40
-rw-r--r--fs/bcachefs/util.c25
-rw-r--r--fs/bcachefs/util.h1
-rw-r--r--fs/befs/linuxvfs.c10
-rw-r--r--fs/binfmt_elf.c101
-rw-r--r--fs/binfmt_elf_fdpic.c5
-rw-r--r--fs/binfmt_misc.c8
-rw-r--r--fs/binfmt_script.c1
-rw-r--r--fs/btrfs/bio.c4
-rw-r--r--fs/btrfs/block-group.c20
-rw-r--r--fs/btrfs/btrfs_inode.h10
-rw-r--r--fs/btrfs/disk-io.c12
-rw-r--r--fs/btrfs/extent_io.c62
-rw-r--r--fs/btrfs/extent_map.c123
-rw-r--r--fs/btrfs/file.c16
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/fs.h1
-rw-r--r--fs/btrfs/inode.c4
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/ordered-data.c31
-rw-r--r--fs/btrfs/qgroup.c14
-rw-r--r--fs/btrfs/ref-verify.c9
-rw-r--r--fs/btrfs/scrub.c24
-rw-r--r--fs/btrfs/space-info.c24
-rw-r--r--fs/btrfs/tree-log.c60
-rw-r--r--fs/buffer.c7
-rw-r--r--fs/cachefiles/cache.c45
-rw-r--r--fs/cachefiles/daemon.c7
-rw-r--r--fs/cachefiles/internal.h8
-rw-r--r--fs/cachefiles/ondemand.c270
-rw-r--r--fs/cachefiles/volume.c1
-rw-r--r--fs/cachefiles/xattr.c5
-rw-r--r--fs/coda/symlink.c10
-rw-r--r--fs/configfs/dir.c10
-rw-r--r--fs/cramfs/inode.c26
-rw-r--r--fs/dcache.c105
-rw-r--r--fs/debugfs/inode.c26
-rw-r--r--fs/efivarfs/super.c12
-rw-r--r--fs/efs/inode.c1
-rw-r--r--fs/efs/symlink.c14
-rw-r--r--fs/erofs/super.c2
-rw-r--r--fs/erofs/zmap.c2
-rw-r--r--fs/erofs/zutil.c8
-rw-r--r--fs/exec.c63
-rw-r--r--fs/exec_test.c141
-rw-r--r--fs/exfat/super.c8
-rw-r--r--fs/exportfs/expfs.c9
-rw-r--r--fs/ext4/crypto.c10
-rw-r--r--fs/ext4/ext4.h35
-rw-r--r--fs/ext4/namei.c122
-rw-r--r--fs/ext4/super.c26
-rw-r--r--fs/f2fs/acl.c3
-rw-r--r--fs/f2fs/dir.c105
-rw-r--r--fs/f2fs/f2fs.h16
-rw-r--r--fs/f2fs/file.c6
-rw-r--r--fs/f2fs/namei.c10
-rw-r--r--fs/f2fs/recovery.c9
-rw-r--r--fs/f2fs/super.c8
-rw-r--r--fs/fat/fat.h18
-rw-r--r--fs/fat/fat_test.c1
-rw-r--r--fs/fat/inode.c675
-rw-r--r--fs/fat/namei_msdos.c38
-rw-r--r--fs/fat/namei_vfat.c38
-rw-r--r--fs/fhandle.c178
-rw-r--r--fs/file.c4
-rw-r--r--fs/fs_parser.c34
-rw-r--r--fs/fsopen.c7
-rw-r--r--fs/fuse/acl.c4
-rw-r--r--fs/fuse/inode.c24
-rw-r--r--fs/hfs/inode.c3
-rw-r--r--fs/hfs/super.c1
-rw-r--r--fs/hfsplus/bfind.c15
-rw-r--r--fs/hfsplus/extents.c9
-rw-r--r--fs/hfsplus/hfsplus_fs.h21
-rw-r--r--fs/hfsplus/ioctl.c4
-rw-r--r--fs/hfsplus/xattr.c2
-rw-r--r--fs/hostfs/hostfs_kern.c106
-rw-r--r--fs/hpfs/namei.c15
-rw-r--r--fs/hpfs/super.c1
-rw-r--r--fs/hugetlbfs/inode.c12
-rw-r--r--fs/inode.c109
-rw-r--r--fs/internal.h16
-rw-r--r--fs/iomap/buffered-io.c33
-rw-r--r--fs/isofs/inode.c16
-rw-r--r--fs/isofs/rock.c17
-rw-r--r--fs/jffs2/file.c14
-rw-r--r--fs/jfs/xattr.c4
-rw-r--r--fs/libfs.c74
-rw-r--r--fs/locks.c11
-rw-r--r--fs/minix/inode.c1
-rw-r--r--fs/minix/namei.c3
-rw-r--r--fs/mount.h3
-rw-r--r--fs/mpage.c13
-rw-r--r--fs/namei.c249
-rw-r--r--fs/namespace.c524
-rw-r--r--fs/netfs/buffered_read.c14
-rw-r--r--fs/netfs/buffered_write.c26
-rw-r--r--fs/netfs/direct_read.c2
-rw-r--r--fs/netfs/direct_write.c13
-rw-r--r--fs/netfs/fscache_cache.c4
-rw-r--r--fs/netfs/fscache_cookie.c28
-rw-r--r--fs/netfs/fscache_io.c12
-rw-r--r--fs/netfs/fscache_main.c2
-rw-r--r--fs/netfs/fscache_volume.c18
-rw-r--r--fs/netfs/internal.h44
-rw-r--r--fs/netfs/io.c12
-rw-r--r--fs/netfs/main.c4
-rw-r--r--fs/netfs/misc.c85
-rw-r--r--fs/netfs/objects.c5
-rw-r--r--fs/netfs/write_collect.c23
-rw-r--r--fs/netfs/write_issue.c47
-rw-r--r--fs/nfs/dir.c77
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/nfs4proc.c24
-rw-r--r--fs/nfs/pagelist.c5
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nfs/symlink.c12
-rw-r--r--fs/nfs/write.c1
-rw-r--r--fs/nfsd/netlink.c2
-rw-r--r--fs/nfsd/netlink.h3
-rw-r--r--fs/nfsd/nfsctl.c50
-rw-r--r--fs/nfsd/nfsfh.c2
-rw-r--r--fs/nfsd/nfssvc.c1
-rw-r--r--fs/nilfs2/alloc.c19
-rw-r--r--fs/nilfs2/alloc.h4
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/dir.c40
-rw-r--r--fs/nilfs2/ifile.c7
-rw-r--r--fs/nilfs2/nilfs.h10
-rw-r--r--fs/nilfs2/segment.c3
-rw-r--r--fs/nilfs2/the_nilfs.c6
-rw-r--r--fs/nilfs2/the_nilfs.h2
-rw-r--r--fs/nls/mac-celtic.c1
-rw-r--r--fs/nls/mac-centeuro.c1
-rw-r--r--fs/nls/mac-croatian.c1
-rw-r--r--fs/nls/mac-cyrillic.c1
-rw-r--r--fs/nls/mac-gaelic.c1
-rw-r--r--fs/nls/mac-greek.c1
-rw-r--r--fs/nls/mac-iceland.c1
-rw-r--r--fs/nls/mac-inuit.c1
-rw-r--r--fs/nls/mac-roman.c1
-rw-r--r--fs/nls/mac-romanian.c1
-rw-r--r--fs/nls/mac-turkish.c1
-rw-r--r--fs/nls/nls_ascii.c1
-rw-r--r--fs/nls/nls_base.c1
-rw-r--r--fs/nls/nls_cp1250.c1
-rw-r--r--fs/nls/nls_cp1251.c1
-rw-r--r--fs/nls/nls_cp1255.c1
-rw-r--r--fs/nls/nls_cp437.c1
-rw-r--r--fs/nls/nls_cp737.c1
-rw-r--r--fs/nls/nls_cp775.c1
-rw-r--r--fs/nls/nls_cp850.c1
-rw-r--r--fs/nls/nls_cp852.c1
-rw-r--r--fs/nls/nls_cp855.c1
-rw-r--r--fs/nls/nls_cp857.c1
-rw-r--r--fs/nls/nls_cp860.c1
-rw-r--r--fs/nls/nls_cp861.c1
-rw-r--r--fs/nls/nls_cp862.c1
-rw-r--r--fs/nls/nls_cp863.c1
-rw-r--r--fs/nls/nls_cp864.c1
-rw-r--r--fs/nls/nls_cp865.c1
-rw-r--r--fs/nls/nls_cp866.c1
-rw-r--r--fs/nls/nls_cp869.c1
-rw-r--r--fs/nls/nls_cp874.c1
-rw-r--r--fs/nls/nls_cp932.c1
-rw-r--r--fs/nls/nls_cp936.c1
-rw-r--r--fs/nls/nls_cp949.c1
-rw-r--r--fs/nls/nls_cp950.c1
-rw-r--r--fs/nls/nls_euc-jp.c1
-rw-r--r--fs/nls/nls_iso8859-1.c1
-rw-r--r--fs/nls/nls_iso8859-13.c1
-rw-r--r--fs/nls/nls_iso8859-14.c1
-rw-r--r--fs/nls/nls_iso8859-15.c1
-rw-r--r--fs/nls/nls_iso8859-2.c1
-rw-r--r--fs/nls/nls_iso8859-3.c1
-rw-r--r--fs/nls/nls_iso8859-4.c1
-rw-r--r--fs/nls/nls_iso8859-5.c1
-rw-r--r--fs/nls/nls_iso8859-6.c1
-rw-r--r--fs/nls/nls_iso8859-7.c1
-rw-r--r--fs/nls/nls_iso8859-9.c1
-rw-r--r--fs/nls/nls_koi8-r.c1
-rw-r--r--fs/nls/nls_koi8-ru.c1
-rw-r--r--fs/nls/nls_koi8-u.c1
-rw-r--r--fs/nls/nls_ucs2_utils.c1
-rw-r--r--fs/nls/nls_utf8.c1
-rw-r--r--fs/nsfs.c122
-rw-r--r--fs/ntfs3/super.c12
-rw-r--r--fs/ocfs2/aops.c5
-rw-r--r--fs/ocfs2/journal.c209
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/ocfs2.h27
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/super.c4
-rw-r--r--fs/open.c43
-rw-r--r--fs/openpromfs/inode.c1
-rw-r--r--fs/orangefs/inode.c13
-rw-r--r--fs/orangefs/orangefs-bufmap.c4
-rw-r--r--fs/overlayfs/dir.c8
-rw-r--r--fs/overlayfs/export.c6
-rw-r--r--fs/pidfs.c90
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/proc/generic.c6
-rw-r--r--fs/proc/proc_sysctl.c70
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/proc_namespace.c6
-rw-r--r--fs/pstore/blk.c2
-rw-r--r--fs/pstore/platform.c1
-rw-r--r--fs/qnx4/inode.c1
-rw-r--r--fs/qnx6/inode.c1
-rw-r--r--fs/quota/dquot.c8
-rw-r--r--fs/read_write.c18
-rw-r--r--fs/readdir.c4
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/romfs/super.c22
-rw-r--r--fs/signalfd.c6
-rw-r--r--fs/smb/client/cifsfs.c3
-rw-r--r--fs/smb/client/cifsglob.h7
-rw-r--r--fs/smb/client/cifspdu.h2
-rw-r--r--fs/smb/client/cifssmb.c8
-rw-r--r--fs/smb/client/file.c57
-rw-r--r--fs/smb/client/fs_context.c39
-rw-r--r--fs/smb/client/inode.c4
-rw-r--r--fs/smb/client/smb2ops.c3
-rw-r--r--fs/smb/client/smb2pdu.c22
-rw-r--r--fs/smb/client/smb2transport.c2
-rw-r--r--fs/smb/common/cifs_arc4.c1
-rw-r--r--fs/smb/common/cifs_md4.c1
-rw-r--r--fs/smb/common/smb2pdu.h34
-rw-r--r--fs/smb/server/smb2pdu.c44
-rw-r--r--fs/smb/server/vfs.c17
-rw-r--r--fs/smb/server/vfs.h3
-rw-r--r--fs/smb/server/vfs_cache.c3
-rw-r--r--fs/stat.c206
-rw-r--r--fs/super.c11
-rw-r--r--fs/sysv/super.c1
-rw-r--r--fs/tracefs/inode.c16
-rw-r--r--fs/ufs/dir.c1
-rw-r--r--fs/userfaultfd.c7
-rw-r--r--fs/vboxsf/file.c18
-rw-r--r--fs/vboxsf/super.c16
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c6
-rw-r--r--fs/xfs/libxfs/xfs_attr.c40
-rw-r--r--fs/xfs/libxfs/xfs_attr.h3
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c32
-rw-r--r--fs/xfs/libxfs/xfs_fs.h2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c47
-rw-r--r--fs/xfs/libxfs/xfs_sb.c7
-rw-r--r--fs/xfs/scrub/scrub.c2
-rw-r--r--fs/xfs/scrub/xfarray.c9
-rw-r--r--fs/xfs/xfs_attr_item.c17
-rw-r--r--fs/xfs/xfs_bmap_util.c30
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_handle.c7
-rw-r--r--fs/xfs/xfs_icache.c7
-rw-r--r--fs/xfs/xfs_inode.c47
-rw-r--r--fs/xfs/xfs_iomap.c34
-rw-r--r--fs/xfs/xfs_iops.c15
-rw-r--r--fs/xfs/xfs_iwalk.c5
-rw-r--r--fs/xfs/xfs_reflink.c1
338 files changed, 5811 insertions, 3572 deletions
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index f16f73581634..01338d4c2d9e 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -48,12 +48,17 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)
static void v9fs_dentry_release(struct dentry *dentry)
{
struct hlist_node *p, *n;
+ struct hlist_head head;
p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
dentry, dentry);
- hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata)
+
+ spin_lock(&dentry->d_lock);
+ hlist_move_list((struct hlist_head *)&dentry->d_fsdata, &head);
+ spin_unlock(&dentry->d_lock);
+
+ hlist_for_each_safe(p, n, &head)
p9_fid_put(hlist_entry(p, struct p9_fid, dlist));
- dentry->d_fsdata = NULL;
}
static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7a3308d77606..fd72fc38c8f5 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -348,6 +348,7 @@ void v9fs_evict_inode(struct inode *inode)
__le32 __maybe_unused version;
if (!is_bad_inode(inode)) {
+ netfs_wait_for_outstanding_io(inode);
truncate_inode_pages_final(&inode->i_data);
version = cpu_to_le32(v9inode->qid.version);
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index f5693164ca9a..bd2f530e5740 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -176,4 +176,12 @@ config COREDUMP
certainly want to say Y here. Not necessary on systems that never
need debugging or only ever run flawless code.
+config EXEC_KUNIT_TEST
+ bool "Build execve tests" if !KUNIT_ALL_TESTS
+ depends on KUNIT=y
+ default KUNIT_ALL_TESTS
+ help
+ This builds the exec KUnit tests, which tests boundary conditions
+ of various aspects of the exec internals.
+
endmenu
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 94fc049aff58..3acf5e050072 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -512,7 +512,7 @@ static int afs_iget5_set_root(struct inode *inode, void *opaque)
struct afs_vnode *vnode = AFS_FS_I(inode);
vnode->volume = as->volume;
- vnode->fid.vid = as->volume->vid,
+ vnode->fid.vid = as->volume->vid;
vnode->fid.vnode = 1;
vnode->fid.unique = 1;
inode->i_ino = 1;
@@ -545,7 +545,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key)
BUG_ON(!(inode->i_state & I_NEW));
vnode = AFS_FS_I(inode);
- vnode->cb_v_check = atomic_read(&as->volume->cb_v_break),
+ vnode->cb_v_check = atomic_read(&as->volume->cb_v_break);
afs_set_netfs_context(vnode);
op = afs_alloc_operation(key, as->volume);
@@ -648,6 +648,7 @@ void afs_evict_inode(struct inode *inode)
ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
+ netfs_wait_for_outstanding_io(inode);
truncate_inode_pages_final(&inode->i_data);
afs_set_cache_aux(vnode, &aux);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 97f50e9fd9eb..297487ee8323 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -140,6 +140,11 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
put_page(page);
if (ret < 0)
return ret;
+
+ /* Don't cross a backup volume mountpoint from a backup volume */
+ if (src_as->volume && src_as->volume->type == AFSVL_BACKVOL &&
+ ctx->type == AFSVL_BACKVOL)
+ return -ENODEV;
}
return 0;
diff --git a/fs/aio.c b/fs/aio.c
index 57c9f7c077e6..93ef59d358b3 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1516,7 +1516,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res)
iocb_put(iocb);
}
-static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
+static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type)
{
int ret;
@@ -1542,7 +1542,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
} else
req->ki_ioprio = get_current_ioprio();
- ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
+ ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type);
if (unlikely(ret))
return ret;
@@ -1594,7 +1594,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
struct file *file;
int ret;
- ret = aio_prep_rw(req, iocb);
+ ret = aio_prep_rw(req, iocb, READ);
if (ret)
return ret;
file = req->ki_filp;
@@ -1621,7 +1621,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
struct file *file;
int ret;
- ret = aio_prep_rw(req, iocb);
+ ret = aio_prep_rw(req, iocb, WRITE);
if (ret)
return ret;
file = req->ki_filp;
diff --git a/fs/attr.c b/fs/attr.c
index 960a310581eb..825007d5cda4 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -17,8 +17,6 @@
#include <linux/filelock.h>
#include <linux/security.h>
-#include "internal.h"
-
/**
* setattr_should_drop_sgid - determine whether the setgid bit needs to be
* removed
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index b5e4dfa04ed0..1d644a35ffa0 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -38,4 +38,5 @@ static void __exit exit_autofs_fs(void)
module_init(init_autofs_fs)
module_exit(exit_autofs_fs)
+MODULE_DESCRIPTION("Kernel automounter support");
MODULE_LICENSE("GPL");
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 1f5db6863663..cf792d4de4f1 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -126,7 +126,7 @@ enum {
const struct fs_parameter_spec autofs_param_specs[] = {
fsparam_flag ("direct", Opt_direct),
fsparam_fd ("fd", Opt_fd),
- fsparam_u32 ("gid", Opt_gid),
+ fsparam_gid ("gid", Opt_gid),
fsparam_flag ("ignore", Opt_ignore),
fsparam_flag ("indirect", Opt_indirect),
fsparam_u32 ("maxproto", Opt_maxproto),
@@ -134,7 +134,7 @@ const struct fs_parameter_spec autofs_param_specs[] = {
fsparam_flag ("offset", Opt_offset),
fsparam_u32 ("pgrp", Opt_pgrp),
fsparam_flag ("strictexpire", Opt_strictexpire),
- fsparam_u32 ("uid", Opt_uid),
+ fsparam_uid ("uid", Opt_uid),
{}
};
@@ -193,8 +193,6 @@ static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param)
struct autofs_fs_context *ctx = fc->fs_private;
struct autofs_sb_info *sbi = fc->s_fs_info;
struct fs_parse_result result;
- kuid_t uid;
- kgid_t gid;
int opt;
opt = fs_parse(fc, autofs_param_specs, param, &result);
@@ -205,16 +203,10 @@ static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_fd:
return autofs_parse_fd(fc, sbi, param, &result);
case Opt_uid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid))
- return invalfc(fc, "Invalid uid");
- ctx->uid = uid;
+ ctx->uid = result.uid;
break;
case Opt_gid:
- gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(gid))
- return invalfc(fc, "Invalid gid");
- ctx->gid = gid;
+ ctx->gid = result.gid;
break;
case Opt_pgrp:
ctx->pgrp = result.uint_32;
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 346cd91f91f9..658f11aebda1 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -3,6 +3,7 @@
#include "alloc_background.h"
#include "alloc_foreground.h"
#include "backpointers.h"
+#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_key_cache.h"
@@ -29,7 +30,7 @@
#include <linux/sched/task.h>
#include <linux/sort.h>
-static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket);
+static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
/* Persistent alloc info: */
@@ -259,6 +260,14 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
"invalid data type (got %u should be %u)",
a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
+ for (unsigned i = 0; i < 2; i++)
+ bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX,
+ c, err,
+ alloc_key_io_time_bad,
+ "invalid io_time[%s]: %llu, max %llu",
+ i == READ ? "read" : "write",
+ a.v->io_time[i], LRU_TIME_MAX);
+
switch (a.v->data_type) {
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
@@ -741,6 +750,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
@@ -756,8 +766,8 @@ int bch2_trigger_alloc(struct btree_trans *trans,
alloc_data_type_set(new_a, new_a->data_type);
if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
- new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
- new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
+ new_a->io_time[READ] = bch2_current_io_time(c, READ);
+ new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
}
@@ -767,6 +777,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
!bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
new_a->gen++;
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
+ alloc_data_type_set(new_a, new_a->data_type);
}
if (old_a->data_type != new_a->data_type ||
@@ -780,7 +791,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
if (new_a->data_type == BCH_DATA_cached &&
!new_a->io_time[READ])
- new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+ new_a->io_time[READ] = bch2_current_io_time(c, READ);
u64 old_lru = alloc_lru_idx_read(*old_a);
u64 new_lru = alloc_lru_idx_read(*new_a);
@@ -860,8 +871,14 @@ int bch2_trigger_alloc(struct btree_trans *trans,
}
percpu_down_read(&c->mark_lock);
- if (new_a->gen != old_a->gen)
- *bucket_gen(ca, new.k->p.offset) = new_a->gen;
+ if (new_a->gen != old_a->gen) {
+ u8 *gen = bucket_gen(ca, new.k->p.offset);
+ if (unlikely(!gen)) {
+ percpu_up_read(&c->mark_lock);
+ goto invalid_bucket;
+ }
+ *gen = new_a->gen;
+ }
bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
percpu_up_read(&c->mark_lock);
@@ -875,14 +892,14 @@ int bch2_trigger_alloc(struct btree_trans *trans,
closure_wake_up(&c->freelist_wait);
if (statechange(a->data_type == BCH_DATA_need_discard) &&
- !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
+ !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
bucket_flushed(new_a))
- bch2_discard_one_bucket_fast(c, new.k->p);
+ bch2_discard_one_bucket_fast(ca, new.k->p.offset);
if (statechange(a->data_type == BCH_DATA_cached) &&
!bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
- bch2_do_invalidates(c);
+ bch2_dev_do_invalidates(ca);
if (statechange(a->data_type == BCH_DATA_need_gc_gens))
bch2_gc_gens_async(c);
@@ -895,6 +912,11 @@ int bch2_trigger_alloc(struct btree_trans *trans,
percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, new.k->p.offset);
+ if (unlikely(!g)) {
+ percpu_up_read(&c->mark_lock);
+ goto invalid_bucket;
+ }
+ g->gen_valid = 1;
bucket_lock(g);
@@ -910,8 +932,14 @@ int bch2_trigger_alloc(struct btree_trans *trans,
percpu_up_read(&c->mark_lock);
}
err:
+ printbuf_exit(&buf);
bch2_dev_put(ca);
return ret;
+invalid_bucket:
+ bch2_fs_inconsistent(c, "reference to invalid bucket\n %s",
+ (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
+ ret = -EIO;
+ goto err;
}
/*
@@ -1526,13 +1554,13 @@ err:
}
static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
- struct btree_iter *alloc_iter)
+ struct btree_iter *alloc_iter,
+ struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
- struct btree_iter lru_iter;
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a;
- struct bkey_s_c alloc_k, lru_k;
+ struct bkey_s_c alloc_k;
struct printbuf buf = PRINTBUF;
int ret;
@@ -1546,6 +1574,14 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
a = bch2_alloc_to_v4(alloc_k, &a_convert);
+ if (a->fragmentation_lru) {
+ ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START,
+ a->fragmentation_lru,
+ alloc_k, last_flushed);
+ if (ret)
+ return ret;
+ }
+
if (a->data_type != BCH_DATA_cached)
return 0;
@@ -1561,7 +1597,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
if (ret)
goto err;
- a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
+ a_mut->v.io_time[READ] = bch2_current_io_time(c, READ);
ret = bch2_trans_update(trans, alloc_iter,
&a_mut->k_i, BTREE_TRIGGER_norun);
if (ret)
@@ -1570,73 +1606,66 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
a = &a_mut->v;
}
- lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
- lru_pos(alloc_k.k->p.inode,
- bucket_to_u64(alloc_k.k->p),
- a->io_time[READ]), 0);
- ret = bkey_err(lru_k);
+ ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ],
+ alloc_k, last_flushed);
if (ret)
- return ret;
-
- if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
- alloc_key_to_missing_lru_entry,
- "missing lru entry\n"
- " %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
- ret = bch2_lru_set(trans,
- alloc_k.k->p.inode,
- bucket_to_u64(alloc_k.k->p),
- a->io_time[READ]);
- if (ret)
- goto err;
- }
+ goto err;
err:
fsck_err:
- bch2_trans_iter_exit(trans, &lru_iter);
printbuf_exit(&buf);
return ret;
}
int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
{
+ struct bkey_buf last_flushed;
+
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_alloc_to_lru_ref(trans, &iter)));
+ bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed)));
+
+ bch2_bkey_buf_exit(&last_flushed, c);
bch_err_fn(c, ret);
return ret;
}
-static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket)
+static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
{
int ret;
- mutex_lock(&c->discard_buckets_in_flight_lock);
- darray_for_each(c->discard_buckets_in_flight, i)
- if (bkey_eq(*i, bucket)) {
- ret = -EEXIST;
+ mutex_lock(&ca->discard_buckets_in_flight_lock);
+ darray_for_each(ca->discard_buckets_in_flight, i)
+ if (i->bucket == bucket) {
+ ret = -BCH_ERR_EEXIST_discard_in_flight_add;
goto out;
}
- ret = darray_push(&c->discard_buckets_in_flight, bucket);
+ ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
+ .in_progress = in_progress,
+ .bucket = bucket,
+ }));
out:
- mutex_unlock(&c->discard_buckets_in_flight_lock);
+ mutex_unlock(&ca->discard_buckets_in_flight_lock);
return ret;
}
-static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket)
+static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
{
- mutex_lock(&c->discard_buckets_in_flight_lock);
- darray_for_each(c->discard_buckets_in_flight, i)
- if (bkey_eq(*i, bucket)) {
- darray_remove_item(&c->discard_buckets_in_flight, i);
+ mutex_lock(&ca->discard_buckets_in_flight_lock);
+ darray_for_each(ca->discard_buckets_in_flight, i)
+ if (i->bucket == bucket) {
+ BUG_ON(!i->in_progress);
+ darray_remove_item(&ca->discard_buckets_in_flight, i);
goto found;
}
BUG();
found:
- mutex_unlock(&c->discard_buckets_in_flight_lock);
+ mutex_unlock(&ca->discard_buckets_in_flight_lock);
}
struct discard_buckets_state {
@@ -1644,26 +1673,11 @@ struct discard_buckets_state {
u64 open;
u64 need_journal_commit;
u64 discarded;
- struct bch_dev *ca;
u64 need_journal_commit_this_dev;
};
-static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca)
-{
- if (s->ca == ca)
- return;
-
- if (s->ca && s->need_journal_commit_this_dev >
- bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets)
- bch2_journal_flush_async(&c->journal, NULL);
-
- if (s->ca)
- percpu_ref_put(&s->ca->io_ref);
- s->ca = ca;
- s->need_journal_commit_this_dev = 0;
-}
-
static int bch2_discard_one_bucket(struct btree_trans *trans,
+ struct bch_dev *ca,
struct btree_iter *need_discard_iter,
struct bpos *discard_pos_done,
struct discard_buckets_state *s)
@@ -1677,16 +1691,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
bool discard_locked = false;
int ret = 0;
- struct bch_dev *ca = s->ca && s->ca->dev_idx == pos.inode
- ? s->ca
- : bch2_dev_get_ioref(c, pos.inode, WRITE);
- if (!ca) {
- bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
- return 0;
- }
-
- discard_buckets_next_dev(c, s, ca);
-
if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
s->open++;
goto out;
@@ -1746,7 +1750,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
goto out;
}
- if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true)))
+ if (discard_in_flight_add(ca, iter.pos.offset, true))
goto out;
discard_locked = true;
@@ -1770,8 +1774,9 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
}
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
- alloc_data_type_set(&a->v, a->v.data_type);
write:
+ alloc_data_type_set(&a->v, a->v.data_type);
+
ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
@@ -1783,7 +1788,7 @@ write:
s->discarded++;
out:
if (discard_locked)
- discard_in_flight_remove(c, iter.pos);
+ discard_in_flight_remove(ca, iter.pos.offset);
s->seen++;
bch2_trans_iter_exit(trans, &iter);
printbuf_exit(&buf);
@@ -1792,7 +1797,8 @@ out:
static void bch2_do_discards_work(struct work_struct *work)
{
- struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
+ struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
+ struct bch_fs *c = ca->fs;
struct discard_buckets_state s = {};
struct bpos discard_pos_done = POS_MAX;
int ret;
@@ -1803,23 +1809,41 @@ static void bch2_do_discards_work(struct work_struct *work)
* successful commit:
*/
ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter,
- BTREE_ID_need_discard, POS_MIN, 0, k,
- bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s)));
-
- discard_buckets_next_dev(c, &s, NULL);
+ for_each_btree_key_upto(trans, iter,
+ BTREE_ID_need_discard,
+ POS(ca->dev_idx, 0),
+ POS(ca->dev_idx, U64_MAX), 0, k,
+ bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s)));
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ percpu_ref_put(&ca->io_ref);
+}
+
+void bch2_dev_do_discards(struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ return;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
+ goto put_ioref;
+
+ if (queue_work(c->write_ref_wq, &ca->discard_work))
+ return;
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+put_ioref:
+ percpu_ref_put(&ca->io_ref);
}
void bch2_do_discards(struct bch_fs *c)
{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
- !queue_work(c->write_ref_wq, &c->discard_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ for_each_member_device(c, ca)
+ bch2_dev_do_discards(ca);
}
static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
@@ -1848,68 +1872,69 @@ err:
static void bch2_do_discards_fast_work(struct work_struct *work)
{
- struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work);
+ struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
+ struct bch_fs *c = ca->fs;
while (1) {
bool got_bucket = false;
- struct bpos bucket;
- struct bch_dev *ca;
-
- mutex_lock(&c->discard_buckets_in_flight_lock);
- darray_for_each(c->discard_buckets_in_flight, i) {
- if (i->snapshot)
- continue;
+ u64 bucket;
- ca = bch2_dev_get_ioref(c, i->inode, WRITE);
- if (!ca) {
- darray_remove_item(&c->discard_buckets_in_flight, i);
+ mutex_lock(&ca->discard_buckets_in_flight_lock);
+ darray_for_each(ca->discard_buckets_in_flight, i) {
+ if (i->in_progress)
continue;
- }
got_bucket = true;
- bucket = *i;
- i->snapshot = true;
+ bucket = i->bucket;
+ i->in_progress = true;
break;
}
- mutex_unlock(&c->discard_buckets_in_flight_lock);
+ mutex_unlock(&ca->discard_buckets_in_flight_lock);
if (!got_bucket)
break;
if (ca->mi.discard && !c->opts.nochanges)
blkdev_issue_discard(ca->disk_sb.bdev,
- bucket.offset * ca->mi.bucket_size,
+ bucket_to_sector(ca, bucket),
ca->mi.bucket_size,
GFP_KERNEL);
int ret = bch2_trans_do(c, NULL, NULL,
- BCH_WATERMARK_btree|
- BCH_TRANS_COMMIT_no_enospc,
- bch2_clear_bucket_needs_discard(trans, bucket));
+ BCH_WATERMARK_btree|
+ BCH_TRANS_COMMIT_no_enospc,
+ bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket)));
bch_err_fn(c, ret);
- percpu_ref_put(&ca->io_ref);
- discard_in_flight_remove(c, bucket);
+ discard_in_flight_remove(ca, bucket);
if (ret)
break;
}
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+ percpu_ref_put(&ca->io_ref);
}
-static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket)
+static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
{
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, bucket.inode);
- bool dead = !ca || percpu_ref_is_dying(&ca->io_ref);
- rcu_read_unlock();
+ struct bch_fs *c = ca->fs;
+
+ if (discard_in_flight_add(ca, bucket, false))
+ return;
- if (!dead &&
- !discard_in_flight_add(c, bucket) &&
- bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) &&
- !queue_work(c->write_ref_wq, &c->discard_fast_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ return;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
+ goto put_ioref;
+
+ if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
+ return;
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+put_ioref:
+ percpu_ref_put(&ca->io_ref);
}
static int invalidate_one_bucket(struct btree_trans *trans,
@@ -1957,8 +1982,8 @@ static int invalidate_one_bucket(struct btree_trans *trans,
a->v.data_type = 0;
a->v.dirty_sectors = 0;
a->v.cached_sectors = 0;
- a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
- a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now);
+ a->v.io_time[READ] = bch2_current_io_time(c, READ);
+ a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE);
ret = bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
@@ -1993,9 +2018,25 @@ err:
goto out;
}
+static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
+ struct bch_dev *ca, bool *wrapped)
+{
+ struct bkey_s_c k;
+again:
+ k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
+ if (!k.k && !*wrapped) {
+ bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
+ *wrapped = true;
+ goto again;
+ }
+
+ return k;
+}
+
static void bch2_do_invalidates_work(struct work_struct *work)
{
- struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
+ struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
+ struct bch_fs *c = ca->fs;
struct btree_trans *trans = bch2_trans_get(c);
int ret = 0;
@@ -2003,31 +2044,63 @@ static void bch2_do_invalidates_work(struct work_struct *work)
if (ret)
goto err;
- for_each_member_device(c, ca) {
- s64 nr_to_invalidate =
- should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
+ s64 nr_to_invalidate =
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
+ struct btree_iter iter;
+ bool wrapped = false;
- ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
- lru_pos(ca->dev_idx, 0, 0),
- lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
- BTREE_ITER_intent, k,
- invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate));
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
+ lru_pos(ca->dev_idx, 0,
+ ((bch2_current_io_time(c, READ) + U32_MAX) &
+ LRU_TIME_MAX)), 0);
- if (ret < 0) {
- bch2_dev_put(ca);
+ while (true) {
+ bch2_trans_begin(trans);
+
+ struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
+ ret = bkey_err(k);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
break;
- }
+ if (!k.k)
+ break;
+
+ ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
+ if (ret)
+ break;
+
+ bch2_btree_iter_advance(&iter);
}
+ bch2_trans_iter_exit(trans, &iter);
err:
bch2_trans_put(trans);
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+ percpu_ref_put(&ca->io_ref);
+}
+
+void bch2_dev_do_invalidates(struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ return;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
+ goto put_ioref;
+
+ if (queue_work(c->write_ref_wq, &ca->invalidate_work))
+ return;
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+put_ioref:
+ percpu_ref_put(&ca->io_ref);
}
void bch2_do_invalidates(struct bch_fs *c)
{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
- !queue_work(c->write_ref_wq, &c->invalidate_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+ for_each_member_device(c, ca)
+ bch2_dev_do_invalidates(ca);
}
int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
@@ -2186,7 +2259,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
if (ret)
return ret;
- now = atomic64_read(&c->io_clock[rw].now);
+ now = bch2_current_io_time(c, rw);
if (a->v.io_time[rw] == now)
goto out;
@@ -2343,16 +2416,20 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
set_bit(ca->dev_idx, c->rw_devs[i].d);
}
-void bch2_fs_allocator_background_exit(struct bch_fs *c)
+void bch2_dev_allocator_background_exit(struct bch_dev *ca)
+{
+ darray_exit(&ca->discard_buckets_in_flight);
+}
+
+void bch2_dev_allocator_background_init(struct bch_dev *ca)
{
- darray_exit(&c->discard_buckets_in_flight);
+ mutex_init(&ca->discard_buckets_in_flight_lock);
+ INIT_WORK(&ca->discard_work, bch2_do_discards_work);
+ INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
+ INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
}
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
- mutex_init(&c->discard_buckets_in_flight_lock);
- INIT_WORK(&c->discard_work, bch2_do_discards_work);
- INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work);
- INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index ae31a94be6f9..ba2c5557a3f0 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -141,7 +141,13 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
!bch2_bucket_sectors_fragmented(ca, a))
return 0;
- u64 d = bch2_bucket_sectors_dirty(a);
+ /*
+ * avoid overflowing LRU_TIME_BITS on a corrupted fs, when
+ * bucket_sectors_dirty is (much) bigger than bucket_size
+ */
+ u64 d = min(bch2_bucket_sectors_dirty(a),
+ ca->mi.bucket_size);
+
return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
}
@@ -269,6 +275,7 @@ int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
enum btree_iter_update_trigger_flags);
int bch2_check_alloc_info(struct bch_fs *);
int bch2_check_alloc_to_lru_refs(struct bch_fs *);
+void bch2_dev_do_discards(struct bch_dev *);
void bch2_do_discards(struct bch_fs *);
static inline u64 should_invalidate_buckets(struct bch_dev *ca,
@@ -283,6 +290,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
}
+void bch2_dev_do_invalidates(struct bch_dev *);
void bch2_do_invalidates(struct bch_fs *);
static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
@@ -306,7 +314,9 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
-void bch2_fs_allocator_background_exit(struct bch_fs *);
+void bch2_dev_allocator_background_exit(struct bch_dev *);
+void bch2_dev_allocator_background_init(struct bch_dev *);
+
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 927a5f300b30..27d97c22ae27 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -621,13 +621,13 @@ again:
avail = dev_buckets_free(ca, *usage, watermark);
if (usage->d[BCH_DATA_need_discard].buckets > avail)
- bch2_do_discards(c);
+ bch2_dev_do_discards(ca);
if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
bch2_gc_gens_async(c);
if (should_invalidate_buckets(ca, *usage))
- bch2_do_invalidates(c);
+ bch2_dev_do_invalidates(ca);
if (!avail) {
if (cl && !waiting) {
@@ -1703,6 +1703,7 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
nr[c->open_buckets[i].data_type]++;
+ printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, 24);
percpu_down_read(&c->mark_lock);
@@ -1736,6 +1737,7 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
nr[c->open_buckets[i].data_type]++;
+ printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, 12);
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 16);
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 692b1c7d5018..6d8b1bc90be0 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -434,13 +434,6 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
return ret;
}
-static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
-{
- return bpos_eq(l.k->p, r.k->p) &&
- bkey_bytes(l.k) == bkey_bytes(r.k) &&
- !memcmp(l.v, r.v, bkey_val_bytes(l.k));
-}
-
struct extents_to_bp_state {
struct bpos bucket_start;
struct bpos bucket_end;
@@ -536,11 +529,8 @@ static int check_bp_exists(struct btree_trans *trans,
struct btree_iter other_extent_iter = {};
struct printbuf buf = PRINTBUF;
struct bkey_s_c bp_k;
- struct bkey_buf tmp;
int ret = 0;
- bch2_bkey_buf_init(&tmp);
-
struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket);
if (!ca) {
prt_str(&buf, "extent for nonexistent device:bucket ");
@@ -565,22 +555,9 @@ static int check_bp_exists(struct btree_trans *trans,
if (bp_k.k->type != KEY_TYPE_backpointer ||
memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
- bch2_bkey_buf_reassemble(&tmp, c, orig_k);
-
- if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) {
- if (bp.level) {
- bch2_trans_unlock(trans);
- bch2_btree_interior_updates_flush(c);
- }
-
- ret = bch2_btree_write_buffer_flush_sync(trans);
- if (ret)
- goto err;
-
- bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k);
- ret = -BCH_ERR_transaction_restart_write_buffer_flush;
- goto out;
- }
+ ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed);
+ if (ret)
+ goto err;
goto check_existing_bp;
}
@@ -589,7 +566,6 @@ err:
fsck_err:
bch2_trans_iter_exit(trans, &other_extent_iter);
bch2_trans_iter_exit(trans, &bp_iter);
- bch2_bkey_buf_exit(&tmp, c);
bch2_dev_put(ca);
printbuf_exit(&buf);
return ret;
@@ -690,7 +666,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- struct bpos bucket_pos;
+ struct bpos bucket_pos = POS_MIN;
struct bch_backpointer bp;
if (p.ptr.cached)
@@ -794,6 +770,8 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
!((1U << btree) & btree_interior_mask))
continue;
+ bch2_trans_begin(trans);
+
__for_each_btree_node(trans, iter, btree,
btree == start.btree ? start.pos : POS_MIN,
0, depth, BTREE_ITER_prefetch, b, ret) {
@@ -905,7 +883,7 @@ static int check_one_backpointer(struct btree_trans *trans,
struct bbpos start,
struct bbpos end,
struct bkey_s_c_backpointer bp,
- struct bpos *last_flushed_pos)
+ struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
@@ -925,20 +903,18 @@ static int check_one_backpointer(struct btree_trans *trans,
if (ret)
return ret;
- if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) {
- *last_flushed_pos = bp.k->p;
- ret = bch2_btree_write_buffer_flush_sync(trans) ?:
- -BCH_ERR_transaction_restart_write_buffer_flush;
- goto out;
- }
+ if (!k.k) {
+ ret = bch2_btree_write_buffer_maybe_flush(trans, bp.s_c, last_flushed);
+ if (ret)
+ goto out;
- if (fsck_err_on(!k.k, c,
- backpointer_to_missing_ptr,
- "backpointer for missing %s\n %s",
- bp.v->level ? "btree node" : "extent",
- (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
- ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
- goto out;
+ if (fsck_err(c, backpointer_to_missing_ptr,
+ "backpointer for missing %s\n %s",
+ bp.v->level ? "btree node" : "extent",
+ (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
+ ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
+ goto out;
+ }
}
out:
fsck_err:
@@ -951,14 +927,20 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
struct bbpos start,
struct bbpos end)
{
- struct bpos last_flushed_pos = SPOS_MAX;
+ struct bkey_buf last_flushed;
- return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
+ int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_one_backpointer(trans, start, end,
bkey_s_c_to_backpointer(k),
- &last_flushed_pos));
+ &last_flushed));
+
+ bch2_bkey_buf_exit(&last_flushed, trans->c);
+ return ret;
}
int bch2_check_backpointers_to_extents(struct bch_fs *c)
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index bc0ea2c4efef..1106fec6e155 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -457,6 +457,7 @@ enum bch_time_stats {
};
#include "alloc_types.h"
+#include "btree_gc_types.h"
#include "btree_types.h"
#include "btree_node_scan_types.h"
#include "btree_write_buffer_types.h"
@@ -488,53 +489,15 @@ enum bch_time_stats {
struct btree;
-enum gc_phase {
- GC_PHASE_NOT_RUNNING,
- GC_PHASE_START,
- GC_PHASE_SB,
-
- GC_PHASE_BTREE_stripes,
- GC_PHASE_BTREE_extents,
- GC_PHASE_BTREE_inodes,
- GC_PHASE_BTREE_dirents,
- GC_PHASE_BTREE_xattrs,
- GC_PHASE_BTREE_alloc,
- GC_PHASE_BTREE_quotas,
- GC_PHASE_BTREE_reflink,
- GC_PHASE_BTREE_subvolumes,
- GC_PHASE_BTREE_snapshots,
- GC_PHASE_BTREE_lru,
- GC_PHASE_BTREE_freespace,
- GC_PHASE_BTREE_need_discard,
- GC_PHASE_BTREE_backpointers,
- GC_PHASE_BTREE_bucket_gens,
- GC_PHASE_BTREE_snapshot_trees,
- GC_PHASE_BTREE_deleted_inodes,
- GC_PHASE_BTREE_logged_ops,
- GC_PHASE_BTREE_rebalance_work,
- GC_PHASE_BTREE_subvolume_children,
-
- GC_PHASE_PENDING_DELETE,
-};
-
-struct gc_pos {
- enum gc_phase phase;
- u16 level;
- struct bpos pos;
-};
-
-struct reflink_gc {
- u64 offset;
- u32 size;
- u32 refcount;
-};
-
-typedef GENRADIX(struct reflink_gc) reflink_gc_table;
-
struct io_count {
u64 sectors[2][BCH_DATA_NR];
};
+struct discard_in_flight {
+ bool in_progress:1;
+ u64 bucket:63;
+};
+
struct bch_dev {
struct kobject kobj;
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -596,6 +559,12 @@ struct bch_dev {
size_t inc_gen_really_needs_gc;
size_t buckets_waiting_on_journal;
+ struct work_struct invalidate_work;
+ struct work_struct discard_work;
+ struct mutex discard_buckets_in_flight_lock;
+ DARRAY(struct discard_in_flight) discard_buckets_in_flight;
+ struct work_struct discard_fast_work;
+
atomic64_t rebalance_work;
struct journal_device journal;
@@ -832,7 +801,8 @@ struct bch_fs {
/* BTREE CACHE */
struct bio_set btree_bio;
- struct workqueue_struct *io_complete_wq;
+ struct workqueue_struct *btree_read_complete_wq;
+ struct workqueue_struct *btree_write_submit_wq;
struct btree_root btree_roots_known[BTREE_ID_NR];
DARRAY(struct btree_root) btree_roots_extra;
@@ -956,11 +926,6 @@ struct bch_fs {
unsigned write_points_nr;
struct buckets_waiting_for_journal buckets_waiting_for_journal;
- struct work_struct invalidate_work;
- struct work_struct discard_work;
- struct mutex discard_buckets_in_flight_lock;
- DARRAY(struct bpos) discard_buckets_in_flight;
- struct work_struct discard_fast_work;
/* GARBAGE COLLECTION */
struct work_struct gc_gens_work;
@@ -1255,6 +1220,11 @@ static inline s64 bch2_current_time(const struct bch_fs *c)
return timespec_to_bch2_time(c, now);
}
+static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw)
+{
+ return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX);
+}
+
static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
{
struct stdio_redirect *stdio = c->stdio;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index d801e19cb489..e3b1bde489c3 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -476,6 +476,9 @@ struct bch_lru {
#define LRU_ID_STRIPES (1U << 16)
+#define LRU_TIME_BITS 48
+#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
+
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -503,16 +506,22 @@ struct bch_sb_field {
#include "alloc_background_format.h"
#include "extents_format.h"
-#include "reflink_format.h"
#include "ec_format.h"
-#include "inode_format.h"
#include "dirent_format.h"
-#include "xattr_format.h"
-#include "quota_format.h"
+#include "disk_groups_format.h"
+#include "inode_format.h"
+#include "journal_seq_blacklist_format.h"
#include "logged_ops_format.h"
+#include "quota_format.h"
+#include "reflink_format.h"
+#include "replicas_format.h"
#include "snapshot_format.h"
#include "subvolume_format.h"
#include "sb-counters_format.h"
+#include "sb-downgrade_format.h"
+#include "sb-errors_format.h"
+#include "sb-members_format.h"
+#include "xattr_format.h"
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
@@ -545,107 +554,6 @@ struct bch_sb_field_journal_v2 {
} d[];
};
-/* BCH_SB_FIELD_members_v1: */
-
-#define BCH_MIN_NR_NBUCKETS (1 << 6)
-
-#define BCH_IOPS_MEASUREMENTS() \
- x(seqread, 0) \
- x(seqwrite, 1) \
- x(randread, 2) \
- x(randwrite, 3)
-
-enum bch_iops_measurement {
-#define x(t, n) BCH_IOPS_##t = n,
- BCH_IOPS_MEASUREMENTS()
-#undef x
- BCH_IOPS_NR
-};
-
-#define BCH_MEMBER_ERROR_TYPES() \
- x(read, 0) \
- x(write, 1) \
- x(checksum, 2)
-
-enum bch_member_error_type {
-#define x(t, n) BCH_MEMBER_ERROR_##t = n,
- BCH_MEMBER_ERROR_TYPES()
-#undef x
- BCH_MEMBER_ERROR_NR
-};
-
-struct bch_member {
- __uuid_t uuid;
- __le64 nbuckets; /* device size */
- __le16 first_bucket; /* index of first bucket used */
- __le16 bucket_size; /* sectors */
- __u8 btree_bitmap_shift;
- __u8 pad[3];
- __le64 last_mount; /* time_t */
-
- __le64 flags;
- __le32 iops[4];
- __le64 errors[BCH_MEMBER_ERROR_NR];
- __le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
- __le64 errors_reset_time;
- __le64 seq;
- __le64 btree_allocated_bitmap;
- /*
- * On recovery from a clean shutdown we don't normally read the journal,
- * but we still want to resume writing from where we left off so we
- * don't overwrite more than is necessary, for list journal debugging:
- */
- __le32 last_journal_bucket;
- __le32 last_journal_bucket_offset;
-};
-
-/*
- * This limit comes from the bucket_gens array - it's a single allocation, and
- * kernel allocation are limited to INT_MAX
- */
-#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64)
-
-#define BCH_MEMBER_V1_BYTES 56
-
-LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
-/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
-LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15)
-LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20)
-LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28)
-LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30)
-LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
- struct bch_member, flags, 30, 31)
-
-#if 0
-LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
-LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
-#endif
-
-#define BCH_MEMBER_STATES() \
- x(rw, 0) \
- x(ro, 1) \
- x(failed, 2) \
- x(spare, 3)
-
-enum bch_member_state {
-#define x(t, n) BCH_MEMBER_STATE_##t = n,
- BCH_MEMBER_STATES()
-#undef x
- BCH_MEMBER_STATE_NR
-};
-
-struct bch_sb_field_members_v1 {
- struct bch_sb_field field;
- struct bch_member _members[]; //Members are now variable size
-};
-
-struct bch_sb_field_members_v2 {
- struct bch_sb_field field;
- __le16 member_bytes; //size of single member entry
- u8 pad[6];
- struct bch_member _members[];
-};
-
/* BCH_SB_FIELD_crypt: */
struct nonce {
@@ -694,8 +602,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16);
LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
-/* BCH_SB_FIELD_replicas: */
-
#define BCH_DATA_TYPES() \
x(free, 0) \
x(sb, 1) \
@@ -738,50 +644,6 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
}
}
-struct bch_replicas_entry_v0 {
- __u8 data_type;
- __u8 nr_devs;
- __u8 devs[];
-} __packed;
-
-struct bch_sb_field_replicas_v0 {
- struct bch_sb_field field;
- struct bch_replicas_entry_v0 entries[];
-} __packed __aligned(8);
-
-struct bch_replicas_entry_v1 {
- __u8 data_type;
- __u8 nr_devs;
- __u8 nr_required;
- __u8 devs[];
-} __packed;
-
-#define replicas_entry_bytes(_i) \
- (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
-
-struct bch_sb_field_replicas {
- struct bch_sb_field field;
- struct bch_replicas_entry_v1 entries[];
-} __packed __aligned(8);
-
-/* BCH_SB_FIELD_disk_groups: */
-
-#define BCH_SB_LABEL_SIZE 32
-
-struct bch_disk_group {
- __u8 label[BCH_SB_LABEL_SIZE];
- __le64 flags[2];
-} __packed __aligned(8);
-
-LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
-LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
-LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24)
-
-struct bch_sb_field_disk_groups {
- struct bch_sb_field field;
- struct bch_disk_group entries[];
-} __packed __aligned(8);
-
/*
* On clean shutdown, store btree roots and current journal sequence number in
* the superblock:
@@ -809,27 +671,6 @@ struct bch_sb_field_clean {
__u64 _data[];
};
-struct journal_seq_blacklist_entry {
- __le64 start;
- __le64 end;
-};
-
-struct bch_sb_field_journal_seq_blacklist {
- struct bch_sb_field field;
- struct journal_seq_blacklist_entry start[];
-};
-
-struct bch_sb_field_errors {
- struct bch_sb_field field;
- struct bch_sb_field_error_entry {
- __le64 v;
- __le64 last_error_time;
- } entries[];
-};
-
-LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16);
-LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64);
-
struct bch_sb_field_ext {
struct bch_sb_field field;
__le64 recovery_passes_required[2];
@@ -837,18 +678,6 @@ struct bch_sb_field_ext {
__le64 btrees_lost_data;
};
-struct bch_sb_field_downgrade_entry {
- __le16 version;
- __le64 recovery_passes[2];
- __le16 nr_errors;
- __le16 errors[] __counted_by(nr_errors);
-} __packed __aligned(2);
-
-struct bch_sb_field_downgrade {
- struct bch_sb_field field;
- struct bch_sb_field_downgrade_entry entries[];
-};
-
/* Superblock: */
/*
@@ -909,7 +738,6 @@ unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_re
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
#define BCH_SB_SECTOR 8
-#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */
#define BCH_SB_LAYOUT_SIZE_BITS_MAX 16 /* 32 MB */
@@ -1162,8 +990,9 @@ enum bch_version_upgrade_opts {
#define BCH_ERROR_ACTIONS() \
x(continue, 0) \
- x(ro, 1) \
- x(panic, 2)
+ x(fix_safe, 1) \
+ x(panic, 2) \
+ x(ro, 3)
enum bch_error_actions {
#define x(t, n) BCH_ON_ERROR_##t = n,
@@ -1557,9 +1386,10 @@ enum btree_id {
/*
* Maximum number of btrees that we will _ever_ have under the current scheme,
- * where we refer to them with bitfields
+ * where we refer to them with 64 bit bitfields - and we also need a bit for
+ * the interior btree node type:
*/
-#define BTREE_ID_NR_MAX 64
+#define BTREE_ID_NR_MAX 63
static inline bool btree_id_is_alloc(enum btree_id id)
{
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index f46978e5cb7c..587d7318a2e8 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -660,8 +660,9 @@ int bch2_bkey_format_invalid(struct bch_fs *c,
bch2_bkey_format_field_overflows(f, i)) {
unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
- u64 packed_max = f->bits_per_field[i]
- ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+ unsigned packed_bits = min(64, f->bits_per_field[i]);
+ u64 packed_max = packed_bits
+ ? ~((~0ULL << 1) << (packed_bits - 1))
: 0;
prt_printf(err, "field %u too large: %llu + %llu > %llu",
@@ -1064,7 +1065,7 @@ void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
{
const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
u8 *l = k->key_start;
- u8 *h = (u8 *) (k->_data + f->key_u64s) - 1;
+ u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1;
while (l < h) {
swap(*l, *h);
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index fcd43915df07..936357149cf0 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -194,6 +194,13 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r)
return bkey_gt(l, r) ? l : r;
}
+static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
+{
+ return bpos_eq(l.k->p, r.k->p) &&
+ bkey_bytes(l.k) == bkey_bytes(r.k) &&
+ !memcmp(l.v, r.v, bkey_val_bytes(l.k));
+}
+
void bch2_bpos_swab(struct bpos *);
void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index c2c3dae52186..bd32aac05192 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -398,8 +398,12 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
for (i = 0; i < nr_compat; i++)
switch (!write ? i : nr_compat - 1 - i) {
case 0:
- if (big_endian != CPU_BIG_ENDIAN)
+ if (big_endian != CPU_BIG_ENDIAN) {
+ bch2_bkey_swab_key(f, k);
+ } else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
bch2_bkey_swab_key(f, k);
+ bch2_bkey_swab_key(f, k);
+ }
break;
case 1:
if (version < bcachefs_metadata_version_bkey_renumber)
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 726ef7483763..baef0722f5fb 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -129,7 +129,8 @@ static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id,
struct bkey_packed *k)
{
if (version < bcachefs_metadata_version_current ||
- big_endian != CPU_BIG_ENDIAN)
+ big_endian != CPU_BIG_ENDIAN ||
+ IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
__bch2_bkey_compat(level, btree_id, version,
big_endian, write, f, k);
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 9e4ed75d3675..4f5e411771ba 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -91,10 +91,11 @@ static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
}
static const struct rhashtable_params bch_btree_cache_params = {
- .head_offset = offsetof(struct btree, hash),
- .key_offset = offsetof(struct btree, hash_val),
- .key_len = sizeof(u64),
- .obj_cmpfn = bch2_btree_cache_cmp_fn,
+ .head_offset = offsetof(struct btree, hash),
+ .key_offset = offsetof(struct btree, hash_val),
+ .key_len = sizeof(u64),
+ .obj_cmpfn = bch2_btree_cache_cmp_fn,
+ .automatic_shrinking = true,
};
static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 8035c8b797ab..a0deb8266011 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -585,16 +585,17 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
bkey_version_in_future,
- "key version number higher than recorded: %llu > %llu",
- k.k->version.lo,
- atomic64_read(&c->key_version)))
+ "key version number higher than recorded %llu\n %s",
+ atomic64_read(&c->key_version),
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
atomic64_set(&c->key_version, k.k->version.lo);
}
if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
c, btree_bitmap_not_marked,
"btree ptr not marked in member info btree allocated bitmap\n %s",
- (bch2_bkey_val_to_text(&buf, c, k),
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
buf.buf))) {
mutex_lock(&c->sb_lock);
bch2_dev_btree_bitmap_mark(c, k);
@@ -640,16 +641,30 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in
target_depth = 0;
/* root */
- mutex_lock(&c->btree_root_lock);
- struct btree *b = bch2_btree_id_root(c, btree)->b;
- if (!btree_node_fake(b)) {
+ do {
+retry_root:
+ bch2_trans_begin(trans);
+
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN,
+ 0, bch2_btree_id_root(c, btree)->b->c.level, 0);
+ struct btree *b = bch2_btree_iter_peek_node(&iter);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ goto err_root;
+
+ if (b != btree_node_root(c, b)) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto retry_root;
+ }
+
gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
- ret = lockrestart_do(trans,
- bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
- NULL, NULL, bkey_i_to_s_c(&b->key), initial));
+ struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+ ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial);
level = b->c.level;
- }
- mutex_unlock(&c->btree_root_lock);
+err_root:
+ bch2_trans_iter_exit(trans, &iter);
+ } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
if (ret)
return ret;
@@ -673,8 +688,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in
static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
{
- return (int) btree_id_to_gc_phase(l) -
- (int) btree_id_to_gc_phase(r);
+ return cmp_int(gc_btree_order(l), gc_btree_order(r));
}
static int bch2_gc_btrees(struct bch_fs *c)
@@ -711,7 +725,7 @@ fsck_err:
static int bch2_mark_superblocks(struct bch_fs *c)
{
mutex_lock(&c->sb_lock);
- gc_pos_set(c, gc_phase(GC_PHASE_SB));
+ gc_pos_set(c, gc_phase(GC_PHASE_sb));
int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
mutex_unlock(&c->sb_lock);
@@ -874,6 +888,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
const struct bch_alloc_v4 *old;
int ret;
+ if (!bucket_valid(ca, k.k->p.offset))
+ return 0;
+
old = bch2_alloc_to_v4(k, &old_convert);
gc = new = *old;
@@ -900,6 +917,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
bch2_dev_usage_update(c, ca, &old_gc, &gc, 0, true);
percpu_up_read(&c->mark_lock);
+ gc.fragmentation_lru = alloc_lru_idx_fragmentation(gc, ca);
+
if (fsck_err_on(new.data_type != gc.data_type, c,
alloc_key_data_type_wrong,
"bucket %llu:%llu gen %u has wrong data_type"
@@ -913,23 +932,19 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
#define copy_bucket_field(_errtype, _f) \
if (fsck_err_on(new._f != gc._f, c, _errtype, \
"bucket %llu:%llu gen %u data type %s has wrong " #_f \
- ": got %u, should be %u", \
+ ": got %llu, should be %llu", \
iter->pos.inode, iter->pos.offset, \
gc.gen, \
bch2_data_type_str(gc.data_type), \
- new._f, gc._f)) \
+ (u64) new._f, (u64) gc._f)) \
new._f = gc._f; \
- copy_bucket_field(alloc_key_gen_wrong,
- gen);
- copy_bucket_field(alloc_key_dirty_sectors_wrong,
- dirty_sectors);
- copy_bucket_field(alloc_key_cached_sectors_wrong,
- cached_sectors);
- copy_bucket_field(alloc_key_stripe_wrong,
- stripe);
- copy_bucket_field(alloc_key_stripe_redundancy_wrong,
- stripe_redundancy);
+ copy_bucket_field(alloc_key_gen_wrong, gen);
+ copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors);
+ copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors);
+ copy_bucket_field(alloc_key_stripe_wrong, stripe);
+ copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy);
+ copy_bucket_field(alloc_key_fragmentation_lru_wrong, fragmentation_lru);
#undef copy_bucket_field
if (!bch2_alloc_v4_cmp(*old, new))
@@ -943,7 +958,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
a->v = new;
/*
- * The trigger normally makes sure this is set, but we're not running
+ * The trigger normally makes sure these are set, but we're not running
* triggers:
*/
if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
@@ -990,6 +1005,8 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
buckets->first_bucket = ca->mi.first_bucket;
buckets->nbuckets = ca->mi.nbuckets;
+ buckets->nbuckets_minus_first =
+ buckets->nbuckets - buckets->first_bucket;
rcu_assign_pointer(ca->buckets_gc, buckets);
}
@@ -1003,12 +1020,14 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
continue;
}
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+ if (bucket_valid(ca, k.k->p.offset)) {
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
- struct bucket *g = gc_bucket(ca, k.k->p.offset);
- g->gen_valid = 1;
- g->gen = a->gen;
+ struct bucket *g = gc_bucket(ca, k.k->p.offset);
+ g->gen_valid = 1;
+ g->gen = a->gen;
+ }
0;
})));
bch2_dev_put(ca);
@@ -1209,7 +1228,7 @@ int bch2_check_allocations(struct bch_fs *c)
if (ret)
goto out;
- gc_pos_set(c, gc_phase(GC_PHASE_START));
+ gc_pos_set(c, gc_phase(GC_PHASE_start));
ret = bch2_mark_superblocks(c);
BUG_ON(ret);
@@ -1231,7 +1250,7 @@ out:
percpu_down_write(&c->mark_lock);
/* Indicates that gc is no longer in progress: */
- __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+ __gc_pos_set(c, gc_phase(GC_PHASE_not_running));
bch2_gc_free(c);
percpu_up_write(&c->mark_lock);
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 1b6489d8e0f4..876d81e2017d 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -3,6 +3,7 @@
#define _BCACHEFS_BTREE_GC_H
#include "bkey.h"
+#include "btree_gc_types.h"
#include "btree_types.h"
int bch2_check_topology(struct bch_fs *);
@@ -32,36 +33,15 @@ int bch2_check_allocations(struct bch_fs *);
/* Position of (the start of) a gc phase: */
static inline struct gc_pos gc_phase(enum gc_phase phase)
{
- return (struct gc_pos) {
- .phase = phase,
- .level = 0,
- .pos = POS_MIN,
- };
-}
-
-static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
-{
- return cmp_int(l.phase, r.phase) ?:
- -cmp_int(l.level, r.level) ?:
- bpos_cmp(l.pos, r.pos);
-}
-
-static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
-{
- switch (id) {
-#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
- BCH_BTREE_IDS()
-#undef x
- default:
- BUG();
- }
+ return (struct gc_pos) { .phase = phase, };
}
static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level,
struct bpos pos)
{
return (struct gc_pos) {
- .phase = btree_id_to_gc_phase(btree),
+ .phase = GC_PHASE_btree,
+ .btree = btree,
.level = level,
.pos = pos,
};
@@ -76,6 +56,22 @@ static inline struct gc_pos gc_pos_btree_node(struct btree *b)
return gc_pos_btree(b->c.btree_id, b->c.level, b->key.k.p);
}
+static inline int gc_btree_order(enum btree_id btree)
+{
+ if (btree == BTREE_ID_stripes)
+ return -1;
+ return btree;
+}
+
+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
+{
+ return cmp_int(l.phase, r.phase) ?:
+ cmp_int(gc_btree_order(l.btree),
+ gc_btree_order(r.btree)) ?:
+ -cmp_int(l.level, r.level) ?:
+ bpos_cmp(l.pos, r.pos);
+}
+
static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
{
unsigned seq;
diff --git a/fs/bcachefs/btree_gc_types.h b/fs/bcachefs/btree_gc_types.h
new file mode 100644
index 000000000000..b82c24bcc088
--- /dev/null
+++ b/fs/bcachefs/btree_gc_types.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_GC_TYPES_H
+#define _BCACHEFS_BTREE_GC_TYPES_H
+
+#include <linux/generic-radix-tree.h>
+
+enum gc_phase {
+ GC_PHASE_not_running,
+ GC_PHASE_start,
+ GC_PHASE_sb,
+ GC_PHASE_btree,
+};
+
+struct gc_pos {
+ enum gc_phase phase:8;
+ enum btree_id btree:8;
+ u16 level;
+ struct bpos pos;
+};
+
+struct reflink_gc {
+ u64 offset;
+ u32 size;
+ u32 refcount;
+};
+
+typedef GENRADIX(struct reflink_gc) reflink_gc_table;
+
+#endif /* _BCACHEFS_BTREE_GC_TYPES_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index cbf8f5d90602..7bca15c604f5 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -519,7 +519,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
struct bch_dev *ca,
- struct btree *b, struct bset *i,
+ struct btree *b, struct bset *i, struct bkey_packed *k,
unsigned offset, int write)
{
prt_printf(out, bch2_log_msg(c, "%s"),
@@ -537,15 +537,20 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
b->written, btree_ptr_sectors_written(&b->key));
if (i)
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
+ if (k)
+ prt_printf(out, " bset byte offset %lu",
+ (unsigned long)(void *)k -
+ ((unsigned long)(void *)i & ~511UL));
prt_str(out, ": ");
}
-__printf(9, 10)
+__printf(10, 11)
static int __btree_err(int ret,
struct bch_fs *c,
struct bch_dev *ca,
struct btree *b,
struct bset *i,
+ struct bkey_packed *k,
int write,
bool have_retry,
enum bch_sb_error_id err_type,
@@ -555,7 +560,7 @@ static int __btree_err(int ret,
bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes;
va_list args;
- btree_err_msg(&out, c, ca, b, i, b->written, write);
+ btree_err_msg(&out, c, ca, b, i, k, b->written, write);
va_start(args, fmt);
prt_vprintf(&out, fmt, args);
@@ -611,9 +616,9 @@ fsck_err:
return ret;
}
-#define btree_err(type, c, ca, b, i, _err_type, msg, ...) \
+#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \
({ \
- int _ret = __btree_err(type, c, ca, b, i, write, have_retry, \
+ int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \
BCH_FSCK_ERR_##_err_type, \
msg, ##__VA_ARGS__); \
\
@@ -690,7 +695,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(!bch2_version_compatible(version),
-BCH_ERR_btree_node_read_err_incompatible,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_unsupported_version,
"unsupported bset version %u.%u",
BCH_VERSION_MAJOR(version),
@@ -698,7 +703,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
if (btree_err_on(version < c->sb.version_min,
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, NULL,
btree_node_bset_older_than_sb_min,
"bset version %u older than superblock version_min %u",
version, c->sb.version_min)) {
@@ -711,7 +716,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
if (btree_err_on(BCH_VERSION_MAJOR(version) >
BCH_VERSION_MAJOR(c->sb.version),
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, NULL,
btree_node_bset_newer_than_sb,
"bset version %u newer than superblock version %u",
version, c->sb.version)) {
@@ -723,13 +728,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
-BCH_ERR_btree_node_read_err_incompatible,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_unsupported_version,
"BSET_SEPARATE_WHITEOUTS no longer supported");
if (btree_err_on(offset + sectors > btree_sectors(c),
-BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_past_end_of_btree_node,
"bset past end of btree node")) {
i->u64s = 0;
@@ -739,13 +744,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(offset && !i->u64s,
-BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_empty,
"empty bset");
btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
-BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_wrong_sector_offset,
"bset at wrong sector offset");
@@ -761,20 +766,20 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
/* XXX endianness */
btree_err_on(bp->seq != bn->keys.seq,
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
bset_bad_seq,
"incorrect sequence number (wrong btree node)");
}
btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_bad_btree,
"incorrect btree id");
btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_bad_level,
"incorrect level");
@@ -793,7 +798,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_bad_min_key,
"incorrect min_key: got %s should be %s",
(printbuf_reset(&buf1),
@@ -804,7 +809,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_bad_max_key,
"incorrect max key %s",
(printbuf_reset(&buf1),
@@ -816,7 +821,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
-BCH_ERR_btree_node_read_err_bad_node,
- c, ca, b, i,
+ c, ca, b, i, NULL,
btree_node_bad_format,
"invalid bkey format: %s\n %s", buf1.buf,
(printbuf_reset(&buf2),
@@ -883,7 +888,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, k,
btree_node_bkey_past_bset_end,
"key extends past end of bset")) {
i->u64s = cpu_to_le16((u64 *) k - i->_data);
@@ -892,14 +897,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, k,
btree_node_bkey_bad_format,
"invalid bkey format %u", k->format))
goto drop_this_key;
if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, k,
btree_node_bkey_bad_u64s,
"bad k->u64s %u (min %u max %zu)", k->u64s,
bkeyp_key_u64s(&b->format, k),
@@ -921,7 +926,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
bch2_bkey_val_to_text(&buf, c, u.s_c);
btree_err(-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, k,
btree_node_bad_bkey,
"invalid bkey: %s", buf.buf);
goto drop_this_key;
@@ -942,7 +947,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
bch2_bkey_to_text(&buf, u.k);
if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, k,
btree_node_bkey_out_of_order,
"%s", buf.buf))
goto drop_this_key;
@@ -1011,13 +1016,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
if (bch2_meta_read_fault("btree"))
btree_err(-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_fault_injected,
"dynamic fault");
btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_bad_magic,
"bad magic: want %llx, got %llx",
bset_magic(c), le64_to_cpu(b->data->magic));
@@ -1032,7 +1037,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(b->data->keys.seq != bp->seq,
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_bad_seq,
"got wrong btree node: got\n%s",
(printbuf_reset(&buf),
@@ -1041,7 +1046,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
} else {
btree_err_on(!b->data->keys.seq,
-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_bad_seq,
"bad btree header: seq 0\n%s",
(printbuf_reset(&buf),
@@ -1060,7 +1065,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_unknown_csum,
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
@@ -1073,7 +1078,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(csum_bad,
-BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_bad_csum,
"%s",
(printbuf_reset(&buf),
@@ -1088,7 +1093,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
!BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
-BCH_ERR_btree_node_read_err_incompatible,
- c, NULL, b, NULL,
+ c, NULL, b, NULL, NULL,
btree_node_unsupported_version,
"btree node does not have NEW_EXTENT_OVERWRITE set");
@@ -1102,7 +1107,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_unknown_csum,
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
@@ -1114,7 +1119,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(csum_bad,
-BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_bad_csum,
"%s",
(printbuf_reset(&buf),
@@ -1152,14 +1157,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(blacklisted && first,
-BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i,
+ c, ca, b, i, NULL,
bset_blacklisted_journal_seq,
"first btree node bset has blacklisted journal seq (%llu)",
le64_to_cpu(i->journal_seq));
btree_err_on(blacklisted && ptr_written,
-BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i,
+ c, ca, b, i, NULL,
first_bset_blacklisted_journal_seq,
"found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
le64_to_cpu(i->journal_seq),
@@ -1178,7 +1183,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
if (ptr_written) {
btree_err_on(b->written < ptr_written,
-BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_data_missing,
"btree node data missing: expected %u sectors, found %u",
ptr_written, b->written);
@@ -1191,7 +1196,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
le64_to_cpu(bne->keys.journal_seq),
true),
-BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, NULL,
+ c, ca, b, NULL, NULL,
btree_node_bset_after_end,
"found bset signature after last bset");
}
@@ -1235,7 +1240,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
bch2_bkey_val_to_text(&buf, c, u.s_c);
btree_err(-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i,
+ c, NULL, b, i, k,
btree_node_bad_bkey,
"%s", buf.buf);
@@ -1384,7 +1389,7 @@ static void btree_node_read_endio(struct bio *bio)
bch2_latency_acct(ca, rb->start_time, READ);
}
- queue_work(c->io_complete_wq, &rb->work);
+ queue_work(c->btree_read_complete_wq, &rb->work);
}
struct btree_node_read_all {
@@ -1471,18 +1476,18 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
written2 = btree_node_sectors_written(c, ra->buf[i]);
if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, NULL,
+ c, NULL, b, NULL, NULL,
btree_node_replicas_sectors_written_mismatch,
"btree node sectors written mismatch: %u != %u",
written, written2) ||
btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, NULL,
+ c, NULL, b, NULL, NULL,
btree_node_bset_after_end,
"found bset signature after last bset") ||
btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, NULL,
+ c, NULL, b, NULL, NULL,
btree_node_replicas_data_mismatch,
"btree node replicas content mismatch"))
dump_bset_maps = true;
@@ -1651,7 +1656,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
btree_node_read_all_replicas_done(&ra->cl.work);
} else {
continue_at(&ra->cl, btree_node_read_all_replicas_done,
- c->io_complete_wq);
+ c->btree_read_complete_wq);
}
return 0;
@@ -1732,7 +1737,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
if (sync)
btree_node_read_work(&rb->work);
else
- queue_work(c->io_complete_wq, &rb->work);
+ queue_work(c->btree_read_complete_wq, &rb->work);
}
}
@@ -2224,7 +2229,7 @@ do_write:
atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
INIT_WORK(&wbio->work, btree_write_submit);
- queue_work(c->io_complete_wq, &wbio->work);
+ queue_work(c->btree_write_submit_wq, &wbio->work);
return;
err:
set_btree_node_noevict(b);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d3bcb4e4e230..19352a08ea20 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -221,11 +221,8 @@ static void bch2_btree_path_verify(struct btree_trans *trans,
struct btree_path *path)
{
struct bch_fs *c = trans->c;
- unsigned i;
-
- EBUG_ON(path->btree_id >= BTREE_ID_NR);
- for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
+ for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
if (!path->l[i].b) {
BUG_ON(!path->cached &&
bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
@@ -251,8 +248,6 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
- BUG_ON(iter->btree_id >= BTREE_ID_NR);
-
BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
@@ -1001,7 +996,7 @@ retry_all:
bch2_trans_unlock(trans);
cond_resched();
- trans->locked = true;
+ trans_set_locked(trans);
if (unlikely(trans->memory_allocation_failure)) {
struct closure cl;
@@ -3094,7 +3089,8 @@ u32 bch2_trans_begin(struct btree_trans *trans)
bch2_trans_srcu_unlock(trans);
trans->last_begin_ip = _RET_IP_;
- trans->locked = true;
+
+ trans_set_locked(trans);
if (trans->restarted) {
bch2_btree_path_traverse_all(trans);
@@ -3135,7 +3131,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
memset(trans, 0, sizeof(*trans));
- closure_init_stack(&trans->ref);
seqmutex_lock(&c->btree_trans_lock);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
@@ -3155,22 +3150,16 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
BUG_ON(pos_task &&
pid == pos_task->pid &&
pos->locked);
-
- if (pos_task && pid < pos_task->pid) {
- list_add_tail(&trans->list, &pos->list);
- goto list_add_done;
- }
}
}
- list_add_tail(&trans->list, &c->btree_trans_list);
-list_add_done:
+
+ list_add(&trans->list, &c->btree_trans_list);
seqmutex_unlock(&c->btree_trans_lock);
got_trans:
trans->c = c;
trans->last_begin_time = local_clock();
trans->fn_idx = fn_idx;
trans->locking_wait.task = current;
- trans->locked = true;
trans->journal_replay_not_finished =
unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) &&
atomic_inc_not_zero(&c->journal_keys.ref);
@@ -3204,6 +3193,9 @@ got_trans:
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
trans->srcu_lock_time = jiffies;
trans->srcu_held = true;
+ trans_set_locked(trans);
+
+ closure_init_stack_release(&trans->ref);
return trans;
}
@@ -3240,7 +3232,6 @@ void bch2_trans_put(struct btree_trans *trans)
trans_for_each_update(trans, i)
__btree_path_put(trans->paths + i->path, true);
trans->nr_updates = 0;
- trans->locking_wait.task = NULL;
check_btree_paths_leaked(trans);
@@ -3261,6 +3252,13 @@ void bch2_trans_put(struct btree_trans *trans)
if (unlikely(trans->journal_replay_not_finished))
bch2_journal_keys_put(c);
+ /*
+ * trans->ref protects trans->locking_wait.task, btree_paths array; used
+ * by cycle detector
+ */
+ closure_return_sync(&trans->ref);
+ trans->locking_wait.task = NULL;
+
unsigned long *paths_allocated = trans->paths_allocated;
trans->paths_allocated = NULL;
trans->paths = NULL;
@@ -3278,8 +3276,6 @@ void bch2_trans_put(struct btree_trans *trans)
trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
if (trans) {
- closure_sync(&trans->ref);
-
seqmutex_lock(&c->btree_trans_lock);
list_del(&trans->list);
seqmutex_unlock(&c->btree_trans_lock);
@@ -3385,8 +3381,6 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
if (trans) {
- closure_sync(&trans->ref);
-
seqmutex_lock(&c->btree_trans_lock);
list_del(&trans->list);
seqmutex_unlock(&c->btree_trans_lock);
@@ -3406,8 +3400,10 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
bch2_time_stats_exit(&s->lock_hold_times);
}
- if (c->btree_trans_barrier_initialized)
+ if (c->btree_trans_barrier_initialized) {
+ synchronize_srcu_expedited(&c->btree_trans_barrier);
cleanup_srcu_struct(&c->btree_trans_barrier);
+ }
mempool_exit(&c->btree_trans_mem_pool);
mempool_exit(&c->btree_trans_pool);
}
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 75f5e6fe4634..2d3c0d45c37f 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -32,10 +32,11 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
}
static const struct rhashtable_params bch2_btree_key_cache_params = {
- .head_offset = offsetof(struct bkey_cached, hash),
- .key_offset = offsetof(struct bkey_cached, key),
- .key_len = sizeof(struct bkey_cached_key),
- .obj_cmpfn = bch2_btree_key_cache_cmp_fn,
+ .head_offset = offsetof(struct bkey_cached, hash),
+ .key_offset = offsetof(struct bkey_cached, key),
+ .key_len = sizeof(struct bkey_cached_key),
+ .obj_cmpfn = bch2_btree_key_cache_cmp_fn,
+ .automatic_shrinking = true,
};
__flatten
@@ -424,16 +425,16 @@ static int btree_key_cache_fill(struct btree_trans *trans,
goto err;
}
- if (!bch2_btree_node_relock(trans, ck_path, 0)) {
+ ret = bch2_trans_relock(trans);
+ if (ret) {
kfree(new_k);
- trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
goto err;
}
- ret = bch2_trans_relock(trans);
- if (ret) {
+ if (!bch2_btree_node_relock(trans, ck_path, 0)) {
kfree(new_k);
+ trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
goto err;
}
}
@@ -840,7 +841,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
- freed++;
bc->nr_freed_nonpcpu--;
bc->freed++;
}
@@ -854,7 +854,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
- freed++;
bc->nr_freed_pcpu--;
bc->freed++;
}
@@ -876,23 +875,22 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
bc->skipped_dirty++;
- goto next;
} else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) {
clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
bc->skipped_accessed++;
- goto next;
- } else if (bkey_cached_lock_for_evict(ck)) {
+ } else if (!bkey_cached_lock_for_evict(ck)) {
+ bc->skipped_lock_fail++;
+ } else {
bkey_cached_evict(bc, ck);
bkey_cached_free(bc, ck);
bc->moved_to_freelist++;
- } else {
- bc->skipped_lock_fail++;
+ freed++;
}
scanned++;
if (scanned >= nr)
break;
-next:
+
pos = next;
}
@@ -917,6 +915,14 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
long nr = atomic_long_read(&bc->nr_keys) -
atomic_long_read(&bc->nr_dirty);
+ /*
+ * Avoid hammering our shrinker too much if it's nearly empty - the
+ * shrinker code doesn't take into account how big our cache is, if it's
+ * mostly empty but the system is under memory pressure it causes nasty
+ * lock contention:
+ */
+ nr -= 128;
+
return max(0L, nr);
}
@@ -1025,9 +1031,10 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
if (!shrink)
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
bc->shrink = shrink;
- shrink->seeks = 0;
shrink->count_objects = bch2_btree_key_cache_count;
shrink->scan_objects = bch2_btree_key_cache_scan;
+ shrink->batch = 1 << 14;
+ shrink->seeks = 0;
shrink->private_data = c;
shrinker_register(shrink);
return 0;
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index c3e9b0cc7bbd..c51826fd557f 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -215,6 +215,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
if (unlikely(!best)) {
struct printbuf buf = PRINTBUF;
+ buf.atomic++;
prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
@@ -230,7 +231,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
prt_newline(&buf);
}
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf);
printbuf_exit(&buf);
BUG();
}
@@ -791,7 +792,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
return bch2_trans_relock_fail(trans, path, &f, trace);
}
- trans->locked = true;
+ trans_set_locked(trans);
out:
bch2_trans_verify_locks(trans);
return 0;
@@ -811,16 +812,14 @@ void bch2_trans_unlock_noassert(struct btree_trans *trans)
{
__bch2_trans_unlock(trans);
- trans->locked = false;
- trans->last_unlock_ip = _RET_IP_;
+ trans_set_unlocked(trans);
}
void bch2_trans_unlock(struct btree_trans *trans)
{
__bch2_trans_unlock(trans);
- trans->locked = false;
- trans->last_unlock_ip = _RET_IP_;
+ trans_set_unlocked(trans);
}
void bch2_trans_unlock_long(struct btree_trans *trans)
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 7f41545b9147..75a6274c7d27 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -193,6 +193,28 @@ int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
/* lock: */
+static inline void trans_set_locked(struct btree_trans *trans)
+{
+ if (!trans->locked) {
+ trans->locked = true;
+ trans->last_unlock_ip = 0;
+
+ trans->pf_memalloc_nofs = (current->flags & PF_MEMALLOC_NOFS) != 0;
+ current->flags |= PF_MEMALLOC_NOFS;
+ }
+}
+
+static inline void trans_set_unlocked(struct btree_trans *trans)
+{
+ if (trans->locked) {
+ trans->locked = false;
+ trans->last_unlock_ip = _RET_IP_;
+
+ if (!trans->pf_memalloc_nofs)
+ current->flags &= ~PF_MEMALLOC_NOFS;
+ }
+}
+
static inline int __btree_node_lock_nopath(struct btree_trans *trans,
struct btree_bkey_cached_common *b,
enum six_lock_type type,
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 45cb8149d374..2cb0442f6cc9 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -72,10 +72,11 @@ static bool found_btree_node_is_readable(struct btree_trans *trans,
struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
bool ret = !IS_ERR_OR_NULL(b);
- if (ret) {
- f->sectors_written = b->written;
- six_unlock_read(&b->c.lock);
- }
+ if (!ret)
+ return ret;
+
+ f->sectors_written = b->written;
+ six_unlock_read(&b->c.lock);
/*
* We might update this node's range; if that happens, we need the node
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d63db4fefe73..48cb1a7d31c5 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -484,6 +484,7 @@ struct btree_trans {
bool lock_may_not_fail:1;
bool srcu_held:1;
bool locked:1;
+ bool pf_memalloc_nofs:1;
bool write_locked:1;
bool used_mempool:1;
bool in_traverse_all:1;
@@ -761,13 +762,13 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
static inline bool btree_node_type_is_extents(enum btree_node_type type)
{
- const unsigned mask = 0
+ const u64 mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1))
BCH_BTREE_IDS()
#undef x
;
- return (1U << type) & mask;
+ return BIT_ULL(type) & mask;
}
static inline bool btree_id_is_extents(enum btree_id btree)
@@ -777,35 +778,35 @@ static inline bool btree_id_is_extents(enum btree_id btree)
static inline bool btree_type_has_snapshots(enum btree_id id)
{
- const unsigned mask = 0
+ const u64 mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
BCH_BTREE_IDS()
#undef x
;
- return (1U << id) & mask;
+ return BIT_ULL(id) & mask;
}
static inline bool btree_type_has_snapshot_field(enum btree_id id)
{
- const unsigned mask = 0
+ const u64 mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
BCH_BTREE_IDS()
#undef x
;
- return (1U << id) & mask;
+ return BIT_ULL(id) & mask;
}
static inline bool btree_type_has_ptrs(enum btree_id id)
{
- const unsigned mask = 0
+ const u64 mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr)
BCH_BTREE_IDS()
#undef x
;
- return (1U << id) & mask;
+ return BIT_ULL(id) & mask;
}
struct btree_root {
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 75c8a196b3f6..d0e92d948002 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -1,11 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey_buf.h"
#include "btree_locking.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "error.h"
+#include "extents.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
@@ -492,6 +494,41 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
return ret;
}
+/**
+ * In check and repair code, when checking references to write buffer btrees we
+ * need to issue a flush before we have a definitive error: this issues a flush
+ * if this is a key we haven't yet checked.
+ */
+int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans,
+ struct bkey_s_c referring_k,
+ struct bkey_buf *last_flushed)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_buf tmp;
+ int ret = 0;
+
+ bch2_bkey_buf_init(&tmp);
+
+ if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) {
+ bch2_bkey_buf_reassemble(&tmp, c, referring_k);
+
+ if (bkey_is_btree_ptr(referring_k.k)) {
+ bch2_trans_unlock(trans);
+ bch2_btree_interior_updates_flush(c);
+ }
+
+ ret = bch2_btree_write_buffer_flush_sync(trans);
+ if (ret)
+ goto err;
+
+ bch2_bkey_buf_copy(last_flushed, c, tmp.k);
+ ret = -BCH_ERR_transaction_restart_write_buffer_flush;
+ }
+err:
+ bch2_bkey_buf_exit(&tmp, c);
+ return ret;
+}
+
static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work);
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
index eebcd2b15249..dd5e64218b50 100644
--- a/fs/bcachefs/btree_write_buffer.h
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -23,6 +23,9 @@ int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
int bch2_btree_write_buffer_tryflush(struct btree_trans *);
+struct bkey_buf;
+int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct bkey_buf *);
+
struct journal_keys_to_wb {
struct btree_write_buffer_keys *wb;
size_t room;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index b469586517a8..314ee3e0187f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -465,143 +465,172 @@ int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64
return bch2_update_replicas_list(trans, &r.e, sectors);
}
-int bch2_check_fix_ptrs(struct btree_trans *trans,
- enum btree_id btree, unsigned level, struct bkey_s_c k,
- enum btree_iter_update_trigger_flags flags)
+static int bch2_check_fix_ptr(struct btree_trans *trans,
+ struct bkey_s_c k,
+ struct extent_ptr_decoded p,
+ const union bch_extent_entry *entry,
+ bool *do_update)
{
struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry_c;
- struct extent_ptr_decoded p = { 0 };
- bool do_update = false;
struct printbuf buf = PRINTBUF;
int ret = 0;
- percpu_down_read(&c->mark_lock);
+ struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
+ if (!ca) {
+ if (fsck_err(c, ptr_to_invalid_device,
+ "pointer to missing device %u\n"
+ "while marking %s",
+ p.ptr.dev,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+ return 0;
+ }
- bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
- struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
- if (!ca) {
- if (fsck_err(c, ptr_to_invalid_device,
- "pointer to missing device %u\n"
- "while marking %s",
- p.ptr.dev,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- do_update = true;
- continue;
- }
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ if (!g) {
+ if (fsck_err(c, ptr_to_invalid_device,
+ "pointer to invalid bucket on device %u\n"
+ "while marking %s",
+ p.ptr.dev,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+ goto out;
+ }
- struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry_c);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
- if (fsck_err_on(!g->gen_valid,
- c, ptr_to_missing_alloc_key,
- "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- if (!p.ptr.cached) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- } else {
- do_update = true;
- }
+ if (fsck_err_on(!g->gen_valid,
+ c, ptr_to_missing_alloc_key,
+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (!p.ptr.cached) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ } else {
+ *do_update = true;
}
+ }
- if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
- c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- if (!p.ptr.cached &&
- (g->data_type != BCH_DATA_btree ||
- data_type == BCH_DATA_btree)) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- g->data_type = 0;
- g->dirty_sectors = 0;
- g->cached_sectors = 0;
- } else {
- do_update = true;
- }
+ if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
+ c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (!p.ptr.cached &&
+ (g->data_type != BCH_DATA_btree ||
+ data_type == BCH_DATA_btree)) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ g->data_type = 0;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
+ } else {
+ *do_update = true;
+ }
+ }
+
+ if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
+ c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+
+ if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
+ c, stale_dirty_ptr,
+ "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+
+ if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
+ goto out;
+
+ if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
+ c, ptr_bucket_data_type_mismatch,
+ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type),
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (data_type == BCH_DATA_btree) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ g->data_type = data_type;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
+ } else {
+ *do_update = true;
}
+ }
+
+ if (p.has_ec) {
+ struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
- if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
- c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ if (fsck_err_on(!m || !m->alive,
+ c, ptr_to_missing_stripe,
+ "pointer to nonexistent stripe %llu\n"
"while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen,
+ (u64) p.ec.idx,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- do_update = true;
+ *do_update = true;
- if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
- c, stale_dirty_ptr,
- "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+ if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p),
+ c, ptr_to_incorrect_stripe,
+ "pointer does not match stripe %llu\n"
"while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen, g->gen,
+ (u64) p.ec.idx,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- do_update = true;
+ *do_update = true;
+ }
+out:
+fsck_err:
+ bch2_dev_put(ca);
+ printbuf_exit(&buf);
+ return ret;
+}
- if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
- goto next;
+int bch2_check_fix_ptrs(struct btree_trans *trans,
+ enum btree_id btree, unsigned level, struct bkey_s_c k,
+ enum btree_iter_update_trigger_flags flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry_c;
+ struct extent_ptr_decoded p = { 0 };
+ bool do_update = false;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
- if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
- c, ptr_bucket_data_type_mismatch,
- "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_type_str(g->data_type),
- bch2_data_type_str(data_type),
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- if (data_type == BCH_DATA_btree) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- g->data_type = data_type;
- g->dirty_sectors = 0;
- g->cached_sectors = 0;
- } else {
- do_update = true;
- }
- }
+ percpu_down_read(&c->mark_lock);
- if (p.has_ec) {
- struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
-
- if (fsck_err_on(!m || !m->alive, c,
- ptr_to_missing_stripe,
- "pointer to nonexistent stripe %llu\n"
- "while marking %s",
- (u64) p.ec.idx,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- do_update = true;
-
- if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
- ptr_to_incorrect_stripe,
- "pointer does not match stripe %llu\n"
- "while marking %s",
- (u64) p.ec.idx,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- do_update = true;
- }
-next:
- bch2_dev_put(ca);
+ bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
+ ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
+ if (ret)
+ goto err;
}
if (do_update) {
@@ -716,7 +745,6 @@ found:
bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
}
err:
-fsck_err:
percpu_up_read(&c->mark_lock);
printbuf_exit(&buf);
return ret;
@@ -777,7 +805,7 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- *bucket_gen(ca, bucket_nr),
+ bucket_gen_get(ca, bucket_nr),
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(printbuf_reset(&buf),
@@ -987,6 +1015,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
enum btree_iter_update_trigger_flags flags)
{
bool insert = !(flags & BTREE_TRIGGER_overwrite);
+ struct printbuf buf = PRINTBUF;
int ret = 0;
struct bch_fs *c = trans->c;
@@ -1019,6 +1048,13 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (flags & BTREE_TRIGGER_gc) {
percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, bucket.offset);
+ if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
+ p.ptr.dev,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = -EIO;
+ goto err_unlock;
+ }
+
bucket_lock(g);
struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
ret = __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &new);
@@ -1027,10 +1063,12 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
bch2_dev_usage_update(c, ca, &old, &new, 0, true);
}
bucket_unlock(g);
+err_unlock:
percpu_up_read(&c->mark_lock);
}
err:
bch2_dev_put(ca);
+ printbuf_exit(&buf);
return ret;
}
@@ -1134,7 +1172,7 @@ static int __trigger_extent(struct btree_trans *trans,
r.e.nr_required = 1;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- s64 disk_sectors;
+ s64 disk_sectors = 0;
ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags);
if (ret < 0)
return ret;
@@ -1318,10 +1356,11 @@ static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
u64 b, enum bch_data_type data_type, unsigned sectors,
enum btree_iter_update_trigger_flags flags)
{
- int ret = 0;
-
percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, b);
+ if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s",
+ ca->dev_idx, bch2_data_type_str(data_type)))
+ goto err_unlock;
bucket_lock(g);
struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
@@ -1330,29 +1369,27 @@ static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
g->data_type != data_type, c,
"different types of data in same bucket: %s, %s",
bch2_data_type_str(g->data_type),
- bch2_data_type_str(data_type))) {
- ret = -EIO;
+ bch2_data_type_str(data_type)))
goto err;
- }
if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
"bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
ca->dev_idx, b, g->gen,
bch2_data_type_str(g->data_type ?: data_type),
- g->dirty_sectors, sectors)) {
- ret = -EIO;
+ g->dirty_sectors, sectors))
goto err;
- }
g->data_type = data_type;
g->dirty_sectors += sectors;
struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
+ bch2_dev_usage_update(c, ca, &old, &new, 0, true);
+ percpu_up_read(&c->mark_lock);
+ return 0;
err:
bucket_unlock(g);
- if (!ret)
- bch2_dev_usage_update(c, ca, &old, &new, 0, true);
+err_unlock:
percpu_up_read(&c->mark_lock);
- return ret;
+ return -EIO;
}
int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
@@ -1595,6 +1632,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bucket_gens->first_bucket = ca->mi.first_bucket;
bucket_gens->nbuckets = nbuckets;
+ bucket_gens->nbuckets_minus_first =
+ bucket_gens->nbuckets - bucket_gens->first_bucket;
if (resize) {
down_write(&c->gc_lock);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 617ffde2fb7a..8ad4be73860c 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -93,7 +93,8 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
{
struct bucket_array *buckets = gc_bucket_array(ca);
- BUG_ON(!bucket_valid(ca, b));
+ if (b - buckets->first_bucket >= buckets->nbuckets_minus_first)
+ return NULL;
return buckets->b + b;
}
@@ -110,10 +111,19 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
{
struct bucket_gens *gens = bucket_gens(ca);
- BUG_ON(!bucket_valid(ca, b));
+ if (b - gens->first_bucket >= gens->nbuckets_minus_first)
+ return NULL;
return gens->b + b;
}
+static inline u8 bucket_gen_get(struct bch_dev *ca, size_t b)
+{
+ rcu_read_lock();
+ u8 gen = *bucket_gen(ca, b);
+ rcu_read_unlock();
+ return gen;
+}
+
static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
{
@@ -170,19 +180,22 @@ static inline int gen_after(u8 a, u8 b)
return r > 0 ? r : 0;
}
-static inline u8 dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
+static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
{
- return gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
+ u8 *gen = bucket_gen(ca, PTR_BUCKET_NR(ca, ptr));
+ if (!gen)
+ return -1;
+ return gen_after(*gen, ptr->gen);
}
/**
* dev_ptr_stale() - check if a pointer points into a bucket that has been
* invalidated.
*/
-static inline u8 dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
+static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
{
rcu_read_lock();
- u8 ret = dev_ptr_stale_rcu(ca, ptr);
+ int ret = dev_ptr_stale_rcu(ca, ptr);
rcu_read_unlock();
return ret;
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 6a31740222a7..f636e17c4caf 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -22,6 +22,7 @@ struct bucket_array {
struct rcu_head rcu;
u16 first_bucket;
size_t nbuckets;
+ size_t nbuckets_minus_first;
struct bucket b[];
};
@@ -29,6 +30,7 @@ struct bucket_gens {
struct rcu_head rcu;
u16 first_bucket;
size_t nbuckets;
+ size_t nbuckets_minus_first;
u8 b[];
};
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 9e54323f0f5f..6d82e1165adc 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -216,7 +216,8 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
ret = PTR_ERR_OR_ZERO(optstr) ?:
bch2_parse_mount_opts(NULL, &thr->opts, optstr);
- kfree(optstr);
+ if (!IS_ERR(optstr))
+ kfree(optstr);
if (ret)
goto err;
@@ -319,7 +320,8 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
return ret;
ret = bch2_dev_add(c, path);
- kfree(path);
+ if (!IS_ERR(path))
+ kfree(path);
return ret;
}
@@ -850,7 +852,8 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
ret = PTR_ERR_OR_ZERO(optstr) ?:
bch2_parse_mount_opts(c, &thr->opts, optstr);
- kfree(optstr);
+ if (!IS_ERR(optstr))
+ kfree(optstr);
if (ret)
goto err;
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 363644451106..0f40b585ce2b 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -132,14 +132,9 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
{
struct io_timer *ret = NULL;
- spin_lock(&clock->timer_lock);
-
if (clock->timers.used &&
time_after_eq(now, clock->timers.data[0]->expire))
heap_pop(&clock->timers, ret, io_timer_cmp, NULL);
-
- spin_unlock(&clock->timer_lock);
-
return ret;
}
@@ -148,8 +143,10 @@ void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
struct io_timer *timer;
unsigned long now = atomic64_add_return(sectors, &clock->now);
+ spin_lock(&clock->timer_lock);
while ((timer = get_expired_timer(clock, now)))
timer->fn(timer);
+ spin_unlock(&clock->timer_lock);
}
void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 0d807c2ce9c6..0087b8555ead 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -5,7 +5,9 @@
#include "bkey_buf.h"
#include "btree_update.h"
#include "buckets.h"
+#include "compress.h"
#include "data_update.h"
+#include "disk_groups.h"
#include "ec.h"
#include "error.h"
#include "extents.h"
@@ -202,9 +204,8 @@ restart_drop_conflicting_replicas:
bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
/* Now, drop excess replicas: */
-restart_drop_extra_replicas:
-
rcu_read_lock();
+restart_drop_extra_replicas:
bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
@@ -455,6 +456,38 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
}
}
+void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ printbuf_tabstop_push(out, 20);
+ prt_str(out, "rewrite ptrs:\t");
+ bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
+ prt_newline(out);
+
+ prt_str(out, "kill ptrs:\t");
+ bch2_prt_u64_base2(out, data_opts->kill_ptrs);
+ prt_newline(out);
+
+ prt_str(out, "target:\t");
+ bch2_target_to_text(out, c, data_opts->target);
+ prt_newline(out);
+
+ prt_str(out, "compression:\t");
+ bch2_compression_opt_to_text(out, background_compression(*io_opts));
+ prt_newline(out);
+
+ prt_str(out, "extra replicas:\t");
+ prt_u64(out, data_opts->extra_replicas);
+}
+
+void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
+{
+ bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
+ prt_newline(out);
+ bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
+}
+
int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@@ -644,6 +677,16 @@ int bch2_data_update_init(struct btree_trans *trans,
if (!(durability_have + durability_removing))
m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
+ if (!m->op.nr_replicas) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_data_update_to_text(&buf, m);
+ WARN(1, "trying to move an extent, but nr_replicas=0\n%s", buf.buf);
+ printbuf_exit(&buf);
+ ret = -BCH_ERR_data_update_done;
+ goto done;
+ }
+
m->op.nr_replicas_required = m->op.nr_replicas;
if (reserve_sectors) {
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index 991095bbd469..8d36365bdea8 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -17,6 +17,9 @@ struct data_update_opts {
unsigned write_flags;
};
+void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
+ struct bch_io_opts *, struct data_update_opts *);
+
struct data_update {
/* extent being updated: */
enum btree_id btree_id;
@@ -27,6 +30,8 @@ struct data_update {
struct bch_write_op op;
};
+void bch2_data_update_to_text(struct printbuf *, struct data_update *);
+
int bch2_data_update_index_update(struct bch_write_op *);
void bch2_data_update_read_done(struct data_update *,
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 51cbf3928361..ebabab171fe5 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -568,6 +568,32 @@ static const struct file_operations cached_btree_nodes_ops = {
.read = bch2_cached_btree_nodes_read,
};
+typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r);
+
+static void list_sort(struct list_head *head, list_cmp_fn cmp)
+{
+ struct list_head *pos;
+
+ list_for_each(pos, head)
+ while (!list_is_last(pos, head) &&
+ cmp(pos, pos->next) > 0) {
+ struct list_head *pos2, *next = pos->next;
+
+ list_del(next);
+ list_for_each(pos2, head)
+ if (cmp(next, pos2) < 0)
+ goto pos_found;
+ BUG();
+pos_found:
+ list_add_tail(next, pos2);
+ }
+}
+
+static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r)
+{
+ return cmp_int(l, r);
+}
+
static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
@@ -575,41 +601,39 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
struct bch_fs *c = i->c;
struct btree_trans *trans;
ssize_t ret = 0;
- u32 seq;
i->ubuf = buf;
i->size = size;
i->ret = 0;
restart:
seqmutex_lock(&c->btree_trans_lock);
- list_for_each_entry(trans, &c->btree_trans_list, list) {
- struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+ list_sort(&c->btree_trans_list, list_ptr_order_cmp);
- if (!task || task->pid <= i->iter)
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
+ if ((ulong) trans <= i->iter)
continue;
- closure_get(&trans->ref);
- seq = seqmutex_seq(&c->btree_trans_lock);
- seqmutex_unlock(&c->btree_trans_lock);
+ i->iter = (ulong) trans;
- ret = flush_buf(i);
- if (ret) {
- closure_put(&trans->ref);
- goto unlocked;
- }
+ if (!closure_get_not_zero(&trans->ref))
+ continue;
+
+ u32 seq = seqmutex_unlock(&c->btree_trans_lock);
bch2_btree_trans_to_text(&i->buf, trans);
prt_printf(&i->buf, "backtrace:\n");
printbuf_indent_add(&i->buf, 2);
- bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL);
+ bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL);
printbuf_indent_sub(&i->buf, 2);
prt_newline(&i->buf);
- i->iter = task->pid;
-
closure_put(&trans->ref);
+ ret = flush_buf(i);
+ if (ret)
+ goto unlocked;
+
if (!seqmutex_relock(&c->btree_trans_lock, seq))
goto restart;
}
@@ -804,50 +828,55 @@ static const struct file_operations btree_transaction_stats_op = {
.read = btree_transaction_stats_read,
};
-static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
+/* walk btree transactions until we find a deadlock and print it */
+static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct dump_iter *i = file->private_data;
- struct bch_fs *c = i->c;
struct btree_trans *trans;
- ssize_t ret = 0;
- u32 seq;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- if (i->iter)
- goto out;
+ ulong iter = 0;
restart:
seqmutex_lock(&c->btree_trans_lock);
- list_for_each_entry(trans, &c->btree_trans_list, list) {
- struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+ list_sort(&c->btree_trans_list, list_ptr_order_cmp);
- if (!task || task->pid <= i->iter)
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
+ if ((ulong) trans <= iter)
continue;
- closure_get(&trans->ref);
- seq = seqmutex_seq(&c->btree_trans_lock);
- seqmutex_unlock(&c->btree_trans_lock);
+ iter = (ulong) trans;
- ret = flush_buf(i);
- if (ret) {
- closure_put(&trans->ref);
- goto out;
- }
+ if (!closure_get_not_zero(&trans->ref))
+ continue;
- bch2_check_for_deadlock(trans, &i->buf);
+ u32 seq = seqmutex_unlock(&c->btree_trans_lock);
- i->iter = task->pid;
+ bool found = bch2_check_for_deadlock(trans, out) != 0;
closure_put(&trans->ref);
+ if (found)
+ return;
+
if (!seqmutex_relock(&c->btree_trans_lock, seq))
goto restart;
}
seqmutex_unlock(&c->btree_trans_lock);
-out:
+}
+
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ ssize_t ret = 0;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ if (!i->iter) {
+ btree_deadlock_to_text(&i->buf, c);
+ i->iter++;
+ }
+
if (i->buf.allocation_failure)
ret = -ENOMEM;
diff --git a/fs/bcachefs/disk_groups_format.h b/fs/bcachefs/disk_groups_format.h
new file mode 100644
index 000000000000..698990bbf1d2
--- /dev/null
+++ b/fs/bcachefs/disk_groups_format.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_FORMAT_H
+#define _BCACHEFS_DISK_GROUPS_FORMAT_H
+
+#define BCH_SB_LABEL_SIZE 32
+
+struct bch_disk_group {
+ __u8 label[BCH_SB_LABEL_SIZE];
+ __le64 flags[2];
+} __packed __aligned(8);
+
+LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
+LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24)
+
+struct bch_sb_field_disk_groups {
+ struct bch_sb_field field;
+ struct bch_disk_group entries[];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_DISK_GROUPS_FORMAT_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index b26dc7424662..83e279d41829 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -268,6 +268,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
@@ -289,6 +290,13 @@ static int mark_stripe_bucket(struct btree_trans *trans,
if (flags & BTREE_TRIGGER_gc) {
percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, bucket.offset);
+ if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
+ ptr->dev,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ ret = -EIO;
+ goto err_unlock;
+ }
+
bucket_lock(g);
struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
@@ -297,10 +305,12 @@ static int mark_stripe_bucket(struct btree_trans *trans,
bch2_dev_usage_update(c, ca, &old, &new, 0, true);
}
bucket_unlock(g);
+err_unlock:
percpu_up_read(&c->mark_lock);
}
err:
bch2_dev_put(ca);
+ printbuf_exit(&buf);
return ret;
}
@@ -714,10 +724,12 @@ static void ec_block_endio(struct bio *bio)
bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
- if (dev_ptr_stale(ca, ptr)) {
+ int stale = dev_ptr_stale(ca, ptr);
+ if (stale) {
bch_err_ratelimited(ca->fs,
- "error %s stripe: stale pointer after io",
- bio_data_dir(bio) == READ ? "reading from" : "writing to");
+ "error %s stripe: stale/invalid pointer (%i) after io",
+ bio_data_dir(bio) == READ ? "reading from" : "writing to",
+ stale);
clear_bit(ec_bio->idx, ec_bio->buf->valid);
}
@@ -743,10 +755,12 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
return;
}
- if (dev_ptr_stale(ca, ptr)) {
+ int stale = dev_ptr_stale(ca, ptr);
+ if (stale) {
bch_err_ratelimited(c,
- "error %s stripe: stale pointer",
- rw == READ ? "reading from" : "writing to");
+ "error %s stripe: stale pointer (%i)",
+ rw == READ ? "reading from" : "writing to",
+ stale);
clear_bit(idx, buf->valid);
return;
}
@@ -908,7 +922,7 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
- if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
+ if (c->gc_pos.phase != GC_PHASE_not_running &&
!genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index dbe35b80bc0b..58612abf7927 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -116,6 +116,9 @@
x(ENOENT, ENOENT_dev_idx_not_found) \
x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
+ x(EEXIST, EEXIST_str_hash_set) \
+ x(EEXIST, EEXIST_discard_in_flight_add) \
+ x(EEXIST, EEXIST_subvolume_create) \
x(0, open_buckets_empty) \
x(0, freelist_empty) \
x(BCH_ERR_freelist_empty, no_buckets_found) \
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index c66eeffcd7f2..d95c40f1b6af 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -15,6 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c)
switch (c->opts.errors) {
case BCH_ON_ERROR_continue:
return false;
+ case BCH_ON_ERROR_fix_safe:
case BCH_ON_ERROR_ro:
if (bch2_fs_emergency_read_only(c))
bch_err(c, "inconsistency detected - emergency read only at journal seq %llu",
@@ -191,6 +192,12 @@ static void prt_actioning(struct printbuf *out, const char *action)
prt_str(out, "ing");
}
+static const u8 fsck_flags_extra[] = {
+#define x(t, n, flags) [BCH_FSCK_ERR_##t] = flags,
+ BCH_SB_ERRS()
+#undef x
+};
+
int bch2_fsck_err(struct bch_fs *c,
enum bch_fsck_flags flags,
enum bch_sb_error_id err,
@@ -203,6 +210,9 @@ int bch2_fsck_err(struct bch_fs *c,
int ret = -BCH_ERR_fsck_ignore;
const char *action_orig = "fix?", *action = action_orig;
+ if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
+ flags |= fsck_flags_extra[err];
+
if ((flags & FSCK_CAN_FIX) &&
test_bit(err, c->sb.errors_silent))
return -BCH_ERR_fsck_fix;
@@ -265,7 +275,14 @@ int bch2_fsck_err(struct bch_fs *c,
prt_printf(out, bch2_log_msg(c, ""));
#endif
- if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
+ if ((flags & FSCK_CAN_FIX) &&
+ (flags & FSCK_AUTOFIX) &&
+ (c->opts.errors == BCH_ON_ERROR_continue ||
+ c->opts.errors == BCH_ON_ERROR_fix_safe)) {
+ prt_str(out, ", ");
+ prt_actioning(out, action);
+ ret = -BCH_ERR_fsck_fix;
+ } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
if (c->opts.errors != BCH_ON_ERROR_continue ||
!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
prt_str(out, ", shutting down");
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 36caedf72d89..777711504c35 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -108,13 +108,6 @@ struct fsck_err_state {
char *last_msg;
};
-enum bch_fsck_flags {
- FSCK_CAN_FIX = 1 << 0,
- FSCK_CAN_IGNORE = 1 << 1,
- FSCK_NEED_FSCK = 1 << 2,
- FSCK_NO_RATELIMIT = 1 << 3,
-};
-
#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
__printf(4, 5) __cold
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 469037929685..410b8bd81b5a 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -137,7 +137,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
- if (p.ptr.cached && (!ca || dev_ptr_stale(ca, &p.ptr)))
+ if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
continue;
f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
@@ -999,7 +999,7 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
bch2_bkey_drop_ptrs(k, ptr,
ptr->cached &&
(ca = bch2_dev_rcu(c, ptr->dev)) &&
- dev_ptr_stale_rcu(ca, ptr));
+ dev_ptr_stale_rcu(ca, ptr) > 0);
rcu_read_unlock();
return bkey_deleted(k.k);
@@ -1024,8 +1024,11 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc
prt_str(out, " cached");
if (ptr->unwritten)
prt_str(out, " unwritten");
- if (bucket_valid(ca, b) && dev_ptr_stale_rcu(ca, ptr))
+ int stale = dev_ptr_stale_rcu(ca, ptr);
+ if (stale > 0)
prt_printf(out, " stale");
+ else if (stale)
+ prt_printf(out, " invalid");
}
rcu_read_unlock();
--out->atomic;
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index 24840aee335c..795f4fc0bab1 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -48,7 +48,7 @@ static inline unsigned eytzinger1_right_child(unsigned i)
static inline unsigned eytzinger1_first(unsigned size)
{
- return rounddown_pow_of_two(size);
+ return size ? rounddown_pow_of_two(size) : 0;
}
static inline unsigned eytzinger1_last(unsigned size)
@@ -101,7 +101,9 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
static inline unsigned eytzinger1_extra(unsigned size)
{
- return (size + 1 - rounddown_pow_of_two(size)) << 1;
+ return size
+ ? (size + 1 - rounddown_pow_of_two(size)) << 1
+ : 0;
}
static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 6b69e5cd68dd..54873ecc635c 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -437,8 +437,8 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
*/
/*
- * PageWriteback is effectively our ref on the inode - fixup i_blocks
- * before calling end_page_writeback:
+ * The writeback flag is effectively our ref on the inode -
+ * fixup i_blocks before calling folio_end_writeback:
*/
bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
@@ -898,7 +898,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
darray_for_each(fs, fi) {
f = *fi;
f_len = min(end, folio_end_pos(f)) - f_pos;
- f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
+ f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter);
if (!f_copied) {
folios_trunc(&fs, fi);
break;
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 09d21aef879a..049b61bc9a5b 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -609,8 +609,10 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
if (unlikely(ret))
goto err_put_write_ref;
- if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
+ if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) {
+ ret = -EINVAL;
goto err_put_write_ref;
+ }
inode_dio_begin(&inode->v);
bch2_pagecache_block_get(inode);
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 205a323ffc6d..79a0c8732bce 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -308,8 +308,8 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
return ret;
}
-static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
- struct bch_ioctl_subvolume arg)
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+ struct bch_ioctl_subvolume arg)
{
struct inode *dir;
struct bch_inode_info *inode;
@@ -373,7 +373,7 @@ retry:
}
if (dst_dentry->d_inode) {
- error = -EEXIST;
+ error = -BCH_ERR_EEXIST_subvolume_create;
goto err3;
}
@@ -406,9 +406,12 @@ retry:
!arg.src_ptr)
snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol;
+ down_write(&c->snapshot_create_lock);
inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir),
dst_dentry, arg.mode|S_IFDIR,
0, snapshot_src, create_flags);
+ up_write(&c->snapshot_create_lock);
+
error = PTR_ERR_OR_ZERO(inode);
if (error)
goto err3;
@@ -429,16 +432,6 @@ err1:
return error;
}
-static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
- struct bch_ioctl_subvolume arg)
-{
- down_write(&c->snapshot_create_lock);
- long ret = __bch2_ioctl_subvolume_create(c, filp, arg);
- up_write(&c->snapshot_create_lock);
-
- return ret;
-}
-
static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
struct bch_ioctl_subvolume arg)
{
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 96040a95cf46..0c7d1bc0548a 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -188,6 +188,18 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
BUG_ON(!old);
if (unlikely(old != inode)) {
+ /*
+ * bcachefs doesn't use I_NEW; we have no use for it since we
+ * only insert fully created inodes in the inode hash table. But
+ * discard_new_inode() expects it to be set...
+ */
+ inode->v.i_flags |= I_NEW;
+ /*
+ * We don't want bch2_evict_inode() to delete the inode on disk,
+ * we just raced and had another inode in cache. Normally new
+ * inodes don't have nlink == 0 - except tmpfiles do...
+ */
+ set_nlink(&inode->v, 1);
discard_new_inode(&inode->v);
inode = old;
} else {
@@ -195,8 +207,10 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock);
/*
- * we really don't want insert_inode_locked2() to be setting
- * I_NEW...
+ * Again, I_NEW makes no sense for bcachefs. This is only needed
+ * for clearing I_NEW, but since the inode was already fully
+ * created and initialized we didn't actually want
+ * inode_insert5() to set it for us.
*/
unlock_new_inode(&inode->v);
}
@@ -227,8 +241,9 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
mutex_init(&inode->ei_update_lock);
two_state_lock_init(&inode->ei_pagecache_lock);
INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
+ inode->ei_flags = 0;
mutex_init(&inode->ei_quota_lock);
- inode->v.i_state = 0;
+ memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
kmem_cache_free(bch2_inode_cache, inode);
@@ -1155,6 +1170,7 @@ static const struct file_operations bch_file_operations = {
.read_iter = bch2_read_iter,
.write_iter = bch2_write_iter,
.mmap = bch2_mmap,
+ .get_unmapped_area = thp_get_unmapped_area,
.fsync = bch2_fsync,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
@@ -1486,11 +1502,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
bch2_iget5_set(&inode->v, &inum);
bch2_inode_update_after_write(trans, inode, bi, ~0);
- if (BCH_SUBVOLUME_SNAP(subvol))
- set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
- else
- clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
-
inode->v.i_blocks = bi->bi_sectors;
inode->v.i_ino = bi->bi_inum;
inode->v.i_rdev = bi->bi_dev;
@@ -1502,6 +1513,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
inode->ei_qid = bch_qid(bi);
inode->ei_subvol = inum.subvol;
+ if (BCH_SUBVOLUME_SNAP(subvol))
+ set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+
inode->v.i_mapping->a_ops = &bch_address_space_operations;
switch (inode->v.i_mode & S_IFMT) {
@@ -1939,8 +1953,7 @@ got_sb:
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
- ret = bch2_err_class(ret);
- return ERR_PTR(ret);
+ goto err;
}
c = sb->s_fs_info;
@@ -1968,6 +1981,7 @@ got_sb:
sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
sb->s_uuid = c->sb.user_uuid;
+ sb->s_shrink->seeks = 0;
c->vfs_sb = sb;
strscpy(sb->s_id, c->name, sizeof(sb->s_id));
@@ -2016,6 +2030,17 @@ out:
err_put_super:
__bch2_fs_stop(c);
deactivate_locked_super(sb);
+err:
+ if (ret)
+ pr_err("error: %s", bch2_err_str(ret));
+ /*
+ * On an inconsistency error in recovery we might see an -EROFS derived
+ * errorcode (from the journal), but we don't want to return that to
+ * userspace as that causes util-linux to retry the mount RO - which is
+ * confusing:
+ */
+ if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
+ ret = -EIO;
return ERR_PTR(bch2_err_class(ret));
}
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index c8f57465131c..921bcdb3e5e4 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -77,21 +77,17 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
struct bkey_s_c k;
int ret;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
- POS(0, inode_nr),
- BTREE_ITER_all_snapshots);
- k = bch2_btree_iter_peek(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
- ret = -BCH_ERR_ENOENT_inode;
- goto err;
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inode_nr)
+ break;
+ if (!bkey_is_inode(k.k))
+ continue;
+ ret = bch2_inode_unpack(k, inode);
+ goto found;
}
-
- ret = bch2_inode_unpack(k, inode);
-err:
+ ret = -BCH_ERR_ENOENT_inode;
+found:
bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -770,25 +766,6 @@ static int get_visible_inodes(struct btree_trans *trans,
return ret;
}
-static int check_key_has_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
- bkey_in_missing_snapshot,
- "key in missing snapshot: %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- ret = bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node) ?: 1;
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
static int hash_redo_key(struct btree_trans *trans,
const struct bch_hash_desc desc,
struct bch_hash_info *hash_info,
@@ -983,7 +960,7 @@ static int check_inode(struct btree_trans *trans,
bool do_update = false;
int ret;
- ret = check_key_has_snapshot(trans, iter, k);
+ ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret < 0)
goto err;
if (ret)
@@ -1487,7 +1464,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
struct printbuf buf = PRINTBUF;
int ret = 0;
- ret = check_key_has_snapshot(trans, iter, k);
+ ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret) {
ret = ret < 0 ? ret : 0;
goto out;
@@ -1700,6 +1677,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
trans_was_restarted(trans, restart_count);
}
+noinline_for_stack
static int check_dirent_inode_dirent(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c_dirent d,
@@ -1796,6 +1774,7 @@ out_noiter:
return ret;
}
+noinline_for_stack
static int check_dirent_target(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c_dirent d,
@@ -1870,6 +1849,7 @@ found:
return ret;
}
+noinline_for_stack
static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c_dirent d)
{
@@ -2010,7 +1990,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct printbuf buf = PRINTBUF;
int ret = 0;
- ret = check_key_has_snapshot(trans, iter, k);
+ ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret) {
ret = ret < 0 ? ret : 0;
goto out;
@@ -2165,7 +2145,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
struct inode_walker_entry *i;
int ret;
- ret = check_key_has_snapshot(trans, iter, k);
+ ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret < 0)
return ret;
if (ret)
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 4ec979b4b23e..4583c9386e8c 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -125,7 +125,7 @@ err_noprint:
bch2_bkey_buf_exit(&old, c);
if (closure_nr_remaining(&cl) != 1) {
- bch2_trans_unlock(trans);
+ bch2_trans_unlock_long(trans);
closure_sync(&cl);
}
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index f57486794484..ebf39ef72fb2 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -84,9 +84,10 @@ struct promote_op {
};
static const struct rhashtable_params bch_promote_params = {
- .head_offset = offsetof(struct promote_op, hash),
- .key_offset = offsetof(struct promote_op, pos),
- .key_len = sizeof(struct bpos),
+ .head_offset = offsetof(struct promote_op, hash),
+ .key_offset = offsetof(struct promote_op, pos),
+ .key_len = sizeof(struct bpos),
+ .automatic_shrinking = true,
};
static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
@@ -388,7 +389,6 @@ retry:
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- bch2_trans_unlock(trans);
if (!bch2_bkey_matches_ptr(c, k,
rbio->pick.ptr,
@@ -776,18 +776,32 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
PTR_BUCKET_POS(ca, &ptr),
BTREE_ITER_cached);
- prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
- printbuf_indent_add(&buf, 2);
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
+ u8 *gen = bucket_gen(ca, iter.pos.offset);
+ if (gen) {
- prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+ prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
+ printbuf_indent_add(&buf, 2);
- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
- if (!ret) {
+ bch2_bkey_val_to_text(&buf, c, k);
prt_newline(&buf);
+
+ prt_printf(&buf, "memory gen: %u", *gen);
+
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ if (!ret) {
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ }
+ } else {
+ prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n",
+ iter.pos.inode, iter.pos.offset);
+ printbuf_indent_add(&buf, 2);
+
+ prt_printf(&buf, "first bucket %u nbuckets %llu\n",
+ ca->mi.first_bucket, ca->mi.nbuckets);
+
bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
}
bch2_fs_inconsistent(c, "%s", buf.buf);
@@ -989,6 +1003,9 @@ get_bio:
rbio->promote = promote;
INIT_WORK(&rbio->work, NULL);
+ if (flags & BCH_READ_NODECODE)
+ orig->pick = pick;
+
rbio->bio.bi_opf = orig->bio.bi_opf;
rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio;
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 9401d13e31bb..05e0cbef420b 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -1220,7 +1220,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
u32 snapshot;
struct bucket_to_lock *stale_at;
- int ret;
+ int stale, ret;
if (op->flags & BCH_WRITE_MOVE)
return;
@@ -1299,7 +1299,8 @@ retry:
BUCKET_NOCOW_LOCK_UPDATE);
rcu_read_lock();
- bool stale = gen_after(*bucket_gen(ca, i->b.offset), i->gen);
+ u8 *gen = bucket_gen(ca, i->b.offset);
+ stale = !gen ? -1 : gen_after(*gen, i->gen);
rcu_read_unlock();
if (unlikely(stale)) {
@@ -1380,8 +1381,18 @@ err_bucket_stale:
break;
}
- /* We can retry this: */
- ret = -BCH_ERR_transaction_restart;
+ struct printbuf buf = PRINTBUF;
+ if (bch2_fs_inconsistent_on(stale < 0, c,
+ "pointer to invalid bucket in nocow path on device %llu\n %s",
+ stale_at->b.inode,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = -EIO;
+ } else {
+ /* We can retry this: */
+ ret = -BCH_ERR_transaction_restart;
+ }
+ printbuf_exit(&buf);
+
goto err_get_ioref;
}
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index adec8e1ea73e..10b19791ec98 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1095,7 +1095,7 @@ unlock:
return ret;
}
-int bch2_dev_journal_alloc(struct bch_dev *ca)
+int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
{
unsigned nr;
int ret;
@@ -1117,7 +1117,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
min(1 << 13,
(1 << 24) / ca->mi.bucket_size));
- ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+ ret = __bch2_set_nr_journal_buckets(ca, nr, new_fs, NULL);
err:
bch_err_fn(ca, ret);
return ret;
@@ -1129,7 +1129,7 @@ int bch2_fs_journal_alloc(struct bch_fs *c)
if (ca->journal.nr)
continue;
- int ret = bch2_dev_journal_alloc(ca);
+ int ret = bch2_dev_journal_alloc(ca, true);
if (ret) {
percpu_ref_put(&ca->io_ref);
return ret;
@@ -1167,6 +1167,9 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
void bch2_fs_journal_stop(struct journal *j)
{
+ if (!test_bit(JOURNAL_running, &j->flags))
+ return;
+
bch2_journal_reclaim_stop(j);
bch2_journal_flush_all_pins(j);
@@ -1181,9 +1184,11 @@ void bch2_fs_journal_stop(struct journal *j)
journal_quiesce(j);
cancel_delayed_work_sync(&j->write_work);
- BUG_ON(!bch2_journal_error(j) &&
- test_bit(JOURNAL_replay_done, &j->flags) &&
- j->last_empty_seq != journal_cur_seq(j));
+ WARN(!bch2_journal_error(j) &&
+ test_bit(JOURNAL_replay_done, &j->flags) &&
+ j->last_empty_seq != journal_cur_seq(j),
+ "journal shutdown error: cur seq %llu but last empty seq %llu",
+ journal_cur_seq(j), j->last_empty_seq);
if (!bch2_journal_error(j))
clear_bit(JOURNAL_running, &j->flags);
@@ -1415,8 +1420,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
unsigned long now = jiffies;
u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 28);
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 28);
out->atomic++;
rcu_read_lock();
@@ -1518,6 +1523,11 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
struct journal_entry_pin *pin;
spin_lock(&j->lock);
+ if (!test_bit(JOURNAL_running, &j->flags)) {
+ spin_unlock(&j->lock);
+ return true;
+ }
+
*seq = max(*seq, j->pin.front);
if (*seq >= j->pin.back) {
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index fd1f7cdaa8bc..bc6b9c39dcb4 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -433,7 +433,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
unsigned nr);
-int bch2_dev_journal_alloc(struct bch_dev *);
+int bch2_dev_journal_alloc(struct bch_dev *, bool);
int bch2_fs_journal_alloc(struct bch_fs *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index cdcb1ad49af4..2326e2cb9cd2 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -415,6 +415,8 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
flags|BCH_VALIDATE_journal);
if (ret == FSCK_DELETED_KEY)
continue;
+ else if (ret)
+ return ret;
k = bkey_next(k);
}
@@ -1677,6 +1679,13 @@ static CLOSURE_CALLBACK(journal_write_done)
mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
}
+ /*
+ * We don't typically trigger journal writes from her - the next journal
+ * write will be triggered immediately after the previous one is
+ * allocated, in bch2_journal_write() - but the journal write error path
+ * is special:
+ */
+ bch2_journal_do_writes(j);
spin_unlock(&j->lock);
}
@@ -1755,11 +1764,13 @@ static CLOSURE_CALLBACK(journal_write_preflush)
if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
spin_lock(&j->lock);
- closure_wait(&j->async_wait, cl);
+ if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
+ closure_wait(&j->async_wait, cl);
+ spin_unlock(&j->lock);
+ continue_at(cl, journal_write_preflush, j->wq);
+ return;
+ }
spin_unlock(&j->lock);
-
- continue_at(cl, journal_write_preflush, j->wq);
- return;
}
if (w->separate_flush) {
@@ -1967,7 +1978,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_replicas_padded replicas;
- struct printbuf journal_debug_buf = PRINTBUF;
unsigned nr_rw_members = 0;
int ret;
@@ -2011,11 +2021,15 @@ CLOSURE_CALLBACK(bch2_journal_write)
}
if (ret) {
- __bch2_journal_debug_to_text(&journal_debug_buf, j);
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+
+ prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write: %s"),
+ bch2_err_str(ret));
+ __bch2_journal_debug_to_text(&buf, j);
spin_unlock(&j->lock);
- bch_err(c, "Unable to allocate journal write:\n%s",
- journal_debug_buf.buf);
- printbuf_exit(&journal_debug_buf);
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
goto err;
}
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index ed4846709611..1f25c111c54c 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -232,7 +232,7 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c)
BUG_ON(nr != t->nr);
unsigned i;
- for (src = bl->start, i = eytzinger0_first(t->nr);
+ for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr);
src < bl->start + nr;
src++, i = eytzinger0_next(i, nr)) {
BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
diff --git a/fs/bcachefs/journal_seq_blacklist_format.h b/fs/bcachefs/journal_seq_blacklist_format.h
new file mode 100644
index 000000000000..2566b12dbc04
--- /dev/null
+++ b/fs/bcachefs/journal_seq_blacklist_format.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
+
+struct journal_seq_blacklist_entry {
+ __le64 start;
+ __le64 end;
+};
+
+struct bch_sb_field_journal_seq_blacklist {
+ struct bch_sb_field field;
+ struct journal_seq_blacklist_entry start[];
+};
+
+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H */
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index a40d116224ed..b12894ef44f3 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -77,6 +77,45 @@ static const char * const bch2_lru_types[] = {
NULL
};
+int bch2_lru_check_set(struct btree_trans *trans,
+ u16 lru_id, u64 time,
+ struct bkey_s_c referring_k,
+ struct bkey_buf *last_flushed)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ struct btree_iter lru_iter;
+ struct bkey_s_c lru_k =
+ bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
+ lru_pos(lru_id,
+ bucket_to_u64(referring_k.k->p),
+ time), 0);
+ int ret = bkey_err(lru_k);
+ if (ret)
+ return ret;
+
+ if (lru_k.k->type != KEY_TYPE_set) {
+ ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed);
+ if (ret)
+ goto err;
+
+ if (fsck_err(c, alloc_key_to_missing_lru_entry,
+ "missing %s lru entry\n"
+ " %s",
+ bch2_lru_types[lru_type(lru_k)],
+ (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) {
+ ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time);
+ if (ret)
+ goto err;
+ }
+ }
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &lru_iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
static int bch2_check_lru_key(struct btree_trans *trans,
struct btree_iter *lru_iter,
struct bkey_s_c lru_k,
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index fb11ab0dd00e..ed75bcf59d47 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -2,9 +2,6 @@
#ifndef _BCACHEFS_LRU_H
#define _BCACHEFS_LRU_H
-#define LRU_TIME_BITS 48
-#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
-
static inline u64 lru_pos_id(struct bpos pos)
{
return pos.inode >> LRU_TIME_BITS;
@@ -64,6 +61,9 @@ int bch2_lru_del(struct btree_trans *, u16, u64, u64);
int bch2_lru_set(struct btree_trans *, u16, u64, u64);
int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
+struct bkey_buf;
+int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *);
+
int bch2_check_lrus(struct bch_fs *);
#endif /* _BCACHEFS_LRU_H */
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
index 4c298e74723d..e9d9c0212e44 100644
--- a/fs/bcachefs/mean_and_variance_test.c
+++ b/fs/bcachefs/mean_and_variance_test.c
@@ -217,4 +217,5 @@ static struct kunit_suite mean_and_variance_test_suite = {
kunit_test_suite(mean_and_variance_test_suite);
MODULE_AUTHOR("Daniel B. Hill");
+MODULE_DESCRIPTION("bcachefs filesystem mean and variance unit tests");
MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 8171f947fac8..e714e3bd5bbb 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -36,31 +36,6 @@ const char * const bch2_data_ops_strs[] = {
NULL
};
-static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- printbuf_tabstop_push(out, 20);
- prt_str(out, "rewrite ptrs:\t");
- bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
- prt_newline(out);
-
- prt_str(out, "kill ptrs:\t");
- bch2_prt_u64_base2(out, data_opts->kill_ptrs);
- prt_newline(out);
-
- prt_str(out, "target:\t");
- bch2_target_to_text(out, c, data_opts->target);
- prt_newline(out);
-
- prt_str(out, "compression:\t");
- bch2_compression_opt_to_text(out, background_compression(*io_opts));
- prt_newline(out);
-
- prt_str(out, "extra replicas:\t");
- prt_u64(out, data_opts->extra_replicas);
-}
-
static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
@@ -547,6 +522,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
ctxt->stats->pos = BBPOS(btree_id, start);
}
+ bch2_trans_begin(trans);
bch2_trans_iter_init(trans, &iter, btree_id, start,
BTREE_ITER_prefetch|
BTREE_ITER_all_snapshots);
@@ -920,7 +896,20 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
? c->opts.metadata_replicas
: io_opts->data_replicas;
- if (!nr_good || nr_good >= replicas)
+ rcu_read_lock();
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned i = 0;
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ptr->cached &&
+ (!ca || !ca->mi.durability))
+ data_opts->kill_ptrs |= BIT(i);
+ i++;
+ }
+ rcu_read_unlock();
+
+ if (!data_opts->kill_ptrs &&
+ (!nr_good || nr_good >= replicas))
return false;
data_opts->target = 0;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 10bfb31c151b..eb49dd045eff 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -35,9 +35,10 @@ struct buckets_in_flight {
};
static const struct rhashtable_params bch_move_bucket_params = {
- .head_offset = offsetof(struct move_bucket_in_flight, hash),
- .key_offset = offsetof(struct move_bucket_in_flight, bucket.k),
- .key_len = sizeof(struct move_bucket_key),
+ .head_offset = offsetof(struct move_bucket_in_flight, hash),
+ .key_offset = offsetof(struct move_bucket_in_flight, bucket.k),
+ .key_len = sizeof(struct move_bucket_key),
+ .automatic_shrinking = true,
};
static struct move_bucket_in_flight *
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 25530e0bb2f3..b197ec90d4cb 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -137,7 +137,7 @@ enum fsck_err_opts {
x(errors, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_error_actions), \
- BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \
+ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \
NULL, "Action to take on filesystem error") \
x(metadata_replicas, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index cf513fc79ce4..1f9d044ed920 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -326,6 +326,12 @@ static int journal_replay_entry_early(struct bch_fs *c,
case BCH_JSET_ENTRY_btree_root: {
struct btree_root *r;
+ if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX,
+ c, invalid_btree_id,
+ "invalid btree id %u (max %u)",
+ entry->btree_id, BTREE_ID_NR_MAX))
+ return 0;
+
while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) {
ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL });
if (ret)
@@ -415,7 +421,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
}
}
-
+fsck_err:
return ret;
}
@@ -658,10 +664,10 @@ int bch2_fs_recovery(struct bch_fs *c)
if (check_version_upgrade(c))
write_sb = true;
+ c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+
if (write_sb)
bch2_write_super(c);
-
- c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
mutex_unlock(&c->sb_lock);
if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h
new file mode 100644
index 000000000000..b97208195d06
--- /dev/null
+++ b/fs/bcachefs/replicas_format.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REPLICAS_FORMAT_H
+#define _BCACHEFS_REPLICAS_FORMAT_H
+
+struct bch_replicas_entry_v0 {
+ __u8 data_type;
+ __u8 nr_devs;
+ __u8 devs[];
+} __packed;
+
+struct bch_sb_field_replicas_v0 {
+ struct bch_sb_field field;
+ struct bch_replicas_entry_v0 entries[];
+} __packed __aligned(8);
+
+struct bch_replicas_entry_v1 {
+ __u8 data_type;
+ __u8 nr_devs;
+ __u8 nr_required;
+ __u8 devs[];
+} __packed;
+
+struct bch_sb_field_replicas {
+ struct bch_sb_field field;
+ struct bch_replicas_entry_v1 entries[];
+} __packed __aligned(8);
+
+#define replicas_entry_bytes(_i) \
+ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
+
+#endif /* _BCACHEFS_REPLICAS_FORMAT_H */
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 390a1bbd2567..4710b61631f0 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -146,10 +146,17 @@ static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f,
for (const struct bch_sb_field_downgrade_entry *i = e->entries;
(void *) i < vstruct_end(&e->field);
i = downgrade_entry_next_c(i)) {
+ /*
+ * Careful: sb_field_downgrade_entry is only 2 byte aligned, but
+ * section sizes are 8 byte aligned - an empty entry spanning
+ * the end of the section is allowed (and ignored):
+ */
+ if ((void *) &i->errors[0] > vstruct_end(&e->field))
+ break;
+
if (flags & BCH_VALIDATE_write &&
- ((void *) &i->errors[0] > vstruct_end(&e->field) ||
- (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field))) {
- prt_printf(err, "downgrade entry overruns end of superblock section)");
+ (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) {
+ prt_printf(err, "downgrade entry overruns end of superblock section");
return -BCH_ERR_invalid_sb_downgrade;
}
@@ -221,7 +228,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
dst = (void *) &darray_top(table);
dst->version = cpu_to_le16(src->version);
- dst->recovery_passes[0] = cpu_to_le64(src->recovery_passes);
+ dst->recovery_passes[0] = cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes));
dst->recovery_passes[1] = 0;
dst->nr_errors = cpu_to_le16(src->nr_errors);
for (unsigned i = 0; i < src->nr_errors; i++)
diff --git a/fs/bcachefs/sb-downgrade_format.h b/fs/bcachefs/sb-downgrade_format.h
new file mode 100644
index 000000000000..cffd932be3ec
--- /dev/null
+++ b/fs/bcachefs/sb-downgrade_format.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_DOWNGRADE_FORMAT_H
+#define _BCACHEFS_SB_DOWNGRADE_FORMAT_H
+
+struct bch_sb_field_downgrade_entry {
+ __le16 version;
+ __le64 recovery_passes[2];
+ __le16 nr_errors;
+ __le16 errors[] __counted_by(nr_errors);
+} __packed __aligned(2);
+
+struct bch_sb_field_downgrade {
+ struct bch_sb_field field;
+ struct bch_sb_field_downgrade_entry entries[];
+};
+
+#endif /* _BCACHEFS_SB_DOWNGRADE_FORMAT_H */
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
index bda33e59e226..c1270d790e43 100644
--- a/fs/bcachefs/sb-errors.c
+++ b/fs/bcachefs/sb-errors.c
@@ -110,19 +110,25 @@ out:
void bch2_sb_errors_from_cpu(struct bch_fs *c)
{
bch_sb_errors_cpu *src = &c->fsck_error_counts;
- struct bch_sb_field_errors *dst =
- bch2_sb_field_resize(&c->disk_sb, errors,
- bch2_sb_field_errors_u64s(src->nr));
+ struct bch_sb_field_errors *dst;
unsigned i;
+ mutex_lock(&c->fsck_error_counts_lock);
+
+ dst = bch2_sb_field_resize(&c->disk_sb, errors,
+ bch2_sb_field_errors_u64s(src->nr));
+
if (!dst)
- return;
+ goto err;
for (i = 0; i < src->nr; i++) {
SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id);
SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr);
dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time);
}
+
+err:
+ mutex_unlock(&c->fsck_error_counts_lock);
}
static int bch2_sb_errors_to_cpu(struct bch_fs *c)
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
new file mode 100644
index 000000000000..d54121ec093f
--- /dev/null
+++ b/fs/bcachefs/sb-errors_format.h
@@ -0,0 +1,310 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H
+#define _BCACHEFS_SB_ERRORS_FORMAT_H
+
+enum bch_fsck_flags {
+ FSCK_CAN_FIX = 1 << 0,
+ FSCK_CAN_IGNORE = 1 << 1,
+ FSCK_NEED_FSCK = 1 << 2,
+ FSCK_NO_RATELIMIT = 1 << 3,
+ FSCK_AUTOFIX = 1 << 4,
+};
+
+#define BCH_SB_ERRS() \
+ x(clean_but_journal_not_empty, 0, 0) \
+ x(dirty_but_no_journal_entries, 1, 0) \
+ x(dirty_but_no_journal_entries_post_drop_nonflushes, 2, 0) \
+ x(sb_clean_journal_seq_mismatch, 3, 0) \
+ x(sb_clean_btree_root_mismatch, 4, 0) \
+ x(sb_clean_missing, 5, 0) \
+ x(jset_unsupported_version, 6, 0) \
+ x(jset_unknown_csum, 7, 0) \
+ x(jset_last_seq_newer_than_seq, 8, 0) \
+ x(jset_past_bucket_end, 9, 0) \
+ x(jset_seq_blacklisted, 10, 0) \
+ x(journal_entries_missing, 11, 0) \
+ x(journal_entry_replicas_not_marked, 12, 0) \
+ x(journal_entry_past_jset_end, 13, 0) \
+ x(journal_entry_replicas_data_mismatch, 14, 0) \
+ x(journal_entry_bkey_u64s_0, 15, 0) \
+ x(journal_entry_bkey_past_end, 16, 0) \
+ x(journal_entry_bkey_bad_format, 17, 0) \
+ x(journal_entry_bkey_invalid, 18, 0) \
+ x(journal_entry_btree_root_bad_size, 19, 0) \
+ x(journal_entry_blacklist_bad_size, 20, 0) \
+ x(journal_entry_blacklist_v2_bad_size, 21, 0) \
+ x(journal_entry_blacklist_v2_start_past_end, 22, 0) \
+ x(journal_entry_usage_bad_size, 23, 0) \
+ x(journal_entry_data_usage_bad_size, 24, 0) \
+ x(journal_entry_clock_bad_size, 25, 0) \
+ x(journal_entry_clock_bad_rw, 26, 0) \
+ x(journal_entry_dev_usage_bad_size, 27, 0) \
+ x(journal_entry_dev_usage_bad_dev, 28, 0) \
+ x(journal_entry_dev_usage_bad_pad, 29, 0) \
+ x(btree_node_unreadable, 30, 0) \
+ x(btree_node_fault_injected, 31, 0) \
+ x(btree_node_bad_magic, 32, 0) \
+ x(btree_node_bad_seq, 33, 0) \
+ x(btree_node_unsupported_version, 34, 0) \
+ x(btree_node_bset_older_than_sb_min, 35, 0) \
+ x(btree_node_bset_newer_than_sb, 36, 0) \
+ x(btree_node_data_missing, 37, 0) \
+ x(btree_node_bset_after_end, 38, 0) \
+ x(btree_node_replicas_sectors_written_mismatch, 39, 0) \
+ x(btree_node_replicas_data_mismatch, 40, 0) \
+ x(bset_unknown_csum, 41, 0) \
+ x(bset_bad_csum, 42, 0) \
+ x(bset_past_end_of_btree_node, 43, 0) \
+ x(bset_wrong_sector_offset, 44, 0) \
+ x(bset_empty, 45, 0) \
+ x(bset_bad_seq, 46, 0) \
+ x(bset_blacklisted_journal_seq, 47, 0) \
+ x(first_bset_blacklisted_journal_seq, 48, 0) \
+ x(btree_node_bad_btree, 49, 0) \
+ x(btree_node_bad_level, 50, 0) \
+ x(btree_node_bad_min_key, 51, 0) \
+ x(btree_node_bad_max_key, 52, 0) \
+ x(btree_node_bad_format, 53, 0) \
+ x(btree_node_bkey_past_bset_end, 54, 0) \
+ x(btree_node_bkey_bad_format, 55, 0) \
+ x(btree_node_bad_bkey, 56, 0) \
+ x(btree_node_bkey_out_of_order, 57, 0) \
+ x(btree_root_bkey_invalid, 58, 0) \
+ x(btree_root_read_error, 59, 0) \
+ x(btree_root_bad_min_key, 60, 0) \
+ x(btree_root_bad_max_key, 61, 0) \
+ x(btree_node_read_error, 62, 0) \
+ x(btree_node_topology_bad_min_key, 63, 0) \
+ x(btree_node_topology_bad_max_key, 64, 0) \
+ x(btree_node_topology_overwritten_by_prev_node, 65, 0) \
+ x(btree_node_topology_overwritten_by_next_node, 66, 0) \
+ x(btree_node_topology_interior_node_empty, 67, 0) \
+ x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \
+ x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \
+ x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \
+ x(fs_usage_cached_wrong, 71, FSCK_AUTOFIX) \
+ x(fs_usage_reserved_wrong, 72, FSCK_AUTOFIX) \
+ x(fs_usage_persistent_reserved_wrong, 73, FSCK_AUTOFIX) \
+ x(fs_usage_nr_inodes_wrong, 74, FSCK_AUTOFIX) \
+ x(fs_usage_replicas_wrong, 75, FSCK_AUTOFIX) \
+ x(dev_usage_buckets_wrong, 76, FSCK_AUTOFIX) \
+ x(dev_usage_sectors_wrong, 77, FSCK_AUTOFIX) \
+ x(dev_usage_fragmented_wrong, 78, FSCK_AUTOFIX) \
+ x(dev_usage_buckets_ec_wrong, 79, FSCK_AUTOFIX) \
+ x(bkey_version_in_future, 80, 0) \
+ x(bkey_u64s_too_small, 81, 0) \
+ x(bkey_invalid_type_for_btree, 82, 0) \
+ x(bkey_extent_size_zero, 83, 0) \
+ x(bkey_extent_size_greater_than_offset, 84, 0) \
+ x(bkey_size_nonzero, 85, 0) \
+ x(bkey_snapshot_nonzero, 86, 0) \
+ x(bkey_snapshot_zero, 87, 0) \
+ x(bkey_at_pos_max, 88, 0) \
+ x(bkey_before_start_of_btree_node, 89, 0) \
+ x(bkey_after_end_of_btree_node, 90, 0) \
+ x(bkey_val_size_nonzero, 91, 0) \
+ x(bkey_val_size_too_small, 92, 0) \
+ x(alloc_v1_val_size_bad, 93, 0) \
+ x(alloc_v2_unpack_error, 94, 0) \
+ x(alloc_v3_unpack_error, 95, 0) \
+ x(alloc_v4_val_size_bad, 96, 0) \
+ x(alloc_v4_backpointers_start_bad, 97, 0) \
+ x(alloc_key_data_type_bad, 98, 0) \
+ x(alloc_key_empty_but_have_data, 99, 0) \
+ x(alloc_key_dirty_sectors_0, 100, 0) \
+ x(alloc_key_data_type_inconsistency, 101, 0) \
+ x(alloc_key_to_missing_dev_bucket, 102, 0) \
+ x(alloc_key_cached_inconsistency, 103, 0) \
+ x(alloc_key_cached_but_read_time_zero, 104, 0) \
+ x(alloc_key_to_missing_lru_entry, 105, 0) \
+ x(alloc_key_data_type_wrong, 106, FSCK_AUTOFIX) \
+ x(alloc_key_gen_wrong, 107, FSCK_AUTOFIX) \
+ x(alloc_key_dirty_sectors_wrong, 108, FSCK_AUTOFIX) \
+ x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \
+ x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \
+ x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \
+ x(bucket_sector_count_overflow, 112, 0) \
+ x(bucket_metadata_type_mismatch, 113, 0) \
+ x(need_discard_key_wrong, 114, 0) \
+ x(freespace_key_wrong, 115, 0) \
+ x(freespace_hole_missing, 116, 0) \
+ x(bucket_gens_val_size_bad, 117, 0) \
+ x(bucket_gens_key_wrong, 118, 0) \
+ x(bucket_gens_hole_wrong, 119, 0) \
+ x(bucket_gens_to_invalid_dev, 120, 0) \
+ x(bucket_gens_to_invalid_buckets, 121, 0) \
+ x(bucket_gens_nonzero_for_invalid_buckets, 122, 0) \
+ x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \
+ x(need_discard_freespace_key_bad, 124, 0) \
+ x(backpointer_bucket_offset_wrong, 125, 0) \
+ x(backpointer_to_missing_device, 126, 0) \
+ x(backpointer_to_missing_alloc, 127, 0) \
+ x(backpointer_to_missing_ptr, 128, 0) \
+ x(lru_entry_at_time_0, 129, 0) \
+ x(lru_entry_to_invalid_bucket, 130, 0) \
+ x(lru_entry_bad, 131, 0) \
+ x(btree_ptr_val_too_big, 132, 0) \
+ x(btree_ptr_v2_val_too_big, 133, 0) \
+ x(btree_ptr_has_non_ptr, 134, 0) \
+ x(extent_ptrs_invalid_entry, 135, 0) \
+ x(extent_ptrs_no_ptrs, 136, 0) \
+ x(extent_ptrs_too_many_ptrs, 137, 0) \
+ x(extent_ptrs_redundant_crc, 138, 0) \
+ x(extent_ptrs_redundant_stripe, 139, 0) \
+ x(extent_ptrs_unwritten, 140, 0) \
+ x(extent_ptrs_written_and_unwritten, 141, 0) \
+ x(ptr_to_invalid_device, 142, 0) \
+ x(ptr_to_duplicate_device, 143, 0) \
+ x(ptr_after_last_bucket, 144, 0) \
+ x(ptr_before_first_bucket, 145, 0) \
+ x(ptr_spans_multiple_buckets, 146, 0) \
+ x(ptr_to_missing_backpointer, 147, 0) \
+ x(ptr_to_missing_alloc_key, 148, 0) \
+ x(ptr_to_missing_replicas_entry, 149, 0) \
+ x(ptr_to_missing_stripe, 150, 0) \
+ x(ptr_to_incorrect_stripe, 151, 0) \
+ x(ptr_gen_newer_than_bucket_gen, 152, 0) \
+ x(ptr_too_stale, 153, 0) \
+ x(stale_dirty_ptr, 154, 0) \
+ x(ptr_bucket_data_type_mismatch, 155, 0) \
+ x(ptr_cached_and_erasure_coded, 156, 0) \
+ x(ptr_crc_uncompressed_size_too_small, 157, 0) \
+ x(ptr_crc_csum_type_unknown, 158, 0) \
+ x(ptr_crc_compression_type_unknown, 159, 0) \
+ x(ptr_crc_redundant, 160, 0) \
+ x(ptr_crc_uncompressed_size_too_big, 161, 0) \
+ x(ptr_crc_nonce_mismatch, 162, 0) \
+ x(ptr_stripe_redundant, 163, 0) \
+ x(reservation_key_nr_replicas_invalid, 164, 0) \
+ x(reflink_v_refcount_wrong, 165, 0) \
+ x(reflink_p_to_missing_reflink_v, 166, 0) \
+ x(stripe_pos_bad, 167, 0) \
+ x(stripe_val_size_bad, 168, 0) \
+ x(stripe_sector_count_wrong, 169, 0) \
+ x(snapshot_tree_pos_bad, 170, 0) \
+ x(snapshot_tree_to_missing_snapshot, 171, 0) \
+ x(snapshot_tree_to_missing_subvol, 172, 0) \
+ x(snapshot_tree_to_wrong_subvol, 173, 0) \
+ x(snapshot_tree_to_snapshot_subvol, 174, 0) \
+ x(snapshot_pos_bad, 175, 0) \
+ x(snapshot_parent_bad, 176, 0) \
+ x(snapshot_children_not_normalized, 177, 0) \
+ x(snapshot_child_duplicate, 178, 0) \
+ x(snapshot_child_bad, 179, 0) \
+ x(snapshot_skiplist_not_normalized, 180, 0) \
+ x(snapshot_skiplist_bad, 181, 0) \
+ x(snapshot_should_not_have_subvol, 182, 0) \
+ x(snapshot_to_bad_snapshot_tree, 183, 0) \
+ x(snapshot_bad_depth, 184, 0) \
+ x(snapshot_bad_skiplist, 185, 0) \
+ x(subvol_pos_bad, 186, 0) \
+ x(subvol_not_master_and_not_snapshot, 187, 0) \
+ x(subvol_to_missing_root, 188, 0) \
+ x(subvol_root_wrong_bi_subvol, 189, 0) \
+ x(bkey_in_missing_snapshot, 190, 0) \
+ x(inode_pos_inode_nonzero, 191, 0) \
+ x(inode_pos_blockdev_range, 192, 0) \
+ x(inode_unpack_error, 193, 0) \
+ x(inode_str_hash_invalid, 194, 0) \
+ x(inode_v3_fields_start_bad, 195, 0) \
+ x(inode_snapshot_mismatch, 196, 0) \
+ x(inode_unlinked_but_clean, 197, 0) \
+ x(inode_unlinked_but_nlink_nonzero, 198, 0) \
+ x(inode_checksum_type_invalid, 199, 0) \
+ x(inode_compression_type_invalid, 200, 0) \
+ x(inode_subvol_root_but_not_dir, 201, 0) \
+ x(inode_i_size_dirty_but_clean, 202, 0) \
+ x(inode_i_sectors_dirty_but_clean, 203, 0) \
+ x(inode_i_sectors_wrong, 204, 0) \
+ x(inode_dir_wrong_nlink, 205, 0) \
+ x(inode_dir_multiple_links, 206, 0) \
+ x(inode_multiple_links_but_nlink_0, 207, 0) \
+ x(inode_wrong_backpointer, 208, 0) \
+ x(inode_wrong_nlink, 209, 0) \
+ x(inode_unreachable, 210, 0) \
+ x(deleted_inode_but_clean, 211, 0) \
+ x(deleted_inode_missing, 212, 0) \
+ x(deleted_inode_is_dir, 213, 0) \
+ x(deleted_inode_not_unlinked, 214, 0) \
+ x(extent_overlapping, 215, 0) \
+ x(extent_in_missing_inode, 216, 0) \
+ x(extent_in_non_reg_inode, 217, 0) \
+ x(extent_past_end_of_inode, 218, 0) \
+ x(dirent_empty_name, 219, 0) \
+ x(dirent_val_too_big, 220, 0) \
+ x(dirent_name_too_long, 221, 0) \
+ x(dirent_name_embedded_nul, 222, 0) \
+ x(dirent_name_dot_or_dotdot, 223, 0) \
+ x(dirent_name_has_slash, 224, 0) \
+ x(dirent_d_type_wrong, 225, 0) \
+ x(inode_bi_parent_wrong, 226, 0) \
+ x(dirent_in_missing_dir_inode, 227, 0) \
+ x(dirent_in_non_dir_inode, 228, 0) \
+ x(dirent_to_missing_inode, 229, 0) \
+ x(dirent_to_missing_subvol, 230, 0) \
+ x(dirent_to_itself, 231, 0) \
+ x(quota_type_invalid, 232, 0) \
+ x(xattr_val_size_too_small, 233, 0) \
+ x(xattr_val_size_too_big, 234, 0) \
+ x(xattr_invalid_type, 235, 0) \
+ x(xattr_name_invalid_chars, 236, 0) \
+ x(xattr_in_missing_inode, 237, 0) \
+ x(root_subvol_missing, 238, 0) \
+ x(root_dir_missing, 239, 0) \
+ x(root_inode_not_dir, 240, 0) \
+ x(dir_loop, 241, 0) \
+ x(hash_table_key_duplicate, 242, 0) \
+ x(hash_table_key_wrong_offset, 243, 0) \
+ x(unlinked_inode_not_on_deleted_list, 244, 0) \
+ x(reflink_p_front_pad_bad, 245, 0) \
+ x(journal_entry_dup_same_device, 246, 0) \
+ x(inode_bi_subvol_missing, 247, 0) \
+ x(inode_bi_subvol_wrong, 248, 0) \
+ x(inode_points_to_missing_dirent, 249, 0) \
+ x(inode_points_to_wrong_dirent, 250, 0) \
+ x(inode_bi_parent_nonzero, 251, 0) \
+ x(dirent_to_missing_parent_subvol, 252, 0) \
+ x(dirent_not_visible_in_parent_subvol, 253, 0) \
+ x(subvol_fs_path_parent_wrong, 254, 0) \
+ x(subvol_root_fs_path_parent_nonzero, 255, 0) \
+ x(subvol_children_not_set, 256, 0) \
+ x(subvol_children_bad, 257, 0) \
+ x(subvol_loop, 258, 0) \
+ x(subvol_unreachable, 259, 0) \
+ x(btree_node_bkey_bad_u64s, 260, 0) \
+ x(btree_node_topology_empty_interior_node, 261, 0) \
+ x(btree_ptr_v2_min_key_bad, 262, 0) \
+ x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \
+ x(snapshot_node_missing, 264, 0) \
+ x(dup_backpointer_to_bad_csum_extent, 265, 0) \
+ x(btree_bitmap_not_marked, 266, 0) \
+ x(sb_clean_entry_overrun, 267, 0) \
+ x(btree_ptr_v2_written_0, 268, 0) \
+ x(subvol_snapshot_bad, 269, 0) \
+ x(subvol_inode_bad, 270, 0) \
+ x(alloc_key_stripe_sectors_wrong, 271, 0) \
+ x(accounting_mismatch, 272, 0) \
+ x(accounting_replicas_not_marked, 273, 0) \
+ x(invalid_btree_id, 274, 0) \
+ x(alloc_key_io_time_bad, 275, 0) \
+ x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX)
+
+enum bch_sb_error_id {
+#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
+ BCH_SB_ERRS()
+#undef x
+ BCH_SB_ERR_MAX
+};
+
+struct bch_sb_field_errors {
+ struct bch_sb_field field;
+ struct bch_sb_field_error_entry {
+ __le64 v;
+ __le64 last_error_time;
+ } entries[];
+};
+
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16);
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64);
+
+#endif /* _BCACHEFS_SB_ERRORS_FORMAT_H */
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index 666599d3fb9d..40325239c3b0 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -4,286 +4,6 @@
#include "darray.h"
-#define BCH_SB_ERRS() \
- x(clean_but_journal_not_empty, 0) \
- x(dirty_but_no_journal_entries, 1) \
- x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \
- x(sb_clean_journal_seq_mismatch, 3) \
- x(sb_clean_btree_root_mismatch, 4) \
- x(sb_clean_missing, 5) \
- x(jset_unsupported_version, 6) \
- x(jset_unknown_csum, 7) \
- x(jset_last_seq_newer_than_seq, 8) \
- x(jset_past_bucket_end, 9) \
- x(jset_seq_blacklisted, 10) \
- x(journal_entries_missing, 11) \
- x(journal_entry_replicas_not_marked, 12) \
- x(journal_entry_past_jset_end, 13) \
- x(journal_entry_replicas_data_mismatch, 14) \
- x(journal_entry_bkey_u64s_0, 15) \
- x(journal_entry_bkey_past_end, 16) \
- x(journal_entry_bkey_bad_format, 17) \
- x(journal_entry_bkey_invalid, 18) \
- x(journal_entry_btree_root_bad_size, 19) \
- x(journal_entry_blacklist_bad_size, 20) \
- x(journal_entry_blacklist_v2_bad_size, 21) \
- x(journal_entry_blacklist_v2_start_past_end, 22) \
- x(journal_entry_usage_bad_size, 23) \
- x(journal_entry_data_usage_bad_size, 24) \
- x(journal_entry_clock_bad_size, 25) \
- x(journal_entry_clock_bad_rw, 26) \
- x(journal_entry_dev_usage_bad_size, 27) \
- x(journal_entry_dev_usage_bad_dev, 28) \
- x(journal_entry_dev_usage_bad_pad, 29) \
- x(btree_node_unreadable, 30) \
- x(btree_node_fault_injected, 31) \
- x(btree_node_bad_magic, 32) \
- x(btree_node_bad_seq, 33) \
- x(btree_node_unsupported_version, 34) \
- x(btree_node_bset_older_than_sb_min, 35) \
- x(btree_node_bset_newer_than_sb, 36) \
- x(btree_node_data_missing, 37) \
- x(btree_node_bset_after_end, 38) \
- x(btree_node_replicas_sectors_written_mismatch, 39) \
- x(btree_node_replicas_data_mismatch, 40) \
- x(bset_unknown_csum, 41) \
- x(bset_bad_csum, 42) \
- x(bset_past_end_of_btree_node, 43) \
- x(bset_wrong_sector_offset, 44) \
- x(bset_empty, 45) \
- x(bset_bad_seq, 46) \
- x(bset_blacklisted_journal_seq, 47) \
- x(first_bset_blacklisted_journal_seq, 48) \
- x(btree_node_bad_btree, 49) \
- x(btree_node_bad_level, 50) \
- x(btree_node_bad_min_key, 51) \
- x(btree_node_bad_max_key, 52) \
- x(btree_node_bad_format, 53) \
- x(btree_node_bkey_past_bset_end, 54) \
- x(btree_node_bkey_bad_format, 55) \
- x(btree_node_bad_bkey, 56) \
- x(btree_node_bkey_out_of_order, 57) \
- x(btree_root_bkey_invalid, 58) \
- x(btree_root_read_error, 59) \
- x(btree_root_bad_min_key, 60) \
- x(btree_root_bad_max_key, 61) \
- x(btree_node_read_error, 62) \
- x(btree_node_topology_bad_min_key, 63) \
- x(btree_node_topology_bad_max_key, 64) \
- x(btree_node_topology_overwritten_by_prev_node, 65) \
- x(btree_node_topology_overwritten_by_next_node, 66) \
- x(btree_node_topology_interior_node_empty, 67) \
- x(fs_usage_hidden_wrong, 68) \
- x(fs_usage_btree_wrong, 69) \
- x(fs_usage_data_wrong, 70) \
- x(fs_usage_cached_wrong, 71) \
- x(fs_usage_reserved_wrong, 72) \
- x(fs_usage_persistent_reserved_wrong, 73) \
- x(fs_usage_nr_inodes_wrong, 74) \
- x(fs_usage_replicas_wrong, 75) \
- x(dev_usage_buckets_wrong, 76) \
- x(dev_usage_sectors_wrong, 77) \
- x(dev_usage_fragmented_wrong, 78) \
- x(dev_usage_buckets_ec_wrong, 79) \
- x(bkey_version_in_future, 80) \
- x(bkey_u64s_too_small, 81) \
- x(bkey_invalid_type_for_btree, 82) \
- x(bkey_extent_size_zero, 83) \
- x(bkey_extent_size_greater_than_offset, 84) \
- x(bkey_size_nonzero, 85) \
- x(bkey_snapshot_nonzero, 86) \
- x(bkey_snapshot_zero, 87) \
- x(bkey_at_pos_max, 88) \
- x(bkey_before_start_of_btree_node, 89) \
- x(bkey_after_end_of_btree_node, 90) \
- x(bkey_val_size_nonzero, 91) \
- x(bkey_val_size_too_small, 92) \
- x(alloc_v1_val_size_bad, 93) \
- x(alloc_v2_unpack_error, 94) \
- x(alloc_v3_unpack_error, 95) \
- x(alloc_v4_val_size_bad, 96) \
- x(alloc_v4_backpointers_start_bad, 97) \
- x(alloc_key_data_type_bad, 98) \
- x(alloc_key_empty_but_have_data, 99) \
- x(alloc_key_dirty_sectors_0, 100) \
- x(alloc_key_data_type_inconsistency, 101) \
- x(alloc_key_to_missing_dev_bucket, 102) \
- x(alloc_key_cached_inconsistency, 103) \
- x(alloc_key_cached_but_read_time_zero, 104) \
- x(alloc_key_to_missing_lru_entry, 105) \
- x(alloc_key_data_type_wrong, 106) \
- x(alloc_key_gen_wrong, 107) \
- x(alloc_key_dirty_sectors_wrong, 108) \
- x(alloc_key_cached_sectors_wrong, 109) \
- x(alloc_key_stripe_wrong, 110) \
- x(alloc_key_stripe_redundancy_wrong, 111) \
- x(bucket_sector_count_overflow, 112) \
- x(bucket_metadata_type_mismatch, 113) \
- x(need_discard_key_wrong, 114) \
- x(freespace_key_wrong, 115) \
- x(freespace_hole_missing, 116) \
- x(bucket_gens_val_size_bad, 117) \
- x(bucket_gens_key_wrong, 118) \
- x(bucket_gens_hole_wrong, 119) \
- x(bucket_gens_to_invalid_dev, 120) \
- x(bucket_gens_to_invalid_buckets, 121) \
- x(bucket_gens_nonzero_for_invalid_buckets, 122) \
- x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \
- x(need_discard_freespace_key_bad, 124) \
- x(backpointer_bucket_offset_wrong, 125) \
- x(backpointer_to_missing_device, 126) \
- x(backpointer_to_missing_alloc, 127) \
- x(backpointer_to_missing_ptr, 128) \
- x(lru_entry_at_time_0, 129) \
- x(lru_entry_to_invalid_bucket, 130) \
- x(lru_entry_bad, 131) \
- x(btree_ptr_val_too_big, 132) \
- x(btree_ptr_v2_val_too_big, 133) \
- x(btree_ptr_has_non_ptr, 134) \
- x(extent_ptrs_invalid_entry, 135) \
- x(extent_ptrs_no_ptrs, 136) \
- x(extent_ptrs_too_many_ptrs, 137) \
- x(extent_ptrs_redundant_crc, 138) \
- x(extent_ptrs_redundant_stripe, 139) \
- x(extent_ptrs_unwritten, 140) \
- x(extent_ptrs_written_and_unwritten, 141) \
- x(ptr_to_invalid_device, 142) \
- x(ptr_to_duplicate_device, 143) \
- x(ptr_after_last_bucket, 144) \
- x(ptr_before_first_bucket, 145) \
- x(ptr_spans_multiple_buckets, 146) \
- x(ptr_to_missing_backpointer, 147) \
- x(ptr_to_missing_alloc_key, 148) \
- x(ptr_to_missing_replicas_entry, 149) \
- x(ptr_to_missing_stripe, 150) \
- x(ptr_to_incorrect_stripe, 151) \
- x(ptr_gen_newer_than_bucket_gen, 152) \
- x(ptr_too_stale, 153) \
- x(stale_dirty_ptr, 154) \
- x(ptr_bucket_data_type_mismatch, 155) \
- x(ptr_cached_and_erasure_coded, 156) \
- x(ptr_crc_uncompressed_size_too_small, 157) \
- x(ptr_crc_csum_type_unknown, 158) \
- x(ptr_crc_compression_type_unknown, 159) \
- x(ptr_crc_redundant, 160) \
- x(ptr_crc_uncompressed_size_too_big, 161) \
- x(ptr_crc_nonce_mismatch, 162) \
- x(ptr_stripe_redundant, 163) \
- x(reservation_key_nr_replicas_invalid, 164) \
- x(reflink_v_refcount_wrong, 165) \
- x(reflink_p_to_missing_reflink_v, 166) \
- x(stripe_pos_bad, 167) \
- x(stripe_val_size_bad, 168) \
- x(stripe_sector_count_wrong, 169) \
- x(snapshot_tree_pos_bad, 170) \
- x(snapshot_tree_to_missing_snapshot, 171) \
- x(snapshot_tree_to_missing_subvol, 172) \
- x(snapshot_tree_to_wrong_subvol, 173) \
- x(snapshot_tree_to_snapshot_subvol, 174) \
- x(snapshot_pos_bad, 175) \
- x(snapshot_parent_bad, 176) \
- x(snapshot_children_not_normalized, 177) \
- x(snapshot_child_duplicate, 178) \
- x(snapshot_child_bad, 179) \
- x(snapshot_skiplist_not_normalized, 180) \
- x(snapshot_skiplist_bad, 181) \
- x(snapshot_should_not_have_subvol, 182) \
- x(snapshot_to_bad_snapshot_tree, 183) \
- x(snapshot_bad_depth, 184) \
- x(snapshot_bad_skiplist, 185) \
- x(subvol_pos_bad, 186) \
- x(subvol_not_master_and_not_snapshot, 187) \
- x(subvol_to_missing_root, 188) \
- x(subvol_root_wrong_bi_subvol, 189) \
- x(bkey_in_missing_snapshot, 190) \
- x(inode_pos_inode_nonzero, 191) \
- x(inode_pos_blockdev_range, 192) \
- x(inode_unpack_error, 193) \
- x(inode_str_hash_invalid, 194) \
- x(inode_v3_fields_start_bad, 195) \
- x(inode_snapshot_mismatch, 196) \
- x(inode_unlinked_but_clean, 197) \
- x(inode_unlinked_but_nlink_nonzero, 198) \
- x(inode_checksum_type_invalid, 199) \
- x(inode_compression_type_invalid, 200) \
- x(inode_subvol_root_but_not_dir, 201) \
- x(inode_i_size_dirty_but_clean, 202) \
- x(inode_i_sectors_dirty_but_clean, 203) \
- x(inode_i_sectors_wrong, 204) \
- x(inode_dir_wrong_nlink, 205) \
- x(inode_dir_multiple_links, 206) \
- x(inode_multiple_links_but_nlink_0, 207) \
- x(inode_wrong_backpointer, 208) \
- x(inode_wrong_nlink, 209) \
- x(inode_unreachable, 210) \
- x(deleted_inode_but_clean, 211) \
- x(deleted_inode_missing, 212) \
- x(deleted_inode_is_dir, 213) \
- x(deleted_inode_not_unlinked, 214) \
- x(extent_overlapping, 215) \
- x(extent_in_missing_inode, 216) \
- x(extent_in_non_reg_inode, 217) \
- x(extent_past_end_of_inode, 218) \
- x(dirent_empty_name, 219) \
- x(dirent_val_too_big, 220) \
- x(dirent_name_too_long, 221) \
- x(dirent_name_embedded_nul, 222) \
- x(dirent_name_dot_or_dotdot, 223) \
- x(dirent_name_has_slash, 224) \
- x(dirent_d_type_wrong, 225) \
- x(inode_bi_parent_wrong, 226) \
- x(dirent_in_missing_dir_inode, 227) \
- x(dirent_in_non_dir_inode, 228) \
- x(dirent_to_missing_inode, 229) \
- x(dirent_to_missing_subvol, 230) \
- x(dirent_to_itself, 231) \
- x(quota_type_invalid, 232) \
- x(xattr_val_size_too_small, 233) \
- x(xattr_val_size_too_big, 234) \
- x(xattr_invalid_type, 235) \
- x(xattr_name_invalid_chars, 236) \
- x(xattr_in_missing_inode, 237) \
- x(root_subvol_missing, 238) \
- x(root_dir_missing, 239) \
- x(root_inode_not_dir, 240) \
- x(dir_loop, 241) \
- x(hash_table_key_duplicate, 242) \
- x(hash_table_key_wrong_offset, 243) \
- x(unlinked_inode_not_on_deleted_list, 244) \
- x(reflink_p_front_pad_bad, 245) \
- x(journal_entry_dup_same_device, 246) \
- x(inode_bi_subvol_missing, 247) \
- x(inode_bi_subvol_wrong, 248) \
- x(inode_points_to_missing_dirent, 249) \
- x(inode_points_to_wrong_dirent, 250) \
- x(inode_bi_parent_nonzero, 251) \
- x(dirent_to_missing_parent_subvol, 252) \
- x(dirent_not_visible_in_parent_subvol, 253) \
- x(subvol_fs_path_parent_wrong, 254) \
- x(subvol_root_fs_path_parent_nonzero, 255) \
- x(subvol_children_not_set, 256) \
- x(subvol_children_bad, 257) \
- x(subvol_loop, 258) \
- x(subvol_unreachable, 259) \
- x(btree_node_bkey_bad_u64s, 260) \
- x(btree_node_topology_empty_interior_node, 261) \
- x(btree_ptr_v2_min_key_bad, 262) \
- x(btree_root_unreadable_and_scan_found_nothing, 263) \
- x(snapshot_node_missing, 264) \
- x(dup_backpointer_to_bad_csum_extent, 265) \
- x(btree_bitmap_not_marked, 266) \
- x(sb_clean_entry_overrun, 267) \
- x(btree_ptr_v2_written_0, 268) \
- x(subvol_snapshot_bad, 269) \
- x(subvol_inode_bad, 270)
-
-enum bch_sb_error_id {
-#define x(t, n) BCH_FSCK_ERR_##t = n,
- BCH_SB_ERRS()
-#undef x
- BCH_SB_ERR_MAX
-};
-
struct bch_sb_error_entry_cpu {
u64 id:16,
nr:48;
@@ -293,4 +13,3 @@ struct bch_sb_error_entry_cpu {
typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu;
#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */
-
diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h
new file mode 100644
index 000000000000..e2630548c0f6
--- /dev/null
+++ b/fs/bcachefs/sb-members_format.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_MEMBERS_FORMAT_H
+#define _BCACHEFS_SB_MEMBERS_FORMAT_H
+
+/*
+ * We refer to members with bitmasks in various places - but we need to get rid
+ * of this limit:
+ */
+#define BCH_SB_MEMBERS_MAX 64
+
+#define BCH_MIN_NR_NBUCKETS (1 << 6)
+
+#define BCH_IOPS_MEASUREMENTS() \
+ x(seqread, 0) \
+ x(seqwrite, 1) \
+ x(randread, 2) \
+ x(randwrite, 3)
+
+enum bch_iops_measurement {
+#define x(t, n) BCH_IOPS_##t = n,
+ BCH_IOPS_MEASUREMENTS()
+#undef x
+ BCH_IOPS_NR
+};
+
+#define BCH_MEMBER_ERROR_TYPES() \
+ x(read, 0) \
+ x(write, 1) \
+ x(checksum, 2)
+
+enum bch_member_error_type {
+#define x(t, n) BCH_MEMBER_ERROR_##t = n,
+ BCH_MEMBER_ERROR_TYPES()
+#undef x
+ BCH_MEMBER_ERROR_NR
+};
+
+struct bch_member {
+ __uuid_t uuid;
+ __le64 nbuckets; /* device size */
+ __le16 first_bucket; /* index of first bucket used */
+ __le16 bucket_size; /* sectors */
+ __u8 btree_bitmap_shift;
+ __u8 pad[3];
+ __le64 last_mount; /* time_t */
+
+ __le64 flags;
+ __le32 iops[4];
+ __le64 errors[BCH_MEMBER_ERROR_NR];
+ __le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
+ __le64 errors_reset_time;
+ __le64 seq;
+ __le64 btree_allocated_bitmap;
+ /*
+ * On recovery from a clean shutdown we don't normally read the journal,
+ * but we still want to resume writing from where we left off so we
+ * don't overwrite more than is necessary, for list journal debugging:
+ */
+ __le32 last_journal_bucket;
+ __le32 last_journal_bucket_offset;
+};
+
+/*
+ * This limit comes from the bucket_gens array - it's a single allocation, and
+ * kernel allocation are limited to INT_MAX
+ */
+#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64)
+
+#define BCH_MEMBER_V1_BYTES 56
+
+LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
+/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
+LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15)
+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20)
+LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28)
+LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30)
+LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
+ struct bch_member, flags, 30, 31)
+
+#if 0
+LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
+LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
+#endif
+
+#define BCH_MEMBER_STATES() \
+ x(rw, 0) \
+ x(ro, 1) \
+ x(failed, 2) \
+ x(spare, 3)
+
+enum bch_member_state {
+#define x(t, n) BCH_MEMBER_STATE_##t = n,
+ BCH_MEMBER_STATES()
+#undef x
+ BCH_MEMBER_STATE_NR
+};
+
+struct bch_sb_field_members_v1 {
+ struct bch_sb_field field;
+ struct bch_member _members[]; //Members are now variable size
+};
+
+struct bch_sb_field_members_v2 {
+ struct bch_sb_field field;
+ __le16 member_bytes; //size of single member entry
+ u8 pad[6];
+ struct bch_member _members[];
+};
+
+#endif /* _BCACHEFS_SB_MEMBERS_FORMAT_H */
diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h
index c1860d8163fb..c4b3d8d3f414 100644
--- a/fs/bcachefs/seqmutex.h
+++ b/fs/bcachefs/seqmutex.h
@@ -19,17 +19,14 @@ static inline bool seqmutex_trylock(struct seqmutex *lock)
static inline void seqmutex_lock(struct seqmutex *lock)
{
mutex_lock(&lock->lock);
-}
-
-static inline void seqmutex_unlock(struct seqmutex *lock)
-{
lock->seq++;
- mutex_unlock(&lock->lock);
}
-static inline u32 seqmutex_seq(struct seqmutex *lock)
+static inline u32 seqmutex_unlock(struct seqmutex *lock)
{
- return lock->seq;
+ u32 seq = lock->seq;
+ mutex_unlock(&lock->lock);
+ return seq;
}
static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq)
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 629900a5e641..24023d6a9698 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -168,6 +168,9 @@ static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
+ if (unlikely(new_bytes > INT_MAX))
+ return NULL;
+
new = kvzalloc(new_bytes, GFP_KERNEL);
if (!new)
return NULL;
@@ -1042,6 +1045,25 @@ err:
return ret;
}
+int bch2_check_key_has_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
+ bkey_in_missing_snapshot,
+ "key in missing snapshot %s, delete?",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_internal_snapshot_node) ?: 1;
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
/*
* Mark a snapshot as deleted, for future cleanup:
*/
@@ -1351,35 +1373,39 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
* that key to snapshot leaf nodes, where we can mutate it
*/
-static int snapshot_delete_key(struct btree_trans *trans,
+static int delete_dead_snapshots_process_key(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
snapshot_id_list *deleted,
snapshot_id_list *equiv_seen,
struct bpos *last_pos)
{
+ int ret = bch2_check_key_has_snapshot(trans, iter, k);
+ if (ret)
+ return ret < 0 ? ret : 0;
+
struct bch_fs *c = trans->c;
u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+ if (!equiv) /* key for invalid snapshot node, but we chose not to delete */
+ return 0;
if (!bkey_eq(k.k->p, *last_pos))
equiv_seen->nr = 0;
- *last_pos = k.k->p;
- if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
- snapshot_list_has_id(equiv_seen, equiv)) {
+ if (snapshot_list_has_id(deleted, k.k->p.snapshot))
return bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_internal_snapshot_node);
- } else {
- return snapshot_list_add(c, equiv_seen, equiv);
- }
-}
-static int move_key_to_correct_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+ if (!bpos_eq(*last_pos, k.k->p) &&
+ snapshot_list_has_id(equiv_seen, equiv))
+ return bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_internal_snapshot_node);
+
+ *last_pos = k.k->p;
+
+ ret = snapshot_list_add_nodup(c, equiv_seen, equiv);
+ if (ret)
+ return ret;
/*
* When we have a linear chain of snapshot nodes, we consider
@@ -1389,21 +1415,20 @@ static int move_key_to_correct_snapshot(struct btree_trans *trans,
*
* If there are multiple keys in different snapshots at the same
* position, we're only going to keep the one in the newest
- * snapshot - the rest have been overwritten and are redundant,
- * and for the key we're going to keep we need to move it to the
- * equivalance class ID if it's not there already.
+ * snapshot (we delete the others above) - the rest have been
+ * overwritten and are redundant, and for the key we're going to keep we
+ * need to move it to the equivalance class ID if it's not there
+ * already.
*/
if (equiv != k.k->p.snapshot) {
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
- struct btree_iter new_iter;
- int ret;
-
- ret = PTR_ERR_OR_ZERO(new);
+ int ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
new->k.p.snapshot = equiv;
+ struct btree_iter new_iter;
bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
BTREE_ITER_all_snapshots|
BTREE_ITER_cached|
@@ -1538,19 +1563,11 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
struct btree_trans *trans;
snapshot_id_list deleted = { 0 };
snapshot_id_list deleted_interior = { 0 };
- u32 id;
int ret = 0;
if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
return 0;
- if (!test_bit(BCH_FS_started, &c->flags)) {
- ret = bch2_fs_read_write_early(c);
- bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
- if (ret)
- return ret;
- }
-
trans = bch2_trans_get(c);
/*
@@ -1585,33 +1602,20 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
if (ret)
goto err;
- for (id = 0; id < BTREE_ID_NR; id++) {
+ for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
struct bpos last_pos = POS_MIN;
snapshot_id_list equiv_seen = { 0 };
struct disk_reservation res = { 0 };
- if (!btree_type_has_snapshots(id))
- continue;
-
- /*
- * deleted inodes btree is maintained by a trigger on the inodes
- * btree - no work for us to do here, and it's not safe to scan
- * it because we'll see out of date keys due to the btree write
- * buffer:
- */
- if (id == BTREE_ID_deleted_inodes)
+ if (!btree_type_has_snapshots(btree))
continue;
ret = for_each_btree_key_commit(trans, iter,
- id, POS_MIN,
+ btree, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
&res, NULL, BCH_TRANS_COMMIT_no_enospc,
- snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
- for_each_btree_key_commit(trans, iter,
- id, POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- &res, NULL, BCH_TRANS_COMMIT_no_enospc,
- move_key_to_correct_snapshot(trans, &iter, k));
+ delete_dead_snapshots_process_key(trans, &iter, k, &deleted,
+ &equiv_seen, &last_pos));
bch2_disk_reservation_put(c, &res);
darray_exit(&equiv_seen);
@@ -1679,6 +1683,8 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+ set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
+
bch2_delete_dead_snapshots(c);
bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index ab13d8f4b41e..31b0ee03e962 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -242,6 +242,7 @@ int bch2_snapshot_node_create(struct btree_trans *, u32,
int bch2_check_snapshot_trees(struct bch_fs *);
int bch2_check_snapshots(struct bch_fs *);
int bch2_reconstruct_snapshots(struct bch_fs *);
+int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
void bch2_delete_dead_snapshots_work(struct work_struct *);
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index cbad9b27874f..c8c266cb5797 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -300,7 +300,7 @@ not_found:
if (!found && (flags & STR_HASH_must_replace)) {
ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
} else if (found && (flags & STR_HASH_must_create)) {
- ret = -EEXIST;
+ ret = -BCH_ERR_EEXIST_str_hash_set;
} else {
if (!found && slot.path)
swap(iter, slot);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index f1bee6c5222d..b156fc85b8a3 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -649,9 +649,10 @@ reread:
bytes = vstruct_bytes(sb->sb);
- if (bytes > 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits)) {
- prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
- bytes, 512UL << sb->sb->layout.sb_max_size_bits);
+ u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits);
+ if (bytes > sb_size) {
+ prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)",
+ bytes, sb_size);
return -BCH_ERR_invalid_sb_too_big;
}
@@ -1132,18 +1133,12 @@ bool bch2_check_version_downgrade(struct bch_fs *c)
* c->sb will be checked before we write the superblock, so update it as
* well:
*/
- if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) {
+ if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
- c->sb.version_upgrade_complete = bcachefs_metadata_version_current;
- }
- if (c->sb.version > bcachefs_metadata_version_current) {
+ if (c->sb.version > bcachefs_metadata_version_current)
c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
- c->sb.version = bcachefs_metadata_version_current;
- }
- if (c->sb.version_min > bcachefs_metadata_version_current) {
+ if (c->sb.version_min > bcachefs_metadata_version_current)
c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
- c->sb.version_min = bcachefs_metadata_version_current;
- }
c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
return ret;
}
@@ -1316,15 +1311,15 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, "Device index:\t%u\n", sb->dev_idx);
- prt_str(out, "Label:\t");
+ prt_printf(out, "Label:\t");
prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
prt_newline(out);
- prt_str(out, "Version:\t");
+ prt_printf(out, "Version:\t");
bch2_version_to_text(out, le16_to_cpu(sb->version));
prt_newline(out);
- prt_str(out, "Version upgrade complete:\t");
+ prt_printf(out, "Version upgrade complete:\t");
bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
prt_newline(out);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 2206a8dee693..da735608d47c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -536,7 +536,6 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c);
- bch2_fs_allocator_background_exit(c);
bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);
bch2_fs_snapshots_exit(c);
@@ -564,8 +563,11 @@ static void __bch2_fs_free(struct bch_fs *c)
BUG_ON(atomic_read(&c->journal_keys.ref));
bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock);
- EBUG_ON(percpu_u64_get(c->online_reserved));
- free_percpu(c->online_reserved);
+ if (c->online_reserved) {
+ u64 v = percpu_u64_get(c->online_reserved);
+ WARN(v, "online_reserved not 0 at shutdown: %lli", v);
+ free_percpu(c->online_reserved);
+ }
darray_exit(&c->btree_roots_extra);
free_percpu(c->pcpu);
@@ -582,8 +584,10 @@ static void __bch2_fs_free(struct bch_fs *c)
if (c->write_ref_wq)
destroy_workqueue(c->write_ref_wq);
- if (c->io_complete_wq)
- destroy_workqueue(c->io_complete_wq);
+ if (c->btree_write_submit_wq)
+ destroy_workqueue(c->btree_write_submit_wq);
+ if (c->btree_read_complete_wq)
+ destroy_workqueue(c->btree_read_complete_wq);
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
if (c->btree_io_complete_wq)
@@ -878,8 +882,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
- !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
+ !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
+ !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
!(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
WQ_FREEZABLE, 0)) ||
#ifndef BCH_WRITE_REF_DEBUG
@@ -908,9 +914,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_io_clock_init(&c->io_clock[WRITE]) ?:
bch2_fs_journal_init(&c->journal) ?:
bch2_fs_replicas_init(c) ?:
+ bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_cache_init(c) ?:
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
- bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_interior_update_init(c) ?:
bch2_fs_buckets_waiting_for_journal_init(c) ?:
bch2_fs_btree_write_buffer_init(c) ?:
@@ -927,12 +933,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
if (ret)
goto err;
- for (i = 0; i < c->sb.nr_devices; i++)
- if (bch2_member_exists(c->disk_sb.sb, i) &&
- bch2_dev_alloc(c, i)) {
- ret = -EEXIST;
+ for (i = 0; i < c->sb.nr_devices; i++) {
+ if (!bch2_member_exists(c->disk_sb.sb, i))
+ continue;
+ ret = bch2_dev_alloc(c, i);
+ if (ret)
goto err;
- }
+ }
bch2_journal_entry_res_resize(&c->journal,
&c->btree_root_journal_res,
@@ -1190,6 +1197,7 @@ static void bch2_dev_free(struct bch_dev *ca)
kfree(ca->buckets_nouse);
bch2_free_super(&ca->disk_sb);
+ bch2_dev_allocator_background_exit(ca);
bch2_dev_journal_exit(ca);
free_percpu(ca->io_done);
@@ -1312,6 +1320,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
atomic_long_set(&ca->ref, 1);
#endif
+ bch2_dev_allocator_background_init(ca);
+
if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
!(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
@@ -1524,6 +1534,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
* The allocator thread itself allocates btree nodes, so stop it first:
*/
bch2_dev_allocator_remove(c, ca);
+ bch2_recalc_capacity(c);
bch2_dev_journal_stop(&c->journal, ca);
}
@@ -1535,6 +1546,7 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
+ bch2_dev_do_discards(ca);
}
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
@@ -1760,7 +1772,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (ret)
goto err;
- ret = bch2_dev_journal_alloc(ca);
+ ret = bch2_dev_journal_alloc(ca, true);
bch_err_msg(c, ret, "allocating journal");
if (ret)
goto err;
@@ -1920,7 +1932,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
}
if (!ca->journal.nr) {
- ret = bch2_dev_journal_alloc(ca);
+ ret = bch2_dev_journal_alloc(ca, false);
bch_err_msg(ca, ret, "allocating journal");
if (ret)
goto err;
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index de331dec2a99..4ec7e44d6e36 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -252,8 +252,10 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v)
bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
}
-void bch2_print_string_as_lines(const char *prefix, const char *lines)
+static void __bch2_print_string_as_lines(const char *prefix, const char *lines,
+ bool nonblocking)
{
+ bool locked = false;
const char *p;
if (!lines) {
@@ -261,7 +263,13 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
return;
}
- console_lock();
+ if (!nonblocking) {
+ console_lock();
+ locked = true;
+ } else {
+ locked = console_trylock();
+ }
+
while (1) {
p = strchrnul(lines, '\n');
printk("%s%.*s\n", prefix, (int) (p - lines), lines);
@@ -269,7 +277,18 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
break;
lines = p + 1;
}
- console_unlock();
+ if (locked)
+ console_unlock();
+}
+
+void bch2_print_string_as_lines(const char *prefix, const char *lines)
+{
+ return __bch2_print_string_as_lines(prefix, lines, false);
+}
+
+void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines)
+{
+ return __bch2_print_string_as_lines(prefix, lines, true);
}
int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 5d2c470a49ac..5b0533ec4c7e 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -315,6 +315,7 @@ void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
void bch2_prt_u64_base2(struct printbuf *, u64);
void bch2_print_string_as_lines(const char *prefix, const char *lines);
+void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines);
typedef DARRAY(unsigned long) bch_stacktrace;
int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index d76f406d3b2e..f92f108840f5 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -475,6 +475,7 @@ static int befs_symlink_read_folio(struct file *unused, struct folio *folio)
befs_data_stream *data = &befs_ino->i_data.ds;
befs_off_t len = data->size;
char *link = folio_address(folio);
+ int err = -EIO;
if (len == 0 || len > PAGE_SIZE) {
befs_error(sb, "Long symlink with illegal length");
@@ -487,13 +488,10 @@ static int befs_symlink_read_folio(struct file *unused, struct folio *folio)
goto fail;
}
link[len - 1] = '\0';
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- return 0;
+ err = 0;
fail:
- folio_set_error(folio);
- folio_unlock(folio);
- return -EIO;
+ folio_end_read(folio, err == 0);
+ return err;
}
/*
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index a43897b03ce9..5ae8045f4df4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1003,7 +1003,8 @@ out_free_interp:
if (elf_read_implies_exec(*elf_ex, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
- if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+ const int snapshot_randomize_va_space = READ_ONCE(randomize_va_space);
+ if (!(current->personality & ADDR_NO_RANDOMIZE) && snapshot_randomize_va_space)
current->flags |= PF_RANDOMIZE;
setup_new_exec(bprm);
@@ -1061,10 +1062,40 @@ out_free_interp:
* Header for ET_DYN binaries to calculate the
* randomization (load_bias) for all the LOAD
* Program Headers.
+ */
+
+ /*
+ * Calculate the entire size of the ELF mapping
+ * (total_size), used for the initial mapping,
+ * due to load_addr_set which is set to true later
+ * once the initial mapping is performed.
+ *
+ * Note that this is only sensible when the LOAD
+ * segments are contiguous (or overlapping). If
+ * used for LOADs that are far apart, this would
+ * cause the holes between LOADs to be mapped,
+ * running the risk of having the mapping fail,
+ * as it would be larger than the ELF file itself.
*
+ * As a result, only ET_DYN does this, since
+ * some ET_EXEC (e.g. ia64) may have large virtual
+ * memory holes between LOADs.
+ *
+ */
+ total_size = total_mapping_size(elf_phdata,
+ elf_ex->e_phnum);
+ if (!total_size) {
+ retval = -EINVAL;
+ goto out_free_dentry;
+ }
+
+ /* Calculate any requested alignment. */
+ alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
+
+ /*
* There are effectively two types of ET_DYN
- * binaries: programs (i.e. PIE: ET_DYN with INTERP)
- * and loaders (ET_DYN without INTERP, since they
+ * binaries: programs (i.e. PIE: ET_DYN with PT_INTERP)
+ * and loaders (ET_DYN without PT_INTERP, since they
* _are_ the ELF interpreter). The loaders must
* be loaded away from programs since the program
* may otherwise collide with the loader (especially
@@ -1084,15 +1115,44 @@ out_free_interp:
* without MAP_FIXED nor MAP_FIXED_NOREPLACE).
*/
if (interpreter) {
+ /* On ET_DYN with PT_INTERP, we do the ASLR. */
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
- alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
+ /* Adjust alignment as requested. */
if (alignment)
load_bias &= ~(alignment - 1);
elf_flags |= MAP_FIXED_NOREPLACE;
- } else
- load_bias = 0;
+ } else {
+ /*
+ * For ET_DYN without PT_INTERP, we rely on
+ * the architectures's (potentially ASLR) mmap
+ * base address (via a load_bias of 0).
+ *
+ * When a large alignment is requested, we
+ * must do the allocation at address "0" right
+ * now to discover where things will load so
+ * that we can adjust the resulting alignment.
+ * In this case (load_bias != 0), we can use
+ * MAP_FIXED_NOREPLACE to make sure the mapping
+ * doesn't collide with anything.
+ */
+ if (alignment > ELF_MIN_ALIGN) {
+ load_bias = elf_load(bprm->file, 0, elf_ppnt,
+ elf_prot, elf_flags, total_size);
+ if (BAD_ADDR(load_bias)) {
+ retval = IS_ERR_VALUE(load_bias) ?
+ PTR_ERR((void*)load_bias) : -EINVAL;
+ goto out_free_dentry;
+ }
+ vm_munmap(load_bias, total_size);
+ /* Adjust alignment as requested. */
+ if (alignment)
+ load_bias &= ~(alignment - 1);
+ elf_flags |= MAP_FIXED_NOREPLACE;
+ } else
+ load_bias = 0;
+ }
/*
* Since load_bias is used for all subsequent loading
@@ -1102,31 +1162,6 @@ out_free_interp:
* is then page aligned.
*/
load_bias = ELF_PAGESTART(load_bias - vaddr);
-
- /*
- * Calculate the entire size of the ELF mapping
- * (total_size), used for the initial mapping,
- * due to load_addr_set which is set to true later
- * once the initial mapping is performed.
- *
- * Note that this is only sensible when the LOAD
- * segments are contiguous (or overlapping). If
- * used for LOADs that are far apart, this would
- * cause the holes between LOADs to be mapped,
- * running the risk of having the mapping fail,
- * as it would be larger than the ELF file itself.
- *
- * As a result, only ET_DYN does this, since
- * some ET_EXEC (e.g. ia64) may have large virtual
- * memory holes between LOADs.
- *
- */
- total_size = total_mapping_size(elf_phdata,
- elf_ex->e_phnum);
- if (!total_size) {
- retval = -EINVAL;
- goto out_free_dentry;
- }
}
error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt,
@@ -1216,7 +1251,6 @@ out_free_interp:
}
reloc_func_desc = interp_load_addr;
- allow_write_access(interpreter);
fput(interpreter);
kfree(interp_elf_ex);
@@ -1251,7 +1285,7 @@ out_free_interp:
mm->end_data = end_data;
mm->start_stack = bprm->p;
- if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
+ if ((current->flags & PF_RANDOMIZE) && (snapshot_randomize_va_space > 1)) {
/*
* For architectures with ELF randomization, when executing
* a loader directly (i.e. no interpreter listed in ELF
@@ -1308,7 +1342,6 @@ out_free_dentry:
kfree(interp_elf_ex);
kfree(interp_elf_phdata);
out_free_file:
- allow_write_access(interpreter);
if (interpreter)
fput(interpreter);
out_free_ph:
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index b799701454a9..28a3439f163a 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -394,7 +394,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
goto error;
}
- allow_write_access(interpreter);
fput(interpreter);
interpreter = NULL;
}
@@ -466,10 +465,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
retval = 0;
error:
- if (interpreter) {
- allow_write_access(interpreter);
+ if (interpreter)
fput(interpreter);
- }
kfree(interpreter_name);
kfree(exec_params.phdrs);
kfree(exec_params.loadmap);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 68fa225f89e5..31660d8cc2c6 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -247,13 +247,10 @@ static int load_misc_binary(struct linux_binprm *bprm)
if (retval < 0)
goto ret;
- if (fmt->flags & MISC_FMT_OPEN_FILE) {
+ if (fmt->flags & MISC_FMT_OPEN_FILE)
interp_file = file_clone_open(fmt->interp_file);
- if (!IS_ERR(interp_file))
- deny_write_access(interp_file);
- } else {
+ else
interp_file = open_exec(fmt->interpreter);
- }
retval = PTR_ERR(interp_file);
if (IS_ERR(interp_file))
goto ret;
@@ -1086,4 +1083,5 @@ static void __exit exit_misc_binfmt(void)
core_initcall(init_misc_binfmt);
module_exit(exit_misc_binfmt);
+MODULE_DESCRIPTION("Kernel support for miscellaneous binaries");
MODULE_LICENSE("GPL");
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 1b6625e95958..637daf6e4d45 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -155,4 +155,5 @@ static void __exit exit_script_binfmt(void)
core_initcall(init_script_binfmt);
module_exit(exit_script_binfmt);
+MODULE_DESCRIPTION("Kernel support for scripts starting with #!");
MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 477f350a8bd0..e3a57196b0ee 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -741,7 +741,9 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
ret = btrfs_bio_csum(bbio);
if (ret)
goto fail_put_bio;
- } else if (use_append) {
+ } else if (use_append ||
+ (btrfs_is_zoned(fs_info) && inode &&
+ inode->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_alloc_dummy_sum(bbio);
if (ret)
goto fail_put_bio;
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 1e09aeea69c2..60066822b532 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1785,6 +1785,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
struct btrfs_block_group *bg;
struct btrfs_space_info *space_info;
+ LIST_HEAD(retry_list);
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
@@ -1921,8 +1922,20 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
}
next:
- if (ret)
- btrfs_mark_bg_to_reclaim(bg);
+ if (ret) {
+ /* Refcount held by the reclaim_bgs list after splice. */
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * This block group might be added to the unused list
+ * during the above process. Move it back to the
+ * reclaim list otherwise.
+ */
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ list_add_tail(&bg->bg_list, &retry_list);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ }
btrfs_put_block_group(bg);
mutex_unlock(&fs_info->reclaim_bgs_lock);
@@ -1942,6 +1955,9 @@ next:
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
end:
+ spin_lock(&fs_info->unused_bgs_lock);
+ list_splice_tail(&retry_list, &fs_info->reclaim_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
btrfs_exclop_finish(fs_info);
sb_end_write(fs_info->sb);
}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 91c994b569f3..6ed495ca7a31 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -89,6 +89,16 @@ enum {
BTRFS_INODE_FREE_SPACE_INODE,
/* Set when there are no capabilities in XATTs for the inode. */
BTRFS_INODE_NO_CAP_XATTR,
+ /*
+ * Set if an error happened when doing a COW write before submitting a
+ * bio or during writeback. Used for both buffered writes and direct IO
+ * writes. This is to signal a fast fsync that it has to wait for
+ * ordered extents to complete and therefore not log extent maps that
+ * point to unwritten extents (when an ordered extent completes and it
+ * has the BTRFS_ORDERED_IOERR flag set, it drops extent maps in its
+ * range).
+ */
+ BTRFS_INODE_COW_WRITE_ERROR,
};
/* in memory btrfs inode */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1b20b3e390df..cabb558dbdaa 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2856,6 +2856,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (ret)
return ret;
+ spin_lock_init(&fs_info->extent_map_shrinker_lock);
+
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret)
return ret;
@@ -4538,18 +4540,10 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info)
{
struct rb_node *node;
- struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
struct btrfs_delayed_ref_node *ref;
- delayed_refs = &trans->delayed_refs;
-
spin_lock(&delayed_refs->lock);
- if (atomic_read(&delayed_refs->num_entries) == 0) {
- spin_unlock(&delayed_refs->lock);
- btrfs_debug(fs_info, "delayed_refs has NO entry");
- return;
- }
-
while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
struct btrfs_delayed_ref_head *head;
struct rb_node *n;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 597387e9f040..958155cc43a8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3553,7 +3553,7 @@ err:
for (int i = 0; i < num_folios; i++) {
if (eb->folios[i]) {
detach_extent_buffer_folio(eb, eb->folios[i]);
- __folio_put(eb->folios[i]);
+ folio_put(eb->folios[i]);
}
}
__free_extent_buffer(eb);
@@ -3689,6 +3689,8 @@ static struct extent_buffer *grab_extent_buffer(
struct folio *folio = page_folio(page);
struct extent_buffer *exists;
+ lockdep_assert_held(&page->mapping->i_private_lock);
+
/*
* For subpage case, we completely rely on radix tree to ensure we
* don't try to insert two ebs for the same bytenr. So here we always
@@ -3756,13 +3758,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
* The caller needs to free the existing folios and retry using the same order.
*/
static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
+ struct btrfs_subpage *prealloc,
struct extent_buffer **found_eb_ret)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
const unsigned long index = eb->start >> PAGE_SHIFT;
- struct folio *existing_folio;
+ struct folio *existing_folio = NULL;
int ret;
ASSERT(found_eb_ret);
@@ -3774,12 +3777,14 @@ retry:
ret = filemap_add_folio(mapping, eb->folios[i], index + i,
GFP_NOFS | __GFP_NOFAIL);
if (!ret)
- return 0;
+ goto finish;
existing_folio = filemap_lock_folio(mapping, index + i);
/* The page cache only exists for a very short time, just retry. */
- if (IS_ERR(existing_folio))
+ if (IS_ERR(existing_folio)) {
+ existing_folio = NULL;
goto retry;
+ }
/* For now, we should only have single-page folios for btree inode. */
ASSERT(folio_nr_pages(existing_folio) == 1);
@@ -3790,14 +3795,13 @@ retry:
return -EAGAIN;
}
- if (fs_info->nodesize < PAGE_SIZE) {
- /*
- * We're going to reuse the existing page, can drop our page
- * and subpage structure now.
- */
+finish:
+ spin_lock(&mapping->i_private_lock);
+ if (existing_folio && fs_info->nodesize < PAGE_SIZE) {
+ /* We're going to reuse the existing page, can drop our folio now. */
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
- } else {
+ } else if (existing_folio) {
struct extent_buffer *existing_eb;
existing_eb = grab_extent_buffer(fs_info,
@@ -3805,6 +3809,7 @@ retry:
if (existing_eb) {
/* The extent buffer still exists, we can use it directly. */
*found_eb_ret = existing_eb;
+ spin_unlock(&mapping->i_private_lock);
folio_unlock(existing_folio);
folio_put(existing_folio);
return 1;
@@ -3813,6 +3818,22 @@ retry:
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
}
+ eb->folio_size = folio_size(eb->folios[i]);
+ eb->folio_shift = folio_shift(eb->folios[i]);
+ /* Should not fail, as we have preallocated the memory. */
+ ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc);
+ ASSERT(!ret);
+ /*
+ * To inform we have an extra eb under allocation, so that
+ * detach_extent_buffer_page() won't release the folio private when the
+ * eb hasn't been inserted into radix tree yet.
+ *
+ * The ref will be decreased when the eb releases the page, in
+ * detach_extent_buffer_page(). Thus needs no special handling in the
+ * error path.
+ */
+ btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]);
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
@@ -3824,7 +3845,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
int attached = 0;
struct extent_buffer *eb;
struct extent_buffer *existing_eb = NULL;
- struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct btrfs_subpage *prealloc = NULL;
u64 lockdep_owner = owner_root;
bool page_contig = true;
@@ -3890,7 +3910,7 @@ reallocate:
for (int i = 0; i < num_folios; i++) {
struct folio *folio;
- ret = attach_eb_folio_to_filemap(eb, i, &existing_eb);
+ ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb);
if (ret > 0) {
ASSERT(existing_eb);
goto out;
@@ -3927,24 +3947,6 @@ reallocate:
* and free the allocated page.
*/
folio = eb->folios[i];
- eb->folio_size = folio_size(folio);
- eb->folio_shift = folio_shift(folio);
- spin_lock(&mapping->i_private_lock);
- /* Should not fail, as we have preallocated the memory */
- ret = attach_extent_buffer_folio(eb, folio, prealloc);
- ASSERT(!ret);
- /*
- * To inform we have extra eb under allocation, so that
- * detach_extent_buffer_page() won't release the folio private
- * when the eb hasn't yet been inserted into radix tree.
- *
- * The ref will be decreased when the eb released the page, in
- * detach_extent_buffer_page().
- * Thus needs no special handling in error path.
- */
- btrfs_folio_inc_eb_refs(fs_info, folio);
- spin_unlock(&mapping->i_private_lock);
-
WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));
/*
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 744e8952abb0..b4c9a6aa118c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1028,7 +1028,14 @@ out_free_pre:
return ret;
}
-static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan)
+struct btrfs_em_shrink_ctx {
+ long nr_to_scan;
+ long scanned;
+ u64 last_ino;
+ u64 last_root;
+};
+
+static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
{
const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
struct extent_map_tree *tree = &inode->extent_tree;
@@ -1057,14 +1064,25 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
if (!down_read_trylock(&inode->i_mmap_lock))
return 0;
- write_lock(&tree->lock);
+ /*
+ * We want to be fast because we can be called from any path trying to
+ * allocate memory, so if the lock is busy we don't want to spend time
+ * waiting for it - either some task is about to do IO for the inode or
+ * we may have another task shrinking extent maps, here in this code, so
+ * skip this inode.
+ */
+ if (!write_trylock(&tree->lock)) {
+ up_read(&inode->i_mmap_lock);
+ return 0;
+ }
+
node = rb_first_cached(&tree->map);
while (node) {
struct extent_map *em;
em = rb_entry(node, struct extent_map, rb_node);
node = rb_next(node);
- (*scanned)++;
+ ctx->scanned++;
if (em->flags & EXTENT_FLAG_PINNED)
goto next;
@@ -1085,16 +1103,18 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
free_extent_map(em);
nr_dropped++;
next:
- if (*scanned >= nr_to_scan)
+ if (ctx->scanned >= ctx->nr_to_scan)
break;
/*
- * Restart if we had to reschedule, and any extent maps that were
- * pinned before may have become unpinned after we released the
- * lock and took it again.
+ * Stop if we need to reschedule or there's contention on the
+ * lock. This is to avoid slowing other tasks trying to take the
+ * lock and because the shrinker might be called during a memory
+ * allocation path and we want to avoid taking a very long time
+ * and slowing down all sorts of tasks.
*/
- if (cond_resched_rwlock_write(&tree->lock))
- node = rb_first_cached(&tree->map);
+ if (need_resched() || rwlock_needbreak(&tree->lock))
+ break;
}
write_unlock(&tree->lock);
up_read(&inode->i_mmap_lock);
@@ -1102,25 +1122,30 @@ next:
return nr_dropped;
}
-static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan)
+static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_inode *inode;
long nr_dropped = 0;
- u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
+ u64 min_ino = ctx->last_ino + 1;
inode = btrfs_find_first_inode(root, min_ino);
while (inode) {
- nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan);
+ nr_dropped += btrfs_scan_inode(inode, ctx);
min_ino = btrfs_ino(inode) + 1;
- fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
- iput(&inode->vfs_inode);
+ ctx->last_ino = btrfs_ino(inode);
+ btrfs_add_delayed_iput(inode);
- if (*scanned >= nr_to_scan)
+ if (ctx->scanned >= ctx->nr_to_scan)
+ break;
+
+ /*
+ * We may be called from memory allocation paths, so we don't
+ * want to take too much time and slowdown tasks.
+ */
+ if (need_resched())
break;
- cond_resched();
inode = btrfs_find_first_inode(root, min_ino);
}
@@ -1132,14 +1157,14 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
* inode if there is one or we will find out this was the last
* one and move to the next root.
*/
- fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
+ ctx->last_root = btrfs_root_id(root);
} else {
/*
* No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
* that when processing the next root we start from its first inode.
*/
- fs_info->extent_map_shrinker_last_ino = 0;
- fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
+ ctx->last_ino = 0;
+ ctx->last_root = btrfs_root_id(root) + 1;
}
return nr_dropped;
@@ -1147,19 +1172,41 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
{
- const u64 start_root_id = fs_info->extent_map_shrinker_last_root;
- u64 next_root_id = start_root_id;
+ struct btrfs_em_shrink_ctx ctx;
+ u64 start_root_id;
+ u64 next_root_id;
bool cycled = false;
long nr_dropped = 0;
- long scanned = 0;
+
+ ctx.scanned = 0;
+ ctx.nr_to_scan = nr_to_scan;
+
+ /*
+ * In case we have multiple tasks running this shrinker, make the next
+ * one start from the next inode in case it starts before we finish.
+ */
+ spin_lock(&fs_info->extent_map_shrinker_lock);
+ ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
+ fs_info->extent_map_shrinker_last_ino++;
+ ctx.last_root = fs_info->extent_map_shrinker_last_root;
+ spin_unlock(&fs_info->extent_map_shrinker_lock);
+
+ start_root_id = ctx.last_root;
+ next_root_id = ctx.last_root;
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
- trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr);
+ trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
+ nr, ctx.last_root,
+ ctx.last_ino);
}
- while (scanned < nr_to_scan) {
+ /*
+ * We may be called from memory allocation paths, so we don't want to
+ * take too much time and slowdown tasks, so stop if we need reschedule.
+ */
+ while (ctx.scanned < ctx.nr_to_scan && !need_resched()) {
struct btrfs_root *root;
unsigned long count;
@@ -1171,8 +1218,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
spin_unlock(&fs_info->fs_roots_radix_lock);
if (start_root_id > 0 && !cycled) {
next_root_id = 0;
- fs_info->extent_map_shrinker_last_root = 0;
- fs_info->extent_map_shrinker_last_ino = 0;
+ ctx.last_root = 0;
+ ctx.last_ino = 0;
cycled = true;
continue;
}
@@ -1186,15 +1233,33 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
continue;
if (is_fstree(btrfs_root_id(root)))
- nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan);
+ nr_dropped += btrfs_scan_root(root, &ctx);
btrfs_put_root(root);
}
+ /*
+ * In case of multiple tasks running this extent map shrinking code this
+ * isn't perfect but it's simple and silences things like KCSAN. It's
+ * not possible to know which task made more progress because we can
+ * cycle back to the first root and first inode if it's not the first
+ * time the shrinker ran, see the above logic. Also a task that started
+ * later may finish ealier than another task and made less progress. So
+ * make this simple and update to the progress of the last task that
+ * finished, with the occasional possiblity of having two consecutive
+ * runs of the shrinker process the same inodes.
+ */
+ spin_lock(&fs_info->extent_map_shrinker_lock);
+ fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
+ fs_info->extent_map_shrinker_last_root = ctx.last_root;
+ spin_unlock(&fs_info->extent_map_shrinker_lock);
+
if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
- trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
+ trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
+ nr, ctx.last_root,
+ ctx.last_ino);
}
return nr_dropped;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e764ac3f22e2..d90138683a0a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1885,6 +1885,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
if (full_sync || btrfs_is_zoned(fs_info)) {
ret = btrfs_wait_ordered_range(inode, start, len);
+ clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &BTRFS_I(inode)->runtime_flags);
} else {
/*
* Get our ordered extents as soon as possible to avoid doing
@@ -1894,6 +1895,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
&ctx.ordered_extents);
ret = filemap_fdatawait_range(inode->i_mapping, start, end);
+ if (ret)
+ goto out_release_extents;
+
+ /*
+ * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
+ * starting and waiting for writeback, because for buffered IO
+ * it may have been set during the end IO callback
+ * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
+ * case an error happened and we need to wait for ordered
+ * extents to complete so that any extent maps that point to
+ * unwritten locations are dropped and we don't log them.
+ */
+ if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = btrfs_wait_ordered_range(inode, start, len);
}
if (ret)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 3ab8dea5036b..dabc3d0793cf 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2697,7 +2697,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
int bg_reclaim_threshold = 0;
- bool initial = (size == block_group->length);
+ bool initial = ((size == block_group->length) && (block_group->alloc_offset == 0));
u64 reclaimable_unusable;
WARN_ON(!initial && offset + size > block_group->zone_capacity);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 89f0650631cd..833dc3fe0a38 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -630,6 +630,7 @@ struct btrfs_fs_info {
s32 delalloc_batch;
struct percpu_counter evictable_extent_maps;
+ spinlock_t extent_map_shrinker_lock;
u64 extent_map_shrinker_last_root;
u64 extent_map_shrinker_last_ino;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 753db965f7c0..d62c96f00ff8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5587,7 +5587,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
args.ino = ino;
args.root = root;
- inode = iget5_locked(s, hashval, btrfs_find_actor,
+ inode = iget5_locked_rcu(s, hashval, btrfs_find_actor,
btrfs_init_locked_inode,
(void *)&args);
return inode;
@@ -10385,7 +10385,7 @@ out_unlock:
out_folios:
for (i = 0; i < nr_folios; i++) {
if (folios[i])
- __folio_put(folios[i]);
+ folio_put(folios[i]);
}
kvfree(folios);
out:
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index efd5d6e9589e..6ad524b894fc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4627,7 +4627,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
goto out_iov;
init_sync_kiocb(&kiocb, file);
- ret = kiocb_set_rw_flags(&kiocb, 0);
+ ret = kiocb_set_rw_flags(&kiocb, 0, WRITE);
if (ret)
goto out_iov;
kiocb.ki_pos = pos;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index c5bdd674f55c..35a413ce935d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -388,6 +388,37 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ /*
+ * If this is a COW write it means we created new extent maps for the
+ * range and they point to unwritten locations if we got an error either
+ * before submitting a bio or during IO.
+ *
+ * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we
+ * are queuing its completion below. During completion, at
+ * btrfs_finish_one_ordered(), we will drop the extent maps for the
+ * unwritten extents.
+ *
+ * However because completion runs in a work queue we can end up having
+ * a fast fsync running before that. In the case of direct IO, once we
+ * unlock the inode the fsync might start, and we queue the completion
+ * before unlocking the inode. In the case of buffered IO when writeback
+ * finishes (end_bbio_data_write()) we queue the completion, so if the
+ * writeback was triggered by a fast fsync, the fsync might start
+ * logging before ordered extent completion runs in the work queue.
+ *
+ * The fast fsync will log file extent items based on the extent maps it
+ * finds, so if by the time it collects extent maps the ordered extent
+ * completion didn't happen yet, it will log file extent items that
+ * point to unwritten extents, resulting in a corruption if a crash
+ * happens and the log tree is replayed. Note that a fast fsync does not
+ * wait for completion of ordered extents in order to reduce latency.
+ *
+ * Set a flag in the inode so that the next fast fsync will wait for
+ * ordered extents to complete before starting to log.
+ */
+ if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+ set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
+
if (ret)
btrfs_queue_ordered_fn(ordered);
return ret;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index fc2a7ea26354..39a15cca58ca 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1351,7 +1351,7 @@ static int flush_reservations(struct btrfs_fs_info *fs_info)
int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *quota_root;
+ struct btrfs_root *quota_root = NULL;
struct btrfs_trans_handle *trans = NULL;
int ret = 0;
@@ -1449,9 +1449,9 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
quota_root->node, 0, 1);
- btrfs_put_root(quota_root);
out:
+ btrfs_put_root(quota_root);
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (ret && trans)
btrfs_end_transaction(trans);
@@ -3062,8 +3062,6 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup_inherit *inherit,
size_t size)
{
- if (!btrfs_qgroup_enabled(fs_info))
- return 0;
if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP)
return -EOPNOTSUPP;
if (size < sizeof(*inherit) || size > PAGE_SIZE)
@@ -3085,6 +3083,14 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
return -EINVAL;
/*
+ * Skip the inherit source qgroups check if qgroup is not enabled.
+ * Qgroup can still be later enabled causing problems, but in that case
+ * btrfs_qgroup_inherit() would just ignore those invalid ones.
+ */
+ if (!btrfs_qgroup_enabled(fs_info))
+ return 0;
+
+ /*
* Now check all the remaining qgroups, they should all:
*
* - Exist
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index cf531255ab76..9522a8b79d22 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -441,7 +441,8 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
u32 item_size = btrfs_item_size(leaf, slot);
unsigned long end, ptr;
u64 offset, flags, count;
- int type, ret;
+ int type;
+ int ret = 0;
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
flags = btrfs_extent_flags(leaf, ei);
@@ -486,7 +487,11 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
key->objectid, key->offset);
break;
case BTRFS_EXTENT_OWNER_REF_KEY:
- WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) {
+ btrfs_err(fs_info,
+ "found extent owner ref without simple quotas enabled");
+ ret = -EINVAL;
+ }
break;
default:
btrfs_err(fs_info, "invalid key type in iref");
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index afd6932f5e89..d7caa3732f07 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1688,20 +1688,24 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
(i << fs_info->sectorsize_bits);
int err;
- bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
- fs_info, scrub_read_endio, stripe);
- bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
-
io_stripe.is_scrub = true;
+ stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits;
+ /*
+ * For RST cases, we need to manually split the bbio to
+ * follow the RST boundary.
+ */
err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
- &stripe_len, &bioc, &io_stripe,
- &mirror);
+ &stripe_len, &bioc, &io_stripe, &mirror);
btrfs_put_bioc(bioc);
- if (err) {
- btrfs_bio_end_io(bbio,
- errno_to_blk_status(err));
- return;
+ if (err < 0) {
+ set_bit(i, &stripe->io_error_bitmap);
+ set_bit(i, &stripe->error_bitmap);
+ continue;
}
+
+ bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
+ fs_info, scrub_read_endio, stripe);
+ bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
}
__bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d620323d08ea..ae8c56442549 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -373,11 +373,18 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
* "optimal" chunk size based on the fs size. However when we actually
* allocate the chunk we will strip this down further, making it no more
* than 10% of the disk or 1G, whichever is smaller.
+ *
+ * On the zoned mode, we need to use zone_size (=
+ * data_sinfo->chunk_size) as it is.
*/
data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
- data_chunk_size = min(data_sinfo->chunk_size,
- mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
- data_chunk_size = min_t(u64, data_chunk_size, SZ_1G);
+ if (!btrfs_is_zoned(fs_info)) {
+ data_chunk_size = min(data_sinfo->chunk_size,
+ mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
+ data_chunk_size = min_t(u64, data_chunk_size, SZ_1G);
+ } else {
+ data_chunk_size = data_sinfo->chunk_size;
+ }
/*
* Since data allocations immediately use block groups as part of the
@@ -405,6 +412,17 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
avail >>= 3;
else
avail >>= 1;
+
+ /*
+ * On the zoned mode, we always allocate one zone as one chunk.
+ * Returning non-zone size alingned bytes here will result in
+ * less pressure for the async metadata reclaim process, and it
+ * will over-commit too much leading to ENOSPC. Align down to the
+ * zone size to avoid that.
+ */
+ if (btrfs_is_zoned(fs_info))
+ avail = ALIGN_DOWN(avail, fs_info->zone_size);
+
return avail;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5146387b416b..0bce1d45e252 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -138,6 +138,25 @@ static void wait_log_commit(struct btrfs_root *root, int transid);
* and once to do all the other items.
*/
+static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
+{
+ unsigned int nofs_flag;
+ struct inode *inode;
+
+ /*
+ * We're holding a transaction handle whether we are logging or
+ * replaying a log tree, so we must make sure NOFS semantics apply
+ * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL
+ * to allocate an inode, which can recurse back into the filesystem and
+ * attempt a transaction commit, resulting in a deadlock.
+ */
+ nofs_flag = memalloc_nofs_save();
+ inode = btrfs_iget(root->fs_info->sb, objectid, root);
+ memalloc_nofs_restore(nofs_flag);
+
+ return inode;
+}
+
/*
* start a sub transaction and setup the log tree
* this increments the log tree writer count to make the people
@@ -600,7 +619,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
{
struct inode *inode;
- inode = btrfs_iget(root->fs_info->sb, objectid, root);
+ inode = btrfs_iget_logging(objectid, root);
if (IS_ERR(inode))
inode = NULL;
return inode;
@@ -4860,18 +4879,23 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
path->slots[0]++;
continue;
}
- if (!dropped_extents) {
- /*
- * Avoid logging extent items logged in past fsync calls
- * and leading to duplicate keys in the log tree.
- */
+ /*
+ * Avoid overlapping items in the log tree. The first time we
+ * get here, get rid of everything from a past fsync. After
+ * that, if the current extent starts before the end of the last
+ * extent we copied, truncate the last one. This can happen if
+ * an ordered extent completion modifies the subvolume tree
+ * while btrfs_next_leaf() has the tree unlocked.
+ */
+ if (!dropped_extents || key.offset < truncate_offset) {
ret = truncate_inode_items(trans, root->log_root, inode,
- truncate_offset,
+ min(key.offset, truncate_offset),
BTRFS_EXTENT_DATA_KEY);
if (ret)
goto out;
dropped_extents = true;
}
+ truncate_offset = btrfs_file_extent_end(path);
if (ins_nr == 0)
start_slot = slot;
ins_nr++;
@@ -5433,7 +5457,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = start_inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
LIST_HEAD(dir_list);
struct btrfs_dir_list *dir_elem;
@@ -5494,7 +5517,7 @@ again:
continue;
btrfs_release_path(path);
- di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
+ di_inode = btrfs_iget_logging(di_key.objectid, root);
if (IS_ERR(di_inode)) {
ret = PTR_ERR(di_inode);
goto out;
@@ -5554,7 +5577,7 @@ again:
btrfs_add_delayed_iput(curr_inode);
curr_inode = NULL;
- vfs_inode = btrfs_iget(fs_info->sb, ino, root);
+ vfs_inode = btrfs_iget_logging(ino, root);
if (IS_ERR(vfs_inode)) {
ret = PTR_ERR(vfs_inode);
break;
@@ -5649,7 +5672,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
return BTRFS_LOG_FORCE_COMMIT;
- inode = btrfs_iget(root->fs_info->sb, ino, root);
+ inode = btrfs_iget_logging(ino, root);
/*
* If the other inode that had a conflicting dir entry was deleted in
* the current transaction then we either:
@@ -5750,7 +5773,6 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
/*
@@ -5781,7 +5803,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
list_del(&curr->list);
kfree(curr);
- inode = btrfs_iget(fs_info->sb, ino, root);
+ inode = btrfs_iget_logging(ino, root);
/*
* If the other inode that had a conflicting dir entry was
* deleted in the current transaction, we need to log its parent
@@ -5792,7 +5814,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
if (ret != -ENOENT)
break;
- inode = btrfs_iget(fs_info->sb, parent, root);
+ inode = btrfs_iget_logging(parent, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
break;
@@ -6314,7 +6336,6 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
const bool orig_log_new_dentries = ctx->log_new_dentries;
- struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_item *item;
int ret = 0;
@@ -6340,7 +6361,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
if (key.type == BTRFS_ROOT_ITEM_KEY)
continue;
- di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root);
+ di_inode = btrfs_iget_logging(key.objectid, inode->root);
if (IS_ERR(di_inode)) {
ret = PTR_ERR(di_inode);
break;
@@ -6724,7 +6745,6 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
@@ -6789,8 +6809,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
cur_offset = item_size;
}
- dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
- root);
+ dir_inode = btrfs_iget_logging(inode_key.objectid, root);
/*
* If the parent inode was deleted, return an error to
* fallback to a transaction commit. This is to prevent
@@ -6852,7 +6871,6 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
while (true) {
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
int slot;
struct btrfs_key search_key;
@@ -6867,7 +6885,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
search_key.objectid = found_key.offset;
search_key.type = BTRFS_INODE_ITEM_KEY;
search_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, ino, root);
+ inode = btrfs_iget_logging(ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
diff --git a/fs/buffer.c b/fs/buffer.c
index 8c19e705b9c3..dbe8f411ce52 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -258,7 +258,6 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
} else {
clear_buffer_uptodate(bh);
buffer_io_error(bh, ", async page read");
- folio_set_error(folio);
}
/*
@@ -391,7 +390,6 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
buffer_io_error(bh, ", lost async page write");
mark_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
- folio_set_error(folio);
}
first = folio_buffers(folio);
@@ -1960,7 +1958,6 @@ recover:
clear_buffer_dirty(bh);
}
} while ((bh = bh->b_this_page) != head);
- folio_set_error(folio);
BUG_ON(folio_test_writeback(folio));
mapping_set_error(folio->mapping, err);
folio_start_writeback(folio);
@@ -2405,10 +2402,8 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
if (iblock < lblock) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, iblock, bh, 0);
- if (err) {
- folio_set_error(folio);
+ if (err)
page_error = true;
- }
}
if (!buffer_mapped(bh)) {
folio_zero_range(folio, i * blocksize,
diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c
index f449f7340aad..9fb06dc16520 100644
--- a/fs/cachefiles/cache.c
+++ b/fs/cachefiles/cache.c
@@ -8,6 +8,7 @@
#include <linux/slab.h>
#include <linux/statfs.h>
#include <linux/namei.h>
+#include <trace/events/fscache.h>
#include "internal.h"
/*
@@ -312,19 +313,59 @@ static void cachefiles_withdraw_objects(struct cachefiles_cache *cache)
}
/*
- * Withdraw volumes.
+ * Withdraw fscache volumes.
+ */
+static void cachefiles_withdraw_fscache_volumes(struct cachefiles_cache *cache)
+{
+ struct list_head *cur;
+ struct cachefiles_volume *volume;
+ struct fscache_volume *vcookie;
+
+ _enter("");
+retry:
+ spin_lock(&cache->object_list_lock);
+ list_for_each(cur, &cache->volumes) {
+ volume = list_entry(cur, struct cachefiles_volume, cache_link);
+
+ if (atomic_read(&volume->vcookie->n_accesses) == 0)
+ continue;
+
+ vcookie = fscache_try_get_volume(volume->vcookie,
+ fscache_volume_get_withdraw);
+ if (vcookie) {
+ spin_unlock(&cache->object_list_lock);
+ fscache_withdraw_volume(vcookie);
+ fscache_put_volume(vcookie, fscache_volume_put_withdraw);
+ goto retry;
+ }
+ }
+ spin_unlock(&cache->object_list_lock);
+
+ _leave("");
+}
+
+/*
+ * Withdraw cachefiles volumes.
*/
static void cachefiles_withdraw_volumes(struct cachefiles_cache *cache)
{
_enter("");
for (;;) {
+ struct fscache_volume *vcookie = NULL;
struct cachefiles_volume *volume = NULL;
spin_lock(&cache->object_list_lock);
if (!list_empty(&cache->volumes)) {
volume = list_first_entry(&cache->volumes,
struct cachefiles_volume, cache_link);
+ vcookie = fscache_try_get_volume(volume->vcookie,
+ fscache_volume_get_withdraw);
+ if (!vcookie) {
+ spin_unlock(&cache->object_list_lock);
+ cpu_relax();
+ continue;
+ }
list_del_init(&volume->cache_link);
}
spin_unlock(&cache->object_list_lock);
@@ -332,6 +373,7 @@ static void cachefiles_withdraw_volumes(struct cachefiles_cache *cache)
break;
cachefiles_withdraw_volume(volume);
+ fscache_put_volume(vcookie, fscache_volume_put_withdraw);
}
_leave("");
@@ -371,6 +413,7 @@ void cachefiles_withdraw_cache(struct cachefiles_cache *cache)
pr_info("File cache on %s unregistering\n", fscache->name);
fscache_withdraw_cache(fscache);
+ cachefiles_withdraw_fscache_volumes(cache);
/* we now have to destroy all the active objects pertaining to this
* cache - which we do by passing them off to thread pool to be
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 6465e2574230..89b11336a836 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -133,7 +133,7 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file)
return 0;
}
-static void cachefiles_flush_reqs(struct cachefiles_cache *cache)
+void cachefiles_flush_reqs(struct cachefiles_cache *cache)
{
struct xarray *xa = &cache->reqs;
struct cachefiles_req *req;
@@ -159,6 +159,7 @@ static void cachefiles_flush_reqs(struct cachefiles_cache *cache)
xa_for_each(xa, index, req) {
req->error = -EIO;
complete(&req->done);
+ __xa_erase(xa, index);
}
xa_unlock(xa);
@@ -365,14 +366,14 @@ static __poll_t cachefiles_daemon_poll(struct file *file,
if (cachefiles_in_ondemand_mode(cache)) {
if (!xa_empty(&cache->reqs)) {
- rcu_read_lock();
+ xas_lock(&xas);
xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) {
if (!cachefiles_ondemand_is_reopening_read(req)) {
mask |= EPOLLIN;
break;
}
}
- rcu_read_unlock();
+ xas_unlock(&xas);
}
} else {
if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index d33169f0018b..7b99bd98de75 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -48,6 +48,7 @@ enum cachefiles_object_state {
CACHEFILES_ONDEMAND_OBJSTATE_CLOSE, /* Anonymous fd closed by daemon or initial state */
CACHEFILES_ONDEMAND_OBJSTATE_OPEN, /* Anonymous fd associated with object is available */
CACHEFILES_ONDEMAND_OBJSTATE_REOPENING, /* Object that was closed and is being reopened. */
+ CACHEFILES_ONDEMAND_OBJSTATE_DROPPING, /* Object is being dropped. */
};
struct cachefiles_ondemand_info {
@@ -55,6 +56,7 @@ struct cachefiles_ondemand_info {
int ondemand_id;
enum cachefiles_object_state state;
struct cachefiles_object *object;
+ spinlock_t lock;
};
/*
@@ -127,6 +129,7 @@ struct cachefiles_cache {
unsigned long req_id_next;
struct xarray ondemand_ids; /* xarray for ondemand_id allocation */
u32 ondemand_id_next;
+ u32 msg_id_next;
};
static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache)
@@ -138,6 +141,7 @@ static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache)
struct cachefiles_req {
struct cachefiles_object *object;
struct completion done;
+ refcount_t ref;
int error;
struct cachefiles_msg msg;
};
@@ -186,6 +190,7 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache,
* daemon.c
*/
extern const struct file_operations cachefiles_daemon_fops;
+extern void cachefiles_flush_reqs(struct cachefiles_cache *cache);
extern void cachefiles_get_unbind_pincount(struct cachefiles_cache *cache);
extern void cachefiles_put_unbind_pincount(struct cachefiles_cache *cache);
@@ -332,6 +337,7 @@ cachefiles_ondemand_set_object_##_state(struct cachefiles_object *object) \
CACHEFILES_OBJECT_STATE_FUNCS(open, OPEN);
CACHEFILES_OBJECT_STATE_FUNCS(close, CLOSE);
CACHEFILES_OBJECT_STATE_FUNCS(reopening, REOPENING);
+CACHEFILES_OBJECT_STATE_FUNCS(dropping, DROPPING);
static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req)
{
@@ -424,6 +430,8 @@ do { \
pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__); \
fscache_io_error((___cache)->cache); \
set_bit(CACHEFILES_DEAD, &(___cache)->flags); \
+ if (cachefiles_in_ondemand_mode(___cache)) \
+ cachefiles_flush_reqs(___cache); \
} while (0)
#define cachefiles_io_error_obj(object, FMT, ...) \
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index 4ba42f1fa3b4..470c96658385 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -1,22 +1,42 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-#include <linux/fdtable.h>
#include <linux/anon_inodes.h>
#include <linux/uio.h>
#include "internal.h"
+struct ondemand_anon_file {
+ struct file *file;
+ int fd;
+};
+
+static inline void cachefiles_req_put(struct cachefiles_req *req)
+{
+ if (refcount_dec_and_test(&req->ref))
+ kfree(req);
+}
+
static int cachefiles_ondemand_fd_release(struct inode *inode,
struct file *file)
{
struct cachefiles_object *object = file->private_data;
- struct cachefiles_cache *cache = object->volume->cache;
- struct cachefiles_ondemand_info *info = object->ondemand;
- int object_id = info->ondemand_id;
+ struct cachefiles_cache *cache;
+ struct cachefiles_ondemand_info *info;
+ int object_id;
struct cachefiles_req *req;
- XA_STATE(xas, &cache->reqs, 0);
+ XA_STATE(xas, NULL, 0);
+
+ if (!object)
+ return 0;
+
+ info = object->ondemand;
+ cache = object->volume->cache;
+ xas.xa = &cache->reqs;
xa_lock(&cache->reqs);
+ spin_lock(&info->lock);
+ object_id = info->ondemand_id;
info->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED;
cachefiles_ondemand_set_object_close(object);
+ spin_unlock(&info->lock);
/* Only flush CACHEFILES_REQ_NEW marked req to avoid race with daemon_read */
xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) {
@@ -76,12 +96,12 @@ static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos,
}
static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl,
- unsigned long arg)
+ unsigned long id)
{
struct cachefiles_object *object = filp->private_data;
struct cachefiles_cache *cache = object->volume->cache;
struct cachefiles_req *req;
- unsigned long id;
+ XA_STATE(xas, &cache->reqs, id);
if (ioctl != CACHEFILES_IOC_READ_COMPLETE)
return -EINVAL;
@@ -89,10 +109,15 @@ static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl,
if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
return -EOPNOTSUPP;
- id = arg;
- req = xa_erase(&cache->reqs, id);
- if (!req)
+ xa_lock(&cache->reqs);
+ req = xas_load(&xas);
+ if (!req || req->msg.opcode != CACHEFILES_OP_READ ||
+ req->object != object) {
+ xa_unlock(&cache->reqs);
return -EINVAL;
+ }
+ xas_store(&xas, NULL);
+ xa_unlock(&cache->reqs);
trace_cachefiles_ondemand_cread(object, id);
complete(&req->done);
@@ -116,10 +141,12 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
{
struct cachefiles_req *req;
struct fscache_cookie *cookie;
+ struct cachefiles_ondemand_info *info;
char *pid, *psize;
unsigned long id;
long size;
int ret;
+ XA_STATE(xas, &cache->reqs, 0);
if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
return -EOPNOTSUPP;
@@ -143,10 +170,18 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
if (ret)
return ret;
- req = xa_erase(&cache->reqs, id);
- if (!req)
+ xa_lock(&cache->reqs);
+ xas.xa_index = id;
+ req = xas_load(&xas);
+ if (!req || req->msg.opcode != CACHEFILES_OP_OPEN ||
+ !req->object->ondemand->ondemand_id) {
+ xa_unlock(&cache->reqs);
return -EINVAL;
+ }
+ xas_store(&xas, NULL);
+ xa_unlock(&cache->reqs);
+ info = req->object->ondemand;
/* fail OPEN request if copen format is invalid */
ret = kstrtol(psize, 0, &size);
if (ret) {
@@ -166,6 +201,32 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
goto out;
}
+ spin_lock(&info->lock);
+ /*
+ * The anonymous fd was closed before copen ? Fail the request.
+ *
+ * t1 | t2
+ * ---------------------------------------------------------
+ * cachefiles_ondemand_copen
+ * req = xa_erase(&cache->reqs, id)
+ * // Anon fd is maliciously closed.
+ * cachefiles_ondemand_fd_release
+ * xa_lock(&cache->reqs)
+ * cachefiles_ondemand_set_object_close(object)
+ * xa_unlock(&cache->reqs)
+ * cachefiles_ondemand_set_object_open
+ * // No one will ever close it again.
+ * cachefiles_ondemand_daemon_read
+ * cachefiles_ondemand_select_req
+ *
+ * Get a read req but its fd is already closed. The daemon can't
+ * issue a cread ioctl with an closed fd, then hung.
+ */
+ if (info->ondemand_id == CACHEFILES_ONDEMAND_ID_CLOSED) {
+ spin_unlock(&info->lock);
+ req->error = -EBADFD;
+ goto out;
+ }
cookie = req->object->cookie;
cookie->object_size = size;
if (size)
@@ -175,9 +236,15 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
trace_cachefiles_ondemand_copen(req->object, id, size);
cachefiles_ondemand_set_object_open(req->object);
+ spin_unlock(&info->lock);
wake_up_all(&cache->daemon_pollwq);
out:
+ spin_lock(&info->lock);
+ /* Need to set object close to avoid reopen status continuing */
+ if (info->ondemand_id == CACHEFILES_ONDEMAND_ID_CLOSED)
+ cachefiles_ondemand_set_object_close(req->object);
+ spin_unlock(&info->lock);
complete(&req->done);
return ret;
}
@@ -205,14 +272,14 @@ int cachefiles_ondemand_restore(struct cachefiles_cache *cache, char *args)
return 0;
}
-static int cachefiles_ondemand_get_fd(struct cachefiles_req *req)
+static int cachefiles_ondemand_get_fd(struct cachefiles_req *req,
+ struct ondemand_anon_file *anon_file)
{
struct cachefiles_object *object;
struct cachefiles_cache *cache;
struct cachefiles_open *load;
- struct file *file;
u32 object_id;
- int ret, fd;
+ int ret;
object = cachefiles_grab_object(req->object,
cachefiles_obj_get_ondemand_fd);
@@ -224,35 +291,53 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req)
if (ret < 0)
goto err;
- fd = get_unused_fd_flags(O_WRONLY);
- if (fd < 0) {
- ret = fd;
+ anon_file->fd = get_unused_fd_flags(O_WRONLY);
+ if (anon_file->fd < 0) {
+ ret = anon_file->fd;
goto err_free_id;
}
- file = anon_inode_getfile("[cachefiles]", &cachefiles_ondemand_fd_fops,
- object, O_WRONLY);
- if (IS_ERR(file)) {
- ret = PTR_ERR(file);
+ anon_file->file = anon_inode_getfile("[cachefiles]",
+ &cachefiles_ondemand_fd_fops, object, O_WRONLY);
+ if (IS_ERR(anon_file->file)) {
+ ret = PTR_ERR(anon_file->file);
goto err_put_fd;
}
- file->f_mode |= FMODE_PWRITE | FMODE_LSEEK;
- fd_install(fd, file);
+ spin_lock(&object->ondemand->lock);
+ if (object->ondemand->ondemand_id > 0) {
+ spin_unlock(&object->ondemand->lock);
+ /* Pair with check in cachefiles_ondemand_fd_release(). */
+ anon_file->file->private_data = NULL;
+ ret = -EEXIST;
+ goto err_put_file;
+ }
+
+ anon_file->file->f_mode |= FMODE_PWRITE | FMODE_LSEEK;
load = (void *)req->msg.data;
- load->fd = fd;
+ load->fd = anon_file->fd;
object->ondemand->ondemand_id = object_id;
+ spin_unlock(&object->ondemand->lock);
cachefiles_get_unbind_pincount(cache);
trace_cachefiles_ondemand_open(object, &req->msg, load);
return 0;
+err_put_file:
+ fput(anon_file->file);
+ anon_file->file = NULL;
err_put_fd:
- put_unused_fd(fd);
+ put_unused_fd(anon_file->fd);
+ anon_file->fd = ret;
err_free_id:
xa_erase(&cache->ondemand_ids, object_id);
err:
+ spin_lock(&object->ondemand->lock);
+ /* Avoid marking an opened object as closed. */
+ if (object->ondemand->ondemand_id <= 0)
+ cachefiles_ondemand_set_object_close(object);
+ spin_unlock(&object->ondemand->lock);
cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd);
return ret;
}
@@ -294,14 +379,28 @@ static struct cachefiles_req *cachefiles_ondemand_select_req(struct xa_state *xa
return NULL;
}
+static inline bool cachefiles_ondemand_finish_req(struct cachefiles_req *req,
+ struct xa_state *xas, int err)
+{
+ if (unlikely(!xas || !req))
+ return false;
+
+ if (xa_cmpxchg(xas->xa, xas->xa_index, req, NULL, 0) != req)
+ return false;
+
+ req->error = err;
+ complete(&req->done);
+ return true;
+}
+
ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
char __user *_buffer, size_t buflen)
{
struct cachefiles_req *req;
struct cachefiles_msg *msg;
- unsigned long id = 0;
size_t n;
int ret = 0;
+ struct ondemand_anon_file anon_file;
XA_STATE(xas, &cache->reqs, cache->req_id_next);
xa_lock(&cache->reqs);
@@ -330,42 +429,37 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
xas_clear_mark(&xas, CACHEFILES_REQ_NEW);
cache->req_id_next = xas.xa_index + 1;
+ refcount_inc(&req->ref);
+ cachefiles_grab_object(req->object, cachefiles_obj_get_read_req);
xa_unlock(&cache->reqs);
- id = xas.xa_index;
-
if (msg->opcode == CACHEFILES_OP_OPEN) {
- ret = cachefiles_ondemand_get_fd(req);
- if (ret) {
- cachefiles_ondemand_set_object_close(req->object);
- goto error;
- }
+ ret = cachefiles_ondemand_get_fd(req, &anon_file);
+ if (ret)
+ goto out;
}
- msg->msg_id = id;
+ msg->msg_id = xas.xa_index;
msg->object_id = req->object->ondemand->ondemand_id;
- if (copy_to_user(_buffer, msg, n) != 0) {
+ if (copy_to_user(_buffer, msg, n) != 0)
ret = -EFAULT;
- goto err_put_fd;
- }
- /* CLOSE request has no reply */
- if (msg->opcode == CACHEFILES_OP_CLOSE) {
- xa_erase(&cache->reqs, id);
- complete(&req->done);
+ if (msg->opcode == CACHEFILES_OP_OPEN) {
+ if (ret < 0) {
+ fput(anon_file.file);
+ put_unused_fd(anon_file.fd);
+ goto out;
+ }
+ fd_install(anon_file.fd, anon_file.file);
}
-
- return n;
-
-err_put_fd:
- if (msg->opcode == CACHEFILES_OP_OPEN)
- close_fd(((struct cachefiles_open *)msg->data)->fd);
-error:
- xa_erase(&cache->reqs, id);
- req->error = ret;
- complete(&req->done);
- return ret;
+out:
+ cachefiles_put_object(req->object, cachefiles_obj_put_read_req);
+ /* Remove error request and CLOSE request has no reply */
+ if (ret || msg->opcode == CACHEFILES_OP_CLOSE)
+ cachefiles_ondemand_finish_req(req, &xas, ret);
+ cachefiles_req_put(req);
+ return ret ? ret : n;
}
typedef int (*init_req_fn)(struct cachefiles_req *req, void *private);
@@ -395,6 +489,7 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
goto out;
}
+ refcount_set(&req->ref, 1);
req->object = object;
init_completion(&req->done);
req->msg.opcode = opcode;
@@ -422,7 +517,8 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
*/
xas_lock(&xas);
- if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+ if (test_bit(CACHEFILES_DEAD, &cache->flags) ||
+ cachefiles_ondemand_object_is_dropping(object)) {
xas_unlock(&xas);
ret = -EIO;
goto out;
@@ -432,20 +528,32 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
smp_mb();
if (opcode == CACHEFILES_OP_CLOSE &&
- !cachefiles_ondemand_object_is_open(object)) {
+ !cachefiles_ondemand_object_is_open(object)) {
WARN_ON_ONCE(object->ondemand->ondemand_id == 0);
xas_unlock(&xas);
ret = -EIO;
goto out;
}
- xas.xa_index = 0;
+ /*
+ * Cyclically find a free xas to avoid msg_id reuse that would
+ * cause the daemon to successfully copen a stale msg_id.
+ */
+ xas.xa_index = cache->msg_id_next;
xas_find_marked(&xas, UINT_MAX, XA_FREE_MARK);
+ if (xas.xa_node == XAS_RESTART) {
+ xas.xa_index = 0;
+ xas_find_marked(&xas, cache->msg_id_next - 1, XA_FREE_MARK);
+ }
if (xas.xa_node == XAS_RESTART)
xas_set_err(&xas, -EBUSY);
+
xas_store(&xas, req);
- xas_clear_mark(&xas, XA_FREE_MARK);
- xas_set_mark(&xas, CACHEFILES_REQ_NEW);
+ if (xas_valid(&xas)) {
+ cache->msg_id_next = xas.xa_index + 1;
+ xas_clear_mark(&xas, XA_FREE_MARK);
+ xas_set_mark(&xas, CACHEFILES_REQ_NEW);
+ }
xas_unlock(&xas);
} while (xas_nomem(&xas, GFP_KERNEL));
@@ -454,16 +562,27 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
goto out;
wake_up_all(&cache->daemon_pollwq);
- wait_for_completion(&req->done);
- ret = req->error;
- kfree(req);
+wait:
+ ret = wait_for_completion_killable(&req->done);
+ if (!ret) {
+ ret = req->error;
+ } else {
+ ret = -EINTR;
+ if (!cachefiles_ondemand_finish_req(req, &xas, ret)) {
+ /* Someone will complete it soon. */
+ cpu_relax();
+ goto wait;
+ }
+ }
+ cachefiles_req_put(req);
return ret;
out:
/* Reset the object to close state in error handling path.
* If error occurs after creating the anonymous fd,
* cachefiles_ondemand_fd_release() will set object to close.
*/
- if (opcode == CACHEFILES_OP_OPEN)
+ if (opcode == CACHEFILES_OP_OPEN &&
+ !cachefiles_ondemand_object_is_dropping(object))
cachefiles_ondemand_set_object_close(object);
kfree(req);
return ret;
@@ -562,8 +681,34 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object)
void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
{
+ unsigned long index;
+ struct cachefiles_req *req;
+ struct cachefiles_cache *cache;
+
+ if (!object->ondemand)
+ return;
+
cachefiles_ondemand_send_req(object, CACHEFILES_OP_CLOSE, 0,
cachefiles_ondemand_init_close_req, NULL);
+
+ if (!object->ondemand->ondemand_id)
+ return;
+
+ /* Cancel all requests for the object that is being dropped. */
+ cache = object->volume->cache;
+ xa_lock(&cache->reqs);
+ cachefiles_ondemand_set_object_dropping(object);
+ xa_for_each(&cache->reqs, index, req) {
+ if (req->object == object) {
+ req->error = -EIO;
+ complete(&req->done);
+ __xa_erase(&cache->reqs, index);
+ }
+ }
+ xa_unlock(&cache->reqs);
+
+ /* Wait for ondemand_object_worker() to finish to avoid UAF. */
+ cancel_work_sync(&object->ondemand->ondemand_work);
}
int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object,
@@ -578,6 +723,7 @@ int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object,
return -ENOMEM;
object->ondemand->object = object;
+ spin_lock_init(&object->ondemand->lock);
INIT_WORK(&object->ondemand->ondemand_work, ondemand_object_worker);
return 0;
}
diff --git a/fs/cachefiles/volume.c b/fs/cachefiles/volume.c
index 89df0ba8ba5e..781aac4ef274 100644
--- a/fs/cachefiles/volume.c
+++ b/fs/cachefiles/volume.c
@@ -133,7 +133,6 @@ void cachefiles_free_volume(struct fscache_volume *vcookie)
void cachefiles_withdraw_volume(struct cachefiles_volume *volume)
{
- fscache_withdraw_volume(volume->vcookie);
cachefiles_set_volume_xattr(volume);
__cachefiles_free_volume(volume);
}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index bcb6173943ee..4dd8a993c60a 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -110,9 +110,11 @@ int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file
if (xlen == 0)
xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, tlen);
if (xlen != tlen) {
- if (xlen < 0)
+ if (xlen < 0) {
+ ret = xlen;
trace_cachefiles_vfs_error(object, file_inode(file), xlen,
cachefiles_trace_getxattr_error);
+ }
if (xlen == -EIO)
cachefiles_io_error_obj(
object,
@@ -252,6 +254,7 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume)
xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, len);
if (xlen != len) {
if (xlen < 0) {
+ ret = xlen;
trace_cachefiles_vfs_error(NULL, d_inode(dentry), xlen,
cachefiles_trace_getxattr_error);
if (xlen == -EIO)
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index ccdbec388091..40f84d014524 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -31,15 +31,7 @@ static int coda_symlink_filler(struct file *file, struct folio *folio)
cii = ITOC(inode);
error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len);
- if (error)
- goto fail;
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- return 0;
-
-fail:
- folio_set_error(folio);
- folio_unlock(folio);
+ folio_end_read(folio, error == 0);
return error;
}
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 18677cd4e62f..43d6bde1adcc 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -580,6 +580,7 @@ static void detach_attrs(struct config_item * item)
static int populate_attrs(struct config_item *item)
{
const struct config_item_type *t = item->ci_type;
+ struct configfs_group_operations *ops;
struct configfs_attribute *attr;
struct configfs_bin_attribute *bin_attr;
int error = 0;
@@ -587,14 +588,23 @@ static int populate_attrs(struct config_item *item)
if (!t)
return -EINVAL;
+
+ ops = t->ct_group_ops;
+
if (t->ct_attrs) {
for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) {
+ if (ops && ops->is_visible && !ops->is_visible(item, attr, i))
+ continue;
+
if ((error = configfs_create_file(item, attr)))
break;
}
}
if (t->ct_bin_attrs) {
for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) {
+ if (ops && ops->is_bin_visible && !ops->is_bin_visible(item, bin_attr, i))
+ continue;
+
error = configfs_create_bin_file(item, bin_attr);
if (error)
break;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 460690ca0174..b84d1747a020 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -811,19 +811,19 @@ out:
static int cramfs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
u32 maxblock;
int bytes_filled;
void *pgdata;
+ bool success = false;
maxblock = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
bytes_filled = 0;
- pgdata = kmap_local_page(page);
+ pgdata = kmap_local_folio(folio, 0);
- if (page->index < maxblock) {
+ if (folio->index < maxblock) {
struct super_block *sb = inode->i_sb;
- u32 blkptr_offset = OFFSET(inode) + page->index * 4;
+ u32 blkptr_offset = OFFSET(inode) + folio->index * 4;
u32 block_ptr, block_start, block_len;
bool uncompressed, direct;
@@ -844,7 +844,7 @@ static int cramfs_read_folio(struct file *file, struct folio *folio)
if (uncompressed) {
block_len = PAGE_SIZE;
/* if last block: cap to file length */
- if (page->index == maxblock - 1)
+ if (folio->index == maxblock - 1)
block_len =
offset_in_page(inode->i_size);
} else {
@@ -861,7 +861,7 @@ static int cramfs_read_folio(struct file *file, struct folio *folio)
* from the previous block's pointer.
*/
block_start = OFFSET(inode) + maxblock * 4;
- if (page->index)
+ if (folio->index)
block_start = *(u32 *)
cramfs_read(sb, blkptr_offset - 4, 4);
/* Beware... previous ptr might be a direct ptr */
@@ -906,17 +906,12 @@ static int cramfs_read_folio(struct file *file, struct folio *folio)
}
memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled);
- flush_dcache_page(page);
- kunmap_local(pgdata);
- SetPageUptodate(page);
- unlock_page(page);
- return 0;
+ flush_dcache_folio(folio);
+ success = true;
err:
kunmap_local(pgdata);
- ClearPageUptodate(page);
- SetPageError(page);
- unlock_page(page);
+ folio_end_read(folio, success);
return 0;
}
@@ -1003,4 +998,5 @@ static void __exit exit_cramfs_fs(void)
module_init(init_cramfs_fs)
module_exit(exit_cramfs_fs)
+MODULE_DESCRIPTION("Compressed ROM file system support");
MODULE_LICENSE("GPL");
diff --git a/fs/dcache.c b/fs/dcache.c
index 1ee6404b430b..8bdc278a0205 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -35,6 +35,8 @@
#include "internal.h"
#include "mount.h"
+#include <asm/runtime-const.h>
+
/*
* Usage:
* dcache->d_inode->i_lock protects:
@@ -100,9 +102,10 @@ static unsigned int d_hash_shift __ro_after_init;
static struct hlist_bl_head *dentry_hashtable __ro_after_init;
-static inline struct hlist_bl_head *d_hash(unsigned int hash)
+static inline struct hlist_bl_head *d_hash(unsigned long hashlen)
{
- return dentry_hashtable + (hash >> d_hash_shift);
+ return runtime_const_ptr(dentry_hashtable) +
+ runtime_const_shift_right_32(hashlen, d_hash_shift);
}
#define IN_LOOKUP_SHIFT 10
@@ -355,7 +358,11 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry)
flags &= ~DCACHE_ENTRY_TYPE;
WRITE_ONCE(dentry->d_flags, flags);
dentry->d_inode = NULL;
- if (flags & DCACHE_LRU_LIST)
+ /*
+ * The negative counter only tracks dentries on the LRU. Don't inc if
+ * d_lru is on another list.
+ */
+ if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST)
this_cpu_inc(nr_dentry_negative);
}
@@ -1548,7 +1555,7 @@ void shrink_dcache_for_umount(struct super_block *sb)
{
struct dentry *dentry;
- WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked");
+ rwsem_assert_held_write(&sb->s_umount);
dentry = sb->s_root;
sb->s_root = NULL;
@@ -1844,9 +1851,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
spin_lock(&dentry->d_lock);
/*
- * Decrement negative dentry count if it was in the LRU list.
+ * The negative counter only tracks dentries on the LRU. Don't dec if
+ * d_lru is on another list.
*/
- if (dentry->d_flags & DCACHE_LRU_LIST)
+ if ((dentry->d_flags &
+ (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST)
this_cpu_dec(nr_dentry_negative);
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
raw_write_seqcount_begin(&dentry->d_seq);
@@ -2104,7 +2113,7 @@ static noinline struct dentry *__d_lookup_rcu_op_compare(
unsigned *seqp)
{
u64 hashlen = name->hash_len;
- struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
+ struct hlist_bl_head *b = d_hash(hashlen);
struct hlist_bl_node *node;
struct dentry *dentry;
@@ -2171,7 +2180,7 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
{
u64 hashlen = name->hash_len;
const unsigned char *str = name->name;
- struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
+ struct hlist_bl_head *b = d_hash(hashlen);
struct hlist_bl_node *node;
struct dentry *dentry;
@@ -2360,17 +2369,19 @@ EXPORT_SYMBOL(d_hash_and_lookup);
* - unhash this dentry and free it.
*
* Usually, we want to just turn this into
- * a negative dentry, but certain workloads can
- * generate a large number of negative dentries.
- * Therefore, it would be better to simply
- * unhash it.
+ * a negative dentry, but if anybody else is
+ * currently using the dentry or the inode
+ * we can't do that and we fall back on removing
+ * it from the hash queues and waiting for
+ * it to be deleted later when it has no users
*/
/**
* d_delete - delete a dentry
* @dentry: The dentry to delete
*
- * Remove the dentry from the hash queues so it can be deleted later.
+ * Turn the dentry into a negative dentry if possible, otherwise
+ * remove it from the hash queues so it can be deleted later
*/
void d_delete(struct dentry * dentry)
@@ -2379,8 +2390,6 @@ void d_delete(struct dentry * dentry)
spin_lock(&inode->i_lock);
spin_lock(&dentry->d_lock);
- __d_drop(dentry);
-
/*
* Are we the only user?
*/
@@ -2388,6 +2397,7 @@ void d_delete(struct dentry * dentry)
dentry->d_flags &= ~DCACHE_CANT_MOUNT;
dentry_unlink_inode(dentry);
} else {
+ __d_drop(dentry);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
}
@@ -3028,28 +3038,25 @@ EXPORT_SYMBOL(d_splice_alias);
bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
{
- bool result;
+ bool subdir;
unsigned seq;
if (new_dentry == old_dentry)
return true;
- do {
- /* for restarting inner loop in case of seq retry */
- seq = read_seqbegin(&rename_lock);
- /*
- * Need rcu_readlock to protect against the d_parent trashing
- * due to d_move
- */
- rcu_read_lock();
- if (d_ancestor(old_dentry, new_dentry))
- result = true;
- else
- result = false;
- rcu_read_unlock();
- } while (read_seqretry(&rename_lock, seq));
-
- return result;
+ /* Access d_parent under rcu as d_move() may change it. */
+ rcu_read_lock();
+ seq = read_seqbegin(&rename_lock);
+ subdir = d_ancestor(old_dentry, new_dentry);
+ /* Try lockless once... */
+ if (read_seqretry(&rename_lock, seq)) {
+ /* ...else acquire lock for progress even on deep chains. */
+ read_seqlock_excl(&rename_lock);
+ subdir = d_ancestor(old_dentry, new_dentry);
+ read_sequnlock_excl(&rename_lock);
+ }
+ rcu_read_unlock();
+ return subdir;
}
EXPORT_SYMBOL(is_subdir);
@@ -3099,6 +3106,34 @@ void d_tmpfile(struct file *file, struct inode *inode)
}
EXPORT_SYMBOL(d_tmpfile);
+/*
+ * Obtain inode number of the parent dentry.
+ */
+ino_t d_parent_ino(struct dentry *dentry)
+{
+ struct dentry *parent;
+ struct inode *iparent;
+ unsigned seq;
+ ino_t ret;
+
+ scoped_guard(rcu) {
+ seq = raw_seqcount_begin(&dentry->d_seq);
+ parent = READ_ONCE(dentry->d_parent);
+ iparent = d_inode_rcu(parent);
+ if (likely(iparent)) {
+ ret = iparent->i_ino;
+ if (!read_seqcount_retry(&dentry->d_seq, seq))
+ return ret;
+ }
+ }
+
+ spin_lock(&dentry->d_lock);
+ ret = dentry->d_parent->d_inode->i_ino;
+ spin_unlock(&dentry->d_lock);
+ return ret;
+}
+EXPORT_SYMBOL(d_parent_ino);
+
static __initdata unsigned long dhash_entries;
static int __init set_dhash_entries(char *str)
{
@@ -3128,6 +3163,9 @@ static void __init dcache_init_early(void)
0,
0);
d_hash_shift = 32 - d_hash_shift;
+
+ runtime_const_init(shift, d_hash_shift);
+ runtime_const_init(ptr, dentry_hashtable);
}
static void __init dcache_init(void)
@@ -3156,6 +3194,9 @@ static void __init dcache_init(void)
0,
0);
d_hash_shift = 32 - d_hash_shift;
+
+ runtime_const_init(shift, d_hash_shift);
+ runtime_const_init(ptr, dentry_hashtable);
}
/* SLAB cache for __getname() consumers */
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index dc51df0b118d..91521576f500 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -92,9 +92,9 @@ enum {
};
static const struct fs_parameter_spec debugfs_param_specs[] = {
- fsparam_u32 ("gid", Opt_gid),
+ fsparam_gid ("gid", Opt_gid),
fsparam_u32oct ("mode", Opt_mode),
- fsparam_u32 ("uid", Opt_uid),
+ fsparam_uid ("uid", Opt_uid),
{}
};
@@ -102,26 +102,26 @@ static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param
{
struct debugfs_fs_info *opts = fc->s_fs_info;
struct fs_parse_result result;
- kuid_t uid;
- kgid_t gid;
int opt;
opt = fs_parse(fc, debugfs_param_specs, param, &result);
- if (opt < 0)
+ if (opt < 0) {
+ /*
+ * We might like to report bad mount options here; but
+ * traditionally debugfs has ignored all mount options
+ */
+ if (opt == -ENOPARAM)
+ return 0;
+
return opt;
+ }
switch (opt) {
case Opt_uid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid))
- return invalf(fc, "Unknown uid");
- opts->uid = uid;
+ opts->uid = result.uid;
break;
case Opt_gid:
- gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(gid))
- return invalf(fc, "Unknown gid");
- opts->gid = gid;
+ opts->gid = result.gid;
break;
case Opt_mode:
opts->mode = result.uint_32 & S_IALLUGO;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index bb14462f6d99..a929f1b613be 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -275,8 +275,8 @@ enum {
};
static const struct fs_parameter_spec efivarfs_parameters[] = {
- fsparam_u32("uid", Opt_uid),
- fsparam_u32("gid", Opt_gid),
+ fsparam_uid("uid", Opt_uid),
+ fsparam_gid("gid", Opt_gid),
{},
};
@@ -293,14 +293,10 @@ static int efivarfs_parse_param(struct fs_context *fc, struct fs_parameter *para
switch (opt) {
case Opt_uid:
- opts->uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(opts->uid))
- return -EINVAL;
+ opts->uid = result.uid;
break;
case Opt_gid:
- opts->gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(opts->gid))
- return -EINVAL;
+ opts->gid = result.gid;
break;
default:
return -EINVAL;
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 7844ab24b813..462619e59766 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -311,4 +311,5 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) {
return 0;
}
+MODULE_DESCRIPTION("Extent File System (efs)");
MODULE_LICENSE("GPL");
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 3b03a573cb1a..7749feded722 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -14,10 +14,9 @@
static int efs_symlink_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- char *link = page_address(page);
- struct buffer_head * bh;
- struct inode * inode = page->mapping->host;
+ char *link = folio_address(folio);
+ struct buffer_head *bh;
+ struct inode *inode = folio->mapping->host;
efs_block_t size = inode->i_size;
int err;
@@ -40,12 +39,9 @@ static int efs_symlink_read_folio(struct file *file, struct folio *folio)
brelse(bh);
}
link[size] = '\0';
- SetPageUptodate(page);
- unlock_page(page);
- return 0;
+ err = 0;
fail:
- SetPageError(page);
- unlock_page(page);
+ folio_end_read(folio, err == 0);
return err;
}
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index c93bd24d2771..1b91d9513013 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -343,7 +343,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->build_time = le64_to_cpu(dsb->build_time);
sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec);
- memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid));
+ super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid));
ret = strscpy(sbi->volume_name, dsb->volume_name,
sizeof(dsb->volume_name));
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 9b248ee5fef2..74d3d7bffcf3 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -711,6 +711,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
err = z_erofs_do_map_blocks(inode, map, flags);
out:
+ if (err)
+ map->m_llen = 0;
trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
return err;
}
diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
index 036024bce9f7..b80f612867c2 100644
--- a/fs/erofs/zutil.c
+++ b/fs/erofs/zutil.c
@@ -148,7 +148,7 @@ int __init z_erofs_gbuf_init(void)
void z_erofs_gbuf_exit(void)
{
- int i;
+ int i, j;
for (i = 0; i < z_erofs_gbuf_count + (!!z_erofs_rsvbuf); ++i) {
struct z_erofs_gbuf *gbuf = &z_erofs_gbufpool[i];
@@ -161,9 +161,9 @@ void z_erofs_gbuf_exit(void)
if (!gbuf->pages)
continue;
- for (i = 0; i < gbuf->nrpages; ++i)
- if (gbuf->pages[i])
- put_page(gbuf->pages[i]);
+ for (j = 0; j < gbuf->nrpages; ++j)
+ if (gbuf->pages[j])
+ put_page(gbuf->pages[j]);
kfree(gbuf->pages);
gbuf->pages = NULL;
}
diff --git a/fs/exec.c b/fs/exec.c
index 40073142288f..a47d0e4c54f6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -486,6 +486,35 @@ static int count_strings_kernel(const char *const *argv)
return i;
}
+static inline int bprm_set_stack_limit(struct linux_binprm *bprm,
+ unsigned long limit)
+{
+#ifdef CONFIG_MMU
+ /* Avoid a pathological bprm->p. */
+ if (bprm->p < limit)
+ return -E2BIG;
+ bprm->argmin = bprm->p - limit;
+#endif
+ return 0;
+}
+static inline bool bprm_hit_stack_limit(struct linux_binprm *bprm)
+{
+#ifdef CONFIG_MMU
+ return bprm->p < bprm->argmin;
+#else
+ return false;
+#endif
+}
+
+/*
+ * Calculate bprm->argmin from:
+ * - _STK_LIM
+ * - ARG_MAX
+ * - bprm->rlim_stack.rlim_cur
+ * - bprm->argc
+ * - bprm->envc
+ * - bprm->p
+ */
static int bprm_stack_limits(struct linux_binprm *bprm)
{
unsigned long limit, ptr_size;
@@ -505,6 +534,9 @@ static int bprm_stack_limits(struct linux_binprm *bprm)
* of argument strings even with small stacks
*/
limit = max_t(unsigned long, limit, ARG_MAX);
+ /* Reject totally pathological counts. */
+ if (bprm->argc < 0 || bprm->envc < 0)
+ return -E2BIG;
/*
* We must account for the size of all the argv and envp pointers to
* the argv and envp strings, since they will also take up space in
@@ -518,13 +550,14 @@ static int bprm_stack_limits(struct linux_binprm *bprm)
* argc can never be 0, to keep them from walking envp by accident.
* See do_execveat_common().
*/
- ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *);
+ if (check_add_overflow(max(bprm->argc, 1), bprm->envc, &ptr_size) ||
+ check_mul_overflow(ptr_size, sizeof(void *), &ptr_size))
+ return -E2BIG;
if (limit <= ptr_size)
return -E2BIG;
limit -= ptr_size;
- bprm->argmin = bprm->p - limit;
- return 0;
+ return bprm_set_stack_limit(bprm, limit);
}
/*
@@ -562,10 +595,8 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
pos = bprm->p;
str += len;
bprm->p -= len;
-#ifdef CONFIG_MMU
- if (bprm->p < bprm->argmin)
+ if (bprm_hit_stack_limit(bprm))
goto out;
-#endif
while (len > 0) {
int offset, bytes_to_copy;
@@ -640,7 +671,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
/* We're going to work our way backwards. */
arg += len;
bprm->p -= len;
- if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin)
+ if (bprm_hit_stack_limit(bprm))
return -E2BIG;
while (len > 0) {
@@ -952,10 +983,6 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
path_noexec(&file->f_path)))
goto exit;
- err = deny_write_access(file);
- if (err)
- goto exit;
-
out:
return file;
@@ -971,8 +998,7 @@ exit:
*
* Returns ERR_PTR on failure or allocated struct file on success.
*
- * As this is a wrapper for the internal do_open_execat(), callers
- * must call allow_write_access() before fput() on release. Also see
+ * As this is a wrapper for the internal do_open_execat(). Also see
* do_close_execat().
*/
struct file *open_exec(const char *name)
@@ -1524,10 +1550,8 @@ static int prepare_bprm_creds(struct linux_binprm *bprm)
/* Matches do_open_execat() */
static void do_close_execat(struct file *file)
{
- if (!file)
- return;
- allow_write_access(file);
- fput(file);
+ if (file)
+ fput(file);
}
static void free_bprm(struct linux_binprm *bprm)
@@ -1846,7 +1870,6 @@ static int exec_binprm(struct linux_binprm *bprm)
bprm->file = bprm->interpreter;
bprm->interpreter = NULL;
- allow_write_access(exec);
if (unlikely(bprm->have_execfd)) {
if (bprm->executable) {
fput(exec);
@@ -2211,3 +2234,7 @@ static int __init init_fs_exec_sysctls(void)
fs_initcall(init_fs_exec_sysctls);
#endif /* CONFIG_SYSCTL */
+
+#ifdef CONFIG_EXEC_KUNIT_TEST
+#include "exec_test.c"
+#endif
diff --git a/fs/exec_test.c b/fs/exec_test.c
new file mode 100644
index 000000000000..7c77d039680b
--- /dev/null
+++ b/fs/exec_test.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <kunit/test.h>
+
+struct bprm_stack_limits_result {
+ struct linux_binprm bprm;
+ int expected_rc;
+ unsigned long expected_argmin;
+};
+
+static const struct bprm_stack_limits_result bprm_stack_limits_results[] = {
+ /* Negative argc/envc counts produce -E2BIG */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = INT_MIN, .envc = INT_MIN }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = 5, .envc = -1 }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = -1, .envc = 10 }, .expected_rc = -E2BIG },
+ /* The max value of argc or envc is MAX_ARG_STRINGS. */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = INT_MAX, .envc = INT_MAX }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = MAX_ARG_STRINGS, .envc = MAX_ARG_STRINGS }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = 0, .envc = MAX_ARG_STRINGS }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = MAX_ARG_STRINGS, .envc = 0 }, .expected_rc = -E2BIG },
+ /*
+ * On 32-bit system these argc and envc counts, while likely impossible
+ * to represent within the associated TASK_SIZE, could overflow the
+ * limit calculation, and bypass the ptr_size <= limit check.
+ */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = 0x20000001, .envc = 0x20000001 }, .expected_rc = -E2BIG },
+#ifdef CONFIG_MMU
+ /* Make sure a pathological bprm->p doesn't cause an overflow. */
+ { { .p = sizeof(void *), .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = 10, .envc = 10 }, .expected_rc = -E2BIG },
+#endif
+ /*
+ * 0 rlim_stack will get raised to ARG_MAX. With 1 string pointer,
+ * we should see p - ARG_MAX + sizeof(void *).
+ */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 0,
+ .argc = 1, .envc = 0 }, .expected_argmin = ULONG_MAX - ARG_MAX + sizeof(void *)},
+ /* Validate that argc is always raised to a minimum of 1. */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 0,
+ .argc = 0, .envc = 0 }, .expected_argmin = ULONG_MAX - ARG_MAX + sizeof(void *)},
+ /*
+ * 0 rlim_stack will get raised to ARG_MAX. With pointers filling ARG_MAX,
+ * we should see -E2BIG. (Note argc is always raised to at least 1.)
+ */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 0,
+ .argc = ARG_MAX / sizeof(void *), .envc = 0 }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 0,
+ .argc = 0, .envc = ARG_MAX / sizeof(void *) - 1 }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 0,
+ .argc = ARG_MAX / sizeof(void *) + 1, .envc = 0 }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 0,
+ .argc = 0, .envc = ARG_MAX / sizeof(void *) }, .expected_rc = -E2BIG },
+ /* And with one less, we see space for exactly 1 pointer. */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 0,
+ .argc = (ARG_MAX / sizeof(void *)) - 1, .envc = 0 },
+ .expected_argmin = ULONG_MAX - sizeof(void *) },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 0,
+ .argc = 0, .envc = (ARG_MAX / sizeof(void *)) - 2, },
+ .expected_argmin = ULONG_MAX - sizeof(void *) },
+ /* If we raise rlim_stack / 4 to exactly ARG_MAX, nothing changes. */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ARG_MAX * 4,
+ .argc = ARG_MAX / sizeof(void *), .envc = 0 }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ARG_MAX * 4,
+ .argc = 0, .envc = ARG_MAX / sizeof(void *) - 1 }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ARG_MAX * 4,
+ .argc = ARG_MAX / sizeof(void *) + 1, .envc = 0 }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ARG_MAX * 4,
+ .argc = 0, .envc = ARG_MAX / sizeof(void *) }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ARG_MAX * 4,
+ .argc = (ARG_MAX / sizeof(void *)) - 1, .envc = 0 },
+ .expected_argmin = ULONG_MAX - sizeof(void *) },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ARG_MAX * 4,
+ .argc = 0, .envc = (ARG_MAX / sizeof(void *)) - 2, },
+ .expected_argmin = ULONG_MAX - sizeof(void *) },
+ /* But raising it another pointer * 4 will provide space for 1 more pointer. */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = (ARG_MAX + sizeof(void *)) * 4,
+ .argc = ARG_MAX / sizeof(void *), .envc = 0 },
+ .expected_argmin = ULONG_MAX - sizeof(void *) },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = (ARG_MAX + sizeof(void *)) * 4,
+ .argc = 0, .envc = ARG_MAX / sizeof(void *) - 1 },
+ .expected_argmin = ULONG_MAX - sizeof(void *) },
+ /* Raising rlim_stack / 4 to _STK_LIM / 4 * 3 will see more space. */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 4 * (_STK_LIM / 4 * 3),
+ .argc = 0, .envc = 0 },
+ .expected_argmin = ULONG_MAX - (_STK_LIM / 4 * 3) + sizeof(void *) },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 4 * (_STK_LIM / 4 * 3),
+ .argc = 0, .envc = 0 },
+ .expected_argmin = ULONG_MAX - (_STK_LIM / 4 * 3) + sizeof(void *) },
+ /* But raising it any further will see no increase. */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 4 * (_STK_LIM / 4 * 3 + sizeof(void *)),
+ .argc = 0, .envc = 0 },
+ .expected_argmin = ULONG_MAX - (_STK_LIM / 4 * 3) + sizeof(void *) },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 4 * (_STK_LIM / 4 * + sizeof(void *)),
+ .argc = 0, .envc = 0 },
+ .expected_argmin = ULONG_MAX - (_STK_LIM / 4 * 3) + sizeof(void *) },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 4 * _STK_LIM,
+ .argc = 0, .envc = 0 },
+ .expected_argmin = ULONG_MAX - (_STK_LIM / 4 * 3) + sizeof(void *) },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 4 * _STK_LIM,
+ .argc = 0, .envc = 0 },
+ .expected_argmin = ULONG_MAX - (_STK_LIM / 4 * 3) + sizeof(void *) },
+};
+
+static void exec_test_bprm_stack_limits(struct kunit *test)
+{
+ /* Double-check the constants. */
+ KUNIT_EXPECT_EQ(test, _STK_LIM, SZ_8M);
+ KUNIT_EXPECT_EQ(test, ARG_MAX, 32 * SZ_4K);
+ KUNIT_EXPECT_EQ(test, MAX_ARG_STRINGS, 0x7FFFFFFF);
+
+ for (int i = 0; i < ARRAY_SIZE(bprm_stack_limits_results); i++) {
+ const struct bprm_stack_limits_result *result = &bprm_stack_limits_results[i];
+ struct linux_binprm bprm = result->bprm;
+ int rc;
+
+ rc = bprm_stack_limits(&bprm);
+ KUNIT_EXPECT_EQ_MSG(test, rc, result->expected_rc, "on loop %d", i);
+#ifdef CONFIG_MMU
+ KUNIT_EXPECT_EQ_MSG(test, bprm.argmin, result->expected_argmin, "on loop %d", i);
+#endif
+ }
+}
+
+static struct kunit_case exec_test_cases[] = {
+ KUNIT_CASE(exec_test_bprm_stack_limits),
+ {},
+};
+
+static struct kunit_suite exec_test_suite = {
+ .name = "exec",
+ .test_cases = exec_test_cases,
+};
+
+kunit_test_suite(exec_test_suite);
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 3d5ea2cfad66..a3c7173ef693 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -225,8 +225,8 @@ static const struct constant_table exfat_param_enums[] = {
};
static const struct fs_parameter_spec exfat_parameters[] = {
- fsparam_u32("uid", Opt_uid),
- fsparam_u32("gid", Opt_gid),
+ fsparam_uid("uid", Opt_uid),
+ fsparam_gid("gid", Opt_gid),
fsparam_u32oct("umask", Opt_umask),
fsparam_u32oct("dmask", Opt_dmask),
fsparam_u32oct("fmask", Opt_fmask),
@@ -262,10 +262,10 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
switch (opt) {
case Opt_uid:
- opts->fs_uid = make_kuid(current_user_ns(), result.uint_32);
+ opts->fs_uid = result.uid;
break;
case Opt_gid:
- opts->fs_gid = make_kgid(current_user_ns(), result.uint_32);
+ opts->fs_gid = result.gid;
break;
case Opt_umask:
opts->fs_fmask = result.uint_32;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 07ea3d62b298..4f2dd4ab4486 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -427,7 +427,7 @@ EXPORT_SYMBOL_GPL(exportfs_encode_fh);
struct dentry *
exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
- int fileid_type,
+ int fileid_type, unsigned int flags,
int (*acceptable)(void *, struct dentry *),
void *context)
{
@@ -445,6 +445,11 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
if (IS_ERR_OR_NULL(result))
return result;
+ if ((flags & EXPORT_FH_DIR_ONLY) && !d_is_dir(result)) {
+ err = -ENOTDIR;
+ goto err_result;
+ }
+
/*
* If no acceptance criteria was specified by caller, a disconnected
* dentry is also accepatable. Callers may use this mode to query if
@@ -581,7 +586,7 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
{
struct dentry *ret;
- ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type,
+ ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type, 0,
acceptable, context);
if (IS_ERR_OR_NULL(ret)) {
if (ret == ERR_PTR(-ENOMEM))
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 7ae0b61258a7..0a056d97e640 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -31,11 +31,10 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
ext4_fname_from_fscrypt_name(fname, &name);
-#if IS_ENABLED(CONFIG_UNICODE)
err = ext4_fname_setup_ci_filename(dir, iname, fname);
if (err)
ext4_fname_free_filename(fname);
-#endif
+
return err;
}
@@ -51,11 +50,9 @@ int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry,
ext4_fname_from_fscrypt_name(fname, &name);
-#if IS_ENABLED(CONFIG_UNICODE)
err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname);
if (err)
ext4_fname_free_filename(fname);
-#endif
return err;
}
@@ -70,10 +67,7 @@ void ext4_fname_free_filename(struct ext4_filename *fname)
fname->usr_fname = NULL;
fname->disk_name.name = NULL;
-#if IS_ENABLED(CONFIG_UNICODE)
- kfree(fname->cf_name.name);
- fname->cf_name.name = NULL;
-#endif
+ ext4_fname_free_ci_filename(fname);
}
static bool uuid_is_zero(__u8 u[16])
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 983dad8c07ec..8007abd4972d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2511,7 +2511,7 @@ struct ext4_filename {
struct fscrypt_str crypto_buf;
#endif
#if IS_ENABLED(CONFIG_UNICODE)
- struct fscrypt_str cf_name;
+ struct qstr cf_name;
#endif
};
@@ -2745,8 +2745,25 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
#if IS_ENABLED(CONFIG_UNICODE)
extern int ext4_fname_setup_ci_filename(struct inode *dir,
- const struct qstr *iname,
- struct ext4_filename *fname);
+ const struct qstr *iname,
+ struct ext4_filename *fname);
+
+static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname)
+{
+ kfree(fname->cf_name.name);
+ fname->cf_name.name = NULL;
+}
+#else
+static inline int ext4_fname_setup_ci_filename(struct inode *dir,
+ const struct qstr *iname,
+ struct ext4_filename *fname)
+{
+ return 0;
+}
+
+static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname)
+{
+}
#endif
/* ext4 encryption related stuff goes here crypto.c */
@@ -2769,16 +2786,11 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
int lookup,
struct ext4_filename *fname)
{
- int err = 0;
fname->usr_fname = iname;
fname->disk_name.name = (unsigned char *) iname->name;
fname->disk_name.len = iname->len;
-#if IS_ENABLED(CONFIG_UNICODE)
- err = ext4_fname_setup_ci_filename(dir, iname, fname);
-#endif
-
- return err;
+ return ext4_fname_setup_ci_filename(dir, iname, fname);
}
static inline int ext4_fname_prepare_lookup(struct inode *dir,
@@ -2790,10 +2802,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir,
static inline void ext4_fname_free_filename(struct ext4_filename *fname)
{
-#if IS_ENABLED(CONFIG_UNICODE)
- kfree(fname->cf_name.name);
- fname->cf_name.name = NULL;
-#endif
+ ext4_fname_free_ci_filename(fname);
}
static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp,
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a630b27a4cc6..e6769b97a970 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1390,62 +1390,11 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
}
#if IS_ENABLED(CONFIG_UNICODE)
-/*
- * Test whether a case-insensitive directory entry matches the filename
- * being searched for. If quick is set, assume the name being looked up
- * is already in the casefolded form.
- *
- * Returns: 0 if the directory entry matches, more than 0 if it
- * doesn't match or less than zero on error.
- */
-static int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
- u8 *de_name, size_t de_name_len, bool quick)
-{
- const struct super_block *sb = parent->i_sb;
- const struct unicode_map *um = sb->s_encoding;
- struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len);
- struct qstr entry = QSTR_INIT(de_name, de_name_len);
- int ret;
-
- if (IS_ENCRYPTED(parent)) {
- const struct fscrypt_str encrypted_name =
- FSTR_INIT(de_name, de_name_len);
-
- decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL);
- if (!decrypted_name.name)
- return -ENOMEM;
- ret = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name,
- &decrypted_name);
- if (ret < 0)
- goto out;
- entry.name = decrypted_name.name;
- entry.len = decrypted_name.len;
- }
-
- if (quick)
- ret = utf8_strncasecmp_folded(um, name, &entry);
- else
- ret = utf8_strncasecmp(um, name, &entry);
- if (ret < 0) {
- /* Handle invalid character sequence as either an error
- * or as an opaque byte sequence.
- */
- if (sb_has_strict_encoding(sb))
- ret = -EINVAL;
- else if (name->len != entry.len)
- ret = 1;
- else
- ret = !!memcmp(name->name, entry.name, entry.len);
- }
-out:
- kfree(decrypted_name.name);
- return ret;
-}
-
int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
struct ext4_filename *name)
{
- struct fscrypt_str *cf_name = &name->cf_name;
+ struct qstr *cf_name = &name->cf_name;
+ unsigned char *buf;
struct dx_hash_info *hinfo = &name->hinfo;
int len;
@@ -1455,18 +1404,18 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
return 0;
}
- cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS);
- if (!cf_name->name)
+ buf = kmalloc(EXT4_NAME_LEN, GFP_NOFS);
+ if (!buf)
return -ENOMEM;
- len = utf8_casefold(dir->i_sb->s_encoding,
- iname, cf_name->name,
- EXT4_NAME_LEN);
+ len = utf8_casefold(dir->i_sb->s_encoding, iname, buf, EXT4_NAME_LEN);
if (len <= 0) {
- kfree(cf_name->name);
- cf_name->name = NULL;
+ kfree(buf);
+ buf = NULL;
}
+ cf_name->name = buf;
cf_name->len = (unsigned) len;
+
if (!IS_ENCRYPTED(dir))
return 0;
@@ -1502,22 +1451,29 @@ static bool ext4_match(struct inode *parent,
#if IS_ENABLED(CONFIG_UNICODE)
if (IS_CASEFOLDED(parent) &&
(!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) {
- if (fname->cf_name.name) {
- struct qstr cf = {.name = fname->cf_name.name,
- .len = fname->cf_name.len};
- if (IS_ENCRYPTED(parent)) {
- if (fname->hinfo.hash != EXT4_DIRENT_HASH(de) ||
- fname->hinfo.minor_hash !=
- EXT4_DIRENT_MINOR_HASH(de)) {
-
- return false;
- }
- }
- return !ext4_ci_compare(parent, &cf, de->name,
- de->name_len, true);
- }
- return !ext4_ci_compare(parent, fname->usr_fname, de->name,
- de->name_len, false);
+ /*
+ * Just checking IS_ENCRYPTED(parent) below is not
+ * sufficient to decide whether one can use the hash for
+ * skipping the string comparison, because the key might
+ * have been added right after
+ * ext4_fname_setup_ci_filename(). In this case, a hash
+ * mismatch will be a false negative. Therefore, make
+ * sure cf_name was properly initialized before
+ * considering the calculated hash.
+ */
+ if (IS_ENCRYPTED(parent) && fname->cf_name.name &&
+ (fname->hinfo.hash != EXT4_DIRENT_HASH(de) ||
+ fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de)))
+ return false;
+ /*
+ * Treat comparison errors as not a match. The
+ * only case where it happens is on a disk
+ * corruption or ENOMEM.
+ */
+
+ return generic_ci_match(parent, fname->usr_fname,
+ &fname->cf_name, de->name,
+ de->name_len) > 0;
}
#endif
@@ -1869,8 +1825,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
}
}
-#if IS_ENABLED(CONFIG_UNICODE)
- if (!inode && IS_CASEFOLDED(dir)) {
+ if (IS_ENABLED(CONFIG_UNICODE) && !inode && IS_CASEFOLDED(dir)) {
/* Eventually we want to call d_add_ci(dentry, NULL)
* for negative dentries in the encoding case as
* well. For now, prevent the negative dentry
@@ -1878,7 +1833,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
*/
return NULL;
}
-#endif
+
return d_splice_alias(inode, dentry);
}
@@ -3208,16 +3163,14 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
ext4_fc_track_unlink(handle, dentry);
retval = ext4_mark_inode_dirty(handle, dir);
-#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
* negative dentries at ext4_lookup(), when it is better
* supported by the VFS for the CI case.
*/
- if (IS_CASEFOLDED(dir))
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
d_invalidate(dentry);
-#endif
end_rmdir:
brelse(bh);
@@ -3319,16 +3272,15 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
goto out_trace;
retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry);
-#if IS_ENABLED(CONFIG_UNICODE)
+
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
* negative dentries at ext4_lookup(), when it is better
* supported by the VFS for the CI case.
*/
- if (IS_CASEFOLDED(dir))
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
d_invalidate(dentry);
-#endif
out_trace:
trace_ext4_unlink_exit(dentry, retval);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c682fb927b64..eb899628e121 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1721,8 +1721,8 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
fsparam_flag ("bsdgroups", Opt_grpid),
fsparam_flag ("nogrpid", Opt_nogrpid),
fsparam_flag ("sysvgroups", Opt_nogrpid),
- fsparam_u32 ("resgid", Opt_resgid),
- fsparam_u32 ("resuid", Opt_resuid),
+ fsparam_gid ("resgid", Opt_resgid),
+ fsparam_uid ("resuid", Opt_resuid),
fsparam_u32 ("sb", Opt_sb),
fsparam_enum ("errors", Opt_errors, ext4_param_errors),
fsparam_flag ("nouid32", Opt_nouid32),
@@ -2127,8 +2127,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
struct fs_parse_result result;
const struct mount_opts *m;
int is_remount;
- kuid_t uid;
- kgid_t gid;
int token;
token = fs_parse(fc, ext4_param_specs, param, &result);
@@ -2270,23 +2268,11 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx->spec |= EXT4_SPEC_s_stripe;
return 0;
case Opt_resuid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid)) {
- ext4_msg(NULL, KERN_ERR, "Invalid uid value %d",
- result.uint_32);
- return -EINVAL;
- }
- ctx->s_resuid = uid;
+ ctx->s_resuid = result.uid;
ctx->spec |= EXT4_SPEC_s_resuid;
return 0;
case Opt_resgid:
- gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(gid)) {
- ext4_msg(NULL, KERN_ERR, "Invalid gid value %d",
- result.uint_32);
- return -EINVAL;
- }
- ctx->s_resgid = gid;
+ ctx->s_resgid = result.gid;
ctx->spec |= EXT4_SPEC_s_resgid;
return 0;
case Opt_journal_dev:
@@ -3586,14 +3572,12 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 0;
}
-#if !IS_ENABLED(CONFIG_UNICODE)
- if (ext4_has_feature_casefold(sb)) {
+ if (!IS_ENABLED(CONFIG_UNICODE) && ext4_has_feature_casefold(sb)) {
ext4_msg(sb, KERN_ERR,
"Filesystem with casefold feature cannot be "
"mounted without CONFIG_UNICODE");
return 0;
}
-#endif
if (readonly)
return 1;
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index ec2aeccb69a3..8bffdeccdbc3 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -219,8 +219,7 @@ static int f2fs_acl_update_mode(struct mnt_idmap *idmap,
return error;
if (error == 0)
*acl = NULL;
- if (!vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)) &&
- !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID))
+ if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode)))
mode &= ~S_ISGID;
*mode_p = mode;
return 0;
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 02c9355176d3..cbd7a5e96a37 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -42,35 +42,49 @@ static unsigned int bucket_blocks(unsigned int level)
return 4;
}
+#if IS_ENABLED(CONFIG_UNICODE)
/* If @dir is casefolded, initialize @fname->cf_name from @fname->usr_fname. */
int f2fs_init_casefolded_name(const struct inode *dir,
struct f2fs_filename *fname)
{
-#if IS_ENABLED(CONFIG_UNICODE)
struct super_block *sb = dir->i_sb;
+ unsigned char *buf;
+ int len;
if (IS_CASEFOLDED(dir) &&
!is_dot_dotdot(fname->usr_fname->name, fname->usr_fname->len)) {
- fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab,
- GFP_NOFS, false, F2FS_SB(sb));
- if (!fname->cf_name.name)
+ buf = f2fs_kmem_cache_alloc(f2fs_cf_name_slab,
+ GFP_NOFS, false, F2FS_SB(sb));
+ if (!buf)
return -ENOMEM;
- fname->cf_name.len = utf8_casefold(sb->s_encoding,
- fname->usr_fname,
- fname->cf_name.name,
- F2FS_NAME_LEN);
- if ((int)fname->cf_name.len <= 0) {
- kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
- fname->cf_name.name = NULL;
+
+ len = utf8_casefold(sb->s_encoding, fname->usr_fname,
+ buf, F2FS_NAME_LEN);
+ if (len <= 0) {
+ kmem_cache_free(f2fs_cf_name_slab, buf);
if (sb_has_strict_encoding(sb))
return -EINVAL;
/* fall back to treating name as opaque byte sequence */
+ return 0;
}
+ fname->cf_name.name = buf;
+ fname->cf_name.len = len;
}
-#endif
+
return 0;
}
+void f2fs_free_casefolded_name(struct f2fs_filename *fname)
+{
+ unsigned char *buf = (unsigned char *)fname->cf_name.name;
+
+ if (buf) {
+ kmem_cache_free(f2fs_cf_name_slab, buf);
+ fname->cf_name.name = NULL;
+ }
+}
+#endif /* CONFIG_UNICODE */
+
static int __f2fs_setup_filename(const struct inode *dir,
const struct fscrypt_name *crypt_name,
struct f2fs_filename *fname)
@@ -142,12 +156,7 @@ void f2fs_free_filename(struct f2fs_filename *fname)
kfree(fname->crypto_buf.name);
fname->crypto_buf.name = NULL;
#endif
-#if IS_ENABLED(CONFIG_UNICODE)
- if (fname->cf_name.name) {
- kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
- fname->cf_name.name = NULL;
- }
-#endif
+ f2fs_free_casefolded_name(fname);
}
static unsigned long dir_block_index(unsigned int level,
@@ -176,58 +185,6 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir,
return f2fs_find_target_dentry(&d, fname, max_slots);
}
-#if IS_ENABLED(CONFIG_UNICODE)
-/*
- * Test whether a case-insensitive directory entry matches the filename
- * being searched for.
- *
- * Returns 1 for a match, 0 for no match, and -errno on an error.
- */
-static int f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
- const u8 *de_name, u32 de_name_len)
-{
- const struct super_block *sb = dir->i_sb;
- const struct unicode_map *um = sb->s_encoding;
- struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len);
- struct qstr entry = QSTR_INIT(de_name, de_name_len);
- int res;
-
- if (IS_ENCRYPTED(dir)) {
- const struct fscrypt_str encrypted_name =
- FSTR_INIT((u8 *)de_name, de_name_len);
-
- if (WARN_ON_ONCE(!fscrypt_has_encryption_key(dir)))
- return -EINVAL;
-
- decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL);
- if (!decrypted_name.name)
- return -ENOMEM;
- res = fscrypt_fname_disk_to_usr(dir, 0, 0, &encrypted_name,
- &decrypted_name);
- if (res < 0)
- goto out;
- entry.name = decrypted_name.name;
- entry.len = decrypted_name.len;
- }
-
- res = utf8_strncasecmp_folded(um, name, &entry);
- /*
- * In strict mode, ignore invalid names. In non-strict mode,
- * fall back to treating them as opaque byte sequences.
- */
- if (res < 0 && !sb_has_strict_encoding(sb)) {
- res = name->len == entry.len &&
- memcmp(name->name, entry.name, name->len) == 0;
- } else {
- /* utf8_strncasecmp_folded returns 0 on match */
- res = (res == 0);
- }
-out:
- kfree(decrypted_name.name);
- return res;
-}
-#endif /* CONFIG_UNICODE */
-
static inline int f2fs_match_name(const struct inode *dir,
const struct f2fs_filename *fname,
const u8 *de_name, u32 de_name_len)
@@ -235,11 +192,11 @@ static inline int f2fs_match_name(const struct inode *dir,
struct fscrypt_name f;
#if IS_ENABLED(CONFIG_UNICODE)
- if (fname->cf_name.name) {
- struct qstr cf = FSTR_TO_QSTR(&fname->cf_name);
+ if (fname->cf_name.name)
+ return generic_ci_match(dir, fname->usr_fname,
+ &fname->cf_name,
+ de_name, de_name_len);
- return f2fs_match_ci_name(dir, &cf, de_name, de_name_len);
- }
#endif
f.usr_fname = fname->usr_fname;
f.disk_name = fname->disk_name;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 1974b6aff397..8a9d910aa552 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -531,7 +531,7 @@ struct f2fs_filename {
* internal operation where usr_fname is also NULL. In all these cases
* we fall back to treating the name as an opaque byte sequence.
*/
- struct fscrypt_str cf_name;
+ struct qstr cf_name;
#endif
};
@@ -3533,8 +3533,22 @@ int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
/*
* dir.c
*/
+#if IS_ENABLED(CONFIG_UNICODE)
int f2fs_init_casefolded_name(const struct inode *dir,
struct f2fs_filename *fname);
+void f2fs_free_casefolded_name(struct f2fs_filename *fname);
+#else
+static inline int f2fs_init_casefolded_name(const struct inode *dir,
+ struct f2fs_filename *fname)
+{
+ return 0;
+}
+
+static inline void f2fs_free_casefolded_name(struct f2fs_filename *fname)
+{
+}
+#endif /* CONFIG_UNICODE */
+
int f2fs_setup_filename(struct inode *dir, const struct qstr *iname,
int lookup, struct f2fs_filename *fname);
int f2fs_prepare_lookup(struct inode *dir, struct dentry *dentry,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5c0b281a70f3..c1ad9b278c47 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -185,7 +185,7 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
if (!dentry)
return 0;
- *pino = parent_ino(dentry);
+ *pino = d_parent_ino(dentry);
dput(dentry);
return 1;
}
@@ -923,10 +923,8 @@ static void __setattr_copy(struct mnt_idmap *idmap,
inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
- vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
- if (!vfsgid_in_group_p(vfsgid) &&
- !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID))
+ if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode)))
mode &= ~S_ISGID;
set_acl_inode(inode, mode);
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index e54f8c08bda8..1ecde2b45e99 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -576,8 +576,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
goto out_iput;
}
out_splice:
-#if IS_ENABLED(CONFIG_UNICODE)
- if (!inode && IS_CASEFOLDED(dir)) {
+ if (IS_ENABLED(CONFIG_UNICODE) && !inode && IS_CASEFOLDED(dir)) {
/* Eventually we want to call d_add_ci(dentry, NULL)
* for negative dentries in the encoding case as
* well. For now, prevent the negative dentry
@@ -586,7 +585,7 @@ out_splice:
trace_f2fs_lookup_end(dir, dentry, ino, err);
return NULL;
}
-#endif
+
new = d_splice_alias(inode, dentry);
trace_f2fs_lookup_end(dir, !IS_ERR_OR_NULL(new) ? new : dentry,
ino, IS_ERR(new) ? PTR_ERR(new) : err);
@@ -639,16 +638,15 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
f2fs_delete_entry(de, page, dir, inode);
f2fs_unlock_op(sbi);
-#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
* negative dentries at f2fs_lookup(), when it is better
* supported by the VFS for the CI case.
*/
- if (IS_CASEFOLDED(dir))
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
d_invalidate(dentry);
-#endif
+
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
fail:
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 496aee53c38a..8712e264071f 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -46,10 +46,6 @@
static struct kmem_cache *fsync_entry_slab;
-#if IS_ENABLED(CONFIG_UNICODE)
-extern struct kmem_cache *f2fs_cf_name_slab;
-#endif
-
bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi)
{
s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count);
@@ -153,11 +149,8 @@ static int init_recovered_filename(const struct inode *dir,
if (err)
return err;
f2fs_hash_filename(dir, fname);
-#if IS_ENABLED(CONFIG_UNICODE)
/* Case-sensitive match is fine for recovery */
- kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
- fname->cf_name.name = NULL;
-#endif
+ f2fs_free_casefolded_name(fname);
} else {
f2fs_hash_filename(dir, fname);
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 1f1b3647a998..df4cf31f93df 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -321,7 +321,7 @@ struct kmem_cache *f2fs_cf_name_slab;
static int __init f2fs_create_casefold_cache(void)
{
f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name",
- F2FS_NAME_LEN);
+ F2FS_NAME_LEN);
return f2fs_cf_name_slab ? 0 : -ENOMEM;
}
@@ -1326,13 +1326,13 @@ default_check:
return -EINVAL;
}
#endif
-#if !IS_ENABLED(CONFIG_UNICODE)
- if (f2fs_sb_has_casefold(sbi)) {
+
+ if (!IS_ENABLED(CONFIG_UNICODE) && f2fs_sb_has_casefold(sbi)) {
f2fs_err(sbi,
"Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE");
return -EINVAL;
}
-#endif
+
/*
* The BLKZONED feature indicates that the drive was formatted with
* zone alignment optimization. This is optional for host-aware
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 66cf4778cf3b..d3e426de5f01 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -7,6 +7,8 @@
#include <linux/hash.h>
#include <linux/ratelimit.h>
#include <linux/msdos_fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
/*
* vfat shortname flags
@@ -51,7 +53,8 @@ struct fat_mount_options {
tz_set:1, /* Filesystem timestamps' offset set */
rodir:1, /* allow ATTR_RO for directory */
discard:1, /* Issue discard requests on deletions */
- dos1xfloppy:1; /* Assume default BPB for DOS 1.x floppies */
+ dos1xfloppy:1, /* Assume default BPB for DOS 1.x floppies */
+ debug:1; /* Not currently used */
};
#define FAT_HASH_BITS 8
@@ -415,12 +418,21 @@ extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos);
extern struct inode *fat_build_inode(struct super_block *sb,
struct msdos_dir_entry *de, loff_t i_pos);
extern int fat_sync_inode(struct inode *inode);
-extern int fat_fill_super(struct super_block *sb, void *data, int silent,
- int isvfat, void (*setup)(struct super_block *));
+extern int fat_fill_super(struct super_block *sb, struct fs_context *fc,
+ void (*setup)(struct super_block *));
extern int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de);
extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
struct inode *i2);
+
+extern const struct fs_parameter_spec fat_param_spec[];
+int fat_init_fs_context(struct fs_context *fc, bool is_vfat);
+void fat_free_fc(struct fs_context *fc);
+
+int fat_parse_param(struct fs_context *fc, struct fs_parameter *param,
+ bool is_vfat);
+int fat_reconfigure(struct fs_context *fc);
+
static inline unsigned long fat_dir_hash(int logstart)
{
return hash_32(logstart, FAT_HASH_BITS);
diff --git a/fs/fat/fat_test.c b/fs/fat/fat_test.c
index 2dab4ca1d0d8..1f0062659067 100644
--- a/fs/fat/fat_test.c
+++ b/fs/fat/fat_test.c
@@ -193,4 +193,5 @@ static struct kunit_suite fat_test_suite = {
kunit_test_suites(&fat_test_suite);
+MODULE_DESCRIPTION("KUnit tests for FAT filesystems");
MODULE_LICENSE("GPL v2");
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index d9e6fbb6f246..19115fd2d2a4 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -16,7 +16,6 @@
#include <linux/mpage.h>
#include <linux/vfs.h>
#include <linux/seq_file.h>
-#include <linux/parser.h>
#include <linux/uio.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
@@ -804,16 +803,17 @@ static void __exit fat_destroy_inodecache(void)
kmem_cache_destroy(fat_inode_cachep);
}
-static int fat_remount(struct super_block *sb, int *flags, char *data)
+int fat_reconfigure(struct fs_context *fc)
{
bool new_rdonly;
+ struct super_block *sb = fc->root->d_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
- *flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME);
+ fc->sb_flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME);
sync_filesystem(sb);
/* make sure we update state on remount. */
- new_rdonly = *flags & SB_RDONLY;
+ new_rdonly = fc->sb_flags & SB_RDONLY;
if (new_rdonly != sb_rdonly(sb)) {
if (new_rdonly)
fat_set_state(sb, 0, 0);
@@ -822,6 +822,7 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
}
return 0;
}
+EXPORT_SYMBOL_GPL(fat_reconfigure);
static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
{
@@ -939,8 +940,6 @@ static const struct super_operations fat_sops = {
.evict_inode = fat_evict_inode,
.put_super = fat_put_super,
.statfs = fat_statfs,
- .remount_fs = fat_remount,
-
.show_options = fat_show_options,
};
@@ -1037,355 +1036,282 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
}
enum {
- Opt_check_n, Opt_check_r, Opt_check_s, Opt_uid, Opt_gid,
- Opt_umask, Opt_dmask, Opt_fmask, Opt_allow_utime, Opt_codepage,
- Opt_usefree, Opt_nocase, Opt_quiet, Opt_showexec, Opt_debug,
- Opt_immutable, Opt_dots, Opt_nodots,
- Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
- Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
- Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
- Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
- Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset,
- Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy,
+ Opt_check, Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask,
+ Opt_allow_utime, Opt_codepage, Opt_usefree, Opt_nocase, Opt_quiet,
+ Opt_showexec, Opt_debug, Opt_immutable, Opt_dots, Opt_dotsOK,
+ Opt_charset, Opt_shortname, Opt_utf8, Opt_utf8_bool,
+ Opt_uni_xl, Opt_uni_xl_bool, Opt_nonumtail, Opt_nonumtail_bool,
+ Opt_obsolete, Opt_flush, Opt_tz, Opt_rodir, Opt_errors, Opt_discard,
+ Opt_nfs, Opt_nfs_enum, Opt_time_offset, Opt_dos1xfloppy,
};
-static const match_table_t fat_tokens = {
- {Opt_check_r, "check=relaxed"},
- {Opt_check_s, "check=strict"},
- {Opt_check_n, "check=normal"},
- {Opt_check_r, "check=r"},
- {Opt_check_s, "check=s"},
- {Opt_check_n, "check=n"},
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_umask, "umask=%o"},
- {Opt_dmask, "dmask=%o"},
- {Opt_fmask, "fmask=%o"},
- {Opt_allow_utime, "allow_utime=%o"},
- {Opt_codepage, "codepage=%u"},
- {Opt_usefree, "usefree"},
- {Opt_nocase, "nocase"},
- {Opt_quiet, "quiet"},
- {Opt_showexec, "showexec"},
- {Opt_debug, "debug"},
- {Opt_immutable, "sys_immutable"},
- {Opt_flush, "flush"},
- {Opt_tz_utc, "tz=UTC"},
- {Opt_time_offset, "time_offset=%d"},
- {Opt_err_cont, "errors=continue"},
- {Opt_err_panic, "errors=panic"},
- {Opt_err_ro, "errors=remount-ro"},
- {Opt_discard, "discard"},
- {Opt_nfs_stale_rw, "nfs"},
- {Opt_nfs_stale_rw, "nfs=stale_rw"},
- {Opt_nfs_nostale_ro, "nfs=nostale_ro"},
- {Opt_dos1xfloppy, "dos1xfloppy"},
- {Opt_obsolete, "conv=binary"},
- {Opt_obsolete, "conv=text"},
- {Opt_obsolete, "conv=auto"},
- {Opt_obsolete, "conv=b"},
- {Opt_obsolete, "conv=t"},
- {Opt_obsolete, "conv=a"},
- {Opt_obsolete, "fat=%u"},
- {Opt_obsolete, "blocksize=%u"},
- {Opt_obsolete, "cvf_format=%20s"},
- {Opt_obsolete, "cvf_options=%100s"},
- {Opt_obsolete, "posix"},
- {Opt_err, NULL},
-};
-static const match_table_t msdos_tokens = {
- {Opt_nodots, "nodots"},
- {Opt_nodots, "dotsOK=no"},
- {Opt_dots, "dots"},
- {Opt_dots, "dotsOK=yes"},
- {Opt_err, NULL}
+static const struct constant_table fat_param_check[] = {
+ {"relaxed", 'r'},
+ {"r", 'r'},
+ {"strict", 's'},
+ {"s", 's'},
+ {"normal", 'n'},
+ {"n", 'n'},
+ {}
};
-static const match_table_t vfat_tokens = {
- {Opt_charset, "iocharset=%s"},
- {Opt_shortname_lower, "shortname=lower"},
- {Opt_shortname_win95, "shortname=win95"},
- {Opt_shortname_winnt, "shortname=winnt"},
- {Opt_shortname_mixed, "shortname=mixed"},
- {Opt_utf8_no, "utf8=0"}, /* 0 or no or false */
- {Opt_utf8_no, "utf8=no"},
- {Opt_utf8_no, "utf8=false"},
- {Opt_utf8_yes, "utf8=1"}, /* empty or 1 or yes or true */
- {Opt_utf8_yes, "utf8=yes"},
- {Opt_utf8_yes, "utf8=true"},
- {Opt_utf8_yes, "utf8"},
- {Opt_uni_xl_no, "uni_xlate=0"}, /* 0 or no or false */
- {Opt_uni_xl_no, "uni_xlate=no"},
- {Opt_uni_xl_no, "uni_xlate=false"},
- {Opt_uni_xl_yes, "uni_xlate=1"}, /* empty or 1 or yes or true */
- {Opt_uni_xl_yes, "uni_xlate=yes"},
- {Opt_uni_xl_yes, "uni_xlate=true"},
- {Opt_uni_xl_yes, "uni_xlate"},
- {Opt_nonumtail_no, "nonumtail=0"}, /* 0 or no or false */
- {Opt_nonumtail_no, "nonumtail=no"},
- {Opt_nonumtail_no, "nonumtail=false"},
- {Opt_nonumtail_yes, "nonumtail=1"}, /* empty or 1 or yes or true */
- {Opt_nonumtail_yes, "nonumtail=yes"},
- {Opt_nonumtail_yes, "nonumtail=true"},
- {Opt_nonumtail_yes, "nonumtail"},
- {Opt_rodir, "rodir"},
- {Opt_err, NULL}
+
+static const struct constant_table fat_param_tz[] = {
+ {"UTC", 0},
+ {}
};
-static int parse_options(struct super_block *sb, char *options, int is_vfat,
- int silent, int *debug, struct fat_mount_options *opts)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
- char *iocharset;
+static const struct constant_table fat_param_errors[] = {
+ {"continue", FAT_ERRORS_CONT},
+ {"panic", FAT_ERRORS_PANIC},
+ {"remount-ro", FAT_ERRORS_RO},
+ {}
+};
- opts->isvfat = is_vfat;
- opts->fs_uid = current_uid();
- opts->fs_gid = current_gid();
- opts->fs_fmask = opts->fs_dmask = current_umask();
- opts->allow_utime = -1;
- opts->codepage = fat_default_codepage;
- fat_reset_iocharset(opts);
- if (is_vfat) {
- opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95;
- opts->rodir = 0;
- } else {
- opts->shortname = 0;
- opts->rodir = 1;
- }
- opts->name_check = 'n';
- opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0;
- opts->unicode_xlate = 0;
- opts->numtail = 1;
- opts->usefree = opts->nocase = 0;
- opts->tz_set = 0;
- opts->nfs = 0;
- opts->errors = FAT_ERRORS_RO;
- *debug = 0;
+static const struct constant_table fat_param_nfs[] = {
+ {"stale_rw", FAT_NFS_STALE_RW},
+ {"nostale_ro", FAT_NFS_NOSTALE_RO},
+ {}
+};
- opts->utf8 = IS_ENABLED(CONFIG_FAT_DEFAULT_UTF8) && is_vfat;
+/*
+ * These are all obsolete but we still reject invalid options.
+ * The corresponding values are therefore meaningless.
+ */
+static const struct constant_table fat_param_conv[] = {
+ {"binary", 0},
+ {"text", 0},
+ {"auto", 0},
+ {"b", 0},
+ {"t", 0},
+ {"a", 0},
+ {}
+};
- if (!options)
- goto out;
+/* Core options. See below for vfat and msdos extras */
+const struct fs_parameter_spec fat_param_spec[] = {
+ fsparam_enum ("check", Opt_check, fat_param_check),
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
+ fsparam_u32oct ("umask", Opt_umask),
+ fsparam_u32oct ("dmask", Opt_dmask),
+ fsparam_u32oct ("fmask", Opt_fmask),
+ fsparam_u32oct ("allow_utime", Opt_allow_utime),
+ fsparam_u32 ("codepage", Opt_codepage),
+ fsparam_flag ("usefree", Opt_usefree),
+ fsparam_flag ("nocase", Opt_nocase),
+ fsparam_flag ("quiet", Opt_quiet),
+ fsparam_flag ("showexec", Opt_showexec),
+ fsparam_flag ("debug", Opt_debug),
+ fsparam_flag ("sys_immutable", Opt_immutable),
+ fsparam_flag ("flush", Opt_flush),
+ fsparam_enum ("tz", Opt_tz, fat_param_tz),
+ fsparam_s32 ("time_offset", Opt_time_offset),
+ fsparam_enum ("errors", Opt_errors, fat_param_errors),
+ fsparam_flag ("discard", Opt_discard),
+ fsparam_flag ("nfs", Opt_nfs),
+ fsparam_enum ("nfs", Opt_nfs_enum, fat_param_nfs),
+ fsparam_flag ("dos1xfloppy", Opt_dos1xfloppy),
+ __fsparam(fs_param_is_enum, "conv",
+ Opt_obsolete, fs_param_deprecated, fat_param_conv),
+ __fsparam(fs_param_is_u32, "fat",
+ Opt_obsolete, fs_param_deprecated, NULL),
+ __fsparam(fs_param_is_u32, "blocksize",
+ Opt_obsolete, fs_param_deprecated, NULL),
+ __fsparam(fs_param_is_string, "cvf_format",
+ Opt_obsolete, fs_param_deprecated, NULL),
+ __fsparam(fs_param_is_string, "cvf_options",
+ Opt_obsolete, fs_param_deprecated, NULL),
+ __fsparam(NULL, "posix",
+ Opt_obsolete, fs_param_deprecated, NULL),
+ {}
+};
+EXPORT_SYMBOL_GPL(fat_param_spec);
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
+static const struct fs_parameter_spec msdos_param_spec[] = {
+ fsparam_flag_no ("dots", Opt_dots),
+ fsparam_bool ("dotsOK", Opt_dotsOK),
+ {}
+};
- token = match_token(p, fat_tokens, args);
- if (token == Opt_err) {
- if (is_vfat)
- token = match_token(p, vfat_tokens, args);
- else
- token = match_token(p, msdos_tokens, args);
- }
- switch (token) {
- case Opt_check_s:
- opts->name_check = 's';
- break;
- case Opt_check_r:
- opts->name_check = 'r';
- break;
- case Opt_check_n:
- opts->name_check = 'n';
- break;
- case Opt_usefree:
- opts->usefree = 1;
- break;
- case Opt_nocase:
- if (!is_vfat)
- opts->nocase = 1;
- else {
- /* for backward compatibility */
- opts->shortname = VFAT_SFN_DISPLAY_WIN95
- | VFAT_SFN_CREATE_WIN95;
- }
- break;
- case Opt_quiet:
- opts->quiet = 1;
- break;
- case Opt_showexec:
- opts->showexec = 1;
- break;
- case Opt_debug:
- *debug = 1;
- break;
- case Opt_immutable:
- opts->sys_immutable = 1;
- break;
- case Opt_uid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- opts->fs_uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(opts->fs_uid))
- return -EINVAL;
- break;
- case Opt_gid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- opts->fs_gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(opts->fs_gid))
- return -EINVAL;
- break;
- case Opt_umask:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->fs_fmask = opts->fs_dmask = option;
- break;
- case Opt_dmask:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->fs_dmask = option;
- break;
- case Opt_fmask:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->fs_fmask = option;
- break;
- case Opt_allow_utime:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->allow_utime = option & (S_IWGRP | S_IWOTH);
- break;
- case Opt_codepage:
- if (match_int(&args[0], &option))
- return -EINVAL;
- opts->codepage = option;
- break;
- case Opt_flush:
- opts->flush = 1;
- break;
- case Opt_time_offset:
- if (match_int(&args[0], &option))
- return -EINVAL;
- /*
- * GMT+-12 zones may have DST corrections so at least
- * 13 hours difference is needed. Make the limit 24
- * just in case someone invents something unusual.
- */
- if (option < -24 * 60 || option > 24 * 60)
- return -EINVAL;
- opts->tz_set = 1;
- opts->time_offset = option;
- break;
- case Opt_tz_utc:
- opts->tz_set = 1;
- opts->time_offset = 0;
- break;
- case Opt_err_cont:
- opts->errors = FAT_ERRORS_CONT;
- break;
- case Opt_err_panic:
- opts->errors = FAT_ERRORS_PANIC;
- break;
- case Opt_err_ro:
- opts->errors = FAT_ERRORS_RO;
- break;
- case Opt_nfs_stale_rw:
- opts->nfs = FAT_NFS_STALE_RW;
- break;
- case Opt_nfs_nostale_ro:
- opts->nfs = FAT_NFS_NOSTALE_RO;
- break;
- case Opt_dos1xfloppy:
- opts->dos1xfloppy = 1;
- break;
+static const struct constant_table fat_param_shortname[] = {
+ {"lower", VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95},
+ {"win95", VFAT_SFN_DISPLAY_WIN95 | VFAT_SFN_CREATE_WIN95},
+ {"winnt", VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WINNT},
+ {"mixed", VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WIN95},
+ {}
+};
- /* msdos specific */
- case Opt_dots:
- opts->dotsOK = 1;
- break;
- case Opt_nodots:
- opts->dotsOK = 0;
- break;
+static const struct fs_parameter_spec vfat_param_spec[] = {
+ fsparam_string ("iocharset", Opt_charset),
+ fsparam_enum ("shortname", Opt_shortname, fat_param_shortname),
+ fsparam_flag ("utf8", Opt_utf8),
+ fsparam_bool ("utf8", Opt_utf8_bool),
+ fsparam_flag ("uni_xlate", Opt_uni_xl),
+ fsparam_bool ("uni_xlate", Opt_uni_xl_bool),
+ fsparam_flag ("nonumtail", Opt_nonumtail),
+ fsparam_bool ("nonumtail", Opt_nonumtail_bool),
+ fsparam_flag ("rodir", Opt_rodir),
+ {}
+};
- /* vfat specific */
- case Opt_charset:
- fat_reset_iocharset(opts);
- iocharset = match_strdup(&args[0]);
- if (!iocharset)
- return -ENOMEM;
- opts->iocharset = iocharset;
- break;
- case Opt_shortname_lower:
- opts->shortname = VFAT_SFN_DISPLAY_LOWER
- | VFAT_SFN_CREATE_WIN95;
- break;
- case Opt_shortname_win95:
- opts->shortname = VFAT_SFN_DISPLAY_WIN95
- | VFAT_SFN_CREATE_WIN95;
- break;
- case Opt_shortname_winnt:
- opts->shortname = VFAT_SFN_DISPLAY_WINNT
- | VFAT_SFN_CREATE_WINNT;
- break;
- case Opt_shortname_mixed:
- opts->shortname = VFAT_SFN_DISPLAY_WINNT
- | VFAT_SFN_CREATE_WIN95;
- break;
- case Opt_utf8_no: /* 0 or no or false */
- opts->utf8 = 0;
- break;
- case Opt_utf8_yes: /* empty or 1 or yes or true */
- opts->utf8 = 1;
- break;
- case Opt_uni_xl_no: /* 0 or no or false */
- opts->unicode_xlate = 0;
- break;
- case Opt_uni_xl_yes: /* empty or 1 or yes or true */
- opts->unicode_xlate = 1;
- break;
- case Opt_nonumtail_no: /* 0 or no or false */
- opts->numtail = 1; /* negated option */
- break;
- case Opt_nonumtail_yes: /* empty or 1 or yes or true */
- opts->numtail = 0; /* negated option */
- break;
- case Opt_rodir:
- opts->rodir = 1;
- break;
- case Opt_discard:
- opts->discard = 1;
- break;
+int fat_parse_param(struct fs_context *fc, struct fs_parameter *param,
+ bool is_vfat)
+{
+ struct fat_mount_options *opts = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
- /* obsolete mount options */
- case Opt_obsolete:
- fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, "
- "not supported now", p);
- break;
- /* unknown option */
- default:
- if (!silent) {
- fat_msg(sb, KERN_ERR,
- "Unrecognized mount option \"%s\" "
- "or missing value", p);
- }
- return -EINVAL;
- }
- }
+ /* remount options have traditionally been ignored */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
+ return 0;
-out:
- /* UTF-8 doesn't provide FAT semantics */
- if (!strcmp(opts->iocharset, "utf8")) {
- fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset"
- " for FAT filesystems, filesystem will be "
- "case sensitive!");
+ opt = fs_parse(fc, fat_param_spec, param, &result);
+ /* If option not found in fat_param_spec, try vfat/msdos options */
+ if (opt == -ENOPARAM) {
+ if (is_vfat)
+ opt = fs_parse(fc, vfat_param_spec, param, &result);
+ else
+ opt = fs_parse(fc, msdos_param_spec, param, &result);
}
- /* If user doesn't specify allow_utime, it's initialized from dmask. */
- if (opts->allow_utime == (unsigned short)-1)
- opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH);
- if (opts->unicode_xlate)
- opts->utf8 = 0;
- if (opts->nfs == FAT_NFS_NOSTALE_RO) {
- sb->s_flags |= SB_RDONLY;
- sb->s_export_op = &fat_export_ops_nostale;
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_check:
+ opts->name_check = result.uint_32;
+ break;
+ case Opt_usefree:
+ opts->usefree = 1;
+ break;
+ case Opt_nocase:
+ if (!is_vfat)
+ opts->nocase = 1;
+ else {
+ /* for backward compatibility */
+ opts->shortname = VFAT_SFN_DISPLAY_WIN95
+ | VFAT_SFN_CREATE_WIN95;
+ }
+ break;
+ case Opt_quiet:
+ opts->quiet = 1;
+ break;
+ case Opt_showexec:
+ opts->showexec = 1;
+ break;
+ case Opt_debug:
+ opts->debug = 1;
+ break;
+ case Opt_immutable:
+ opts->sys_immutable = 1;
+ break;
+ case Opt_uid:
+ opts->fs_uid = result.uid;
+ break;
+ case Opt_gid:
+ opts->fs_gid = result.gid;
+ break;
+ case Opt_umask:
+ opts->fs_fmask = opts->fs_dmask = result.uint_32;
+ break;
+ case Opt_dmask:
+ opts->fs_dmask = result.uint_32;
+ break;
+ case Opt_fmask:
+ opts->fs_fmask = result.uint_32;
+ break;
+ case Opt_allow_utime:
+ opts->allow_utime = result.uint_32 & (S_IWGRP | S_IWOTH);
+ break;
+ case Opt_codepage:
+ opts->codepage = result.uint_32;
+ break;
+ case Opt_flush:
+ opts->flush = 1;
+ break;
+ case Opt_time_offset:
+ /*
+ * GMT+-12 zones may have DST corrections so at least
+ * 13 hours difference is needed. Make the limit 24
+ * just in case someone invents something unusual.
+ */
+ if (result.int_32 < -24 * 60 || result.int_32 > 24 * 60)
+ return -EINVAL;
+ opts->tz_set = 1;
+ opts->time_offset = result.int_32;
+ break;
+ case Opt_tz:
+ opts->tz_set = 1;
+ opts->time_offset = result.uint_32;
+ break;
+ case Opt_errors:
+ opts->errors = result.uint_32;
+ break;
+ case Opt_nfs:
+ opts->nfs = FAT_NFS_STALE_RW;
+ break;
+ case Opt_nfs_enum:
+ opts->nfs = result.uint_32;
+ break;
+ case Opt_dos1xfloppy:
+ opts->dos1xfloppy = 1;
+ break;
+
+ /* msdos specific */
+ case Opt_dots: /* dots / nodots */
+ opts->dotsOK = !result.negated;
+ break;
+ case Opt_dotsOK: /* dotsOK = yes/no */
+ opts->dotsOK = result.boolean;
+ break;
+
+ /* vfat specific */
+ case Opt_charset:
+ fat_reset_iocharset(opts);
+ opts->iocharset = param->string;
+ param->string = NULL; /* Steal string */
+ break;
+ case Opt_shortname:
+ opts->shortname = result.uint_32;
+ break;
+ case Opt_utf8:
+ opts->utf8 = 1;
+ break;
+ case Opt_utf8_bool:
+ opts->utf8 = result.boolean;
+ break;
+ case Opt_uni_xl:
+ opts->unicode_xlate = 1;
+ break;
+ case Opt_uni_xl_bool:
+ opts->unicode_xlate = result.boolean;
+ break;
+ case Opt_nonumtail:
+ opts->numtail = 0; /* negated option */
+ break;
+ case Opt_nonumtail_bool:
+ opts->numtail = !result.boolean; /* negated option */
+ break;
+ case Opt_rodir:
+ opts->rodir = 1;
+ break;
+ case Opt_discard:
+ opts->discard = 1;
+ break;
+
+ /* obsolete mount options */
+ case Opt_obsolete:
+ printk(KERN_INFO "FAT-fs: \"%s\" option is obsolete, "
+ "not supported now", param->key);
+ break;
+ default:
+ return -EINVAL;
}
return 0;
}
+EXPORT_SYMBOL_GPL(fat_parse_param);
static int fat_read_root(struct inode *inode)
{
@@ -1604,9 +1530,11 @@ out:
/*
* Read the super block of an MS-DOS FS.
*/
-int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
+int fat_fill_super(struct super_block *sb, struct fs_context *fc,
void (*setup)(struct super_block *))
{
+ struct fat_mount_options *opts = fc->fs_private;
+ int silent = fc->sb_flags & SB_SILENT;
struct inode *root_inode = NULL, *fat_inode = NULL;
struct inode *fsinfo_inode = NULL;
struct buffer_head *bh;
@@ -1614,7 +1542,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
struct msdos_sb_info *sbi;
u16 logical_sector_size;
u32 total_sectors, total_clusters, fat_clusters, rootdir_sectors;
- int debug;
long error;
char buf[50];
struct timespec64 ts;
@@ -1643,9 +1570,27 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- error = parse_options(sb, data, isvfat, silent, &debug, &sbi->options);
- if (error)
- goto out_fail;
+ /* UTF-8 doesn't provide FAT semantics */
+ if (!strcmp(opts->iocharset, "utf8")) {
+ fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset"
+ " for FAT filesystems, filesystem will be"
+ " case sensitive!");
+ }
+
+ /* If user doesn't specify allow_utime, it's initialized from dmask. */
+ if (opts->allow_utime == (unsigned short)-1)
+ opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH);
+ if (opts->unicode_xlate)
+ opts->utf8 = 0;
+ if (opts->nfs == FAT_NFS_NOSTALE_RO) {
+ sb->s_flags |= SB_RDONLY;
+ sb->s_export_op = &fat_export_ops_nostale;
+ }
+
+ /* Apply parsed options to sbi (structure copy) */
+ sbi->options = *opts;
+ /* Transfer ownership of iocharset to sbi->options */
+ opts->iocharset = NULL;
setup(sb); /* flavour-specific stuff that needs options */
@@ -1950,6 +1895,57 @@ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
}
EXPORT_SYMBOL_GPL(fat_flush_inodes);
+int fat_init_fs_context(struct fs_context *fc, bool is_vfat)
+{
+ struct fat_mount_options *opts;
+
+ opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+
+ opts->isvfat = is_vfat;
+ opts->fs_uid = current_uid();
+ opts->fs_gid = current_gid();
+ opts->fs_fmask = opts->fs_dmask = current_umask();
+ opts->allow_utime = -1;
+ opts->codepage = fat_default_codepage;
+ fat_reset_iocharset(opts);
+ if (is_vfat) {
+ opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95;
+ opts->rodir = 0;
+ } else {
+ opts->shortname = 0;
+ opts->rodir = 1;
+ }
+ opts->name_check = 'n';
+ opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0;
+ opts->unicode_xlate = 0;
+ opts->numtail = 1;
+ opts->usefree = opts->nocase = 0;
+ opts->tz_set = 0;
+ opts->nfs = 0;
+ opts->errors = FAT_ERRORS_RO;
+ opts->debug = 0;
+
+ opts->utf8 = IS_ENABLED(CONFIG_FAT_DEFAULT_UTF8) && is_vfat;
+
+ fc->fs_private = opts;
+ /* fc->ops assigned by caller */
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fat_init_fs_context);
+
+void fat_free_fc(struct fs_context *fc)
+{
+ struct fat_mount_options *opts = fc->fs_private;
+
+ if (opts->iocharset != fat_default_iocharset)
+ kfree(opts->iocharset);
+ kfree(fc->fs_private);
+}
+EXPORT_SYMBOL_GPL(fat_free_fc);
+
static int __init init_fat_fs(void)
{
int err;
@@ -1978,4 +1974,5 @@ static void __exit exit_fat_fs(void)
module_init(init_fat_fs)
module_exit(exit_fat_fs)
+MODULE_DESCRIPTION("Core FAT filesystem support");
MODULE_LICENSE("GPL");
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 2116c486843b..f06f6ba643cc 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -650,24 +650,48 @@ static void setup(struct super_block *sb)
sb->s_flags |= SB_NOATIME;
}
-static int msdos_fill_super(struct super_block *sb, void *data, int silent)
+static int msdos_fill_super(struct super_block *sb, struct fs_context *fc)
{
- return fat_fill_super(sb, data, silent, 0, setup);
+ return fat_fill_super(sb, fc, setup);
}
-static struct dentry *msdos_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name,
- void *data)
+static int msdos_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
+ return get_tree_bdev(fc, msdos_fill_super);
+}
+
+static int msdos_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ return fat_parse_param(fc, param, false);
+}
+
+static const struct fs_context_operations msdos_context_ops = {
+ .parse_param = msdos_parse_param,
+ .get_tree = msdos_get_tree,
+ .reconfigure = fat_reconfigure,
+ .free = fat_free_fc,
+};
+
+static int msdos_init_fs_context(struct fs_context *fc)
+{
+ int err;
+
+ /* Initialize with is_vfat == false */
+ err = fat_init_fs_context(fc, false);
+ if (err)
+ return err;
+
+ fc->ops = &msdos_context_ops;
+ return 0;
}
static struct file_system_type msdos_fs_type = {
.owner = THIS_MODULE,
.name = "msdos",
- .mount = msdos_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .init_fs_context = msdos_init_fs_context,
+ .parameters = fat_param_spec,
};
MODULE_ALIAS_FS("msdos");
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index c4d00999a433..6423e1dedf14 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1195,24 +1195,48 @@ static void setup(struct super_block *sb)
sb->s_d_op = &vfat_dentry_ops;
}
-static int vfat_fill_super(struct super_block *sb, void *data, int silent)
+static int vfat_fill_super(struct super_block *sb, struct fs_context *fc)
{
- return fat_fill_super(sb, data, silent, 1, setup);
+ return fat_fill_super(sb, fc, setup);
}
-static struct dentry *vfat_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name,
- void *data)
+static int vfat_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
+ return get_tree_bdev(fc, vfat_fill_super);
+}
+
+static int vfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ return fat_parse_param(fc, param, true);
+}
+
+static const struct fs_context_operations vfat_context_ops = {
+ .parse_param = vfat_parse_param,
+ .get_tree = vfat_get_tree,
+ .reconfigure = fat_reconfigure,
+ .free = fat_free_fc,
+};
+
+static int vfat_init_fs_context(struct fs_context *fc)
+{
+ int err;
+
+ /* Initialize with is_vfat == true */
+ err = fat_init_fs_context(fc, true);
+ if (err)
+ return err;
+
+ fc->ops = &vfat_context_ops;
+ return 0;
}
static struct file_system_type vfat_fs_type = {
.owner = THIS_MODULE,
.name = "vfat",
- .mount = vfat_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .init_fs_context = vfat_init_fs_context,
+ .parameters = fat_param_spec,
};
MODULE_ALIAS_FS("vfat");
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 8a7f86c2139a..6e8cea16790e 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -115,88 +115,188 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
return err;
}
-static struct vfsmount *get_vfsmount_from_fd(int fd)
+static int get_path_from_fd(int fd, struct path *root)
{
- struct vfsmount *mnt;
-
if (fd == AT_FDCWD) {
struct fs_struct *fs = current->fs;
spin_lock(&fs->lock);
- mnt = mntget(fs->pwd.mnt);
+ *root = fs->pwd;
+ path_get(root);
spin_unlock(&fs->lock);
} else {
struct fd f = fdget(fd);
if (!f.file)
- return ERR_PTR(-EBADF);
- mnt = mntget(f.file->f_path.mnt);
+ return -EBADF;
+ *root = f.file->f_path;
+ path_get(root);
fdput(f);
}
- return mnt;
+
+ return 0;
}
+enum handle_to_path_flags {
+ HANDLE_CHECK_PERMS = (1 << 0),
+ HANDLE_CHECK_SUBTREE = (1 << 1),
+};
+
+struct handle_to_path_ctx {
+ struct path root;
+ enum handle_to_path_flags flags;
+ unsigned int fh_flags;
+};
+
static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
{
- return 1;
+ struct handle_to_path_ctx *ctx = context;
+ struct user_namespace *user_ns = current_user_ns();
+ struct dentry *d, *root = ctx->root.dentry;
+ struct mnt_idmap *idmap = mnt_idmap(ctx->root.mnt);
+ int retval = 0;
+
+ if (!root)
+ return 1;
+
+ /* Old permission model with global CAP_DAC_READ_SEARCH. */
+ if (!ctx->flags)
+ return 1;
+
+ /*
+ * It's racy as we're not taking rename_lock but we're able to ignore
+ * permissions and we just need an approximation whether we were able
+ * to follow a path to the file.
+ *
+ * It's also potentially expensive on some filesystems especially if
+ * there is a deep path.
+ */
+ d = dget(dentry);
+ while (d != root && !IS_ROOT(d)) {
+ struct dentry *parent = dget_parent(d);
+
+ /*
+ * We know that we have the ability to override DAC permissions
+ * as we've verified this earlier via CAP_DAC_READ_SEARCH. But
+ * we also need to make sure that there aren't any unmapped
+ * inodes in the path that would prevent us from reaching the
+ * file.
+ */
+ if (!privileged_wrt_inode_uidgid(user_ns, idmap,
+ d_inode(parent))) {
+ dput(d);
+ dput(parent);
+ return retval;
+ }
+
+ dput(d);
+ d = parent;
+ }
+
+ if (!(ctx->flags & HANDLE_CHECK_SUBTREE) || d == root)
+ retval = 1;
+ WARN_ON_ONCE(d != root && d != root->d_sb->s_root);
+ dput(d);
+ return retval;
}
-static int do_handle_to_path(int mountdirfd, struct file_handle *handle,
- struct path *path)
+static int do_handle_to_path(struct file_handle *handle, struct path *path,
+ struct handle_to_path_ctx *ctx)
{
- int retval = 0;
int handle_dwords;
+ struct vfsmount *mnt = ctx->root.mnt;
- path->mnt = get_vfsmount_from_fd(mountdirfd);
- if (IS_ERR(path->mnt)) {
- retval = PTR_ERR(path->mnt);
- goto out_err;
- }
/* change the handle size to multiple of sizeof(u32) */
handle_dwords = handle->handle_bytes >> 2;
- path->dentry = exportfs_decode_fh(path->mnt,
+ path->dentry = exportfs_decode_fh_raw(mnt,
(struct fid *)handle->f_handle,
handle_dwords, handle->handle_type,
- vfs_dentry_acceptable, NULL);
- if (IS_ERR(path->dentry)) {
- retval = PTR_ERR(path->dentry);
- goto out_mnt;
+ ctx->fh_flags,
+ vfs_dentry_acceptable, ctx);
+ if (IS_ERR_OR_NULL(path->dentry)) {
+ if (path->dentry == ERR_PTR(-ENOMEM))
+ return -ENOMEM;
+ return -ESTALE;
}
+ path->mnt = mntget(mnt);
return 0;
-out_mnt:
- mntput(path->mnt);
-out_err:
- return retval;
+}
+
+/*
+ * Allow relaxed permissions of file handles if the caller has the
+ * ability to mount the filesystem or create a bind-mount of the
+ * provided @mountdirfd.
+ *
+ * In both cases the caller may be able to get an unobstructed way to
+ * the encoded file handle. If the caller is only able to create a
+ * bind-mount we need to verify that there are no locked mounts on top
+ * of it that could prevent us from getting to the encoded file.
+ *
+ * In principle, locked mounts can prevent the caller from mounting the
+ * filesystem but that only applies to procfs and sysfs neither of which
+ * support decoding file handles.
+ */
+static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
+ unsigned int o_flags)
+{
+ struct path *root = &ctx->root;
+
+ /*
+ * Restrict to O_DIRECTORY to provide a deterministic API that avoids a
+ * confusing api in the face of disconnected non-dir dentries.
+ *
+ * There's only one dentry for each directory inode (VFS rule)...
+ */
+ if (!(o_flags & O_DIRECTORY))
+ return false;
+
+ if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
+ ctx->flags = HANDLE_CHECK_PERMS;
+ else if (is_mounted(root->mnt) &&
+ ns_capable(real_mount(root->mnt)->mnt_ns->user_ns,
+ CAP_SYS_ADMIN) &&
+ !has_locked_children(real_mount(root->mnt), root->dentry))
+ ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE;
+ else
+ return false;
+
+ /* Are we able to override DAC permissions? */
+ if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH))
+ return false;
+
+ ctx->fh_flags = EXPORT_FH_DIR_ONLY;
+ return true;
}
static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
- struct path *path)
+ struct path *path, unsigned int o_flags)
{
int retval = 0;
struct file_handle f_handle;
struct file_handle *handle = NULL;
+ struct handle_to_path_ctx ctx = {};
- /*
- * With handle we don't look at the execute bit on the
- * directory. Ideally we would like CAP_DAC_SEARCH.
- * But we don't have that
- */
- if (!capable(CAP_DAC_READ_SEARCH)) {
- retval = -EPERM;
+ retval = get_path_from_fd(mountdirfd, &ctx.root);
+ if (retval)
goto out_err;
+
+ if (!capable(CAP_DAC_READ_SEARCH) && !may_decode_fh(&ctx, o_flags)) {
+ retval = -EPERM;
+ goto out_path;
}
+
if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
retval = -EFAULT;
- goto out_err;
+ goto out_path;
}
if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
(f_handle.handle_bytes == 0)) {
retval = -EINVAL;
- goto out_err;
+ goto out_path;
}
handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes),
GFP_KERNEL);
if (!handle) {
retval = -ENOMEM;
- goto out_err;
+ goto out_path;
}
/* copy the full handle */
*handle = f_handle;
@@ -207,10 +307,12 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
goto out_handle;
}
- retval = do_handle_to_path(mountdirfd, handle, path);
+ retval = do_handle_to_path(handle, path, &ctx);
out_handle:
kfree(handle);
+out_path:
+ path_put(&ctx.root);
out_err:
return retval;
}
@@ -223,7 +325,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
struct file *file;
int fd;
- retval = handle_to_path(mountdirfd, ufh, &path);
+ retval = handle_to_path(mountdirfd, ufh, &path, open_flag);
if (retval)
return retval;
diff --git a/fs/file.c b/fs/file.c
index 8076aef9c210..a3b72aa64f11 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -486,12 +486,12 @@ struct files_struct init_files = {
static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
- unsigned int maxfd = fdt->max_fds;
+ unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
unsigned int maxbit = maxfd / BITS_PER_LONG;
unsigned int bitbit = start / BITS_PER_LONG;
bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
- if (bitbit > maxfd)
+ if (bitbit >= maxfd)
return maxfd;
if (bitbit > start)
start = bitbit;
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index a4d6ca0b8971..24727ec34e5a 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -308,6 +308,40 @@ int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p,
}
EXPORT_SYMBOL(fs_param_is_fd);
+int fs_param_is_uid(struct p_log *log, const struct fs_parameter_spec *p,
+ struct fs_parameter *param, struct fs_parse_result *result)
+{
+ kuid_t uid;
+
+ if (fs_param_is_u32(log, p, param, result) != 0)
+ return fs_param_bad_value(log, param);
+
+ uid = make_kuid(current_user_ns(), result->uint_32);
+ if (!uid_valid(uid))
+ return inval_plog(log, "Invalid uid '%s'", param->string);
+
+ result->uid = uid;
+ return 0;
+}
+EXPORT_SYMBOL(fs_param_is_uid);
+
+int fs_param_is_gid(struct p_log *log, const struct fs_parameter_spec *p,
+ struct fs_parameter *param, struct fs_parse_result *result)
+{
+ kgid_t gid;
+
+ if (fs_param_is_u32(log, p, param, result) != 0)
+ return fs_param_bad_value(log, param);
+
+ gid = make_kgid(current_user_ns(), result->uint_32);
+ if (!gid_valid(gid))
+ return inval_plog(log, "Invalid gid '%s'", param->string);
+
+ result->gid = gid;
+ return 0;
+}
+EXPORT_SYMBOL(fs_param_is_gid);
+
int fs_param_is_blockdev(struct p_log *log, const struct fs_parameter_spec *p,
struct fs_parameter *param, struct fs_parse_result *result)
{
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 6593ae518115..ed2dd000622e 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -220,10 +220,6 @@ static int vfs_cmd_create(struct fs_context *fc, bool exclusive)
if (!mount_capable(fc))
return -EPERM;
- /* require the new mount api */
- if (exclusive && fc->ops == &legacy_fs_context_ops)
- return -EOPNOTSUPP;
-
fc->phase = FS_CONTEXT_CREATING;
fc->exclusive = exclusive;
@@ -411,6 +407,7 @@ SYSCALL_DEFINE5(fsconfig,
case FSCONFIG_SET_PATH:
case FSCONFIG_SET_PATH_EMPTY:
case FSCONFIG_SET_FD:
+ case FSCONFIG_CMD_CREATE_EXCL:
ret = -EOPNOTSUPP;
goto out_f;
}
@@ -451,7 +448,7 @@ SYSCALL_DEFINE5(fsconfig,
fallthrough;
case FSCONFIG_SET_PATH:
param.type = fs_value_is_filename;
- param.name = getname_flags(_value, lookup_flags, NULL);
+ param.name = getname_flags(_value, lookup_flags);
if (IS_ERR(param.name)) {
ret = PTR_ERR(param.name);
goto out_key;
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index 3d192b80a561..04cfd8fee992 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -146,8 +146,8 @@ int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
* be stripped.
*/
if (fc->posix_acl &&
- !vfsgid_in_group_p(i_gid_into_vfsgid(&nop_mnt_idmap, inode)) &&
- !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID))
+ !in_group_or_capable(&nop_mnt_idmap, inode,
+ i_gid_into_vfsgid(&nop_mnt_idmap, inode)))
extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID;
ret = fuse_setxattr(inode, name, value, size, 0, extra_flags);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 99e44ea7d875..d8ab4e93916f 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -740,8 +740,8 @@ static const struct fs_parameter_spec fuse_fs_parameters[] = {
fsparam_string ("source", OPT_SOURCE),
fsparam_u32 ("fd", OPT_FD),
fsparam_u32oct ("rootmode", OPT_ROOTMODE),
- fsparam_u32 ("user_id", OPT_USER_ID),
- fsparam_u32 ("group_id", OPT_GROUP_ID),
+ fsparam_uid ("user_id", OPT_USER_ID),
+ fsparam_gid ("group_id", OPT_GROUP_ID),
fsparam_flag ("default_permissions", OPT_DEFAULT_PERMISSIONS),
fsparam_flag ("allow_other", OPT_ALLOW_OTHER),
fsparam_u32 ("max_read", OPT_MAX_READ),
@@ -755,6 +755,8 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param)
struct fs_parse_result result;
struct fuse_fs_context *ctx = fsc->fs_private;
int opt;
+ kuid_t kuid;
+ kgid_t kgid;
if (fsc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
/*
@@ -799,16 +801,26 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param)
break;
case OPT_USER_ID:
- ctx->user_id = make_kuid(fsc->user_ns, result.uint_32);
- if (!uid_valid(ctx->user_id))
+ kuid = result.uid;
+ /*
+ * The requested uid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kuid_has_mapping(fsc->user_ns, kuid))
return invalfc(fsc, "Invalid user_id");
+ ctx->user_id = kuid;
ctx->user_id_present = true;
break;
case OPT_GROUP_ID:
- ctx->group_id = make_kgid(fsc->user_ns, result.uint_32);
- if (!gid_valid(ctx->group_id))
+ kgid = result.gid;
+ /*
+ * The requested gid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kgid_has_mapping(fsc->user_ns, kgid))
return invalfc(fsc, "Invalid group_id");
+ ctx->group_id = kgid;
ctx->group_id_present = true;
break;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 8c34798a0715..744e10b46904 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -200,6 +200,7 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t
HFS_I(inode)->flags = 0;
HFS_I(inode)->rsrc_inode = NULL;
HFS_I(inode)->fs_blocks = 0;
+ HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60;
if (S_ISDIR(mode)) {
inode->i_size = 2;
HFS_SB(sb)->folder_count++;
@@ -275,6 +276,8 @@ void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
for (count = 0, i = 0; i < 3; i++)
count += be16_to_cpu(ext[i].count);
HFS_I(inode)->first_blocks = count;
+ HFS_I(inode)->cached_start = 0;
+ HFS_I(inode)->cached_blocks = 0;
inode->i_size = HFS_I(inode)->phys_size = log_size;
HFS_I(inode)->fs_blocks = (log_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 6764afa98a6f..eeac99765f0d 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -28,6 +28,7 @@
static struct kmem_cache *hfs_inode_cachep;
+MODULE_DESCRIPTION("Apple Macintosh file system support");
MODULE_LICENSE("GPL");
static int hfs_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index ca2ba8c9f82e..901e83d65d20 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -25,19 +25,8 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
fd->key = ptr + tree->max_key_len + 2;
hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n",
tree->cnid, __builtin_return_address(0));
- switch (tree->cnid) {
- case HFSPLUS_CAT_CNID:
- mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX);
- break;
- case HFSPLUS_EXT_CNID:
- mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX);
- break;
- case HFSPLUS_ATTR_CNID:
- mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX);
- break;
- default:
- BUG();
- }
+ mutex_lock_nested(&tree->tree_lock,
+ hfsplus_btree_lock_class(tree));
return 0;
}
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 3c572e44f2ad..9c51867dddc5 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -430,7 +430,8 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid,
hfsplus_free_extents(sb, ext_entry, total_blocks - start,
total_blocks);
total_blocks = start;
- mutex_lock(&fd.tree->tree_lock);
+ mutex_lock_nested(&fd.tree->tree_lock,
+ hfsplus_btree_lock_class(fd.tree));
} while (total_blocks > blocks);
hfs_find_exit(&fd);
@@ -592,7 +593,8 @@ void hfsplus_file_truncate(struct inode *inode)
alloc_cnt, alloc_cnt - blk_cnt);
hfsplus_dump_extent(hip->first_extents);
hip->first_blocks = blk_cnt;
- mutex_lock(&fd.tree->tree_lock);
+ mutex_lock_nested(&fd.tree->tree_lock,
+ hfsplus_btree_lock_class(fd.tree));
break;
}
res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
@@ -606,7 +608,8 @@ void hfsplus_file_truncate(struct inode *inode)
hfsplus_free_extents(sb, hip->cached_extents,
alloc_cnt - start, alloc_cnt - blk_cnt);
hfsplus_dump_extent(hip->cached_extents);
- mutex_lock(&fd.tree->tree_lock);
+ mutex_lock_nested(&fd.tree->tree_lock,
+ hfsplus_btree_lock_class(fd.tree));
if (blk_cnt > start) {
hip->extent_state |= HFSPLUS_EXT_DIRTY;
break;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 012a3d003fbe..9e78f181c24f 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -553,6 +553,27 @@ static inline __be32 __hfsp_ut2mt(time64_t ut)
return cpu_to_be32(lower_32_bits(ut) + HFSPLUS_UTC_OFFSET);
}
+static inline enum hfsplus_btree_mutex_classes
+hfsplus_btree_lock_class(struct hfs_btree *tree)
+{
+ enum hfsplus_btree_mutex_classes class;
+
+ switch (tree->cnid) {
+ case HFSPLUS_CAT_CNID:
+ class = CATALOG_BTREE_MUTEX;
+ break;
+ case HFSPLUS_EXT_CNID:
+ class = EXTENTS_BTREE_MUTEX;
+ break;
+ case HFSPLUS_ATTR_CNID:
+ class = ATTR_BTREE_MUTEX;
+ break;
+ default:
+ BUG();
+ }
+ return class;
+}
+
/* compatibility */
#define hfsp_mt2ut(t) (struct timespec64){ .tv_sec = __hfsp_mt2ut(t) }
#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec)
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 5661a2e24d03..40d04dba13ac 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -40,7 +40,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags)
/* Directory containing the bootable system */
vh->finder_info[0] = bvh->finder_info[0] =
- cpu_to_be32(parent_ino(dentry));
+ cpu_to_be32(d_parent_ino(dentry));
/*
* Bootloader. Just using the inode here breaks in the case of
@@ -51,7 +51,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags)
/* Per spec, the OS X system folder - same as finder_info[0] here */
vh->finder_info[5] = bvh->finder_info[5] =
- cpu_to_be32(parent_ino(dentry));
+ cpu_to_be32(d_parent_ino(dentry));
mutex_unlock(&sbi->vh_mutex);
return 0;
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 5a400259ae74..9a1a93e3888b 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -696,7 +696,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
return err;
}
- strbuf = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN +
+ strbuf = kzalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN +
XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL);
if (!strbuf) {
res = -ENOMEM;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index a73d27c4dd58..3eb747d26924 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -16,11 +16,16 @@
#include <linux/seq_file.h>
#include <linux/writeback.h>
#include <linux/mount.h>
+#include <linux/fs_context.h>
#include <linux/namei.h>
#include "hostfs.h"
#include <init.h>
#include <kern.h>
+struct hostfs_fs_info {
+ char *host_root_path;
+};
+
struct hostfs_inode_info {
int fd;
fmode_t mode;
@@ -90,8 +95,10 @@ static char *__dentry_name(struct dentry *dentry, char *name)
char *p = dentry_path_raw(dentry, name, PATH_MAX);
char *root;
size_t len;
+ struct hostfs_fs_info *fsi;
- root = dentry->d_sb->s_fs_info;
+ fsi = dentry->d_sb->s_fs_info;
+ root = fsi->host_root_path;
len = strlen(root);
if (IS_ERR(p)) {
__putname(name);
@@ -196,8 +203,10 @@ static int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
long long f_bavail;
long long f_files;
long long f_ffree;
+ struct hostfs_fs_info *fsi;
- err = do_statfs(dentry->d_sb->s_fs_info,
+ fsi = dentry->d_sb->s_fs_info;
+ err = do_statfs(fsi->host_root_path,
&sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
&f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
&sf->f_namelen);
@@ -245,7 +254,11 @@ static void hostfs_free_inode(struct inode *inode)
static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
{
- const char *root_path = root->d_sb->s_fs_info;
+ struct hostfs_fs_info *fsi;
+ const char *root_path;
+
+ fsi = root->d_sb->s_fs_info;
+ root_path = fsi->host_root_path;
size_t offset = strlen(root_ino) + 1;
if (strlen(root_path) > offset)
@@ -432,31 +445,20 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
static int hostfs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
char *buffer;
- loff_t start = page_offset(page);
+ loff_t start = folio_pos(folio);
int bytes_read, ret = 0;
- buffer = kmap_local_page(page);
+ buffer = kmap_local_folio(folio, 0);
bytes_read = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer,
PAGE_SIZE);
- if (bytes_read < 0) {
- ClearPageUptodate(page);
- SetPageError(page);
+ if (bytes_read < 0)
ret = bytes_read;
- goto out;
- }
-
- memset(buffer + bytes_read, 0, PAGE_SIZE - bytes_read);
-
- ClearPageError(page);
- SetPageUptodate(page);
-
- out:
- flush_dcache_page(page);
+ else
+ buffer = folio_zero_tail(folio, bytes_read, buffer);
kunmap_local(buffer);
- unlock_page(page);
+ folio_end_read(folio, ret == 0);
return ret;
}
@@ -922,10 +924,11 @@ static const struct inode_operations hostfs_link_iops = {
.get_link = hostfs_get_link,
};
-static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
+static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct hostfs_fs_info *fsi = sb->s_fs_info;
+ const char *host_root = fc->source;
struct inode *root_inode;
- char *host_root_path, *req_root = d;
int err;
sb->s_blocksize = 1024;
@@ -939,15 +942,15 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
return err;
/* NULL is printed as '(null)' by printf(): avoid that. */
- if (req_root == NULL)
- req_root = "";
+ if (fc->source == NULL)
+ host_root = "";
- sb->s_fs_info = host_root_path =
- kasprintf(GFP_KERNEL, "%s/%s", root_ino, req_root);
- if (host_root_path == NULL)
+ fsi->host_root_path =
+ kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root);
+ if (fsi->host_root_path == NULL)
return -ENOMEM;
- root_inode = hostfs_iget(sb, host_root_path);
+ root_inode = hostfs_iget(sb, fsi->host_root_path);
if (IS_ERR(root_inode))
return PTR_ERR(root_inode);
@@ -955,7 +958,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
char *name;
iput(root_inode);
- name = follow_link(host_root_path);
+ name = follow_link(fsi->host_root_path);
if (IS_ERR(name))
return PTR_ERR(name);
@@ -972,11 +975,38 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
return 0;
}
-static struct dentry *hostfs_read_sb(struct file_system_type *type,
- int flags, const char *dev_name,
- void *data)
+static int hostfs_fc_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, hostfs_fill_super);
+}
+
+static void hostfs_fc_free(struct fs_context *fc)
{
- return mount_nodev(type, flags, data, hostfs_fill_sb_common);
+ struct hostfs_fs_info *fsi = fc->s_fs_info;
+
+ if (!fsi)
+ return;
+
+ kfree(fsi->host_root_path);
+ kfree(fsi);
+}
+
+static const struct fs_context_operations hostfs_context_ops = {
+ .get_tree = hostfs_fc_get_tree,
+ .free = hostfs_fc_free,
+};
+
+static int hostfs_init_fs_context(struct fs_context *fc)
+{
+ struct hostfs_fs_info *fsi;
+
+ fsi = kzalloc(sizeof(*fsi), GFP_KERNEL);
+ if (!fsi)
+ return -ENOMEM;
+
+ fc->s_fs_info = fsi;
+ fc->ops = &hostfs_context_ops;
+ return 0;
}
static void hostfs_kill_sb(struct super_block *s)
@@ -986,11 +1016,11 @@ static void hostfs_kill_sb(struct super_block *s)
}
static struct file_system_type hostfs_type = {
- .owner = THIS_MODULE,
- .name = "hostfs",
- .mount = hostfs_read_sb,
- .kill_sb = hostfs_kill_sb,
- .fs_flags = 0,
+ .owner = THIS_MODULE,
+ .name = "hostfs",
+ .init_fs_context = hostfs_init_fs_context,
+ .kill_sb = hostfs_kill_sb,
+ .fs_flags = 0,
};
MODULE_ALIAS_FS("hostfs");
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 9184b4584b01..d0edf9ed33b6 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -472,9 +472,8 @@ out:
static int hpfs_symlink_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- char *link = page_address(page);
- struct inode *i = page->mapping->host;
+ char *link = folio_address(folio);
+ struct inode *i = folio->mapping->host;
struct fnode *fnode;
struct buffer_head *bh;
int err;
@@ -485,17 +484,9 @@ static int hpfs_symlink_read_folio(struct file *file, struct folio *folio)
goto fail;
err = hpfs_read_ea(i->i_sb, fnode, "SYMLINK", link, PAGE_SIZE);
brelse(bh);
- if (err)
- goto fail;
- hpfs_unlock(i->i_sb);
- SetPageUptodate(page);
- unlock_page(page);
- return 0;
-
fail:
hpfs_unlock(i->i_sb);
- SetPageError(page);
- unlock_page(page);
+ folio_end_read(folio, err == 0);
return err;
}
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 314834a078e9..e73717daa5f9 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -793,4 +793,5 @@ static void __exit exit_hpfs_fs(void)
module_init(init_hpfs_fs)
module_exit(exit_hpfs_fs)
+MODULE_DESCRIPTION("OS/2 HPFS file system support");
MODULE_LICENSE("GPL");
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 412f295acebe..81dab95f67ed 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -73,13 +73,13 @@ enum hugetlb_param {
};
static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
- fsparam_u32 ("gid", Opt_gid),
+ fsparam_gid ("gid", Opt_gid),
fsparam_string("min_size", Opt_min_size),
fsparam_u32oct("mode", Opt_mode),
fsparam_string("nr_inodes", Opt_nr_inodes),
fsparam_string("pagesize", Opt_pagesize),
fsparam_string("size", Opt_size),
- fsparam_u32 ("uid", Opt_uid),
+ fsparam_uid ("uid", Opt_uid),
{}
};
@@ -1376,15 +1376,11 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
switch (opt) {
case Opt_uid:
- ctx->uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(ctx->uid))
- goto bad_val;
+ ctx->uid = result.uid;
return 0;
case Opt_gid:
- ctx->gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(ctx->gid))
- goto bad_val;
+ ctx->gid = result.gid;
return 0;
case Opt_mode:
diff --git a/fs/inode.c b/fs/inode.c
index 3a41f83a4ba5..f356fe2ec2b6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -162,6 +162,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_sb = sb;
inode->i_blkbits = sb->s_blocksize_bits;
inode->i_flags = 0;
+ inode->i_state = 0;
atomic64_set(&inode->i_sequence, 0);
atomic_set(&inode->i_count, 1);
inode->i_op = &empty_iops;
@@ -231,6 +232,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
if (unlikely(security_inode_alloc(inode)))
return -ENOMEM;
+
this_cpu_inc(nr_inodes);
return 0;
@@ -886,36 +888,45 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
return freed;
}
-static void __wait_on_freeing_inode(struct inode *inode);
+static void __wait_on_freeing_inode(struct inode *inode, bool locked);
/*
* Called with the inode lock held.
*/
static struct inode *find_inode(struct super_block *sb,
struct hlist_head *head,
int (*test)(struct inode *, void *),
- void *data)
+ void *data, bool locked)
{
struct inode *inode = NULL;
+ if (locked)
+ lockdep_assert_held(&inode_hash_lock);
+ else
+ lockdep_assert_not_held(&inode_hash_lock);
+
+ rcu_read_lock();
repeat:
- hlist_for_each_entry(inode, head, i_hash) {
+ hlist_for_each_entry_rcu(inode, head, i_hash) {
if (inode->i_sb != sb)
continue;
if (!test(inode, data))
continue;
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
- __wait_on_freeing_inode(inode);
+ __wait_on_freeing_inode(inode, locked);
goto repeat;
}
if (unlikely(inode->i_state & I_CREATING)) {
spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
return ERR_PTR(-ESTALE);
}
__iget(inode);
spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
return inode;
}
+ rcu_read_unlock();
return NULL;
}
@@ -924,29 +935,39 @@ repeat:
* iget_locked for details.
*/
static struct inode *find_inode_fast(struct super_block *sb,
- struct hlist_head *head, unsigned long ino)
+ struct hlist_head *head, unsigned long ino,
+ bool locked)
{
struct inode *inode = NULL;
+ if (locked)
+ lockdep_assert_held(&inode_hash_lock);
+ else
+ lockdep_assert_not_held(&inode_hash_lock);
+
+ rcu_read_lock();
repeat:
- hlist_for_each_entry(inode, head, i_hash) {
+ hlist_for_each_entry_rcu(inode, head, i_hash) {
if (inode->i_ino != ino)
continue;
if (inode->i_sb != sb)
continue;
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
- __wait_on_freeing_inode(inode);
+ __wait_on_freeing_inode(inode, locked);
goto repeat;
}
if (unlikely(inode->i_state & I_CREATING)) {
spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
return ERR_PTR(-ESTALE);
}
__iget(inode);
spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
return inode;
}
+ rcu_read_unlock();
return NULL;
}
@@ -1004,14 +1025,7 @@ EXPORT_SYMBOL(get_next_ino);
*/
struct inode *new_inode_pseudo(struct super_block *sb)
{
- struct inode *inode = alloc_inode(sb);
-
- if (inode) {
- spin_lock(&inode->i_lock);
- inode->i_state = 0;
- spin_unlock(&inode->i_lock);
- }
- return inode;
+ return alloc_inode(sb);
}
/**
@@ -1161,7 +1175,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
again:
spin_lock(&inode_hash_lock);
- old = find_inode(inode->i_sb, head, test, data);
+ old = find_inode(inode->i_sb, head, test, data, true);
if (unlikely(old)) {
/*
* Uhhuh, somebody else created the same inode under us.
@@ -1235,7 +1249,6 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
struct inode *new = alloc_inode(sb);
if (new) {
- new->i_state = 0;
inode = inode_insert5(new, hashval, test, set, data);
if (unlikely(inode != new))
destroy_inode(new);
@@ -1246,6 +1259,47 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
EXPORT_SYMBOL(iget5_locked);
/**
+ * iget5_locked_rcu - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @hashval: hash value (usually inode number) to get
+ * @test: callback used for comparisons between inodes
+ * @set: callback used to initialize a new struct inode
+ * @data: opaque data pointer to pass to @test and @set
+ *
+ * This is equivalent to iget5_locked, except the @test callback must
+ * tolerate the inode not being stable, including being mid-teardown.
+ */
+struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *),
+ int (*set)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct inode *inode, *new;
+
+again:
+ inode = find_inode(sb, head, test, data, false);
+ if (inode) {
+ if (IS_ERR(inode))
+ return NULL;
+ wait_on_inode(inode);
+ if (unlikely(inode_unhashed(inode))) {
+ iput(inode);
+ goto again;
+ }
+ return inode;
+ }
+
+ new = alloc_inode(sb);
+ if (new) {
+ inode = inode_insert5(new, hashval, test, set, data);
+ if (unlikely(inode != new))
+ destroy_inode(new);
+ }
+ return inode;
+}
+EXPORT_SYMBOL_GPL(iget5_locked_rcu);
+
+/**
* iget_locked - obtain an inode from a mounted file system
* @sb: super block of file system
* @ino: inode number to get
@@ -1263,9 +1317,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
again:
- spin_lock(&inode_hash_lock);
- inode = find_inode_fast(sb, head, ino);
- spin_unlock(&inode_hash_lock);
+ inode = find_inode_fast(sb, head, ino, false);
if (inode) {
if (IS_ERR(inode))
return NULL;
@@ -1283,7 +1335,7 @@ again:
spin_lock(&inode_hash_lock);
/* We released the lock, so.. */
- old = find_inode_fast(sb, head, ino);
+ old = find_inode_fast(sb, head, ino, true);
if (!old) {
inode->i_ino = ino;
spin_lock(&inode->i_lock);
@@ -1419,7 +1471,7 @@ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
struct inode *inode;
spin_lock(&inode_hash_lock);
- inode = find_inode(sb, head, test, data);
+ inode = find_inode(sb, head, test, data, true);
spin_unlock(&inode_hash_lock);
return IS_ERR(inode) ? NULL : inode;
@@ -1474,7 +1526,7 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
struct inode *inode;
again:
spin_lock(&inode_hash_lock);
- inode = find_inode_fast(sb, head, ino);
+ inode = find_inode_fast(sb, head, ino, true);
spin_unlock(&inode_hash_lock);
if (inode) {
@@ -2235,17 +2287,21 @@ EXPORT_SYMBOL(inode_needs_sync);
* wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
* will DTRT.
*/
-static void __wait_on_freeing_inode(struct inode *inode)
+static void __wait_on_freeing_inode(struct inode *inode, bool locked)
{
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
wq = bit_waitqueue(&inode->i_state, __I_NEW);
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_hash_lock);
+ rcu_read_unlock();
+ if (locked)
+ spin_unlock(&inode_hash_lock);
schedule();
finish_wait(wq, &wait.wq_entry);
- spin_lock(&inode_hash_lock);
+ if (locked)
+ spin_lock(&inode_hash_lock);
+ rcu_read_lock();
}
static __initdata unsigned long ihash_entries;
@@ -2538,6 +2594,7 @@ bool in_group_or_capable(struct mnt_idmap *idmap,
return true;
return false;
}
+EXPORT_SYMBOL(in_group_or_capable);
/**
* mode_strip_sgid - handle the sgid bit for non-directories
diff --git a/fs/internal.h b/fs/internal.h
index ab2225136f60..cdd73209eecb 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -17,6 +17,7 @@ struct fs_context;
struct pipe_inode_info;
struct iov_iter;
struct mnt_idmap;
+struct ns_common;
/*
* block/bdev.c
@@ -239,6 +240,7 @@ extern void mnt_pin_kill(struct mount *m);
* fs/nsfs.c
*/
extern const struct dentry_operations ns_dentry_operations;
+int open_namespace(struct ns_common *ns);
/*
* fs/stat.c:
@@ -247,6 +249,8 @@ extern const struct dentry_operations ns_dentry_operations;
int getname_statx_lookup_flags(int flags);
int do_statx(int dfd, struct filename *filename, unsigned int flags,
unsigned int mask, struct statx __user *buffer);
+int do_statx_fd(int fd, unsigned int flags, unsigned int mask,
+ struct statx __user *buffer);
/*
* fs/splice.c:
@@ -321,3 +325,15 @@ struct stashed_operations {
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
struct path *path);
void stashed_dentry_prune(struct dentry *dentry);
+/**
+ * path_mounted - check whether path is mounted
+ * @path: path to check
+ *
+ * Determine whether @path refers to the root of a mount.
+ *
+ * Return: true if @path is the root of a mount, false if not.
+ */
+static inline bool path_mounted(const struct path *path)
+{
+ return path->mnt->mnt_root == path->dentry;
+}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 41c8f0c68ef5..f420c53d86ac 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -241,6 +241,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
unsigned block_size = (1 << block_bits);
size_t poff = offset_in_folio(folio, *pos);
size_t plen = min_t(loff_t, folio_size(folio) - poff, length);
+ size_t orig_plen = plen;
unsigned first = poff >> block_bits;
unsigned last = (poff + plen - 1) >> block_bits;
@@ -277,7 +278,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
* handle both halves separately so that we properly zero data in the
* page cache for blocks that are entirely outside of i_size.
*/
- if (orig_pos <= isize && orig_pos + length > isize) {
+ if (orig_pos <= isize && orig_pos + orig_plen > isize) {
unsigned end = offset_in_folio(folio, isize - 1) >> block_bits;
if (first <= end && last > end)
@@ -306,8 +307,6 @@ static void iomap_finish_folio_read(struct folio *folio, size_t off,
spin_unlock_irqrestore(&ifs->state_lock, flags);
}
- if (error)
- folio_set_error(folio);
if (finished)
folio_end_read(folio, uptodate);
}
@@ -443,6 +442,24 @@ done:
return pos - orig_pos + plen;
}
+static loff_t iomap_read_folio_iter(const struct iomap_iter *iter,
+ struct iomap_readpage_ctx *ctx)
+{
+ struct folio *folio = ctx->cur_folio;
+ size_t offset = offset_in_folio(folio, iter->pos);
+ loff_t length = min_t(loff_t, folio_size(folio) - offset,
+ iomap_length(iter));
+ loff_t done, ret;
+
+ for (done = 0; done < length; done += ret) {
+ ret = iomap_readpage_iter(iter, ctx, done);
+ if (ret <= 0)
+ return ret;
+ }
+
+ return done;
+}
+
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
{
struct iomap_iter iter = {
@@ -458,10 +475,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
trace_iomap_readpage(iter.inode, 1);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_readpage_iter(&iter, &ctx, 0);
-
- if (ret < 0)
- folio_set_error(folio);
+ iter.processed = iomap_read_folio_iter(&iter, &ctx);
if (ctx.bio) {
submit_bio(ctx.bio);
@@ -697,7 +711,6 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
if (folio_test_uptodate(folio))
return 0;
- folio_clear_error(folio);
do {
iomap_adjust_read_range(iter->inode, folio, &block_start,
@@ -898,11 +911,11 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
{
loff_t length = iomap_length(iter);
- size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
loff_t pos = iter->pos;
ssize_t total_written = 0;
long status = 0;
struct address_space *mapping = iter->inode->i_mapping;
+ size_t chunk = mapping_max_folio_size(mapping);
unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
do {
@@ -1543,8 +1556,6 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
/* walk all folios in bio, ending page IO on them */
bio_for_each_folio_all(fi, bio) {
- if (error)
- folio_set_error(fi.folio);
iomap_finish_folio_write(inode, fi.folio, fi.length);
folio_count++;
}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 93b1077a380a..ed548efdd9bb 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -326,8 +326,8 @@ static const struct fs_parameter_spec isofs_param_spec[] = {
fsparam_u32 ("session", Opt_session),
fsparam_u32 ("sbsector", Opt_sb),
fsparam_enum ("check", Opt_check, isofs_param_check),
- fsparam_u32 ("uid", Opt_uid),
- fsparam_u32 ("gid", Opt_gid),
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
/* Note: mode/dmode historically accepted %u not strictly %o */
fsparam_u32 ("mode", Opt_mode),
fsparam_u32 ("dmode", Opt_dmode),
@@ -344,8 +344,6 @@ static int isofs_parse_param(struct fs_context *fc,
struct isofs_options *popt = fc->fs_private;
struct fs_parse_result result;
int opt;
- kuid_t uid;
- kgid_t gid;
unsigned int n;
/* There are no remountable options */
@@ -409,17 +407,11 @@ static int isofs_parse_param(struct fs_context *fc,
case Opt_ignore:
break;
case Opt_uid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid))
- return -EINVAL;
- popt->uid = uid;
+ popt->uid = result.uid;
popt->uid_set = 1;
break;
case Opt_gid:
- gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(gid))
- return -EINVAL;
- popt->gid = gid;
+ popt->gid = result.gid;
popt->gid_set = 1;
break;
case Opt_mode:
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index d6c17ad69dee..dbf911126e61 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -688,11 +688,10 @@ int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode,
*/
static int rock_ridge_symlink_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct iso_inode_info *ei = ISOFS_I(inode);
struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
- char *link = page_address(page);
+ char *link = folio_address(folio);
unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
struct buffer_head *bh;
char *rpnt = link;
@@ -779,9 +778,10 @@ repeat:
goto fail;
brelse(bh);
*rpnt = '\0';
- SetPageUptodate(page);
- unlock_page(page);
- return 0;
+ ret = 0;
+end:
+ folio_end_read(folio, ret == 0);
+ return ret;
/* error exit from macro */
out:
@@ -795,9 +795,8 @@ out_bad_span:
fail:
brelse(bh);
error:
- SetPageError(page);
- unlock_page(page);
- return -EIO;
+ ret = -EIO;
+ goto end;
}
const struct address_space_operations isofs_symlink_aops = {
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 62ea76da7fdf..e12cb145147e 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -95,13 +95,8 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_SHIFT,
PAGE_SIZE);
- if (ret) {
- ClearPageUptodate(pg);
- SetPageError(pg);
- } else {
+ if (!ret)
SetPageUptodate(pg);
- ClearPageError(pg);
- }
flush_dcache_page(pg);
kunmap(pg);
@@ -304,10 +299,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
kunmap(pg);
- if (ret) {
- /* There was an error writing. */
- SetPageError(pg);
- }
+ if (ret)
+ mapping_set_error(mapping, ret);
/* Adjust writtenlen for the padding we did, so we don't confuse our caller */
writtenlen -= min(writtenlen, (start - aligned_start));
@@ -330,7 +323,6 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
it gets reread */
jffs2_dbg(1, "%s(): Not all bytes written. Marking page !uptodate\n",
__func__);
- SetPageError(pg);
ClearPageUptodate(pg);
}
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 0fb7afac298e..9987055293b3 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -557,9 +557,11 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
size_check:
if (EALIST_SIZE(ea_buf->xattr) != ea_size) {
+ int size = min_t(int, EALIST_SIZE(ea_buf->xattr), ea_size);
+
printk(KERN_ERR "ea_get: invalid extended attribute\n");
print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1,
- ea_buf->xattr, ea_size, 1);
+ ea_buf->xattr, size, 1);
ea_release(inode, ea_buf);
rc = -EIO;
goto clean_up;
diff --git a/fs/libfs.c b/fs/libfs.c
index b635ee5adbcc..8aa34870449f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1854,6 +1854,80 @@ static const struct dentry_operations generic_ci_dentry_ops = {
.d_revalidate = fscrypt_d_revalidate,
#endif
};
+
+/**
+ * generic_ci_match() - Match a name (case-insensitively) with a dirent.
+ * This is a filesystem helper for comparison with directory entries.
+ * generic_ci_d_compare should be used in VFS' ->d_compare instead.
+ *
+ * @parent: Inode of the parent of the dirent under comparison
+ * @name: name under lookup.
+ * @folded_name: Optional pre-folded name under lookup
+ * @de_name: Dirent name.
+ * @de_name_len: dirent name length.
+ *
+ * Test whether a case-insensitive directory entry matches the filename
+ * being searched. If @folded_name is provided, it is used instead of
+ * recalculating the casefold of @name.
+ *
+ * Return: > 0 if the directory entry matches, 0 if it doesn't match, or
+ * < 0 on error.
+ */
+int generic_ci_match(const struct inode *parent,
+ const struct qstr *name,
+ const struct qstr *folded_name,
+ const u8 *de_name, u32 de_name_len)
+{
+ const struct super_block *sb = parent->i_sb;
+ const struct unicode_map *um = sb->s_encoding;
+ struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len);
+ struct qstr dirent = QSTR_INIT(de_name, de_name_len);
+ int res = 0;
+
+ if (IS_ENCRYPTED(parent)) {
+ const struct fscrypt_str encrypted_name =
+ FSTR_INIT((u8 *) de_name, de_name_len);
+
+ if (WARN_ON_ONCE(!fscrypt_has_encryption_key(parent)))
+ return -EINVAL;
+
+ decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL);
+ if (!decrypted_name.name)
+ return -ENOMEM;
+ res = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name,
+ &decrypted_name);
+ if (res < 0) {
+ kfree(decrypted_name.name);
+ return res;
+ }
+ dirent.name = decrypted_name.name;
+ dirent.len = decrypted_name.len;
+ }
+
+ /*
+ * Attempt a case-sensitive match first. It is cheaper and
+ * should cover most lookups, including all the sane
+ * applications that expect a case-sensitive filesystem.
+ */
+
+ if (dirent.len == name->len &&
+ !memcmp(name->name, dirent.name, dirent.len))
+ goto out;
+
+ if (folded_name->name)
+ res = utf8_strncasecmp_folded(um, folded_name, &dirent);
+ else
+ res = utf8_strncasecmp(um, name, &dirent);
+
+out:
+ kfree(decrypted_name.name);
+ if (res < 0 && sb_has_strict_encoding(sb)) {
+ pr_err_ratelimited("Directory contains filename that is invalid UTF-8");
+ return 0;
+ }
+ return !res;
+}
+EXPORT_SYMBOL(generic_ci_match);
#endif
#ifdef CONFIG_FS_ENCRYPTION
diff --git a/fs/locks.c b/fs/locks.c
index 90c8746874de..bdd94c32256f 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1367,9 +1367,9 @@ retry:
locks_wake_up_blocks(&left->c);
}
out:
+ trace_posix_lock_inode(inode, request, error);
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
- trace_posix_lock_inode(inode, request, error);
/*
* Free any unused locks.
*/
@@ -2448,8 +2448,9 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
error = do_lock_file_wait(filp, cmd, file_lock);
/*
- * Attempt to detect a close/fcntl race and recover by releasing the
- * lock that was just acquired. There is no need to do that when we're
+ * Detect close/fcntl races and recover by zapping all POSIX locks
+ * associated with this file and our files_struct, just like on
+ * filp_flush(). There is no need to do that when we're
* unlocking though, or for OFD locks.
*/
if (!error && file_lock->c.flc_type != F_UNLCK &&
@@ -2464,9 +2465,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
f = files_lookup_fd_locked(files, fd);
spin_unlock(&files->file_lock);
if (f != filp) {
- file_lock->c.flc_type = F_UNLCK;
- error = do_lock_file_wait(filp, cmd, file_lock);
- WARN_ON_ONCE(error);
+ locks_remove_posix(filp, files);
error = -EBADF;
}
}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 7f9a2d8aa420..1c3df63162ef 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -730,5 +730,6 @@ static void __exit exit_minix_fs(void)
module_init(init_minix_fs)
module_exit(exit_minix_fs)
+MODULE_DESCRIPTION("Minix file system");
MODULE_LICENSE("GPL");
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index d6031acc34f0..a944a0f17b53 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -213,8 +213,7 @@ static int minix_rename(struct mnt_idmap *idmap,
if (!new_de)
goto out_dir;
err = minix_set_link(new_de, new_page, old_inode);
- kunmap(new_page);
- put_page(new_page);
+ unmap_and_put_page(new_page, new_de);
if (err)
goto out_dir;
inode_set_ctime_current(new_inode);
diff --git a/fs/mount.h b/fs/mount.h
index 4a42fc68f4cc..ad4b1ddebb54 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -16,6 +16,8 @@ struct mnt_namespace {
u64 event;
unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
+ struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
+ refcount_t passive; /* number references not pinning @mounts */
} __randomize_layout;
struct mnt_pcp {
@@ -152,3 +154,4 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
}
extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
+bool has_locked_children(struct mount *mnt, struct dentry *dentry);
diff --git a/fs/mpage.c b/fs/mpage.c
index fa8b99a199fa..b5b5ddf9d513 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -48,13 +48,8 @@ static void mpage_read_end_io(struct bio *bio)
struct folio_iter fi;
int err = blk_status_to_errno(bio->bi_status);
- bio_for_each_folio_all(fi, bio) {
- if (err)
- folio_set_error(fi.folio);
- else
- folio_mark_uptodate(fi.folio);
- folio_unlock(fi.folio);
- }
+ bio_for_each_folio_all(fi, bio)
+ folio_end_read(fi.folio, err == 0);
bio_put(bio);
}
@@ -65,10 +60,8 @@ static void mpage_write_end_io(struct bio *bio)
int err = blk_status_to_errno(bio->bi_status);
bio_for_each_folio_all(fi, bio) {
- if (err) {
- folio_set_error(fi.folio);
+ if (err)
mapping_set_error(fi.folio->mapping, err);
- }
folio_end_writeback(fi.folio);
}
diff --git a/fs/namei.c b/fs/namei.c
index 37fb0a8aa09a..3a4c40e12f78 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -126,7 +126,7 @@
#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
struct filename *
-getname_flags(const char __user *filename, int flags, int *empty)
+getname_flags(const char __user *filename, int flags)
{
struct filename *result;
char *kname;
@@ -148,9 +148,20 @@ getname_flags(const char __user *filename, int flags, int *empty)
result->name = kname;
len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
- if (unlikely(len < 0)) {
- __putname(result);
- return ERR_PTR(len);
+ /*
+ * Handle both empty path and copy failure in one go.
+ */
+ if (unlikely(len <= 0)) {
+ if (unlikely(len < 0)) {
+ __putname(result);
+ return ERR_PTR(len);
+ }
+
+ /* The empty path is special. */
+ if (!(flags & LOOKUP_EMPTY)) {
+ __putname(result);
+ return ERR_PTR(-ENOENT);
+ }
}
/*
@@ -180,6 +191,12 @@ getname_flags(const char __user *filename, int flags, int *empty)
kfree(result);
return ERR_PTR(len);
}
+ /* The empty path is special. */
+ if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) {
+ __putname(kname);
+ kfree(result);
+ return ERR_PTR(-ENOENT);
+ }
if (unlikely(len == PATH_MAX)) {
__putname(kname);
kfree(result);
@@ -188,16 +205,6 @@ getname_flags(const char __user *filename, int flags, int *empty)
}
atomic_set(&result->refcnt, 1);
- /* The empty path is special. */
- if (unlikely(!len)) {
- if (empty)
- *empty = 1;
- if (!(flags & LOOKUP_EMPTY)) {
- putname(result);
- return ERR_PTR(-ENOENT);
- }
- }
-
result->uptr = filename;
result->aname = NULL;
audit_getname(result);
@@ -209,13 +216,13 @@ getname_uflags(const char __user *filename, int uflags)
{
int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
- return getname_flags(filename, flags, NULL);
+ return getname_flags(filename, flags);
}
struct filename *
getname(const char __user * filename)
{
- return getname_flags(filename, 0, NULL);
+ return getname_flags(filename, 0);
}
struct filename *
@@ -1233,29 +1240,48 @@ int may_linkat(struct mnt_idmap *idmap, const struct path *link)
*
* Returns 0 if the open is allowed, -ve on error.
*/
-static int may_create_in_sticky(struct mnt_idmap *idmap,
- struct nameidata *nd, struct inode *const inode)
+static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd,
+ struct inode *const inode)
{
umode_t dir_mode = nd->dir_mode;
- vfsuid_t dir_vfsuid = nd->dir_vfsuid;
+ vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid;
- if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
- (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
- likely(!(dir_mode & S_ISVTX)) ||
- vfsuid_eq(i_uid_into_vfsuid(idmap, inode), dir_vfsuid) ||
- vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid()))
+ if (likely(!(dir_mode & S_ISVTX)))
return 0;
- if (likely(dir_mode & 0002) ||
- (dir_mode & 0020 &&
- ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
- (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
- const char *operation = S_ISFIFO(inode->i_mode) ?
- "sticky_create_fifo" :
- "sticky_create_regular";
- audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
+ if (S_ISREG(inode->i_mode) && !sysctl_protected_regular)
+ return 0;
+
+ if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos)
+ return 0;
+
+ i_vfsuid = i_uid_into_vfsuid(idmap, inode);
+
+ if (vfsuid_eq(i_vfsuid, dir_vfsuid))
+ return 0;
+
+ if (vfsuid_eq_kuid(i_vfsuid, current_fsuid()))
+ return 0;
+
+ if (likely(dir_mode & 0002)) {
+ audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create");
return -EACCES;
}
+
+ if (dir_mode & 0020) {
+ if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) {
+ audit_log_path_denied(AUDIT_ANOM_CREAT,
+ "sticky_create_fifo");
+ return -EACCES;
+ }
+
+ if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) {
+ audit_log_path_denied(AUDIT_ANOM_CREAT,
+ "sticky_create_regular");
+ return -EACCES;
+ }
+ }
+
return 0;
}
@@ -1712,17 +1738,26 @@ static struct dentry *lookup_slow(const struct qstr *name,
}
static inline int may_lookup(struct mnt_idmap *idmap,
- struct nameidata *nd)
+ struct nameidata *restrict nd)
{
- if (nd->flags & LOOKUP_RCU) {
- int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
- if (!err) // success, keep going
- return 0;
- if (!try_to_unlazy(nd))
- return -ECHILD; // redo it all non-lazy
- if (err != -ECHILD) // hard error
- return err;
- }
+ int err, mask;
+
+ mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0;
+ err = inode_permission(idmap, nd->inode, mask | MAY_EXEC);
+ if (likely(!err))
+ return 0;
+
+ // If we failed, and we weren't in LOOKUP_RCU, it's final
+ if (!(nd->flags & LOOKUP_RCU))
+ return err;
+
+ // Drop out of RCU mode to make sure it wasn't transient
+ if (!try_to_unlazy(nd))
+ return -ECHILD; // redo it all non-lazy
+
+ if (err != -ECHILD) // hard error
+ return err;
+
return inode_permission(idmap, nd->inode, MAY_EXEC);
}
@@ -2163,21 +2198,39 @@ EXPORT_SYMBOL(hashlen_string);
/*
* Calculate the length and hash of the path component, and
- * return the "hash_len" as the result.
+ * return the length as the result.
*/
-static inline u64 hash_name(const void *salt, const char *name)
+static inline const char *hash_name(struct nameidata *nd,
+ const char *name,
+ unsigned long *lastword)
{
- unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
+ unsigned long a, b, x, y = (unsigned long)nd->path.dentry;
unsigned long adata, bdata, mask, len;
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
- len = 0;
- goto inside;
+ /*
+ * The first iteration is special, because it can result in
+ * '.' and '..' and has no mixing other than the final fold.
+ */
+ a = load_unaligned_zeropad(name);
+ b = a ^ REPEAT_BYTE('/');
+ if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) {
+ adata = prep_zero_mask(a, adata, &constants);
+ bdata = prep_zero_mask(b, bdata, &constants);
+ mask = create_zero_mask(adata | bdata);
+ a &= zero_bytemask(mask);
+ *lastword = a;
+ len = find_zero(mask);
+ nd->last.hash = fold_hash(a, y);
+ nd->last.len = len;
+ return name + len;
+ }
+ len = 0;
+ x = 0;
do {
HASH_MIX(x, y, a);
len += sizeof(unsigned long);
-inside:
a = load_unaligned_zeropad(name+len);
b = a ^ REPEAT_BYTE('/');
} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
@@ -2185,11 +2238,25 @@ inside:
adata = prep_zero_mask(a, adata, &constants);
bdata = prep_zero_mask(b, bdata, &constants);
mask = create_zero_mask(adata | bdata);
- x ^= a & zero_bytemask(mask);
+ a &= zero_bytemask(mask);
+ x ^= a;
+ len += find_zero(mask);
+ *lastword = 0; // Multi-word components cannot be DOT or DOTDOT
- return hashlen_create(fold_hash(x, y), len + find_zero(mask));
+ nd->last.hash = fold_hash(x, y);
+ nd->last.len = len;
+ return name + len;
}
+/*
+ * Note that the 'last' word is always zero-masked, but
+ * was loaded as a possibly big-endian word.
+ */
+#ifdef __BIG_ENDIAN
+ #define LAST_WORD_IS_DOT (0x2eul << (BITS_PER_LONG-8))
+ #define LAST_WORD_IS_DOTDOT (0x2e2eul << (BITS_PER_LONG-16))
+#endif
+
#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
/* Return the hash of a string of known length */
@@ -2222,22 +2289,35 @@ EXPORT_SYMBOL(hashlen_string);
* We know there's a real path component here of at least
* one character.
*/
-static inline u64 hash_name(const void *salt, const char *name)
+static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword)
{
- unsigned long hash = init_name_hash(salt);
- unsigned long len = 0, c;
+ unsigned long hash = init_name_hash(nd->path.dentry);
+ unsigned long len = 0, c, last = 0;
c = (unsigned char)*name;
do {
+ last = (last << 8) + c;
len++;
hash = partial_name_hash(c, hash);
c = (unsigned char)name[len];
} while (c && c != '/');
- return hashlen_create(end_name_hash(hash), len);
+
+ // This is reliable for DOT or DOTDOT, since the component
+ // cannot contain NUL characters - top bits being zero means
+ // we cannot have had any other pathnames.
+ *lastword = last;
+ nd->last.hash = end_name_hash(hash);
+ nd->last.len = len;
+ return name + len;
}
#endif
+#ifndef LAST_WORD_IS_DOT
+ #define LAST_WORD_IS_DOT 0x2e
+ #define LAST_WORD_IS_DOTDOT 0x2e2e
+#endif
+
/*
* Name resolution.
* This is the basic name resolution function, turning a pathname into
@@ -2266,45 +2346,38 @@ static int link_path_walk(const char *name, struct nameidata *nd)
for(;;) {
struct mnt_idmap *idmap;
const char *link;
- u64 hash_len;
- int type;
+ unsigned long lastword;
idmap = mnt_idmap(nd->path.mnt);
err = may_lookup(idmap, nd);
if (err)
return err;
- hash_len = hash_name(nd->path.dentry, name);
+ nd->last.name = name;
+ name = hash_name(nd, name, &lastword);
- type = LAST_NORM;
- if (name[0] == '.') switch (hashlen_len(hash_len)) {
- case 2:
- if (name[1] == '.') {
- type = LAST_DOTDOT;
- nd->state |= ND_JUMPED;
- }
- break;
- case 1:
- type = LAST_DOT;
- }
- if (likely(type == LAST_NORM)) {
- struct dentry *parent = nd->path.dentry;
+ switch(lastword) {
+ case LAST_WORD_IS_DOTDOT:
+ nd->last_type = LAST_DOTDOT;
+ nd->state |= ND_JUMPED;
+ break;
+
+ case LAST_WORD_IS_DOT:
+ nd->last_type = LAST_DOT;
+ break;
+
+ default:
+ nd->last_type = LAST_NORM;
nd->state &= ~ND_JUMPED;
+
+ struct dentry *parent = nd->path.dentry;
if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
- struct qstr this = { { .hash_len = hash_len }, .name = name };
- err = parent->d_op->d_hash(parent, &this);
+ err = parent->d_op->d_hash(parent, &nd->last);
if (err < 0)
return err;
- hash_len = this.hash_len;
- name = this.name;
}
}
- nd->last.hash_len = hash_len;
- nd->last.name = name;
- nd->last_type = type;
-
- name += hashlen_len(hash_len);
if (!*name)
goto OK;
/*
@@ -2922,16 +2995,16 @@ int path_pts(struct path *path)
}
#endif
-int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
- struct path *path, int *empty)
+int user_path_at(int dfd, const char __user *name, unsigned flags,
+ struct path *path)
{
- struct filename *filename = getname_flags(name, flags, empty);
+ struct filename *filename = getname_flags(name, flags);
int ret = filename_lookup(dfd, filename, flags, path, NULL);
putname(filename);
return ret;
}
-EXPORT_SYMBOL(user_path_at_empty);
+EXPORT_SYMBOL(user_path_at);
int __check_sticky(struct mnt_idmap *idmap, struct inode *dir,
struct inode *inode)
@@ -3572,8 +3645,12 @@ static const char *open_last_lookups(struct nameidata *nd,
else
inode_lock_shared(dir->d_inode);
dentry = lookup_open(nd, file, op, got_write);
- if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
- fsnotify_create(dir->d_inode, dentry);
+ if (!IS_ERR(dentry)) {
+ if (file->f_mode & FMODE_CREATED)
+ fsnotify_create(dir->d_inode, dentry);
+ if (file->f_mode & FMODE_OPENED)
+ fsnotify_open(file);
+ }
if (open_flag & O_CREAT)
inode_unlock(dir->d_inode);
else
@@ -3700,6 +3777,8 @@ int vfs_tmpfile(struct mnt_idmap *idmap,
mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
error = dir->i_op->tmpfile(idmap, dir, file, mode);
dput(child);
+ if (file->f_mode & FMODE_OPENED)
+ fsnotify_open(file);
if (error)
return error;
/* Don't check for other permissions, the inode was just created */
diff --git a/fs/namespace.c b/fs/namespace.c
index 5a51315c6678..221db9de4729 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -70,7 +70,8 @@ static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);
/* Don't allow confusion with old 32bit mount ID */
-static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);
+#define MNT_UNIQUE_ID_OFFSET (1ULL << 32)
+static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET);
static struct hlist_head *mount_hashtable __ro_after_init;
static struct hlist_head *mountpoint_hashtable __ro_after_init;
@@ -78,6 +79,8 @@ static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+static DEFINE_RWLOCK(mnt_ns_tree_lock);
+static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
struct mount_kattr {
unsigned int attr_set;
@@ -103,6 +106,109 @@ EXPORT_SYMBOL_GPL(fs_kobj);
*/
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
+static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
+{
+ u64 seq_b = ns->seq;
+
+ if (seq < seq_b)
+ return -1;
+ if (seq > seq_b)
+ return 1;
+ return 0;
+}
+
+static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
+{
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
+}
+
+static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
+{
+ struct mnt_namespace *ns_a = node_to_mnt_ns(a);
+ struct mnt_namespace *ns_b = node_to_mnt_ns(b);
+ u64 seq_a = ns_a->seq;
+
+ return mnt_ns_cmp(seq_a, ns_b) < 0;
+}
+
+static void mnt_ns_tree_add(struct mnt_namespace *ns)
+{
+ guard(write_lock)(&mnt_ns_tree_lock);
+ rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
+}
+
+static void mnt_ns_release(struct mnt_namespace *ns)
+{
+ lockdep_assert_not_held(&mnt_ns_tree_lock);
+
+ /* keep alive for {list,stat}mount() */
+ if (refcount_dec_and_test(&ns->passive)) {
+ put_user_ns(ns->user_ns);
+ kfree(ns);
+ }
+}
+DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
+
+static void mnt_ns_tree_remove(struct mnt_namespace *ns)
+{
+ /* remove from global mount namespace list */
+ if (!is_anon_ns(ns)) {
+ guard(write_lock)(&mnt_ns_tree_lock);
+ rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
+ }
+
+ mnt_ns_release(ns);
+}
+
+/*
+ * Returns the mount namespace which either has the specified id, or has the
+ * next smallest id afer the specified one.
+ */
+static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
+{
+ struct rb_node *node = mnt_ns_tree.rb_node;
+ struct mnt_namespace *ret = NULL;
+
+ lockdep_assert_held(&mnt_ns_tree_lock);
+
+ while (node) {
+ struct mnt_namespace *n = node_to_mnt_ns(node);
+
+ if (mnt_ns_id <= n->seq) {
+ ret = node_to_mnt_ns(node);
+ if (mnt_ns_id == n->seq)
+ break;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
+ }
+ }
+ return ret;
+}
+
+/*
+ * Lookup a mount namespace by id and take a passive reference count. Taking a
+ * passive reference means the mount namespace can be emptied if e.g., the last
+ * task holding an active reference exits. To access the mounts of the
+ * namespace the @namespace_sem must first be acquired. If the namespace has
+ * already shut down before acquiring @namespace_sem, {list,stat}mount() will
+ * see that the mount rbtree of the namespace is empty.
+ */
+static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
+{
+ struct mnt_namespace *ns;
+
+ guard(read_lock)(&mnt_ns_tree_lock);
+ ns = mnt_ns_find_id_at(mnt_ns_id);
+ if (!ns || ns->seq != mnt_ns_id)
+ return NULL;
+
+ refcount_inc(&ns->passive);
+ return ns;
+}
+
static inline void lock_mount_hash(void)
{
write_seqlock(&mount_lock);
@@ -1448,6 +1554,30 @@ static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
return ret;
}
+/*
+ * Returns the mount which either has the specified mnt_id, or has the next
+ * greater id before the specified one.
+ */
+static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id)
+{
+ struct rb_node *node = ns->mounts.rb_node;
+ struct mount *ret = NULL;
+
+ while (node) {
+ struct mount *m = node_to_mount(node);
+
+ if (mnt_id >= m->mnt_id_unique) {
+ ret = node_to_mount(node);
+ if (mnt_id == m->mnt_id_unique)
+ break;
+ node = node->rb_right;
+ } else {
+ node = node->rb_left;
+ }
+ }
+ return ret;
+}
+
#ifdef CONFIG_PROC_FS
/* iterator; we want it to have access to namespace_sem, thus here... */
@@ -1846,19 +1976,6 @@ bool may_mount(void)
return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}
-/**
- * path_mounted - check whether path is mounted
- * @path: path to check
- *
- * Determine whether @path refers to the root of a mount.
- *
- * Return: true if @path is the root of a mount, false if not.
- */
-static inline bool path_mounted(const struct path *path)
-{
- return path->mnt->mnt_root == path->dentry;
-}
-
static void warn_mandlock(void)
{
pr_warn_once("=======================================================\n"
@@ -1966,69 +2083,72 @@ static bool mnt_ns_loop(struct dentry *dentry)
return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
}
-struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
+struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
int flag)
{
- struct mount *res, *p, *q, *r, *parent;
+ struct mount *res, *src_parent, *src_root_child, *src_mnt,
+ *dst_parent, *dst_mnt;
- if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
+ if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root))
return ERR_PTR(-EINVAL);
if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
return ERR_PTR(-EINVAL);
- res = q = clone_mnt(mnt, dentry, flag);
- if (IS_ERR(q))
- return q;
+ res = dst_mnt = clone_mnt(src_root, dentry, flag);
+ if (IS_ERR(dst_mnt))
+ return dst_mnt;
- q->mnt_mountpoint = mnt->mnt_mountpoint;
+ src_parent = src_root;
+ dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint;
- p = mnt;
- list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
- struct mount *s;
- if (!is_subdir(r->mnt_mountpoint, dentry))
+ list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) {
+ if (!is_subdir(src_root_child->mnt_mountpoint, dentry))
continue;
- for (s = r; s; s = next_mnt(s, r)) {
+ for (src_mnt = src_root_child; src_mnt;
+ src_mnt = next_mnt(src_mnt, src_root_child)) {
if (!(flag & CL_COPY_UNBINDABLE) &&
- IS_MNT_UNBINDABLE(s)) {
- if (s->mnt.mnt_flags & MNT_LOCKED) {
+ IS_MNT_UNBINDABLE(src_mnt)) {
+ if (src_mnt->mnt.mnt_flags & MNT_LOCKED) {
/* Both unbindable and locked. */
- q = ERR_PTR(-EPERM);
+ dst_mnt = ERR_PTR(-EPERM);
goto out;
} else {
- s = skip_mnt_tree(s);
+ src_mnt = skip_mnt_tree(src_mnt);
continue;
}
}
if (!(flag & CL_COPY_MNT_NS_FILE) &&
- is_mnt_ns_file(s->mnt.mnt_root)) {
- s = skip_mnt_tree(s);
+ is_mnt_ns_file(src_mnt->mnt.mnt_root)) {
+ src_mnt = skip_mnt_tree(src_mnt);
continue;
}
- while (p != s->mnt_parent) {
- p = p->mnt_parent;
- q = q->mnt_parent;
+ while (src_parent != src_mnt->mnt_parent) {
+ src_parent = src_parent->mnt_parent;
+ dst_mnt = dst_mnt->mnt_parent;
}
- p = s;
- parent = q;
- q = clone_mnt(p, p->mnt.mnt_root, flag);
- if (IS_ERR(q))
+
+ src_parent = src_mnt;
+ dst_parent = dst_mnt;
+ dst_mnt = clone_mnt(src_mnt, src_mnt->mnt.mnt_root, flag);
+ if (IS_ERR(dst_mnt))
goto out;
lock_mount_hash();
- list_add_tail(&q->mnt_list, &res->mnt_list);
- attach_mnt(q, parent, p->mnt_mp, false);
+ list_add_tail(&dst_mnt->mnt_list, &res->mnt_list);
+ attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp, false);
unlock_mount_hash();
}
}
return res;
+
out:
if (res) {
lock_mount_hash();
umount_tree(res, UMOUNT_SYNC);
unlock_mount_hash();
}
- return q;
+ return dst_mnt;
}
/* Caller should check returned pointer for errors */
@@ -2078,7 +2198,7 @@ void drop_collected_mounts(struct vfsmount *mnt)
namespace_unlock();
}
-static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
+bool has_locked_children(struct mount *mnt, struct dentry *dentry)
{
struct mount *child;
@@ -3709,8 +3829,7 @@ static void free_mnt_ns(struct mnt_namespace *ns)
if (!is_anon_ns(ns))
ns_free_inum(&ns->ns);
dec_mnt_namespaces(ns->ucounts);
- put_user_ns(ns->user_ns);
- kfree(ns);
+ mnt_ns_tree_remove(ns);
}
/*
@@ -3749,7 +3868,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
if (!anon)
new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
refcount_set(&new_ns->ns.count, 1);
+ refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
+ RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
init_waitqueue_head(&new_ns->poll);
new_ns->user_ns = get_user_ns(user_ns);
new_ns->ucounts = ucounts;
@@ -3826,6 +3947,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
while (p->mnt.mnt_root != q->mnt.mnt_root)
p = next_mnt(skip_mnt_tree(p), old);
}
+ mnt_ns_tree_add(new_ns);
namespace_unlock();
if (rootmnt)
@@ -4843,6 +4965,40 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
return 0;
}
+static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
+{
+ s->sm.mask |= STATMOUNT_MNT_NS_ID;
+ s->sm.mnt_ns_id = ns->seq;
+}
+
+static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
+{
+ struct vfsmount *mnt = s->mnt;
+ struct super_block *sb = mnt->mnt_sb;
+ int err;
+
+ if (sb->s_op->show_options) {
+ size_t start = seq->count;
+
+ err = sb->s_op->show_options(seq, mnt->mnt_root);
+ if (err)
+ return err;
+
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ if (seq->count == start)
+ return 0;
+
+ /* skip leading comma */
+ memmove(seq->buf + start, seq->buf + start + 1,
+ seq->count - start - 1);
+ seq->count--;
+ }
+
+ return 0;
+}
+
static int statmount_string(struct kstatmount *s, u64 flag)
{
int ret;
@@ -4863,6 +5019,10 @@ static int statmount_string(struct kstatmount *s, u64 flag)
sm->mnt_point = seq->count;
ret = statmount_mnt_point(s, seq);
break;
+ case STATMOUNT_MNT_OPTS:
+ sm->mnt_opts = seq->count;
+ ret = statmount_mnt_opts(s, seq);
+ break;
default:
WARN_ON_ONCE(true);
return -EINVAL;
@@ -4903,23 +5063,84 @@ static int copy_statmount_to_user(struct kstatmount *s)
return 0;
}
-static int do_statmount(struct kstatmount *s)
+static struct mount *listmnt_next(struct mount *curr, bool reverse)
{
- struct mount *m = real_mount(s->mnt);
+ struct rb_node *node;
+
+ if (reverse)
+ node = rb_prev(&curr->mnt_node);
+ else
+ node = rb_next(&curr->mnt_node);
+
+ return node_to_mount(node);
+}
+
+static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
+{
+ struct mount *first, *child;
+
+ rwsem_assert_held(&namespace_sem);
+
+ /* We're looking at our own ns, just use get_fs_root. */
+ if (ns == current->nsproxy->mnt_ns) {
+ get_fs_root(current->fs, root);
+ return 0;
+ }
+
+ /*
+ * We have to find the first mount in our ns and use that, however it
+ * may not exist, so handle that properly.
+ */
+ if (RB_EMPTY_ROOT(&ns->mounts))
+ return -ENOENT;
+
+ first = child = ns->root;
+ for (;;) {
+ child = listmnt_next(child, false);
+ if (!child)
+ return -ENOENT;
+ if (child->mnt_parent == first)
+ break;
+ }
+
+ root->mnt = mntget(&child->mnt);
+ root->dentry = dget(root->mnt->mnt_root);
+ return 0;
+}
+
+static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
+ struct mnt_namespace *ns)
+{
+ struct path root __free(path_put) = {};
+ struct mount *m;
int err;
+ /* Has the namespace already been emptied? */
+ if (mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts))
+ return -ENOENT;
+
+ s->mnt = lookup_mnt_in_ns(mnt_id, ns);
+ if (!s->mnt)
+ return -ENOENT;
+
+ err = grab_requested_root(ns, &root);
+ if (err)
+ return err;
+
/*
* Don't trigger audit denials. We just want to determine what
* mounts to show users.
*/
- if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
- !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
+ m = real_mount(s->mnt);
+ if (!is_path_reachable(m, m->mnt.mnt_root, &root) &&
+ !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
err = security_sb_statfs(s->mnt->mnt_root);
if (err)
return err;
+ s->root = root;
if (s->mask & STATMOUNT_SB_BASIC)
statmount_sb_basic(s);
@@ -4938,6 +5159,12 @@ static int do_statmount(struct kstatmount *s)
if (!err && s->mask & STATMOUNT_MNT_POINT)
err = statmount_string(s, STATMOUNT_MNT_POINT);
+ if (!err && s->mask & STATMOUNT_MNT_OPTS)
+ err = statmount_string(s, STATMOUNT_MNT_OPTS);
+
+ if (!err && s->mask & STATMOUNT_MNT_NS_ID)
+ statmount_mnt_ns_id(s, ns);
+
if (err)
return err;
@@ -4955,6 +5182,9 @@ static inline bool retry_statmount(const long ret, size_t *seq_size)
return true;
}
+#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
+ STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS)
+
static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
struct statmount __user *buf, size_t bufsize,
size_t seq_size)
@@ -4966,10 +5196,18 @@ static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
ks->mask = kreq->param;
ks->buf = buf;
ks->bufsize = bufsize;
- ks->seq.size = seq_size;
- ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
- if (!ks->seq.buf)
- return -ENOMEM;
+
+ if (ks->mask & STATMOUNT_STRING_REQ) {
+ if (bufsize == sizeof(ks->sm))
+ return -EOVERFLOW;
+
+ ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
+ if (!ks->seq.buf)
+ return -ENOMEM;
+
+ ks->seq.size = seq_size;
+ }
+
return 0;
}
@@ -4979,7 +5217,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
int ret;
size_t usize;
- BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);
ret = get_user(usize, &req->size);
if (ret)
@@ -4994,16 +5232,32 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
return ret;
if (kreq->spare != 0)
return -EINVAL;
+ /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
+ if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
+ return -EINVAL;
return 0;
}
+/*
+ * If the user requested a specific mount namespace id, look that up and return
+ * that, or if not simply grab a passive reference on our mount namespace and
+ * return that.
+ */
+static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id)
+{
+ if (mnt_ns_id)
+ return lookup_mnt_ns(mnt_ns_id);
+ refcount_inc(&current->nsproxy->mnt_ns->passive);
+ return current->nsproxy->mnt_ns;
+}
+
SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
struct statmount __user *, buf, size_t, bufsize,
unsigned int, flags)
{
- struct vfsmount *mnt;
+ struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
+ struct kstatmount *ks __free(kfree) = NULL;
struct mnt_id_req kreq;
- struct kstatmount ks;
/* We currently support retrieval of 3 strings. */
size_t seq_size = 3 * PATH_MAX;
int ret;
@@ -5015,64 +5269,88 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
if (ret)
return ret;
+ ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
+ if (!ns)
+ return -ENOENT;
+
+ if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
+ !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
+ return -ENOENT;
+
+ ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT);
+ if (!ks)
+ return -ENOMEM;
+
retry:
- ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size);
+ ret = prepare_kstatmount(ks, &kreq, buf, bufsize, seq_size);
if (ret)
return ret;
- down_read(&namespace_sem);
- mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns);
- if (!mnt) {
- up_read(&namespace_sem);
- kvfree(ks.seq.buf);
- return -ENOENT;
- }
-
- ks.mnt = mnt;
- get_fs_root(current->fs, &ks.root);
- ret = do_statmount(&ks);
- path_put(&ks.root);
- up_read(&namespace_sem);
+ scoped_guard(rwsem_read, &namespace_sem)
+ ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns);
if (!ret)
- ret = copy_statmount_to_user(&ks);
- kvfree(ks.seq.buf);
+ ret = copy_statmount_to_user(ks);
+ kvfree(ks->seq.buf);
if (retry_statmount(ret, &seq_size))
goto retry;
return ret;
}
-static struct mount *listmnt_next(struct mount *curr)
+static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
+ u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids,
+ bool reverse)
{
- return node_to_mount(rb_next(&curr->mnt_node));
-}
-
-static ssize_t do_listmount(struct mount *first, struct path *orig,
- u64 mnt_parent_id, u64 __user *mnt_ids,
- size_t nr_mnt_ids, const struct path *root)
-{
- struct mount *r;
+ struct path root __free(path_put) = {};
+ struct path orig;
+ struct mount *r, *first;
ssize_t ret;
+ rwsem_assert_held(&namespace_sem);
+
+ ret = grab_requested_root(ns, &root);
+ if (ret)
+ return ret;
+
+ if (mnt_parent_id == LSMT_ROOT) {
+ orig = root;
+ } else {
+ orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
+ if (!orig.mnt)
+ return -ENOENT;
+ orig.dentry = orig.mnt->mnt_root;
+ }
+
/*
* Don't trigger audit denials. We just want to determine what
* mounts to show users.
*/
- if (!is_path_reachable(real_mount(orig->mnt), orig->dentry, root) &&
- !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
+ if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) &&
+ !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
- ret = security_sb_statfs(orig->dentry);
+ ret = security_sb_statfs(orig.dentry);
if (ret)
return ret;
- for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r)) {
+ if (!last_mnt_id) {
+ if (reverse)
+ first = node_to_mount(rb_last(&ns->mounts));
+ else
+ first = node_to_mount(rb_first(&ns->mounts));
+ } else {
+ if (reverse)
+ first = mnt_find_id_at_reverse(ns, last_mnt_id - 1);
+ else
+ first = mnt_find_id_at(ns, last_mnt_id + 1);
+ }
+
+ for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r, reverse)) {
if (r->mnt_id_unique == mnt_parent_id)
continue;
- if (!is_path_reachable(r, r->mnt.mnt_root, orig))
+ if (!is_path_reachable(r, r->mnt.mnt_root, &orig))
continue;
- if (put_user(r->mnt_id_unique, mnt_ids))
- return -EFAULT;
+ *mnt_ids = r->mnt_id_unique;
mnt_ids++;
nr_mnt_ids--;
ret++;
@@ -5080,22 +5358,26 @@ static ssize_t do_listmount(struct mount *first, struct path *orig,
return ret;
}
-SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *,
- mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
+SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
+ u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
{
- struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+ u64 *kmnt_ids __free(kvfree) = NULL;
+ const size_t maxcount = 1000000;
+ struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
struct mnt_id_req kreq;
- struct mount *first;
- struct path root, orig;
- u64 mnt_parent_id, last_mnt_id;
- const size_t maxcount = (size_t)-1 >> 3;
+ u64 last_mnt_id;
ssize_t ret;
- if (flags)
+ if (flags & ~LISTMOUNT_REVERSE)
return -EINVAL;
+ /*
+ * If the mount namespace really has more than 1 million mounts the
+ * caller must iterate over the mount namespace (and reconsider their
+ * system design...).
+ */
if (unlikely(nr_mnt_ids > maxcount))
- return -EFAULT;
+ return -EOVERFLOW;
if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
return -EFAULT;
@@ -5103,33 +5385,37 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *,
ret = copy_mnt_id_req(req, &kreq);
if (ret)
return ret;
- mnt_parent_id = kreq.mnt_id;
+
last_mnt_id = kreq.param;
+ /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
+ if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
+ return -EINVAL;
- down_read(&namespace_sem);
- get_fs_root(current->fs, &root);
- if (mnt_parent_id == LSMT_ROOT) {
- orig = root;
- } else {
- ret = -ENOENT;
- orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
- if (!orig.mnt)
- goto err;
- orig.dentry = orig.mnt->mnt_root;
- }
- if (!last_mnt_id)
- first = node_to_mount(rb_first(&ns->mounts));
- else
- first = mnt_find_id_at(ns, last_mnt_id + 1);
+ kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids),
+ GFP_KERNEL_ACCOUNT);
+ if (!kmnt_ids)
+ return -ENOMEM;
+
+ ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
+ if (!ns)
+ return -ENOENT;
+
+ if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
+ !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
+ return -ENOENT;
+
+ scoped_guard(rwsem_read, &namespace_sem)
+ ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids,
+ nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
+ if (ret <= 0)
+ return ret;
+
+ if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids)))
+ return -EFAULT;
- ret = do_listmount(first, &orig, mnt_parent_id, mnt_ids, nr_mnt_ids, &root);
-err:
- path_put(&root);
- up_read(&namespace_sem);
return ret;
}
-
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
@@ -5157,6 +5443,8 @@ static void __init init_mount_tree(void)
set_fs_pwd(current->fs, &root);
set_fs_root(current->fs, &root);
+
+ mnt_ns_tree_add(ns);
}
void __init mnt_init(void)
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index a6bb03bea920..4c0401dbbfcf 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -117,7 +117,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
if (folio->index == rreq->no_unlock_folio &&
test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
- _debug("no unlock");
+ kdebug("no unlock");
else
folio_unlock(folio);
}
@@ -204,7 +204,7 @@ void netfs_readahead(struct readahead_control *ractl)
struct netfs_inode *ctx = netfs_inode(ractl->mapping->host);
int ret;
- _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
+ kenter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
if (readahead_count(ractl) == 0)
return;
@@ -268,7 +268,7 @@ int netfs_read_folio(struct file *file, struct folio *folio)
struct folio *sink = NULL;
int ret;
- _enter("%lx", folio->index);
+ kenter("%lx", folio->index);
rreq = netfs_alloc_request(mapping, file,
folio_file_pos(folio), folio_size(folio),
@@ -508,7 +508,7 @@ retry:
have_folio:
*_folio = folio;
- _leave(" = 0");
+ kleave(" = 0");
return 0;
error_put:
@@ -518,7 +518,7 @@ error:
folio_unlock(folio);
folio_put(folio);
}
- _leave(" = %d", ret);
+ kleave(" = %d", ret);
return ret;
}
EXPORT_SYMBOL(netfs_write_begin);
@@ -536,7 +536,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
size_t flen = folio_size(folio);
int ret;
- _enter("%zx @%llx", flen, start);
+ kenter("%zx @%llx", flen, start);
ret = -ENOMEM;
@@ -567,7 +567,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
error_put:
netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
error:
- _leave(" = %d", ret);
+ kleave(" = %d", ret);
return ret;
}
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 1121601536d1..ecbc99ec7d36 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -56,7 +56,7 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
struct netfs_group *group = netfs_folio_group(folio);
loff_t pos = folio_file_pos(folio);
- _enter("");
+ kenter("");
if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE)
return NETFS_FLUSH_CONTENT;
@@ -181,7 +181,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
struct folio *folio, *writethrough = NULL;
enum netfs_how_to_modify howto;
enum netfs_folio_trace trace;
- unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC;
+ unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0;
ssize_t written = 0, ret, ret2;
loff_t i_size, pos = iocb->ki_pos, from, to;
size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
@@ -272,12 +272,12 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
*/
howto = netfs_how_to_modify(ctx, file, folio, netfs_group,
flen, offset, part, maybe_trouble);
- _debug("howto %u", howto);
+ kdebug("howto %u", howto);
switch (howto) {
case NETFS_JUST_PREFETCH:
ret = netfs_prefetch_for_write(file, folio, offset, part);
if (ret < 0) {
- _debug("prefetch = %zd", ret);
+ kdebug("prefetch = %zd", ret);
goto error_folio_unlock;
}
break;
@@ -418,7 +418,7 @@ out:
}
iocb->ki_pos += written;
- _leave(" = %zd [%zd]", written, ret);
+ kleave(" = %zd [%zd]", written, ret);
return written ? written : ret;
error_folio_unlock:
@@ -491,7 +491,7 @@ ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct netfs_inode *ictx = netfs_inode(inode);
ssize_t ret;
- _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+ kenter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
if (!iov_iter_count(from))
return 0;
@@ -523,17 +523,23 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
struct netfs_group *group;
struct folio *folio = page_folio(vmf->page);
struct file *file = vmf->vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
struct inode *inode = file_inode(file);
struct netfs_inode *ictx = netfs_inode(inode);
vm_fault_t ret = VM_FAULT_RETRY;
int err;
- _enter("%lx", folio->index);
+ kenter("%lx", folio->index);
sb_start_pagefault(inode->i_sb);
if (folio_lock_killable(folio) < 0)
goto out;
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
+ ret = VM_FAULT_NOPAGE;
+ goto out;
+ }
if (folio_wait_writeback_killable(folio)) {
ret = VM_FAULT_LOCKED;
@@ -549,9 +555,9 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
group = netfs_folio_group(folio);
if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) {
folio_unlock(folio);
- err = filemap_fdatawait_range(inode->i_mapping,
- folio_pos(folio),
- folio_pos(folio) + folio_size(folio));
+ err = filemap_fdatawrite_range(mapping,
+ folio_pos(folio),
+ folio_pos(folio) + folio_size(folio));
switch (err) {
case 0:
ret = VM_FAULT_RETRY;
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index 10a1e4da6bda..b6debac6205f 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -33,7 +33,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
size_t orig_count = iov_iter_count(iter);
bool async = !is_sync_kiocb(iocb);
- _enter("");
+ kenter("");
if (!orig_count)
return 0; /* Don't update atime */
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index f516460e994e..792ef17bae21 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -12,7 +12,7 @@
static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
{
struct inode *inode = wreq->inode;
- unsigned long long end = wreq->start + wreq->len;
+ unsigned long long end = wreq->start + wreq->transferred;
if (!wreq->error &&
i_size_read(inode) < end) {
@@ -37,7 +37,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
size_t len = iov_iter_count(iter);
bool async = !is_sync_kiocb(iocb);
- _enter("");
+ kenter("");
/* We're going to need a bounce buffer if what we transmit is going to
* be different in some way to the source buffer, e.g. because it gets
@@ -45,7 +45,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
*/
// TODO
- _debug("uw %llx-%llx", start, end);
+ kdebug("uw %llx-%llx", start, end);
wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start,
iocb->ki_flags & IOCB_DIRECT ?
@@ -92,10 +92,11 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
if (async)
wreq->iocb = iocb;
+ wreq->len = iov_iter_count(&wreq->io_iter);
wreq->cleanup = netfs_cleanup_dio_write;
- ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), iov_iter_count(&wreq->io_iter));
+ ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len);
if (ret < 0) {
- _debug("begin = %zd", ret);
+ kdebug("begin = %zd", ret);
goto out;
}
@@ -142,7 +143,7 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
loff_t pos = iocb->ki_pos;
unsigned long long end = pos + iov_iter_count(from) - 1;
- _enter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode));
+ kenter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode));
if (!iov_iter_count(from))
return 0;
diff --git a/fs/netfs/fscache_cache.c b/fs/netfs/fscache_cache.c
index 9397ed39b0b4..288a73c3072d 100644
--- a/fs/netfs/fscache_cache.c
+++ b/fs/netfs/fscache_cache.c
@@ -237,7 +237,7 @@ int fscache_add_cache(struct fscache_cache *cache,
{
int n_accesses;
- _enter("{%s,%s}", ops->name, cache->name);
+ kenter("{%s,%s}", ops->name, cache->name);
BUG_ON(fscache_cache_state(cache) != FSCACHE_CACHE_IS_PREPARING);
@@ -257,7 +257,7 @@ int fscache_add_cache(struct fscache_cache *cache,
up_write(&fscache_addremove_sem);
pr_notice("Cache \"%s\" added (type %s)\n", cache->name, ops->name);
- _leave(" = 0 [%s]", cache->name);
+ kleave(" = 0 [%s]", cache->name);
return 0;
}
EXPORT_SYMBOL(fscache_add_cache);
diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c
index bce2492186d0..4d1e8bf4c615 100644
--- a/fs/netfs/fscache_cookie.c
+++ b/fs/netfs/fscache_cookie.c
@@ -456,7 +456,7 @@ struct fscache_cookie *__fscache_acquire_cookie(
{
struct fscache_cookie *cookie;
- _enter("V=%x", volume->debug_id);
+ kenter("V=%x", volume->debug_id);
if (!index_key || !index_key_len || index_key_len > 255 || aux_data_len > 255)
return NULL;
@@ -484,7 +484,7 @@ struct fscache_cookie *__fscache_acquire_cookie(
trace_fscache_acquire(cookie);
fscache_stat(&fscache_n_acquires_ok);
- _leave(" = c=%08x", cookie->debug_id);
+ kleave(" = c=%08x", cookie->debug_id);
return cookie;
}
EXPORT_SYMBOL(__fscache_acquire_cookie);
@@ -505,7 +505,7 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie)
enum fscache_access_trace trace = fscache_access_lookup_cookie_end_failed;
bool need_withdraw = false;
- _enter("");
+ kenter("");
if (!cookie->volume->cache_priv) {
fscache_create_volume(cookie->volume, true);
@@ -519,7 +519,7 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie)
if (cookie->state != FSCACHE_COOKIE_STATE_FAILED)
fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT);
need_withdraw = true;
- _leave(" [fail]");
+ kleave(" [fail]");
goto out;
}
@@ -572,7 +572,7 @@ void __fscache_use_cookie(struct fscache_cookie *cookie, bool will_modify)
bool queue = false;
int n_active;
- _enter("c=%08x", cookie->debug_id);
+ kenter("c=%08x", cookie->debug_id);
if (WARN(test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags),
"Trying to use relinquished cookie\n"))
@@ -636,7 +636,7 @@ again:
spin_unlock(&cookie->lock);
if (queue)
fscache_queue_cookie(cookie, fscache_cookie_get_use_work);
- _leave("");
+ kleave("");
}
EXPORT_SYMBOL(__fscache_use_cookie);
@@ -702,7 +702,7 @@ static void fscache_cookie_state_machine(struct fscache_cookie *cookie)
enum fscache_cookie_state state;
bool wake = false;
- _enter("c=%x", cookie->debug_id);
+ kenter("c=%x", cookie->debug_id);
again:
spin_lock(&cookie->lock);
@@ -820,7 +820,7 @@ out:
spin_unlock(&cookie->lock);
if (wake)
wake_up_cookie_state(cookie);
- _leave("");
+ kleave("");
}
static void fscache_cookie_worker(struct work_struct *work)
@@ -867,7 +867,7 @@ static void fscache_cookie_lru_do_one(struct fscache_cookie *cookie)
set_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags);
spin_unlock(&cookie->lock);
fscache_stat(&fscache_n_cookies_lru_expired);
- _debug("lru c=%x", cookie->debug_id);
+ kdebug("lru c=%x", cookie->debug_id);
__fscache_withdraw_cookie(cookie);
}
@@ -971,7 +971,7 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
if (retire)
fscache_stat(&fscache_n_relinquishes_retire);
- _enter("c=%08x{%d},%d",
+ kenter("c=%08x{%d},%d",
cookie->debug_id, atomic_read(&cookie->n_active), retire);
if (WARN(test_and_set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags),
@@ -1050,7 +1050,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie,
{
bool is_caching;
- _enter("c=%x", cookie->debug_id);
+ kenter("c=%x", cookie->debug_id);
fscache_stat(&fscache_n_invalidates);
@@ -1072,7 +1072,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie,
case FSCACHE_COOKIE_STATE_INVALIDATING: /* is_still_valid will catch it */
default:
spin_unlock(&cookie->lock);
- _leave(" [no %u]", cookie->state);
+ kleave(" [no %u]", cookie->state);
return;
case FSCACHE_COOKIE_STATE_LOOKING_UP:
@@ -1081,7 +1081,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie,
fallthrough;
case FSCACHE_COOKIE_STATE_CREATING:
spin_unlock(&cookie->lock);
- _leave(" [look %x]", cookie->inval_counter);
+ kleave(" [look %x]", cookie->inval_counter);
return;
case FSCACHE_COOKIE_STATE_ACTIVE:
@@ -1094,7 +1094,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie,
if (is_caching)
fscache_queue_cookie(cookie, fscache_cookie_get_inval_work);
- _leave(" [inv]");
+ kleave(" [inv]");
return;
}
}
diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c
index 38637e5c9b57..bf4eaeec44fb 100644
--- a/fs/netfs/fscache_io.c
+++ b/fs/netfs/fscache_io.c
@@ -28,12 +28,12 @@ bool fscache_wait_for_operation(struct netfs_cache_resources *cres,
again:
if (!fscache_cache_is_live(cookie->volume->cache)) {
- _leave(" [broken]");
+ kleave(" [broken]");
return false;
}
state = fscache_cookie_state(cookie);
- _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state);
+ kenter("c=%08x{%u},%x", cookie->debug_id, state, want_state);
switch (state) {
case FSCACHE_COOKIE_STATE_CREATING:
@@ -52,7 +52,7 @@ again:
case FSCACHE_COOKIE_STATE_DROPPED:
case FSCACHE_COOKIE_STATE_RELINQUISHING:
default:
- _leave(" [not live]");
+ kleave(" [not live]");
return false;
}
@@ -92,7 +92,7 @@ again:
spin_lock(&cookie->lock);
state = fscache_cookie_state(cookie);
- _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state);
+ kenter("c=%08x{%u},%x", cookie->debug_id, state, want_state);
switch (state) {
case FSCACHE_COOKIE_STATE_LOOKING_UP:
@@ -140,7 +140,7 @@ failed:
cres->cache_priv = NULL;
cres->ops = NULL;
fscache_end_cookie_access(cookie, fscache_access_io_not_live);
- _leave(" = -ENOBUFS");
+ kleave(" = -ENOBUFS");
return -ENOBUFS;
}
@@ -224,7 +224,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
if (len == 0)
goto abandon;
- _enter("%llx,%zx", start, len);
+ kenter("%llx,%zx", start, len);
wreq = kzalloc(sizeof(struct fscache_write_request), GFP_NOFS);
if (!wreq)
diff --git a/fs/netfs/fscache_main.c b/fs/netfs/fscache_main.c
index 42e98bb523e3..bf9b33d26e31 100644
--- a/fs/netfs/fscache_main.c
+++ b/fs/netfs/fscache_main.c
@@ -99,7 +99,7 @@ error_wq:
*/
void __exit fscache_exit(void)
{
- _enter("");
+ kenter("");
kmem_cache_destroy(fscache_cookie_jar);
fscache_proc_cleanup();
diff --git a/fs/netfs/fscache_volume.c b/fs/netfs/fscache_volume.c
index cdf991bdd9de..2e2a405ca9b0 100644
--- a/fs/netfs/fscache_volume.c
+++ b/fs/netfs/fscache_volume.c
@@ -27,6 +27,19 @@ struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
return volume;
}
+struct fscache_volume *fscache_try_get_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where)
+{
+ int ref;
+
+ if (!__refcount_inc_not_zero(&volume->ref, &ref))
+ return NULL;
+
+ trace_fscache_volume(volume->debug_id, ref + 1, where);
+ return volume;
+}
+EXPORT_SYMBOL(fscache_try_get_volume);
+
static void fscache_see_volume(struct fscache_volume *volume,
enum fscache_volume_trace where)
{
@@ -251,7 +264,7 @@ static struct fscache_volume *fscache_alloc_volume(const char *volume_key,
fscache_see_volume(volume, fscache_volume_new_acquire);
fscache_stat(&fscache_n_volumes);
up_write(&fscache_addremove_sem);
- _leave(" = v=%x", volume->debug_id);
+ kleave(" = v=%x", volume->debug_id);
return volume;
err_vol:
@@ -420,6 +433,7 @@ void fscache_put_volume(struct fscache_volume *volume,
fscache_free_volume(volume);
}
}
+EXPORT_SYMBOL(fscache_put_volume);
/*
* Relinquish a volume representation cookie.
@@ -452,7 +466,7 @@ void fscache_withdraw_volume(struct fscache_volume *volume)
{
int n_accesses;
- _debug("withdraw V=%x", volume->debug_id);
+ kdebug("withdraw V=%x", volume->debug_id);
/* Allow wakeups on dec-to-0 */
n_accesses = atomic_dec_return(&volume->n_accesses);
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 95e281a8af78..21e46bc9aa49 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -34,7 +34,6 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
/*
* main.c
*/
-extern unsigned int netfs_debug;
extern struct list_head netfs_io_requests;
extern spinlock_t netfs_proc_lock;
extern mempool_t netfs_request_pool;
@@ -63,15 +62,6 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
/*
* misc.c
*/
-#define NETFS_FLAG_PUT_MARK BIT(0)
-#define NETFS_FLAG_PAGECACHE_MARK BIT(1)
-int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
- struct folio *folio, unsigned int flags,
- gfp_t gfp_mask);
-int netfs_add_folios_to_buffer(struct xarray *buffer,
- struct address_space *mapping,
- pgoff_t index, pgoff_t to, gfp_t gfp_mask);
-void netfs_clear_buffer(struct xarray *buffer);
/*
* objects.c
@@ -353,8 +343,6 @@ extern const struct seq_operations fscache_volumes_seq_ops;
struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
enum fscache_volume_trace where);
-void fscache_put_volume(struct fscache_volume *volume,
- enum fscache_volume_trace where);
bool fscache_begin_volume_access(struct fscache_volume *volume,
struct fscache_cookie *cookie,
enum fscache_access_trace why);
@@ -365,42 +353,12 @@ void fscache_create_volume(struct fscache_volume *volume, bool wait);
* debug tracing
*/
#define dbgprintk(FMT, ...) \
- printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+ pr_debug("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
-#ifdef __KDEBUG
-#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
-#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
-#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
-
-#elif defined(CONFIG_NETFS_DEBUG)
-#define _enter(FMT, ...) \
-do { \
- if (netfs_debug) \
- kenter(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#define _leave(FMT, ...) \
-do { \
- if (netfs_debug) \
- kleave(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#define _debug(FMT, ...) \
-do { \
- if (netfs_debug) \
- kdebug(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#else
-#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
-#endif
-
/*
* assertions
*/
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index c93851b98368..c7576481c321 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -130,7 +130,7 @@ static void netfs_reset_subreq_iter(struct netfs_io_request *rreq,
if (count == remaining)
return;
- _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n",
+ kdebug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n",
rreq->debug_id, subreq->debug_index,
iov_iter_count(&subreq->io_iter), subreq->transferred,
subreq->len, rreq->i_size,
@@ -326,7 +326,7 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq,
struct netfs_io_request *rreq = subreq->rreq;
int u;
- _enter("R=%x[%x]{%llx,%lx},%zd",
+ kenter("R=%x[%x]{%llx,%lx},%zd",
rreq->debug_id, subreq->debug_index,
subreq->start, subreq->flags, transferred_or_error);
@@ -435,7 +435,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
struct netfs_inode *ictx = netfs_inode(rreq->inode);
size_t lsize;
- _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
+ kenter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
if (rreq->origin != NETFS_DIO_READ) {
source = netfs_cache_prepare_read(subreq, rreq->i_size);
@@ -518,7 +518,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
subreq->start = rreq->start + rreq->submitted;
subreq->len = io_iter->count;
- _debug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted);
+ kdebug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted);
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
/* Call out to the cache to find out what it can do with the remaining
@@ -570,7 +570,7 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
struct iov_iter io_iter;
int ret;
- _enter("R=%x %llx-%llx",
+ kenter("R=%x %llx-%llx",
rreq->debug_id, rreq->start, rreq->start + rreq->len - 1);
if (rreq->len == 0) {
@@ -593,7 +593,7 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
atomic_set(&rreq->nr_outstanding, 1);
io_iter = rreq->io_iter;
do {
- _debug("submit %llx + %llx >= %llx",
+ kdebug("submit %llx + %llx >= %llx",
rreq->start, rreq->submitted, rreq->i_size);
if (rreq->origin == NETFS_DIO_READ &&
rreq->start + rreq->submitted >= rreq->i_size)
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 5f0f438e5d21..db824c372842 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -20,10 +20,6 @@ MODULE_LICENSE("GPL");
EXPORT_TRACEPOINT_SYMBOL(netfs_sreq);
-unsigned netfs_debug;
-module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
-
static struct kmem_cache *netfs_request_slab;
static struct kmem_cache *netfs_subrequest_slab;
mempool_t netfs_request_pool;
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index bc1fc54fb724..172808e83ca8 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -8,87 +8,6 @@
#include <linux/swap.h>
#include "internal.h"
-/*
- * Attach a folio to the buffer and maybe set marks on it to say that we need
- * to put the folio later and twiddle the pagecache flags.
- */
-int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
- struct folio *folio, unsigned int flags,
- gfp_t gfp_mask)
-{
- XA_STATE_ORDER(xas, xa, index, folio_order(folio));
-
-retry:
- xas_lock(&xas);
- for (;;) {
- xas_store(&xas, folio);
- if (!xas_error(&xas))
- break;
- xas_unlock(&xas);
- if (!xas_nomem(&xas, gfp_mask))
- return xas_error(&xas);
- goto retry;
- }
-
- if (flags & NETFS_FLAG_PUT_MARK)
- xas_set_mark(&xas, NETFS_BUF_PUT_MARK);
- if (flags & NETFS_FLAG_PAGECACHE_MARK)
- xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK);
- xas_unlock(&xas);
- return xas_error(&xas);
-}
-
-/*
- * Create the specified range of folios in the buffer attached to the read
- * request. The folios are marked with NETFS_BUF_PUT_MARK so that we know that
- * these need freeing later.
- */
-int netfs_add_folios_to_buffer(struct xarray *buffer,
- struct address_space *mapping,
- pgoff_t index, pgoff_t to, gfp_t gfp_mask)
-{
- struct folio *folio;
- int ret;
-
- if (to + 1 == index) /* Page range is inclusive */
- return 0;
-
- do {
- /* TODO: Figure out what order folio can be allocated here */
- folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0);
- if (!folio)
- return -ENOMEM;
- folio->index = index;
- ret = netfs_xa_store_and_mark(buffer, index, folio,
- NETFS_FLAG_PUT_MARK, gfp_mask);
- if (ret < 0) {
- folio_put(folio);
- return ret;
- }
-
- index += folio_nr_pages(folio);
- } while (index <= to && index != 0);
-
- return 0;
-}
-
-/*
- * Clear an xarray buffer, putting a ref on the folios that have
- * NETFS_BUF_PUT_MARK set.
- */
-void netfs_clear_buffer(struct xarray *buffer)
-{
- struct folio *folio;
- XA_STATE(xas, buffer, 0);
-
- rcu_read_lock();
- xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) {
- folio_put(folio);
- }
- rcu_read_unlock();
- xa_destroy(buffer);
-}
-
/**
* netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback
* @mapping: The mapping the folio belongs to.
@@ -107,7 +26,7 @@ bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio)
struct fscache_cookie *cookie = netfs_i_cookie(ictx);
bool need_use = false;
- _enter("");
+ kenter("");
if (!filemap_dirty_folio(mapping, folio))
return false;
@@ -180,7 +99,7 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
struct netfs_folio *finfo;
size_t flen = folio_size(folio);
- _enter("{%lx},%zx,%zx", folio->index, offset, length);
+ kenter("{%lx},%zx,%zx", folio->index, offset, length);
if (!folio_test_private(folio))
return;
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index c90d482b1650..f4a642727479 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -72,6 +72,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
}
}
+ atomic_inc(&ctx->io_count);
trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new);
netfs_proc_add_rreq(rreq);
netfs_stat(&netfs_n_rh_rreq);
@@ -124,6 +125,7 @@ static void netfs_free_request(struct work_struct *work)
{
struct netfs_io_request *rreq =
container_of(work, struct netfs_io_request, work);
+ struct netfs_inode *ictx = netfs_inode(rreq->inode);
unsigned int i;
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
@@ -142,6 +144,9 @@ static void netfs_free_request(struct work_struct *work)
}
kvfree(rreq->direct_bv);
}
+
+ if (atomic_dec_and_test(&ictx->io_count))
+ wake_up_var(&ictx->io_count);
call_rcu(&rreq->rcu, netfs_free_request_rcu);
}
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 60112e4b2c5e..488147439fe0 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -161,7 +161,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
{
struct list_head *next;
- _enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr);
+ kenter("R=%x[%x:]", wreq->debug_id, stream->stream_nr);
if (list_empty(&stream->subrequests))
return;
@@ -374,7 +374,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq)
unsigned int notes;
int s;
- _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
+ kenter("%llx-%llx", wreq->start, wreq->start + wreq->len);
trace_netfs_collect(wreq);
trace_netfs_rreq(wreq, netfs_rreq_trace_collect);
@@ -409,7 +409,7 @@ reassess_streams:
front = stream->front;
while (front) {
trace_netfs_collect_sreq(wreq, front);
- //_debug("sreq [%x] %llx %zx/%zx",
+ //kdebug("sreq [%x] %llx %zx/%zx",
// front->debug_index, front->start, front->transferred, front->len);
/* Stall if there may be a discontinuity. */
@@ -510,7 +510,7 @@ reassess_streams:
* stream has a gap that can be jumped.
*/
if (notes & SOME_EMPTY) {
- unsigned long long jump_to = wreq->start + wreq->len;
+ unsigned long long jump_to = wreq->start + READ_ONCE(wreq->submitted);
for (s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
@@ -598,7 +598,7 @@ reassess_streams:
out:
netfs_put_group_many(wreq->group, wreq->nr_group_rel);
wreq->nr_group_rel = 0;
- _leave(" = %x", notes);
+ kleave(" = %x", notes);
return;
need_retry:
@@ -606,7 +606,7 @@ need_retry:
* that any partially completed op will have had any wholly transferred
* folios removed from it.
*/
- _debug("retry");
+ kdebug("retry");
netfs_retry_writes(wreq);
goto out;
}
@@ -621,7 +621,7 @@ void netfs_write_collection_worker(struct work_struct *work)
size_t transferred;
int s;
- _enter("R=%x", wreq->debug_id);
+ kenter("R=%x", wreq->debug_id);
netfs_see_request(wreq, netfs_rreq_trace_see_work);
if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) {
@@ -684,16 +684,17 @@ void netfs_write_collection_worker(struct work_struct *work)
if (wreq->origin == NETFS_DIO_WRITE)
inode_dio_end(wreq->inode);
- _debug("finished");
+ kdebug("finished");
trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
if (wreq->iocb) {
- wreq->iocb->ki_pos += wreq->transferred;
+ size_t written = min(wreq->transferred, wreq->len);
+ wreq->iocb->ki_pos += written;
if (wreq->iocb->ki_complete)
wreq->iocb->ki_complete(
- wreq->iocb, wreq->error ? wreq->error : wreq->transferred);
+ wreq->iocb, wreq->error ? wreq->error : written);
wreq->iocb = VFS_PTR_POISON;
}
@@ -743,7 +744,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
struct netfs_io_request *wreq = subreq->rreq;
struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];
- _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
+ kenter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
switch (subreq->source) {
case NETFS_UPLOAD_TO_SERVER:
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index e190043bc0da..d7c971df8866 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -99,7 +99,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
if (IS_ERR(wreq))
return wreq;
- _enter("R=%x", wreq->debug_id);
+ kenter("R=%x", wreq->debug_id);
ictx = netfs_inode(wreq->inode);
if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
@@ -159,7 +159,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq,
subreq->max_nr_segs = INT_MAX;
subreq->stream_nr = stream->stream_nr;
- _enter("R=%x[%x]", wreq->debug_id, subreq->debug_index);
+ kenter("R=%x[%x]", wreq->debug_id, subreq->debug_index);
trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
refcount_read(&subreq->ref),
@@ -215,7 +215,7 @@ static void netfs_do_issue_write(struct netfs_io_stream *stream,
{
struct netfs_io_request *wreq = subreq->rreq;
- _enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len);
+ kenter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len);
if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
return netfs_write_subrequest_terminated(subreq, subreq->error, false);
@@ -254,7 +254,7 @@ static void netfs_issue_write(struct netfs_io_request *wreq,
stream->construct = NULL;
if (subreq->start + subreq->len > wreq->start + wreq->submitted)
- wreq->len = wreq->submitted = subreq->start + subreq->len - wreq->start;
+ WRITE_ONCE(wreq->submitted, subreq->start + subreq->len - wreq->start);
netfs_do_issue_write(stream, subreq);
}
@@ -272,11 +272,11 @@ int netfs_advance_write(struct netfs_io_request *wreq,
size_t part;
if (!stream->avail) {
- _leave("no write");
+ kleave("no write");
return len;
}
- _enter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0);
+ kenter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0);
if (subreq && start != subreq->start + subreq->len) {
netfs_issue_write(wreq, stream);
@@ -288,7 +288,7 @@ int netfs_advance_write(struct netfs_io_request *wreq,
subreq = stream->construct;
part = min(subreq->max_len - subreq->len, len);
- _debug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len);
+ kdebug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len);
subreq->len += part;
subreq->nr_segs++;
@@ -319,7 +319,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
bool to_eof = false, streamw = false;
bool debug = false;
- _enter("");
+ kenter("");
/* netfs_perform_write() may shift i_size around the page or from out
* of the page to beyond it, but cannot move i_size into or through the
@@ -329,7 +329,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
if (fpos >= i_size) {
/* mmap beyond eof. */
- _debug("beyond eof");
+ kdebug("beyond eof");
folio_start_writeback(folio);
folio_unlock(folio);
wreq->nr_group_rel += netfs_folio_written_back(folio);
@@ -363,7 +363,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
}
flen -= foff;
- _debug("folio %zx %zx %zx", foff, flen, fsize);
+ kdebug("folio %zx %zx %zx", foff, flen, fsize);
/* Deal with discontinuities in the stream of dirty pages. These can
* arise from a number of sources:
@@ -483,11 +483,11 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
if (!debug)
kdebug("R=%x: No submit", wreq->debug_id);
- if (flen < fsize)
+ if (foff + flen < fsize)
for (int s = 0; s < NR_IO_STREAMS; s++)
netfs_issue_write(wreq, &wreq->io_streams[s]);
- _leave(" = 0");
+ kleave(" = 0");
return 0;
}
@@ -522,7 +522,7 @@ int netfs_writepages(struct address_space *mapping,
netfs_stat(&netfs_n_wh_writepages);
do {
- _debug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted);
+ kdebug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted);
/* It appears we don't have to handle cyclic writeback wrapping. */
WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted);
@@ -546,14 +546,14 @@ int netfs_writepages(struct address_space *mapping,
mutex_unlock(&ictx->wb_lock);
netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
- _leave(" = %d", error);
+ kleave(" = %d", error);
return error;
couldnt_start:
netfs_kill_dirty_pages(mapping, wbc, folio);
out:
mutex_unlock(&ictx->wb_lock);
- _leave(" = %d", error);
+ kleave(" = %d", error);
return error;
}
EXPORT_SYMBOL(netfs_writepages);
@@ -590,7 +590,7 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c
struct folio *folio, size_t copied, bool to_page_end,
struct folio **writethrough_cache)
{
- _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u",
+ kenter("R=%x ic=%zu ws=%u cp=%zu tp=%u",
wreq->debug_id, wreq->iter.count, wreq->wsize, copied, to_page_end);
if (!*writethrough_cache) {
@@ -624,7 +624,7 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr
struct netfs_inode *ictx = netfs_inode(wreq->inode);
int ret;
- _enter("R=%x", wreq->debug_id);
+ kenter("R=%x", wreq->debug_id);
if (writethrough_cache)
netfs_write_folio(wreq, wbc, writethrough_cache);
@@ -636,7 +636,12 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr
mutex_unlock(&ictx->wb_lock);
- ret = wreq->error;
+ if (wreq->iocb) {
+ ret = -EIOCBQUEUED;
+ } else {
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE);
+ ret = wreq->error;
+ }
netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
return ret;
}
@@ -652,7 +657,7 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t
loff_t start = wreq->start;
int error = 0;
- _enter("%zx", len);
+ kenter("%zx", len);
if (wreq->origin == NETFS_DIO_WRITE)
inode_dio_begin(wreq->inode);
@@ -660,7 +665,7 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t
while (len) {
// TODO: Prepare content encryption
- _debug("unbuffered %zx", len);
+ kdebug("unbuffered %zx", len);
part = netfs_advance_write(wreq, upload, start, len, false);
start += part;
len -= part;
@@ -679,6 +684,6 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t
if (list_empty(&upload->subrequests))
netfs_wake_write_collector(wreq, false);
- _leave(" = %d", error);
+ kleave(" = %d", error);
return error;
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 342930996226..07a7be27182e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1627,7 +1627,16 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
switch (error) {
case 1:
break;
- case 0:
+ case -ETIMEDOUT:
+ if (inode && (IS_ROOT(dentry) ||
+ NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL))
+ error = 1;
+ break;
+ case -ESTALE:
+ case -ENOENT:
+ error = 0;
+ fallthrough;
+ default:
/*
* We can't d_drop the root of a disconnected tree:
* its d_hash is on the s_anon list and d_drop() would hide
@@ -1682,18 +1691,8 @@ static int nfs_lookup_revalidate_dentry(struct inode *dir,
dir_verifier = nfs_save_change_attribute(dir);
ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
- if (ret < 0) {
- switch (ret) {
- case -ESTALE:
- case -ENOENT:
- ret = 0;
- break;
- case -ETIMEDOUT:
- if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)
- ret = 1;
- }
+ if (ret < 0)
goto out;
- }
/* Request help from readdirplus */
nfs_lookup_advise_force_readdirplus(dir, flags);
@@ -1737,7 +1736,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct inode *inode;
- int error;
+ int error = 0;
nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
inode = d_inode(dentry);
@@ -1782,7 +1781,7 @@ out_valid:
out_bad:
if (flags & LOOKUP_RCU)
return -ECHILD;
- return nfs_lookup_revalidate_done(dir, dentry, inode, 0);
+ return nfs_lookup_revalidate_done(dir, dentry, inode, error);
}
static int
@@ -1804,9 +1803,10 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
if (parent != READ_ONCE(dentry->d_parent))
return -ECHILD;
} else {
- /* Wait for unlink to complete */
+ /* Wait for unlink to complete - see unblock_revalidate() */
wait_var_event(&dentry->d_fsdata,
- dentry->d_fsdata != NFS_FSDATA_BLOCKED);
+ smp_load_acquire(&dentry->d_fsdata)
+ != NFS_FSDATA_BLOCKED);
parent = dget_parent(dentry);
ret = reval(d_inode(parent), dentry, flags);
dput(parent);
@@ -1819,6 +1819,29 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate);
}
+static void block_revalidate(struct dentry *dentry)
+{
+ /* old devname - just in case */
+ kfree(dentry->d_fsdata);
+
+ /* Any new reference that could lead to an open
+ * will take ->d_lock in lookup_open() -> d_lookup().
+ * Holding this lock ensures we cannot race with
+ * __nfs_lookup_revalidate() and removes and need
+ * for further barriers.
+ */
+ lockdep_assert_held(&dentry->d_lock);
+
+ dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+}
+
+static void unblock_revalidate(struct dentry *dentry)
+{
+ /* store_release ensures wait_var_event() sees the update */
+ smp_store_release(&dentry->d_fsdata, NULL);
+ wake_up_var(&dentry->d_fsdata);
+}
+
/*
* A weaker form of d_revalidate for revalidating just the d_inode(dentry)
* when we don't really care about the dentry name. This is called when a
@@ -2255,6 +2278,9 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
*/
int error = 0;
+ if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
+ return -ENAMETOOLONG;
+
if (open_flags & O_CREAT) {
file->f_mode |= FMODE_CREATED;
error = nfs_do_create(dir, dentry, mode, open_flags);
@@ -2549,15 +2575,12 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
spin_unlock(&dentry->d_lock);
goto out;
}
- /* old devname */
- kfree(dentry->d_fsdata);
- dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+ block_revalidate(dentry);
spin_unlock(&dentry->d_lock);
error = nfs_safe_remove(dentry);
nfs_dentry_remove_handle_error(dir, dentry, error);
- dentry->d_fsdata = NULL;
- wake_up_var(&dentry->d_fsdata);
+ unblock_revalidate(dentry);
out:
trace_nfs_unlink_exit(dir, dentry, error);
return error;
@@ -2664,8 +2687,7 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data)
{
struct dentry *new_dentry = data->new_dentry;
- new_dentry->d_fsdata = NULL;
- wake_up_var(&new_dentry->d_fsdata);
+ unblock_revalidate(new_dentry);
}
/*
@@ -2727,11 +2749,6 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED))
goto out;
- if (new_dentry->d_fsdata) {
- /* old devname */
- kfree(new_dentry->d_fsdata);
- new_dentry->d_fsdata = NULL;
- }
spin_lock(&new_dentry->d_lock);
if (d_count(new_dentry) > 2) {
@@ -2753,7 +2770,7 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
new_dentry = dentry;
new_inode = NULL;
} else {
- new_dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+ block_revalidate(new_dentry);
must_unblock = true;
spin_unlock(&new_dentry->d_lock);
}
@@ -2765,6 +2782,8 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry,
must_unblock ? nfs_unblock_rename : NULL);
if (IS_ERR(task)) {
+ if (must_unblock)
+ unblock_revalidate(new_dentry);
error = PTR_ERR(task);
goto out;
}
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index bb2f583eb28b..90079ca134dd 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -141,8 +141,6 @@ int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
{
ssize_t ret;
- VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
-
if (iov_iter_rw(iter) == READ)
ret = nfs_file_direct_read(iocb, iter, true);
else
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c93c12063b3a..a691fa10b3e9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4023,6 +4023,23 @@ static void test_fs_location_for_trunking(struct nfs4_fs_location *location,
}
}
+static bool _is_same_nfs4_pathname(struct nfs4_pathname *path1,
+ struct nfs4_pathname *path2)
+{
+ int i;
+
+ if (path1->ncomponents != path2->ncomponents)
+ return false;
+ for (i = 0; i < path1->ncomponents; i++) {
+ if (path1->components[i].len != path2->components[i].len)
+ return false;
+ if (memcmp(path1->components[i].data, path2->components[i].data,
+ path1->components[i].len))
+ return false;
+ }
+ return true;
+}
+
static int _nfs4_discover_trunking(struct nfs_server *server,
struct nfs_fh *fhandle)
{
@@ -4056,9 +4073,13 @@ static int _nfs4_discover_trunking(struct nfs_server *server,
if (status)
goto out_free_3;
- for (i = 0; i < locations->nlocations; i++)
+ for (i = 0; i < locations->nlocations; i++) {
+ if (!_is_same_nfs4_pathname(&locations->fs_path,
+ &locations->locations[i].rootpath))
+ continue;
test_fs_location_for_trunking(&locations->locations[i], clp,
server);
+ }
out_free_3:
kfree(locations->fattr);
out_free_2:
@@ -6268,6 +6289,7 @@ nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen)
if (status == 0)
nfs_setsecurity(inode, fattr);
+ nfs_free_fattr(fattr);
return status;
}
#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 6efb5068c116..040b6b79c75e 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -1545,6 +1545,11 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
continue;
} else if (index == prev->wb_index + 1)
continue;
+ /*
+ * We will submit more requests after these. Indicate
+ * this to the underlying layers.
+ */
+ desc->pg_moreio = 1;
nfs_pageio_complete(desc);
break;
}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index a142287d86f6..cca80b5f54e0 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -122,8 +122,6 @@ static void nfs_readpage_release(struct nfs_page *req, int error)
{
struct folio *folio = nfs_page_to_folio(req);
- if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
- folio_set_error(folio);
if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
if (nfs_netfs_folio_unlock(folio))
folio_unlock(folio);
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 0e27a2e4e68b..1c62a5a9f51d 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -32,16 +32,8 @@ static int nfs_symlink_filler(struct file *file, struct folio *folio)
int error;
error = NFS_PROTO(inode)->readlink(inode, &folio->page, 0, PAGE_SIZE);
- if (error < 0)
- goto error;
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- return 0;
-
-error:
- folio_set_error(folio);
- folio_unlock(folio);
- return -EIO;
+ folio_end_read(folio, error == 0);
+ return error;
}
static const char *nfs_get_link(struct dentry *dentry,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2329cbb0e446..a91463ab87a0 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -311,7 +311,6 @@ static void nfs_mapping_set_error(struct folio *folio, int error)
{
struct address_space *mapping = folio_file_mapping(folio);
- folio_set_error(folio);
filemap_set_wb_err(mapping, error);
if (mapping->host)
errseq_set(&mapping->host->i_sb->s_wb_err,
diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c
index 62d2586d9902..529a75ecf22e 100644
--- a/fs/nfsd/netlink.c
+++ b/fs/nfsd/netlink.c
@@ -44,9 +44,7 @@ static const struct nla_policy nfsd_listener_set_nl_policy[NFSD_A_SERVER_SOCK_AD
static const struct genl_split_ops nfsd_nl_ops[] = {
{
.cmd = NFSD_CMD_RPC_STATUS_GET,
- .start = nfsd_nl_rpc_status_get_start,
.dumpit = nfsd_nl_rpc_status_get_dumpit,
- .done = nfsd_nl_rpc_status_get_done,
.flags = GENL_CMD_CAP_DUMP,
},
{
diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h
index e3724637d64d..2e132ef328f8 100644
--- a/fs/nfsd/netlink.h
+++ b/fs/nfsd/netlink.h
@@ -15,9 +15,6 @@
extern const struct nla_policy nfsd_sock_nl_policy[NFSD_A_SOCK_TRANSPORT_NAME + 1];
extern const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1];
-int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb);
-int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb);
-
int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
struct netlink_callback *cb);
int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 202140df8f82..c848ebe5d08f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1460,28 +1460,6 @@ static int create_proc_exports_entry(void)
unsigned int nfsd_net_id;
-/**
- * nfsd_nl_rpc_status_get_start - Prepare rpc_status_get dumpit
- * @cb: netlink metadata and command arguments
- *
- * Return values:
- * %0: The rpc_status_get command may proceed
- * %-ENODEV: There is no NFSD running in this namespace
- */
-int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb)
-{
- struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id);
- int ret = -ENODEV;
-
- mutex_lock(&nfsd_mutex);
- if (nn->nfsd_serv)
- ret = 0;
- else
- mutex_unlock(&nfsd_mutex);
-
- return ret;
-}
-
static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
struct netlink_callback *cb,
struct nfsd_genl_rqstp *rqstp)
@@ -1558,8 +1536,16 @@ static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
struct netlink_callback *cb)
{
- struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id);
int i, ret, rqstp_index = 0;
+ struct nfsd_net *nn;
+
+ mutex_lock(&nfsd_mutex);
+
+ nn = net_generic(sock_net(skb->sk), nfsd_net_id);
+ if (!nn->nfsd_serv) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
rcu_read_lock();
@@ -1636,22 +1622,10 @@ int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
ret = skb->len;
out:
rcu_read_unlock();
-
- return ret;
-}
-
-/**
- * nfsd_nl_rpc_status_get_done - rpc_status_get dumpit post-processing
- * @cb: netlink metadata and command arguments
- *
- * Return values:
- * %0: Success
- */
-int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb)
-{
+out_unlock:
mutex_unlock(&nfsd_mutex);
- return 0;
+ return ret;
}
/**
@@ -2195,6 +2169,8 @@ static __net_init int nfsd_net_init(struct net *net)
nn->nfsd_svcstats.program = &nfsd_program;
nn->nfsd_versions = NULL;
nn->nfsd4_minorversions = NULL;
+ nn->nfsd_info.mutex = &nfsd_mutex;
+ nn->nfsd_serv = NULL;
nfsd4_init_leases_net(nn);
get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
seqlock_init(&nn->writeverf_lock);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 0b75305fb5f5..dd4e11a703aa 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -247,7 +247,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
dentry = dget(exp->ex_path.dentry);
else {
dentry = exportfs_decode_fh_raw(exp->ex_path.mnt, fid,
- data_left, fileid_type,
+ data_left, fileid_type, 0,
nfsd_acceptable, exp);
if (IS_ERR_OR_NULL(dentry)) {
trace_nfsd_set_fh_dentry_badhandle(rqstp, fhp,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index cd9a6a1a9fc8..89d7918de7b1 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -672,7 +672,6 @@ int nfsd_create_serv(struct net *net)
return error;
}
spin_lock(&nfsd_notifier_lock);
- nn->nfsd_info.mutex = &nfsd_mutex;
nn->nfsd_serv = serv;
spin_unlock(&nfsd_notifier_lock);
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 89caef7513db..ba50388ee4bf 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -377,11 +377,12 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
* @target: offset number of an entry in the group (start point)
* @bsize: size in bits
* @lock: spin lock protecting @bitmap
+ * @wrap: whether to wrap around
*/
static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
unsigned long target,
unsigned int bsize,
- spinlock_t *lock)
+ spinlock_t *lock, bool wrap)
{
int pos, end = bsize;
@@ -397,6 +398,8 @@ static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
end = target;
}
+ if (!wrap)
+ return -ENOSPC;
/* wrap around */
for (pos = 0; pos < end; pos++) {
@@ -495,9 +498,10 @@ int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp)
* nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
* @inode: inode of metadata file using this allocator
* @req: nilfs_palloc_req structure exchanged for the allocation
+ * @wrap: whether to wrap around
*/
int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
- struct nilfs_palloc_req *req)
+ struct nilfs_palloc_req *req, bool wrap)
{
struct buffer_head *desc_bh, *bitmap_bh;
struct nilfs_palloc_group_desc *desc;
@@ -516,7 +520,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
entries_per_group = nilfs_palloc_entries_per_group(inode);
for (i = 0; i < ngroups; i += n) {
- if (group >= ngroups) {
+ if (group >= ngroups && wrap) {
/* wrap around */
group = 0;
maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
@@ -550,7 +554,14 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
bitmap_kaddr = kmap_local_page(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
pos = nilfs_palloc_find_available_slot(
- bitmap, group_offset, entries_per_group, lock);
+ bitmap, group_offset, entries_per_group, lock,
+ wrap);
+ /*
+ * Since the search for a free slot in the second and
+ * subsequent bitmap blocks always starts from the
+ * beginning, the wrap flag only has an effect on the
+ * first search.
+ */
kunmap_local(bitmap_kaddr);
if (pos >= 0)
goto found;
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index b667e869ac07..d825a9faca6d 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -50,8 +50,8 @@ struct nilfs_palloc_req {
struct buffer_head *pr_entry_bh;
};
-int nilfs_palloc_prepare_alloc_entry(struct inode *,
- struct nilfs_palloc_req *);
+int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
+ struct nilfs_palloc_req *req, bool wrap);
void nilfs_palloc_commit_alloc_entry(struct inode *,
struct nilfs_palloc_req *);
void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 180fc8d36213..fc1caf63a42a 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -75,7 +75,7 @@ int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req)
{
int ret;
- ret = nilfs_palloc_prepare_alloc_entry(dat, req);
+ ret = nilfs_palloc_prepare_alloc_entry(dat, req, true);
if (ret < 0)
return ret;
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index a002a44ff161..4a29b0138d75 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -135,6 +135,9 @@ static bool nilfs_check_folio(struct folio *folio, char *kaddr)
goto Enamelen;
if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
goto Espan;
+ if (unlikely(p->inode &&
+ NILFS_PRIVATE_INODE(le64_to_cpu(p->inode))))
+ goto Einumber;
}
if (offs != limit)
goto Eend;
@@ -160,6 +163,9 @@ Enamelen:
goto bad_entry;
Espan:
error = "directory entry across blocks";
+ goto bad_entry;
+Einumber:
+ error = "disallowed inode number";
bad_entry:
nilfs_error(sb,
"bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%zd, name_len=%d",
@@ -377,11 +383,39 @@ found:
struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct folio **foliop)
{
- struct nilfs_dir_entry *de = nilfs_get_folio(dir, 0, foliop);
+ struct folio *folio;
+ struct nilfs_dir_entry *de, *next_de;
+ size_t limit;
+ char *msg;
+ de = nilfs_get_folio(dir, 0, &folio);
if (IS_ERR(de))
return NULL;
- return nilfs_next_entry(de);
+
+ limit = nilfs_last_byte(dir, 0); /* is a multiple of chunk size */
+ if (unlikely(!limit || le64_to_cpu(de->inode) != dir->i_ino ||
+ !nilfs_match(1, ".", de))) {
+ msg = "missing '.'";
+ goto fail;
+ }
+
+ next_de = nilfs_next_entry(de);
+ /*
+ * If "next_de" has not reached the end of the chunk, there is
+ * at least one more record. Check whether it matches "..".
+ */
+ if (unlikely((char *)next_de == (char *)de + nilfs_chunk_size(dir) ||
+ !nilfs_match(2, "..", next_de))) {
+ msg = "missing '..'";
+ goto fail;
+ }
+ *foliop = folio;
+ return next_de;
+
+fail:
+ nilfs_error(dir->i_sb, "directory #%lu %s", dir->i_ino, msg);
+ folio_release_kmap(folio, de);
+ return NULL;
}
ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
@@ -607,7 +641,7 @@ int nilfs_empty_dir(struct inode *inode)
kaddr = nilfs_get_folio(inode, i, &folio);
if (IS_ERR(kaddr))
- continue;
+ return 0;
de = (struct nilfs_dir_entry *)kaddr;
kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1);
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 612e609158b5..1e86b9303b7c 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -56,13 +56,10 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
struct nilfs_palloc_req req;
int ret;
- req.pr_entry_nr = 0; /*
- * 0 says find free inode from beginning
- * of a group. dull code!!
- */
+ req.pr_entry_nr = NILFS_FIRST_INO(ifile->i_sb);
req.pr_entry_bh = NULL;
- ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
+ ret = nilfs_palloc_prepare_alloc_entry(ifile, &req, false);
if (!ret) {
ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
&req.pr_entry_bh);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 728e90be3570..4017f7856440 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -116,9 +116,15 @@ enum {
#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)
#define NILFS_MDT_INODE(sb, ino) \
- ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & BIT(ino)))
+ ((ino) < NILFS_USER_INO && (NILFS_MDT_INO_BITS & BIT(ino)))
#define NILFS_VALID_INODE(sb, ino) \
- ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & BIT(ino)))
+ ((ino) >= NILFS_FIRST_INO(sb) || \
+ ((ino) < NILFS_USER_INO && (NILFS_SYS_INO_BITS & BIT(ino))))
+
+#define NILFS_PRIVATE_INODE(ino) ({ \
+ ino_t __ino = (ino); \
+ ((__ino) < NILFS_USER_INO && (__ino) != NILFS_ROOT_INO && \
+ (__ino) != NILFS_SKETCH_INO); })
/**
* struct nilfs_transaction_info: context information for synchronization
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 60d4f59f7665..6ea81f1d5094 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1652,6 +1652,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
if (bh->b_folio != bd_folio) {
if (bd_folio) {
folio_lock(bd_folio);
+ folio_wait_writeback(bd_folio);
folio_clear_dirty_for_io(bd_folio);
folio_start_writeback(bd_folio);
folio_unlock(bd_folio);
@@ -1665,6 +1666,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
if (bh == segbuf->sb_super_root) {
if (bh->b_folio != bd_folio) {
folio_lock(bd_folio);
+ folio_wait_writeback(bd_folio);
folio_clear_dirty_for_io(bd_folio);
folio_start_writeback(bd_folio);
folio_unlock(bd_folio);
@@ -1681,6 +1683,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
}
if (bd_folio) {
folio_lock(bd_folio);
+ folio_wait_writeback(bd_folio);
folio_clear_dirty_for_io(bd_folio);
folio_start_writeback(bd_folio);
folio_unlock(bd_folio);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index f41d7b6d432c..e44dde57ab65 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -452,6 +452,12 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
}
nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
+ if (nilfs->ns_first_ino < NILFS_USER_INO) {
+ nilfs_err(nilfs->ns_sb,
+ "too small lower limit for non-reserved inode numbers: %u",
+ nilfs->ns_first_ino);
+ return -EINVAL;
+ }
nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 85da0629415d..1e829ed7b0ef 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -182,7 +182,7 @@ struct the_nilfs {
unsigned long ns_nrsvsegs;
unsigned long ns_first_data_block;
int ns_inode_size;
- int ns_first_ino;
+ unsigned int ns_first_ino;
u32 ns_crc_seed;
/* /sys/fs/<nilfs>/<device> */
diff --git a/fs/nls/mac-celtic.c b/fs/nls/mac-celtic.c
index 266c2d7d50bd..2963f3299d7e 100644
--- a/fs/nls/mac-celtic.c
+++ b/fs/nls/mac-celtic.c
@@ -598,4 +598,5 @@ static void __exit exit_nls_macceltic(void)
module_init(init_nls_macceltic)
module_exit(exit_nls_macceltic)
+MODULE_DESCRIPTION("NLS Codepage macceltic");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-centeuro.c b/fs/nls/mac-centeuro.c
index 9789c6057551..43b20f4bdb67 100644
--- a/fs/nls/mac-centeuro.c
+++ b/fs/nls/mac-centeuro.c
@@ -528,4 +528,5 @@ static void __exit exit_nls_maccenteuro(void)
module_init(init_nls_maccenteuro)
module_exit(exit_nls_maccenteuro)
+MODULE_DESCRIPTION("NLS Codepage maccenteuro");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-croatian.c b/fs/nls/mac-croatian.c
index bb19e7a07d43..62730d6a64e5 100644
--- a/fs/nls/mac-croatian.c
+++ b/fs/nls/mac-croatian.c
@@ -598,4 +598,5 @@ static void __exit exit_nls_maccroatian(void)
module_init(init_nls_maccroatian)
module_exit(exit_nls_maccroatian)
+MODULE_DESCRIPTION("NLS Codepage maccroatian");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-cyrillic.c b/fs/nls/mac-cyrillic.c
index 2a7dea36acba..7a5c4d16aac8 100644
--- a/fs/nls/mac-cyrillic.c
+++ b/fs/nls/mac-cyrillic.c
@@ -493,4 +493,5 @@ static void __exit exit_nls_maccyrillic(void)
module_init(init_nls_maccyrillic)
module_exit(exit_nls_maccyrillic)
+MODULE_DESCRIPTION("NLS Codepage maccyrillic");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-gaelic.c b/fs/nls/mac-gaelic.c
index 77b001653588..3d22f03a90b6 100644
--- a/fs/nls/mac-gaelic.c
+++ b/fs/nls/mac-gaelic.c
@@ -563,4 +563,5 @@ static void __exit exit_nls_macgaelic(void)
module_init(init_nls_macgaelic)
module_exit(exit_nls_macgaelic)
+MODULE_DESCRIPTION("NLS Codepage macgaelic");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-greek.c b/fs/nls/mac-greek.c
index 1eccf499e2eb..de3aa9ddb5b1 100644
--- a/fs/nls/mac-greek.c
+++ b/fs/nls/mac-greek.c
@@ -493,4 +493,5 @@ static void __exit exit_nls_macgreek(void)
module_init(init_nls_macgreek)
module_exit(exit_nls_macgreek)
+MODULE_DESCRIPTION("NLS Codepage macgreek");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-iceland.c b/fs/nls/mac-iceland.c
index cbd0875c6d69..0bba83f9d415 100644
--- a/fs/nls/mac-iceland.c
+++ b/fs/nls/mac-iceland.c
@@ -598,4 +598,5 @@ static void __exit exit_nls_maciceland(void)
module_init(init_nls_maciceland)
module_exit(exit_nls_maciceland)
+MODULE_DESCRIPTION("NLS Codepage maciceland");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-inuit.c b/fs/nls/mac-inuit.c
index fba8357aaf03..493386832dfd 100644
--- a/fs/nls/mac-inuit.c
+++ b/fs/nls/mac-inuit.c
@@ -528,4 +528,5 @@ static void __exit exit_nls_macinuit(void)
module_init(init_nls_macinuit)
module_exit(exit_nls_macinuit)
+MODULE_DESCRIPTION("NLS Codepage macinuit");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-roman.c b/fs/nls/mac-roman.c
index b6a98a5208cd..d3c082173c20 100644
--- a/fs/nls/mac-roman.c
+++ b/fs/nls/mac-roman.c
@@ -633,4 +633,5 @@ static void __exit exit_nls_macroman(void)
module_init(init_nls_macroman)
module_exit(exit_nls_macroman)
+MODULE_DESCRIPTION("NLS Codepage macroman");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-romanian.c b/fs/nls/mac-romanian.c
index 25547f023638..a7735852f2d5 100644
--- a/fs/nls/mac-romanian.c
+++ b/fs/nls/mac-romanian.c
@@ -598,4 +598,5 @@ static void __exit exit_nls_macromanian(void)
module_init(init_nls_macromanian)
module_exit(exit_nls_macromanian)
+MODULE_DESCRIPTION("NLS Codepage macromanian");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-turkish.c b/fs/nls/mac-turkish.c
index b5454bc7b7fa..d77e9b6b7d7c 100644
--- a/fs/nls/mac-turkish.c
+++ b/fs/nls/mac-turkish.c
@@ -598,4 +598,5 @@ static void __exit exit_nls_macturkish(void)
module_init(init_nls_macturkish)
module_exit(exit_nls_macturkish)
+MODULE_DESCRIPTION("NLS Codepage macturkish");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c
index a2620650d5e4..068143d71284 100644
--- a/fs/nls/nls_ascii.c
+++ b/fs/nls/nls_ascii.c
@@ -163,4 +163,5 @@ static void __exit exit_nls_ascii(void)
module_init(init_nls_ascii)
module_exit(exit_nls_ascii)
+MODULE_DESCRIPTION("NLS ASCII (United States)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index a026dbd3593f..18d597e49a19 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -545,4 +545,5 @@ EXPORT_SYMBOL(unload_nls);
EXPORT_SYMBOL(load_nls);
EXPORT_SYMBOL(load_nls_default);
+MODULE_DESCRIPTION("Base file system native language support");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp1250.c b/fs/nls/nls_cp1250.c
index ace3e19d3407..e22a57a4b828 100644
--- a/fs/nls/nls_cp1250.c
+++ b/fs/nls/nls_cp1250.c
@@ -343,4 +343,5 @@ static void __exit exit_nls_cp1250(void)
module_init(init_nls_cp1250)
module_exit(exit_nls_cp1250)
+MODULE_DESCRIPTION("NLS Windows CP1250 (Slavic/Central European Languages)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp1251.c b/fs/nls/nls_cp1251.c
index 9273ddfd08a1..6f46d339f23c 100644
--- a/fs/nls/nls_cp1251.c
+++ b/fs/nls/nls_cp1251.c
@@ -298,4 +298,5 @@ static void __exit exit_nls_cp1251(void)
module_init(init_nls_cp1251)
module_exit(exit_nls_cp1251)
+MODULE_DESCRIPTION("NLS Windows CP1251 (Bulgarian, Belarusian)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp1255.c b/fs/nls/nls_cp1255.c
index 1caf5dfed85b..299e089d4301 100644
--- a/fs/nls/nls_cp1255.c
+++ b/fs/nls/nls_cp1255.c
@@ -380,5 +380,6 @@ static void __exit exit_nls_cp1255(void)
module_init(init_nls_cp1255)
module_exit(exit_nls_cp1255)
+MODULE_DESCRIPTION("NLS Hebrew charsets (ISO-8859-8, CP1255)");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS_NLS(iso8859-8);
diff --git a/fs/nls/nls_cp437.c b/fs/nls/nls_cp437.c
index 7ddb830da3fd..ab880499ea32 100644
--- a/fs/nls/nls_cp437.c
+++ b/fs/nls/nls_cp437.c
@@ -384,4 +384,5 @@ static void __exit exit_nls_cp437(void)
module_init(init_nls_cp437)
module_exit(exit_nls_cp437)
+MODULE_DESCRIPTION("NLS Codepage 437 (United States, Canada)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp737.c b/fs/nls/nls_cp737.c
index c593f683a0cd..5c37618296e9 100644
--- a/fs/nls/nls_cp737.c
+++ b/fs/nls/nls_cp737.c
@@ -347,4 +347,5 @@ static void __exit exit_nls_cp737(void)
module_init(init_nls_cp737)
module_exit(exit_nls_cp737)
+MODULE_DESCRIPTION("NLS Codepage 737 (Greek)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp775.c b/fs/nls/nls_cp775.c
index 554c863745f2..51ccc908901f 100644
--- a/fs/nls/nls_cp775.c
+++ b/fs/nls/nls_cp775.c
@@ -316,4 +316,5 @@ static void __exit exit_nls_cp775(void)
module_init(init_nls_cp775)
module_exit(exit_nls_cp775)
+MODULE_DESCRIPTION("NLS Codepage 775 (Baltic Rim)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp850.c b/fs/nls/nls_cp850.c
index 56cccd14b40b..5f9b9507a8b6 100644
--- a/fs/nls/nls_cp850.c
+++ b/fs/nls/nls_cp850.c
@@ -312,4 +312,5 @@ static void __exit exit_nls_cp850(void)
module_init(init_nls_cp850)
module_exit(exit_nls_cp850)
+MODULE_DESCRIPTION("NLS Codepage 850 (Europe)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp852.c b/fs/nls/nls_cp852.c
index 7cdc05ac1d40..fc513a5e8358 100644
--- a/fs/nls/nls_cp852.c
+++ b/fs/nls/nls_cp852.c
@@ -334,4 +334,5 @@ static void __exit exit_nls_cp852(void)
module_init(init_nls_cp852)
module_exit(exit_nls_cp852)
+MODULE_DESCRIPTION("NLS Codepage 852 (Central/Eastern Europe)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp855.c b/fs/nls/nls_cp855.c
index 7426eea05663..a43be58adb36 100644
--- a/fs/nls/nls_cp855.c
+++ b/fs/nls/nls_cp855.c
@@ -296,4 +296,5 @@ static void __exit exit_nls_cp855(void)
module_init(init_nls_cp855)
module_exit(exit_nls_cp855)
+MODULE_DESCRIPTION("NLS Codepage 855 (Cyrillic)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp857.c b/fs/nls/nls_cp857.c
index 098309733ebd..772cd4195bad 100644
--- a/fs/nls/nls_cp857.c
+++ b/fs/nls/nls_cp857.c
@@ -298,4 +298,5 @@ static void __exit exit_nls_cp857(void)
module_init(init_nls_cp857)
module_exit(exit_nls_cp857)
+MODULE_DESCRIPTION("NLS Codepage 857 (Turkish)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp860.c b/fs/nls/nls_cp860.c
index 84224478e731..36cf4ca11966 100644
--- a/fs/nls/nls_cp860.c
+++ b/fs/nls/nls_cp860.c
@@ -361,4 +361,5 @@ static void __exit exit_nls_cp860(void)
module_init(init_nls_cp860)
module_exit(exit_nls_cp860)
+MODULE_DESCRIPTION("NLS Codepage 860 (Portuguese)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp861.c b/fs/nls/nls_cp861.c
index dc873e4be092..b7397d079f8f 100644
--- a/fs/nls/nls_cp861.c
+++ b/fs/nls/nls_cp861.c
@@ -384,4 +384,5 @@ static void __exit exit_nls_cp861(void)
module_init(init_nls_cp861)
module_exit(exit_nls_cp861)
+MODULE_DESCRIPTION("NLS Codepage 861 (Icelandic)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp862.c b/fs/nls/nls_cp862.c
index d5263e3c5566..fd3b95d1e95d 100644
--- a/fs/nls/nls_cp862.c
+++ b/fs/nls/nls_cp862.c
@@ -418,4 +418,5 @@ static void __exit exit_nls_cp862(void)
module_init(init_nls_cp862)
module_exit(exit_nls_cp862)
+MODULE_DESCRIPTION("NLS Codepage 862 (Hebrew)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp863.c b/fs/nls/nls_cp863.c
index 051c9832e36a..813ae7944249 100644
--- a/fs/nls/nls_cp863.c
+++ b/fs/nls/nls_cp863.c
@@ -378,4 +378,5 @@ static void __exit exit_nls_cp863(void)
module_init(init_nls_cp863)
module_exit(exit_nls_cp863)
+MODULE_DESCRIPTION("NLS Codepage 863 (Canadian French)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp864.c b/fs/nls/nls_cp864.c
index 97eb1273b2f7..d9eb6d5cd47a 100644
--- a/fs/nls/nls_cp864.c
+++ b/fs/nls/nls_cp864.c
@@ -404,4 +404,5 @@ static void __exit exit_nls_cp864(void)
module_init(init_nls_cp864)
module_exit(exit_nls_cp864)
+MODULE_DESCRIPTION("NLS Codepage 864 (Arabic)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp865.c b/fs/nls/nls_cp865.c
index 111214228525..2678ffd98bb6 100644
--- a/fs/nls/nls_cp865.c
+++ b/fs/nls/nls_cp865.c
@@ -384,4 +384,5 @@ static void __exit exit_nls_cp865(void)
module_init(init_nls_cp865)
module_exit(exit_nls_cp865)
+MODULE_DESCRIPTION("NLS Codepage 865 (Norwegian, Danish)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp866.c b/fs/nls/nls_cp866.c
index ffdcbc3fc38d..7e93d0a3802a 100644
--- a/fs/nls/nls_cp866.c
+++ b/fs/nls/nls_cp866.c
@@ -302,4 +302,5 @@ static void __exit exit_nls_cp866(void)
module_init(init_nls_cp866)
module_exit(exit_nls_cp866)
+MODULE_DESCRIPTION("NLS Codepage 866 (Cyrillic/Russian)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp869.c b/fs/nls/nls_cp869.c
index 3b5a34589354..4491737dd5cb 100644
--- a/fs/nls/nls_cp869.c
+++ b/fs/nls/nls_cp869.c
@@ -312,4 +312,5 @@ static void __exit exit_nls_cp869(void)
module_init(init_nls_cp869)
module_exit(exit_nls_cp869)
+MODULE_DESCRIPTION("NLS Codepage 869 (Greek)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp874.c b/fs/nls/nls_cp874.c
index 8dfaa10710fa..4fcfbf8ca72c 100644
--- a/fs/nls/nls_cp874.c
+++ b/fs/nls/nls_cp874.c
@@ -271,5 +271,6 @@ static void __exit exit_nls_cp874(void)
module_init(init_nls_cp874)
module_exit(exit_nls_cp874)
+MODULE_DESCRIPTION("NLS Thai charset (CP874, TIS-620)");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS_NLS(tis-620);
diff --git a/fs/nls/nls_cp932.c b/fs/nls/nls_cp932.c
index 67b7398e8483..e5e6270fcca6 100644
--- a/fs/nls/nls_cp932.c
+++ b/fs/nls/nls_cp932.c
@@ -7929,5 +7929,6 @@ static void __exit exit_nls_cp932(void)
module_init(init_nls_cp932)
module_exit(exit_nls_cp932)
+MODULE_DESCRIPTION("NLS Japanese charset (Shift-JIS)");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS_NLS(sjis);
diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c
index c96546cfec9f..91d0a15fd7f9 100644
--- a/fs/nls/nls_cp936.c
+++ b/fs/nls/nls_cp936.c
@@ -11107,5 +11107,6 @@ static void __exit exit_nls_cp936(void)
module_init(init_nls_cp936)
module_exit(exit_nls_cp936)
+MODULE_DESCRIPTION("NLS Simplified Chinese charset (CP936, GB2312)");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS_NLS(gb2312);
diff --git a/fs/nls/nls_cp949.c b/fs/nls/nls_cp949.c
index 199171e97aa4..3ae03c76d59c 100644
--- a/fs/nls/nls_cp949.c
+++ b/fs/nls/nls_cp949.c
@@ -13942,5 +13942,6 @@ static void __exit exit_nls_cp949(void)
module_init(init_nls_cp949)
module_exit(exit_nls_cp949)
+MODULE_DESCRIPTION("NLS Korean charset (CP949, EUC-KR)");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS_NLS(euc-kr);
diff --git a/fs/nls/nls_cp950.c b/fs/nls/nls_cp950.c
index 8e1418708209..e968aa80198d 100644
--- a/fs/nls/nls_cp950.c
+++ b/fs/nls/nls_cp950.c
@@ -9478,5 +9478,6 @@ static void __exit exit_nls_cp950(void)
module_init(init_nls_cp950)
module_exit(exit_nls_cp950)
+MODULE_DESCRIPTION("NLS Traditional Chinese charset (Big5)");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS_NLS(big5);
diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c
index 162b3f160353..0191cc9d955e 100644
--- a/fs/nls/nls_euc-jp.c
+++ b/fs/nls/nls_euc-jp.c
@@ -577,4 +577,5 @@ static void __exit exit_nls_euc_jp(void)
module_init(init_nls_euc_jp)
module_exit(exit_nls_euc_jp)
+MODULE_DESCRIPTION("NLS Japanese charset (EUC-JP)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-1.c b/fs/nls/nls_iso8859-1.c
index 69ac020d43b1..a181be488f7d 100644
--- a/fs/nls/nls_iso8859-1.c
+++ b/fs/nls/nls_iso8859-1.c
@@ -254,4 +254,5 @@ static void __exit exit_nls_iso8859_1(void)
module_init(init_nls_iso8859_1)
module_exit(exit_nls_iso8859_1)
+MODULE_DESCRIPTION("NLS ISO 8859-1 (Latin 1; Western European Languages)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-13.c b/fs/nls/nls_iso8859-13.c
index afb3f8f275f0..8e2be5bfeaf1 100644
--- a/fs/nls/nls_iso8859-13.c
+++ b/fs/nls/nls_iso8859-13.c
@@ -282,4 +282,5 @@ static void __exit exit_nls_iso8859_13(void)
module_init(init_nls_iso8859_13)
module_exit(exit_nls_iso8859_13)
+MODULE_DESCRIPTION("NLS ISO 8859-13 (Latin 7; Baltic)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-14.c b/fs/nls/nls_iso8859-14.c
index 046370f0b6f0..c789eccb8a69 100644
--- a/fs/nls/nls_iso8859-14.c
+++ b/fs/nls/nls_iso8859-14.c
@@ -338,4 +338,5 @@ static void __exit exit_nls_iso8859_14(void)
module_init(init_nls_iso8859_14)
module_exit(exit_nls_iso8859_14)
+MODULE_DESCRIPTION("NLS ISO 8859-14 (Latin 8; Celtic)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-15.c b/fs/nls/nls_iso8859-15.c
index 7e34a841a056..ffec649176fb 100644
--- a/fs/nls/nls_iso8859-15.c
+++ b/fs/nls/nls_iso8859-15.c
@@ -304,4 +304,5 @@ static void __exit exit_nls_iso8859_15(void)
module_init(init_nls_iso8859_15)
module_exit(exit_nls_iso8859_15)
+MODULE_DESCRIPTION("NLS ISO 8859-15 (Latin 9; Western European Languages with Euro)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-2.c b/fs/nls/nls_iso8859-2.c
index 7dd571181741..d352334d0314 100644
--- a/fs/nls/nls_iso8859-2.c
+++ b/fs/nls/nls_iso8859-2.c
@@ -305,4 +305,5 @@ static void __exit exit_nls_iso8859_2(void)
module_init(init_nls_iso8859_2)
module_exit(exit_nls_iso8859_2)
+MODULE_DESCRIPTION("NLS ISO 8859-2 (Latin 2; Slavic/Central European Languages)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-3.c b/fs/nls/nls_iso8859-3.c
index 740b75ec4493..09990e6634d2 100644
--- a/fs/nls/nls_iso8859-3.c
+++ b/fs/nls/nls_iso8859-3.c
@@ -305,4 +305,5 @@ static void __exit exit_nls_iso8859_3(void)
module_init(init_nls_iso8859_3)
module_exit(exit_nls_iso8859_3)
+MODULE_DESCRIPTION("NLS ISO 8859-3 (Latin 3; Esperanto, Galician, Maltese, Turkish)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-4.c b/fs/nls/nls_iso8859-4.c
index 8826021e32f5..92795224912e 100644
--- a/fs/nls/nls_iso8859-4.c
+++ b/fs/nls/nls_iso8859-4.c
@@ -305,4 +305,5 @@ static void __exit exit_nls_iso8859_4(void)
module_init(init_nls_iso8859_4)
module_exit(exit_nls_iso8859_4)
+MODULE_DESCRIPTION("NLS ISO 8859-4 (Latin 4; old Baltic charset)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-5.c b/fs/nls/nls_iso8859-5.c
index 7c04057a1ad8..32309315307a 100644
--- a/fs/nls/nls_iso8859-5.c
+++ b/fs/nls/nls_iso8859-5.c
@@ -269,4 +269,5 @@ static void __exit exit_nls_iso8859_5(void)
module_init(init_nls_iso8859_5)
module_exit(exit_nls_iso8859_5)
+MODULE_DESCRIPTION("NLS ISO 8859-5 (Cyrillic)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-6.c b/fs/nls/nls_iso8859-6.c
index d4a881400d74..c18183469d2a 100644
--- a/fs/nls/nls_iso8859-6.c
+++ b/fs/nls/nls_iso8859-6.c
@@ -260,4 +260,5 @@ static void __exit exit_nls_iso8859_6(void)
module_init(init_nls_iso8859_6)
module_exit(exit_nls_iso8859_6)
+MODULE_DESCRIPTION("NLS ISO 8859-6 (Arabic)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-7.c b/fs/nls/nls_iso8859-7.c
index 37b75d825a75..3652d6832864 100644
--- a/fs/nls/nls_iso8859-7.c
+++ b/fs/nls/nls_iso8859-7.c
@@ -314,4 +314,5 @@ static void __exit exit_nls_iso8859_7(void)
module_init(init_nls_iso8859_7)
module_exit(exit_nls_iso8859_7)
+MODULE_DESCRIPTION("NLS ISO 8859-7 (Modern Greek)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-9.c b/fs/nls/nls_iso8859-9.c
index 557b98250d37..11a67834b855 100644
--- a/fs/nls/nls_iso8859-9.c
+++ b/fs/nls/nls_iso8859-9.c
@@ -269,4 +269,5 @@ static void __exit exit_nls_iso8859_9(void)
module_init(init_nls_iso8859_9)
module_exit(exit_nls_iso8859_9)
+MODULE_DESCRIPTION("NLS ISO 8859-9 (Latin 5; Turkish)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_koi8-r.c b/fs/nls/nls_koi8-r.c
index 811f232fccfb..e3dca27a3803 100644
--- a/fs/nls/nls_koi8-r.c
+++ b/fs/nls/nls_koi8-r.c
@@ -320,4 +320,5 @@ static void __exit exit_nls_koi8_r(void)
module_init(init_nls_koi8_r)
module_exit(exit_nls_koi8_r)
+MODULE_DESCRIPTION("NLS KOI8-R (Russian)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c
index a80a741a8676..07afcd9e58c0 100644
--- a/fs/nls/nls_koi8-ru.c
+++ b/fs/nls/nls_koi8-ru.c
@@ -79,4 +79,5 @@ static void __exit exit_nls_koi8_ru(void)
module_init(init_nls_koi8_ru)
module_exit(exit_nls_koi8_ru)
+MODULE_DESCRIPTION("NLS KOI8-RU (Belarusian)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_koi8-u.c b/fs/nls/nls_koi8-u.c
index 7e029e4c188a..f60645758c1a 100644
--- a/fs/nls/nls_koi8-u.c
+++ b/fs/nls/nls_koi8-u.c
@@ -327,4 +327,5 @@ static void __exit exit_nls_koi8_u(void)
module_init(init_nls_koi8_u)
module_exit(exit_nls_koi8_u)
+MODULE_DESCRIPTION("NLS KOI8-U (Ukrainian)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_ucs2_utils.c b/fs/nls/nls_ucs2_utils.c
index a69781c54dd8..d4564b79d7bf 100644
--- a/fs/nls/nls_ucs2_utils.c
+++ b/fs/nls/nls_ucs2_utils.c
@@ -16,6 +16,7 @@
#include <asm/unaligned.h>
#include "nls_ucs2_utils.h"
+MODULE_DESCRIPTION("NLS UCS-2");
MODULE_LICENSE("GPL");
/*
diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c
index afcfbc4a14db..a0fa0610eaac 100644
--- a/fs/nls/nls_utf8.c
+++ b/fs/nls/nls_utf8.c
@@ -64,4 +64,5 @@ static void __exit exit_nls_utf8(void)
module_init(init_nls_utf8)
module_exit(exit_nls_utf8)
+MODULE_DESCRIPTION("NLS UTF-8");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 07e22a15ef02..a4a925dce331 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -8,10 +8,12 @@
#include <linux/magic.h>
#include <linux/ktime.h>
#include <linux/seq_file.h>
+#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/nsfs.h>
#include <linux/uaccess.h>
+#include "mount.h"
#include "internal.h"
static struct vfsmount *nsfs_mnt;
@@ -82,40 +84,47 @@ int ns_get_path(struct path *path, struct task_struct *task,
return ns_get_path_cb(path, ns_get_path_task, &args);
}
-int open_related_ns(struct ns_common *ns,
- struct ns_common *(*get_ns)(struct ns_common *ns))
+/**
+ * open_namespace - open a namespace
+ * @ns: the namespace to open
+ *
+ * This will consume a reference to @ns indendent of success or failure.
+ *
+ * Return: A file descriptor on success or a negative error code on failure.
+ */
+int open_namespace(struct ns_common *ns)
{
- struct path path = {};
- struct ns_common *relative;
+ struct path path __free(path_put) = {};
struct file *f;
int err;
- int fd;
- fd = get_unused_fd_flags(O_CLOEXEC);
+ /* call first to consume reference */
+ err = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
+ if (err < 0)
+ return err;
+
+ CLASS(get_unused_fd, fd)(O_CLOEXEC);
if (fd < 0)
return fd;
- relative = get_ns(ns);
- if (IS_ERR(relative)) {
- put_unused_fd(fd);
- return PTR_ERR(relative);
- }
+ f = dentry_open(&path, O_RDONLY, current_cred());
+ if (IS_ERR(f))
+ return PTR_ERR(f);
- err = path_from_stashed(&relative->stashed, nsfs_mnt, relative, &path);
- if (err < 0) {
- put_unused_fd(fd);
- return err;
- }
+ fd_install(fd, f);
+ return take_fd(fd);
+}
- f = dentry_open(&path, O_RDONLY, current_cred());
- path_put(&path);
- if (IS_ERR(f)) {
- put_unused_fd(fd);
- fd = PTR_ERR(f);
- } else
- fd_install(fd, f);
+int open_related_ns(struct ns_common *ns,
+ struct ns_common *(*get_ns)(struct ns_common *ns))
+{
+ struct ns_common *relative;
+
+ relative = get_ns(ns);
+ if (IS_ERR(relative))
+ return PTR_ERR(relative);
- return fd;
+ return open_namespace(relative);
}
EXPORT_SYMBOL_GPL(open_related_ns);
@@ -123,9 +132,12 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg)
{
struct user_namespace *user_ns;
+ struct pid_namespace *pid_ns;
+ struct task_struct *tsk;
struct ns_common *ns = get_proc_ns(file_inode(filp));
uid_t __user *argp;
uid_t uid;
+ int ret;
switch (ioctl) {
case NS_GET_USERNS:
@@ -143,9 +155,69 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
argp = (uid_t __user *) arg;
uid = from_kuid_munged(current_user_ns(), user_ns->owner);
return put_user(uid, argp);
+ case NS_GET_MNTNS_ID: {
+ struct mnt_namespace *mnt_ns;
+ __u64 __user *idp;
+ __u64 id;
+
+ if (ns->ops->type != CLONE_NEWNS)
+ return -EINVAL;
+
+ mnt_ns = container_of(ns, struct mnt_namespace, ns);
+ idp = (__u64 __user *)arg;
+ id = mnt_ns->seq;
+ return put_user(id, idp);
+ }
+ case NS_GET_PID_FROM_PIDNS:
+ fallthrough;
+ case NS_GET_TGID_FROM_PIDNS:
+ fallthrough;
+ case NS_GET_PID_IN_PIDNS:
+ fallthrough;
+ case NS_GET_TGID_IN_PIDNS:
+ if (ns->ops->type != CLONE_NEWPID)
+ return -EINVAL;
+
+ ret = -ESRCH;
+ pid_ns = container_of(ns, struct pid_namespace, ns);
+
+ rcu_read_lock();
+
+ if (ioctl == NS_GET_PID_IN_PIDNS ||
+ ioctl == NS_GET_TGID_IN_PIDNS)
+ tsk = find_task_by_vpid(arg);
+ else
+ tsk = find_task_by_pid_ns(arg, pid_ns);
+ if (!tsk)
+ break;
+
+ switch (ioctl) {
+ case NS_GET_PID_FROM_PIDNS:
+ ret = task_pid_vnr(tsk);
+ break;
+ case NS_GET_TGID_FROM_PIDNS:
+ ret = task_tgid_vnr(tsk);
+ break;
+ case NS_GET_PID_IN_PIDNS:
+ ret = task_pid_nr_ns(tsk, pid_ns);
+ break;
+ case NS_GET_TGID_IN_PIDNS:
+ ret = task_tgid_nr_ns(tsk, pid_ns);
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+ rcu_read_unlock();
+
+ if (!ret)
+ ret = -ESRCH;
+ break;
default:
- return -ENOTTY;
+ ret = -ENOTTY;
}
+
+ return ret;
}
int ns_get_name(char *buf, size_t size, struct task_struct *task,
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 27fbde2701b6..c5b688c5f984 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -259,8 +259,8 @@ enum Opt {
// clang-format off
static const struct fs_parameter_spec ntfs_fs_parameters[] = {
- fsparam_u32("uid", Opt_uid),
- fsparam_u32("gid", Opt_gid),
+ fsparam_uid("uid", Opt_uid),
+ fsparam_gid("gid", Opt_gid),
fsparam_u32oct("umask", Opt_umask),
fsparam_u32oct("dmask", Opt_dmask),
fsparam_u32oct("fmask", Opt_fmask),
@@ -319,14 +319,10 @@ static int ntfs_fs_parse_param(struct fs_context *fc,
switch (opt) {
case Opt_uid:
- opts->fs_uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(opts->fs_uid))
- return invalf(fc, "ntfs3: Invalid value for uid.");
+ opts->fs_uid = result.uid;
break;
case Opt_gid:
- opts->fs_gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(opts->fs_gid))
- return invalf(fc, "ntfs3: Invalid value for gid.");
+ opts->fs_gid = result.gid;
break;
case Opt_umask:
if (result.uint_32 & ~07777)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f0467d3b3c88..6be175a1ab3c 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2366,6 +2366,11 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
}
list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
+ ret = ocfs2_assure_trans_credits(handle, credits);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
ret = ocfs2_mark_extent_written(inode, &et, handle,
ue->ue_cpos, 1,
ue->ue_phys,
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 604fea3a26ff..530fba34f6d3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -446,6 +446,23 @@ bail:
}
/*
+ * Make sure handle has at least 'nblocks' credits available. If it does not
+ * have that many credits available, we will try to extend the handle to have
+ * enough credits. If that fails, we will restart transaction to have enough
+ * credits. Similar notes regarding data consistency and locking implications
+ * as for ocfs2_extend_trans() apply here.
+ */
+int ocfs2_assure_trans_credits(handle_t *handle, int nblocks)
+{
+ int old_nblks = jbd2_handle_buffer_credits(handle);
+
+ trace_ocfs2_assure_trans_credits(old_nblks);
+ if (old_nblks >= nblocks)
+ return 0;
+ return ocfs2_extend_trans(handle, nblocks - old_nblks);
+}
+
+/*
* If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA.
* If that fails, restart the transaction & regain write access for the
* buffer head which is used for metadata modifications.
@@ -479,12 +496,6 @@ bail:
return status;
}
-
-struct ocfs2_triggers {
- struct jbd2_buffer_trigger_type ot_triggers;
- int ot_offset;
-};
-
static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
{
return container_of(triggers, struct ocfs2_triggers, ot_triggers);
@@ -548,85 +559,76 @@ static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
struct buffer_head *bh)
{
+ struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
+
mlog(ML_ERROR,
"ocfs2_abort_trigger called by JBD2. bh = 0x%lx, "
"bh->b_blocknr = %llu\n",
(unsigned long)bh,
(unsigned long long)bh->b_blocknr);
- ocfs2_error(bh->b_assoc_map->host->i_sb,
+ ocfs2_error(ot->sb,
"JBD2 has aborted our journal, ocfs2 cannot continue\n");
}
-static struct ocfs2_triggers di_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_dinode, i_check),
-};
-
-static struct ocfs2_triggers eb_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_extent_block, h_check),
-};
-
-static struct ocfs2_triggers rb_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check),
-};
-
-static struct ocfs2_triggers gd_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_group_desc, bg_check),
-};
-
-static struct ocfs2_triggers db_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_db_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
-};
+static void ocfs2_setup_csum_triggers(struct super_block *sb,
+ enum ocfs2_journal_trigger_type type,
+ struct ocfs2_triggers *ot)
+{
+ BUG_ON(type >= OCFS2_JOURNAL_TRIGGER_COUNT);
-static struct ocfs2_triggers xb_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check),
-};
+ switch (type) {
+ case OCFS2_JTR_DI:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_dinode, i_check);
+ break;
+ case OCFS2_JTR_EB:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_extent_block, h_check);
+ break;
+ case OCFS2_JTR_RB:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_refcount_block, rf_check);
+ break;
+ case OCFS2_JTR_GD:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_group_desc, bg_check);
+ break;
+ case OCFS2_JTR_DB:
+ ot->ot_triggers.t_frozen = ocfs2_db_frozen_trigger;
+ break;
+ case OCFS2_JTR_XB:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_xattr_block, xb_check);
+ break;
+ case OCFS2_JTR_DQ:
+ ot->ot_triggers.t_frozen = ocfs2_dq_frozen_trigger;
+ break;
+ case OCFS2_JTR_DR:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check);
+ break;
+ case OCFS2_JTR_DL:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check);
+ break;
+ case OCFS2_JTR_NONE:
+ /* To make compiler happy... */
+ return;
+ }
-static struct ocfs2_triggers dq_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_dq_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
-};
+ ot->ot_triggers.t_abort = ocfs2_abort_trigger;
+ ot->sb = sb;
+}
-static struct ocfs2_triggers dr_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check),
-};
+void ocfs2_initialize_journal_triggers(struct super_block *sb,
+ struct ocfs2_triggers triggers[])
+{
+ enum ocfs2_journal_trigger_type type;
-static struct ocfs2_triggers dl_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check),
-};
+ for (type = OCFS2_JTR_DI; type < OCFS2_JOURNAL_TRIGGER_COUNT; type++)
+ ocfs2_setup_csum_triggers(sb, type, &triggers[type]);
+}
static int __ocfs2_journal_access(handle_t *handle,
struct ocfs2_caching_info *ci,
@@ -708,56 +710,91 @@ static int __ocfs2_journal_access(handle_t *handle,
int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_DI],
+ type);
}
int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_EB],
+ type);
}
int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &rb_triggers,
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_RB],
type);
}
int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_GD],
+ type);
}
int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_DB],
+ type);
}
int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_XB],
+ type);
}
int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_DQ],
+ type);
}
int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_DR],
+ type);
}
int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_DL],
+ type);
}
int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
@@ -778,13 +815,15 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
if (!is_handle_aborted(handle)) {
journal_t *journal = handle->h_transaction->t_journal;
- mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. "
- "Aborting transaction and journal.\n");
+ mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed: "
+ "handle type %u started at line %u, credits %u/%u "
+ "errcode %d. Aborting transaction and journal.\n",
+ handle->h_type, handle->h_line_no,
+ handle->h_requested_credits,
+ jbd2_handle_buffer_credits(handle), status);
handle->h_err = status;
jbd2_journal_abort_handle(handle);
jbd2_journal_abort(journal, status);
- ocfs2_abort(bh->b_assoc_map->host->i_sb,
- "Journal already aborted.\n");
}
}
}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 41c9fe7e62f9..e3c3a35dc5e0 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -243,6 +243,8 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb,
int ocfs2_commit_trans(struct ocfs2_super *osb,
handle_t *handle);
int ocfs2_extend_trans(handle_t *handle, int nblocks);
+int ocfs2_assure_trans_credits(handle_t *handle,
+ int nblocks);
int ocfs2_allocate_extend_trans(handle_t *handle,
int thresh);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index a503c553bab2..8fe826143d7b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -284,6 +284,30 @@ enum ocfs2_mount_options
#define OCFS2_OSB_ERROR_FS 0x0004
#define OCFS2_DEFAULT_ATIME_QUANTUM 60
+struct ocfs2_triggers {
+ struct jbd2_buffer_trigger_type ot_triggers;
+ int ot_offset;
+ struct super_block *sb;
+};
+
+enum ocfs2_journal_trigger_type {
+ OCFS2_JTR_DI,
+ OCFS2_JTR_EB,
+ OCFS2_JTR_RB,
+ OCFS2_JTR_GD,
+ OCFS2_JTR_DB,
+ OCFS2_JTR_XB,
+ OCFS2_JTR_DQ,
+ OCFS2_JTR_DR,
+ OCFS2_JTR_DL,
+ OCFS2_JTR_NONE /* This must be the last entry */
+};
+
+#define OCFS2_JOURNAL_TRIGGER_COUNT OCFS2_JTR_NONE
+
+void ocfs2_initialize_journal_triggers(struct super_block *sb,
+ struct ocfs2_triggers triggers[]);
+
struct ocfs2_journal;
struct ocfs2_slot_info;
struct ocfs2_recovery_map;
@@ -351,6 +375,9 @@ struct ocfs2_super
struct ocfs2_journal *journal;
unsigned long osb_commit_interval;
+ /* Journal triggers for checksum */
+ struct ocfs2_triggers s_journal_triggers[OCFS2_JOURNAL_TRIGGER_COUNT];
+
struct delayed_work la_enable_wq;
/*
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 60e208b01c8d..0511c69c9fde 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2577,6 +2577,8 @@ DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end);
DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans);
+DEFINE_OCFS2_INT_EVENT(ocfs2_assure_trans_credits);
+
DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart);
DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 8aabaed2c1cb..afee70125ae3 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1075,9 +1075,11 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root,
osb, &ocfs2_osb_debug_fops);
- if (ocfs2_meta_ecc(osb))
+ if (ocfs2_meta_ecc(osb)) {
+ ocfs2_initialize_journal_triggers(sb, osb->s_journal_triggers);
ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats,
osb->osb_debug_root);
+ }
status = ocfs2_mount_volume(sb);
if (status < 0)
diff --git a/fs/open.c b/fs/open.c
index 89cafb572061..22adbef7ecc2 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -202,13 +202,13 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
return error;
}
-SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
+SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length)
{
return do_sys_ftruncate(fd, length, 1);
}
#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
+COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length)
{
return do_sys_ftruncate(fd, length, 1);
}
@@ -247,6 +247,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
long ret;
+ loff_t sum;
if (offset < 0 || len <= 0)
return -EINVAL;
@@ -319,8 +320,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
return -ENODEV;
- /* Check for wrap through zero too */
- if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
+ /* Check for wraparound */
+ if (check_add_overflow(offset, len, &sum))
+ return -EFBIG;
+
+ if (sum > inode->i_sb->s_maxbytes)
return -EFBIG;
if (!file->f_op->fallocate)
@@ -982,12 +986,11 @@ static int do_dentry_open(struct file *f,
*/
if (f->f_mode & FMODE_WRITE) {
/*
- * Paired with smp_mb() in collapse_file() to ensure nr_thps
- * is up to date and the update to i_writecount by
- * get_write_access() is visible. Ensures subsequent insertion
- * of THPs into the page cache will fail.
+ * Depends on full fence from get_write_access() to synchronize
+ * against collapse_file() regarding i_writecount and nr_thps
+ * updates. Ensures subsequent insertion of THPs into the page
+ * cache will fail.
*/
- smp_mb();
if (filemap_nr_thps(inode->i_mapping)) {
struct address_space *mapping = inode->i_mapping;
@@ -1004,11 +1007,6 @@ static int do_dentry_open(struct file *f,
}
}
- /*
- * Once we return a file with FMODE_OPENED, __fput() will call
- * fsnotify_close(), so we need fsnotify_open() here for symmetry.
- */
- fsnotify_open(f);
return 0;
cleanup_all:
@@ -1085,8 +1083,19 @@ EXPORT_SYMBOL(file_path);
*/
int vfs_open(const struct path *path, struct file *file)
{
+ int ret;
+
file->f_path = *path;
- return do_dentry_open(file, NULL);
+ ret = do_dentry_open(file, NULL);
+ if (!ret) {
+ /*
+ * Once we return a file with FMODE_OPENED, __fput() will call
+ * fsnotify_close(), so we need fsnotify_open() here for
+ * symmetry.
+ */
+ fsnotify_open(file);
+ }
+ return ret;
}
struct file *dentry_open(const struct path *path, int flags,
@@ -1177,8 +1186,10 @@ struct file *kernel_file_open(const struct path *path, int flags,
error = do_dentry_open(f, NULL);
if (error) {
fput(f);
- f = ERR_PTR(error);
+ return ERR_PTR(error);
}
+
+ fsnotify_open(f);
return f;
}
EXPORT_SYMBOL_GPL(kernel_file_open);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index a7b527ea50d3..26ecda0e4d19 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -471,4 +471,5 @@ static void __exit exit_openprom_fs(void)
module_init(init_openprom_fs)
module_exit(exit_openprom_fs)
+MODULE_DESCRIPTION("OpenPROM filesystem support");
MODULE_LICENSE("GPL");
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 085912268442..fdb9b65db1de 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -56,7 +56,6 @@ static int orangefs_writepage_locked(struct page *page,
ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
len, wr, NULL, NULL);
if (ret < 0) {
- SetPageError(page);
mapping_set_error(page->mapping, ret);
} else {
ret = 0;
@@ -119,7 +118,6 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow,
0, &wr, NULL, NULL);
if (ret < 0) {
for (i = 0; i < ow->npages; i++) {
- SetPageError(ow->pages[i]);
mapping_set_error(ow->pages[i]->mapping, ret);
if (PagePrivate(ow->pages[i])) {
wrp = (struct orangefs_write_range *)
@@ -303,15 +301,10 @@ static int orangefs_read_folio(struct file *file, struct folio *folio)
iov_iter_zero(~0U, &iter);
/* takes care of potential aliasing */
flush_dcache_folio(folio);
- if (ret < 0) {
- folio_set_error(folio);
- } else {
- folio_mark_uptodate(folio);
+ if (ret > 0)
ret = 0;
- }
- /* unlock the folio after the ->read_folio() routine completes */
- folio_unlock(folio);
- return ret;
+ folio_end_read(folio, ret == 0);
+ return ret;
}
static int orangefs_write_begin(struct file *file,
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index b501dc07f922..edcca4beb765 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -274,10 +274,8 @@ orangefs_bufmap_map(struct orangefs_bufmap *bufmap,
gossip_err("orangefs error: asked for %d pages, only got %d.\n",
bufmap->page_count, ret);
- for (i = 0; i < ret; i++) {
- SetPageError(bufmap->page_array[i]);
+ for (i = 0; i < ret; i++)
unpin_user_page(bufmap->page_array[i]);
- }
return -ENOMEM;
}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 116f542442dd..ab65e98a1def 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -1314,10 +1314,6 @@ static int ovl_create_tmpfile(struct file *file, struct dentry *dentry,
int flags = file->f_flags | OVL_OPEN_FLAGS;
int err;
- err = ovl_copy_up(dentry->d_parent);
- if (err)
- return err;
-
old_cred = ovl_override_creds(dentry->d_sb);
err = ovl_setup_cred_for_create(dentry, inode, mode, old_cred);
if (err)
@@ -1360,6 +1356,10 @@ static int ovl_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
if (!OVL_FS(dentry->d_sb)->tmpfile)
return -EOPNOTSUPP;
+ err = ovl_copy_up(dentry->d_parent);
+ if (err)
+ return err;
+
err = ovl_want_write(dentry);
if (err)
return err;
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 063409069f56..5868cb222955 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -181,6 +181,10 @@ static int ovl_check_encode_origin(struct dentry *dentry)
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
bool decodable = ofs->config.nfs_export;
+ /* No upper layer? */
+ if (!ovl_upper_mnt(ofs))
+ return 1;
+
/* Lower file handle for non-upper non-decodable */
if (!ovl_dentry_upper(dentry) && !decodable)
return 1;
@@ -209,7 +213,7 @@ static int ovl_check_encode_origin(struct dentry *dentry)
* ovl_connect_layer() will try to make origin's layer "connected" by
* copying up a "connectable" ancestor.
*/
- if (d_is_dir(dentry) && ovl_upper_mnt(ofs) && decodable)
+ if (d_is_dir(dentry) && decodable)
return ovl_connect_layer(dentry);
/* Lower file handle for indexed and non-upper dir/non-dir */
diff --git a/fs/pidfs.c b/fs/pidfs.c
index dbb9d854d1c5..c9cb14181def 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -11,10 +11,16 @@
#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/pseudo_fs.h>
+#include <linux/ptrace.h>
#include <linux/seq_file.h>
#include <uapi/linux/pidfd.h>
+#include <linux/ipc_namespace.h>
+#include <linux/time_namespace.h>
+#include <linux/utsname.h>
+#include <net/net_namespace.h>
#include "internal.h"
+#include "mount.h"
#ifdef CONFIG_PROC_FS
/**
@@ -108,11 +114,95 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
return poll_flags;
}
+static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct task_struct *task __free(put_task) = NULL;
+ struct nsproxy *nsp __free(put_nsproxy) = NULL;
+ struct pid *pid = pidfd_pid(file);
+ struct ns_common *ns_common;
+
+ if (arg)
+ return -EINVAL;
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ return -ESRCH;
+
+ scoped_guard(task_lock, task) {
+ nsp = task->nsproxy;
+ if (nsp)
+ get_nsproxy(nsp);
+ }
+ if (!nsp)
+ return -ESRCH; /* just pretend it didn't exist */
+
+ /*
+ * We're trying to open a file descriptor to the namespace so perform a
+ * filesystem cred ptrace check. Also, we mirror nsfs behavior.
+ */
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ return -EACCES;
+
+ switch (cmd) {
+ /* Namespaces that hang of nsproxy. */
+ case PIDFD_GET_CGROUP_NAMESPACE:
+ get_cgroup_ns(nsp->cgroup_ns);
+ ns_common = to_ns_common(nsp->cgroup_ns);
+ break;
+ case PIDFD_GET_IPC_NAMESPACE:
+ get_ipc_ns(nsp->ipc_ns);
+ ns_common = to_ns_common(nsp->ipc_ns);
+ break;
+ case PIDFD_GET_MNT_NAMESPACE:
+ get_mnt_ns(nsp->mnt_ns);
+ ns_common = to_ns_common(nsp->mnt_ns);
+ break;
+ case PIDFD_GET_NET_NAMESPACE:
+ ns_common = to_ns_common(nsp->net_ns);
+ get_net_ns(ns_common);
+ break;
+ case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
+ get_pid_ns(nsp->pid_ns_for_children);
+ ns_common = to_ns_common(nsp->pid_ns_for_children);
+ break;
+ case PIDFD_GET_TIME_NAMESPACE:
+ get_time_ns(nsp->time_ns);
+ ns_common = to_ns_common(nsp->time_ns);
+ break;
+ case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
+ get_time_ns(nsp->time_ns_for_children);
+ ns_common = to_ns_common(nsp->time_ns_for_children);
+ break;
+ case PIDFD_GET_UTS_NAMESPACE:
+ get_uts_ns(nsp->uts_ns);
+ ns_common = to_ns_common(nsp->uts_ns);
+ break;
+ /* Namespaces that don't hang of nsproxy. */
+ case PIDFD_GET_USER_NAMESPACE:
+ rcu_read_lock();
+ ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns)));
+ rcu_read_unlock();
+ break;
+ case PIDFD_GET_PID_NAMESPACE:
+ rcu_read_lock();
+ ns_common = to_ns_common(get_pid_ns(task_active_pid_ns(task)));
+ rcu_read_unlock();
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ /* open_namespace() unconditionally consumes the reference */
+ return open_namespace(ns_common);
+}
+
static const struct file_operations pidfs_file_operations = {
.poll = pidfd_poll,
#ifdef CONFIG_PROC_FS
.show_fdinfo = pidfd_show_fdinfo,
#endif
+ .unlocked_ioctl = pidfd_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
};
struct pid *pidfd_pid(const struct file *file)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 18550c071d71..72a1acd03675 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3214,7 +3214,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
mm = get_task_mm(task);
if (mm) {
seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
- seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages);
+ seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm));
seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
mmput(mm);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 775ce0bcf08c..c02f1e63f82d 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -202,8 +202,8 @@ int proc_alloc_inum(unsigned int *inum)
{
int i;
- i = ida_simple_get(&proc_inum_ida, 0, UINT_MAX - PROC_DYNAMIC_FIRST + 1,
- GFP_KERNEL);
+ i = ida_alloc_max(&proc_inum_ida, UINT_MAX - PROC_DYNAMIC_FIRST,
+ GFP_KERNEL);
if (i < 0)
return i;
@@ -213,7 +213,7 @@ int proc_alloc_inum(unsigned int *inum)
void proc_free_inum(unsigned int inum)
{
- ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
+ ida_free(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
}
static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index b1c2c0b82116..9553e77c9d31 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -21,7 +21,7 @@
#define list_for_each_table_entry(entry, header) \
entry = header->ctl_table; \
- for (size_t i = 0 ; i < header->ctl_table_size && entry->procname; ++i, entry++)
+ for (size_t i = 0 ; i < header->ctl_table_size; ++i, entry++)
static const struct dentry_operations proc_sys_dentry_operations;
static const struct file_operations proc_sys_file_operations;
@@ -476,12 +476,10 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
make_empty_dir_inode(inode);
}
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
if (root->set_ownership)
root->set_ownership(head, &inode->i_uid, &inode->i_gid);
- else {
- inode->i_uid = GLOBAL_ROOT_UID;
- inode->i_gid = GLOBAL_ROOT_GID;
- }
return inode;
}
@@ -951,14 +949,14 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set,
char *new_name;
new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) +
- sizeof(struct ctl_table)*2 + namelen + 1,
+ sizeof(struct ctl_table) + namelen + 1,
GFP_KERNEL);
if (!new)
return NULL;
node = (struct ctl_node *)(new + 1);
table = (struct ctl_table *)(node + 1);
- new_name = (char *)(table + 2);
+ new_name = (char *)(table + 1);
memcpy(new_name, name, namelen);
table[0].procname = new_name;
table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
@@ -1093,6 +1091,7 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
static int sysctl_check_table_array(const char *path, struct ctl_table *table)
{
+ unsigned int extra;
int err = 0;
if ((table->proc_handler == proc_douintvec) ||
@@ -1104,6 +1103,19 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table)
if (table->proc_handler == proc_dou8vec_minmax) {
if (table->maxlen != sizeof(u8))
err |= sysctl_err(path, table, "array not allowed");
+
+ if (table->extra1) {
+ extra = *(unsigned int *) table->extra1;
+ if (extra > 255U)
+ err |= sysctl_err(path, table,
+ "range value too large for proc_dou8vec_minmax");
+ }
+ if (table->extra2) {
+ extra = *(unsigned int *) table->extra2;
+ if (extra > 255U)
+ err |= sysctl_err(path, table,
+ "range value too large for proc_dou8vec_minmax");
+ }
}
if (table->proc_handler == proc_dobool) {
@@ -1119,6 +1131,8 @@ static int sysctl_check_table(const char *path, struct ctl_table_header *header)
struct ctl_table *entry;
int err = 0;
list_for_each_table_entry(entry, header) {
+ if (!entry->procname)
+ err |= sysctl_err(path, entry, "procname is null");
if ((entry->proc_handler == proc_dostring) ||
(entry->proc_handler == proc_dobool) ||
(entry->proc_handler == proc_dointvec) ||
@@ -1154,18 +1168,16 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_
struct ctl_table_header *links;
struct ctl_node *node;
char *link_name;
- int nr_entries, name_bytes;
+ int name_bytes;
name_bytes = 0;
- nr_entries = 0;
list_for_each_table_entry(entry, head) {
- nr_entries++;
name_bytes += strlen(entry->procname) + 1;
}
links = kzalloc(sizeof(struct ctl_table_header) +
- sizeof(struct ctl_node)*nr_entries +
- sizeof(struct ctl_table)*(nr_entries + 1) +
+ sizeof(struct ctl_node)*head->ctl_table_size +
+ sizeof(struct ctl_table)*head->ctl_table_size +
name_bytes,
GFP_KERNEL);
@@ -1173,8 +1185,8 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_
return NULL;
node = (struct ctl_node *)(links + 1);
- link_table = (struct ctl_table *)(node + nr_entries);
- link_name = (char *)&link_table[nr_entries + 1];
+ link_table = (struct ctl_table *)(node + head->ctl_table_size);
+ link_name = (char *)(link_table + head->ctl_table_size);
link = link_table;
list_for_each_table_entry(entry, head) {
@@ -1188,7 +1200,7 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_
}
init_header(links, dir->header.root, dir->header.set, node, link_table,
head->ctl_table_size);
- links->nreg = nr_entries;
+ links->nreg = head->ctl_table_size;
return links;
}
@@ -1300,28 +1312,23 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
* __register_sysctl_table - register a leaf sysctl table
* @set: Sysctl tree to register on
* @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure without any child. This table
- * should not be free'd after registration. So it should not be
- * used on stack. It can either be a global or dynamically allocated
- * by the caller and free'd later after sysctl unregistration.
+ *
+ * @table: the top-level table structure. This table should not be free'd
+ * after registration. So it should not be used on stack. It can either
+ * be a global or dynamically allocated by the caller and free'd later
+ * after sysctl unregistration.
* @table_size : The number of elements in table
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
+ * array.
*
* The members of the &struct ctl_table structure are used as follows:
- *
* procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
* enter a sysctl file
- *
- * data - a pointer to data for use by proc_handler
- *
- * maxlen - the maximum size in bytes of the data
- *
- * mode - the file permissions for the /proc/sys file
- *
- * child - must be %NULL.
- *
+ * data - a pointer to data for use by proc_handler
+ * maxlen - the maximum size in bytes of the data
+ * mode - the file permissions for the /proc/sys file
+ * type - Defines the target type (described in struct definition)
* proc_handler - the text handler routine (described below)
*
* extra1, extra2 - extra pointers usable by the proc handler routines
@@ -1329,8 +1336,7 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
* [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org
*
* Leaf nodes in the sysctl tree will be represented by a single file
- * under /proc; non-leaf nodes (where child is not NULL) are not allowed,
- * sysctl_check_table() verifies this.
+ * under /proc; non-leaf nodes are not allowed.
*
* There must be a proc_handler routine for any terminal nodes.
* Several default handlers are available to cover common cases -
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f8d35f993fe5..71e5039d940d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -707,6 +707,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
#ifdef CONFIG_X86_USER_SHADOW_STACK
[ilog2(VM_SHADOW_STACK)] = "ss",
#endif
+#ifdef CONFIG_64BIT
+ [ilog2(VM_SEALED)] = "sl",
+#endif
};
size_t i;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 0a808951b7d3..e133b507ddf3 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -61,7 +61,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
return security_sb_show_options(m, sb);
}
-static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
+static void show_vfsmnt_opts(struct seq_file *m, struct vfsmount *mnt)
{
static const struct proc_fs_opts mnt_opts[] = {
{ MNT_NOSUID, ",nosuid" },
@@ -124,7 +124,7 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
err = show_sb_opts(m, sb);
if (err)
goto out;
- show_mnt_opts(m, mnt);
+ show_vfsmnt_opts(m, mnt);
if (sb->s_op->show_options)
err = sb->s_op->show_options(m, mnt_path.dentry);
seq_puts(m, " 0 0\n");
@@ -153,7 +153,7 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
goto out;
seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
- show_mnt_opts(m, mnt);
+ show_vfsmnt_opts(m, mnt);
/* Tagged fields ("foo:X" or "bar") */
if (IS_MNT_SHARED(r))
diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c
index de8cf5d75f34..65b2473e22ff 100644
--- a/fs/pstore/blk.c
+++ b/fs/pstore/blk.c
@@ -241,7 +241,7 @@ err:
/* get information of pstore/blk */
int pstore_blk_get_config(struct pstore_blk_config *info)
{
- strncpy(info->device, blkdev, 80);
+ strscpy(info->device, blkdev);
info->max_reason = max_reason;
info->kmsg_size = check_size(kmsg_size, 4096);
info->pmsg_size = check_size(pmsg_size, 4096);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 03425928d2fb..3497ede88aa0 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -761,4 +761,5 @@ static void __exit pstore_exit(void)
module_exit(pstore_exit)
MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
+MODULE_DESCRIPTION("Persistent Storage - platform driver interface");
MODULE_LICENSE("GPL");
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index d79841e94428..e399e2dd3a12 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -430,5 +430,6 @@ static void __exit exit_qnx4_fs(void)
module_init(init_qnx4_fs)
module_exit(exit_qnx4_fs)
+MODULE_DESCRIPTION("QNX4 file system");
MODULE_LICENSE("GPL");
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index d62fbef838b6..4f1735b882b1 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -694,4 +694,5 @@ static void __exit exit_qnx6_fs(void)
module_init(init_qnx6_fs)
module_exit(exit_qnx6_fs)
+MODULE_DESCRIPTION("QNX6 file system");
MODULE_LICENSE("GPL");
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 627eb2f72ef3..a2b256dac36e 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2246,9 +2246,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
int cnt;
struct quota_info *dqopt = sb_dqopt(sb);
- /* s_umount should be held in exclusive mode */
- if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
- up_read(&sb->s_umount);
+ rwsem_assert_held_write(&sb->s_umount);
/* Cannot turn off usage accounting without turning off limits, or
* suspend quotas and simultaneously turn quotas off. */
@@ -2510,9 +2508,7 @@ int dquot_resume(struct super_block *sb, int type)
int ret = 0, cnt;
unsigned int flags;
- /* s_umount should be held in exclusive mode */
- if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
- up_read(&sb->s_umount);
+ rwsem_assert_held_write(&sb->s_umount);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (type != -1 && cnt != type)
diff --git a/fs/read_write.c b/fs/read_write.c
index ef6339391351..90e283b31ca1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -730,7 +730,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
- ret = kiocb_set_rw_flags(&kiocb, flags);
+ ret = kiocb_set_rw_flags(&kiocb, flags, type);
if (ret)
return ret;
kiocb.ki_pos = (ppos ? *ppos : 0);
@@ -1736,3 +1736,19 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
return 0;
}
+
+bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos)
+{
+ size_t len = iov_iter_count(iter);
+
+ if (!iter_is_ubuf(iter))
+ return false;
+
+ if (!is_power_of_2(len))
+ return false;
+
+ if (!IS_ALIGNED(pos, len))
+ return false;
+
+ return true;
+}
diff --git a/fs/readdir.c b/fs/readdir.c
index 278bc0254732..d6c82421902a 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -22,8 +22,6 @@
#include <linux/compat.h>
#include <linux/uaccess.h>
-#include <asm/unaligned.h>
-
/*
* Some filesystems were never converted to '->iterate_shared()'
* and their directory iterators want the inode lock held for
@@ -72,7 +70,7 @@ int wrap_directory_iterator(struct file *file,
EXPORT_SYMBOL(wrap_directory_iterator);
/*
- * Note the "unsafe_put_user() semantics: we goto a
+ * Note the "unsafe_put_user()" semantics: we goto a
* label for errors.
*/
#define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index c1daedc50f4c..9b43a81a6488 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2699,7 +2699,6 @@ fail:
}
bh = bh->b_this_page;
} while (bh != head);
- folio_set_error(folio);
BUG_ON(folio_test_writeback(folio));
folio_start_writeback(folio);
folio_unlock(folio);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 2cbb92462074..68758b6fed94 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -101,19 +101,15 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
*/
static int romfs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
loff_t offset, size;
unsigned long fillsize, pos;
void *buf;
int ret;
- buf = kmap(page);
- if (!buf)
- return -ENOMEM;
+ buf = kmap_local_folio(folio, 0);
- /* 32 bit warning -- but not for us :) */
- offset = page_offset(page);
+ offset = folio_pos(folio);
size = i_size_read(inode);
fillsize = 0;
ret = 0;
@@ -125,20 +121,14 @@ static int romfs_read_folio(struct file *file, struct folio *folio)
ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize);
if (ret < 0) {
- SetPageError(page);
fillsize = 0;
ret = -EIO;
}
}
- if (fillsize < PAGE_SIZE)
- memset(buf + fillsize, 0, PAGE_SIZE - fillsize);
- if (ret == 0)
- SetPageUptodate(page);
-
- flush_dcache_page(page);
- kunmap(page);
- unlock_page(page);
+ buf = folio_zero_tail(folio, fillsize, buf);
+ kunmap_local(buf);
+ folio_end_read(folio, ret == 0);
return ret;
}
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 4a5614442dbf..ec7b2da2477a 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -282,14 +282,10 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
if (IS_ERR(file)) {
put_unused_fd(ufd);
kfree(ctx);
- return ufd;
+ return PTR_ERR(file);
}
file->f_mode |= FMODE_NOWAIT;
- /*
- * When we call this, the initialization must be complete, since
- * anon_inode_getfd() will install the fd.
- */
fd_install(ufd, file);
} else {
struct fd f = fdget(ufd);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index a665aac9be9f..6397fdefd876 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -134,7 +134,7 @@ module_param(enable_oplocks, bool, 0644);
MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
module_param(enable_gcm_256, bool, 0644);
-MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: n/N/0");
+MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: y/Y/0");
module_param(require_gcm_256, bool, 0644);
MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0");
@@ -431,6 +431,7 @@ cifs_free_inode(struct inode *inode)
static void
cifs_evict_inode(struct inode *inode)
{
+ netfs_wait_for_outstanding_io(inode);
truncate_inode_pages_final(&inode->i_data);
if (inode->i_state & I_PINNING_NETFS_WB)
cifs_fscache_unuse_inode_cookie(inode, true);
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 73482734a8d8..a865941724c0 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -1494,6 +1494,8 @@ struct cifs_aio_ctx {
struct cifs_io_request {
struct netfs_io_request rreq;
struct cifsFileInfo *cfile;
+ struct TCP_Server_Info *server;
+ pid_t pid;
};
/* asynchronous read support */
@@ -1504,7 +1506,6 @@ struct cifs_io_subrequest {
struct cifs_io_request *req;
};
ssize_t got_bytes;
- pid_t pid;
unsigned int xid;
int result;
bool have_xid;
@@ -1917,8 +1918,8 @@ require use of the stronger protocol */
#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */
#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */
-#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP)
-#define CIFSSEC_MAX (CIFSSEC_MUST_NTLMV2)
+#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP | CIFSSEC_MAY_SEAL)
+#define CIFSSEC_MAX (CIFSSEC_MAY_SIGN | CIFSSEC_MUST_KRB5 | CIFSSEC_MAY_SEAL)
#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
/*
*****************************************************************
diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h
index c46d418c1c0c..a2072ab9e586 100644
--- a/fs/smb/client/cifspdu.h
+++ b/fs/smb/client/cifspdu.h
@@ -2574,7 +2574,7 @@ typedef struct {
struct win_dev {
- unsigned char type[8]; /* IntxCHR or IntxBLK or LnxFIFO*/
+ unsigned char type[8]; /* IntxCHR or IntxBLK or LnxFIFO or LnxSOCK */
__le64 major;
__le64 minor;
} __attribute__((packed));
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 25e9ab947c17..595c4b673707 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1345,8 +1345,8 @@ cifs_async_readv(struct cifs_io_subrequest *rdata)
if (rc)
return rc;
- smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid);
- smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16));
+ smb->hdr.Pid = cpu_to_le16((__u16)rdata->req->pid);
+ smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->req->pid >> 16));
smb->AndXCommand = 0xFF; /* none */
smb->Fid = rdata->req->cfile->fid.netfid;
@@ -1689,8 +1689,8 @@ cifs_async_writev(struct cifs_io_subrequest *wdata)
if (rc)
goto async_writev_out;
- smb->hdr.Pid = cpu_to_le16((__u16)wdata->pid);
- smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->pid >> 16));
+ smb->hdr.Pid = cpu_to_le16((__u16)wdata->req->pid);
+ smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->req->pid >> 16));
smb->AndXCommand = 0xFF; /* none */
smb->Fid = wdata->req->cfile->fid.netfid;
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 9d5c2440abfc..1374635e89fa 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -134,17 +134,15 @@ fail:
static bool cifs_clamp_length(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
- struct TCP_Server_Info *server;
struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
+ struct TCP_Server_Info *server = req->server;
struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
size_t rsize = 0;
int rc;
rdata->xid = get_xid();
rdata->have_xid = true;
-
- server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses);
rdata->server = server;
if (cifs_sb->ctx->rsize == 0)
@@ -179,15 +177,8 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq)
struct netfs_io_request *rreq = subreq->rreq;
struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
- struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
- pid_t pid;
int rc = 0;
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
- pid = req->cfile->pid;
- else
- pid = current->tgid; // Ummm... This may be a workqueue
-
cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n",
__func__, rreq->debug_id, subreq->debug_index, rreq->mapping,
subreq->transferred, subreq->len);
@@ -201,16 +192,8 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq)
}
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
- rdata->pid = pid;
-
- rc = adjust_credits(rdata->server, &rdata->credits, rdata->subreq.len);
- if (!rc) {
- if (rdata->req->cfile->invalidHandle)
- rc = -EAGAIN;
- else
- rc = rdata->server->ops->async_readv(rdata);
- }
+ rc = rdata->server->ops->async_readv(rdata);
out:
if (rc)
netfs_subreq_terminated(subreq, rc, false);
@@ -245,11 +228,15 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file)
rreq->rsize = cifs_sb->ctx->rsize;
rreq->wsize = cifs_sb->ctx->wsize;
+ req->pid = current->tgid; // Ummm... This may be a workqueue
if (file) {
open_file = file->private_data;
rreq->netfs_priv = file->private_data;
req->cfile = cifsFileInfo_get(open_file);
+ req->server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses);
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+ req->pid = req->cfile->pid;
} else if (rreq->origin != NETFS_WRITEBACK) {
WARN_ON_ONCE(1);
return -EIO;
@@ -259,35 +246,6 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file)
}
/*
- * Expand the size of a readahead to the size of the rsize, if at least as
- * large as a page, allowing for the possibility that rsize is not pow-2
- * aligned.
- */
-static void cifs_expand_readahead(struct netfs_io_request *rreq)
-{
- unsigned int rsize = rreq->rsize;
- loff_t misalignment, i_size = i_size_read(rreq->inode);
-
- if (rsize < PAGE_SIZE)
- return;
-
- if (rsize < INT_MAX)
- rsize = roundup_pow_of_two(rsize);
- else
- rsize = ((unsigned int)INT_MAX + 1) / 2;
-
- misalignment = rreq->start & (rsize - 1);
- if (misalignment) {
- rreq->start -= misalignment;
- rreq->len += misalignment;
- }
-
- rreq->len = round_up(rreq->len, rsize);
- if (rreq->start < i_size && rreq->len > i_size - rreq->start)
- rreq->len = i_size - rreq->start;
-}
-
-/*
* Completion of a request operation.
*/
static void cifs_rreq_done(struct netfs_io_request *rreq)
@@ -342,7 +300,6 @@ const struct netfs_request_ops cifs_req_ops = {
.init_request = cifs_init_request,
.free_request = cifs_free_request,
.free_subrequest = cifs_free_subrequest,
- .expand_readahead = cifs_expand_readahead,
.clamp_length = cifs_clamp_length,
.issue_read = cifs_req_issue_read,
.done = cifs_rreq_done,
@@ -3200,8 +3157,6 @@ static int cifs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
{
ssize_t ret;
- WARN_ON_ONCE(iov_iter_count(iter) != PAGE_SIZE);
-
if (iov_iter_rw(iter) == READ)
ret = netfs_unbuffered_read_iter_locked(iocb, iter);
else
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 3bbac925d076..bc926ab2555b 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -128,12 +128,14 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_flag("compress", Opt_compress),
fsparam_flag("witness", Opt_witness),
+ /* Mount options which take uid or gid */
+ fsparam_uid("backupuid", Opt_backupuid),
+ fsparam_gid("backupgid", Opt_backupgid),
+ fsparam_uid("uid", Opt_uid),
+ fsparam_uid("cruid", Opt_cruid),
+ fsparam_gid("gid", Opt_gid),
+
/* Mount options which take numeric value */
- fsparam_u32("backupuid", Opt_backupuid),
- fsparam_u32("backupgid", Opt_backupgid),
- fsparam_u32("uid", Opt_uid),
- fsparam_u32("cruid", Opt_cruid),
- fsparam_u32("gid", Opt_gid),
fsparam_u32("file_mode", Opt_file_mode),
fsparam_u32("dirmode", Opt_dirmode),
fsparam_u32("dir_mode", Opt_dirmode),
@@ -951,8 +953,6 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
int i, opt;
bool is_smb3 = !strcmp(fc->fs_type->name, "smb3");
bool skip_parsing = false;
- kuid_t uid;
- kgid_t gid;
cifs_dbg(FYI, "CIFS: parsing cifs mount option '%s'\n", param->key);
@@ -1083,38 +1083,23 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
}
break;
case Opt_uid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid))
- goto cifs_parse_mount_err;
- ctx->linux_uid = uid;
+ ctx->linux_uid = result.uid;
ctx->uid_specified = true;
break;
case Opt_cruid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid))
- goto cifs_parse_mount_err;
- ctx->cred_uid = uid;
+ ctx->cred_uid = result.uid;
ctx->cruid_specified = true;
break;
case Opt_backupuid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid))
- goto cifs_parse_mount_err;
- ctx->backupuid = uid;
+ ctx->backupuid = result.uid;
ctx->backupuid_specified = true;
break;
case Opt_backupgid:
- gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(gid))
- goto cifs_parse_mount_err;
- ctx->backupgid = gid;
+ ctx->backupgid = result.gid;
ctx->backupgid_specified = true;
break;
case Opt_gid:
- gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(gid))
- goto cifs_parse_mount_err;
- ctx->linux_gid = gid;
+ ctx->linux_gid = result.gid;
ctx->gid_specified = true;
break;
case Opt_port:
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 262576573eb5..4a8aa1de9522 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -606,6 +606,10 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
fattr->cf_rdev = MKDEV(mjr, mnr);
}
+ } else if (memcmp("LnxSOCK", pbuf, 8) == 0) {
+ cifs_dbg(FYI, "Socket\n");
+ fattr->cf_mode |= S_IFSOCK;
+ fattr->cf_dtype = DT_SOCK;
} else if (memcmp("IntxLNK", pbuf, 7) == 0) {
cifs_dbg(FYI, "Symlink\n");
fattr->cf_mode |= S_IFLNK;
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 4ce6c3121a7e..c8e536540895 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -4997,6 +4997,9 @@ static int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
pdev.major = cpu_to_le64(MAJOR(dev));
pdev.minor = cpu_to_le64(MINOR(dev));
break;
+ case S_IFSOCK:
+ strscpy(pdev.type, "LnxSOCK");
+ break;
case S_IFIFO:
strscpy(pdev.type, "LnxFIFO");
break;
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 993ac36c3d58..2ae2dbb6202b 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4484,6 +4484,16 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
return rc;
}
+static void smb2_readv_worker(struct work_struct *work)
+{
+ struct cifs_io_subrequest *rdata =
+ container_of(work, struct cifs_io_subrequest, subreq.work);
+
+ netfs_subreq_terminated(&rdata->subreq,
+ (rdata->result == 0 || rdata->result == -EAGAIN) ?
+ rdata->got_bytes : rdata->result, true);
+}
+
static void
smb2_readv_callback(struct mid_q_entry *mid)
{
@@ -4577,12 +4587,9 @@ smb2_readv_callback(struct mid_q_entry *mid)
if (rdata->subreq.start < rdata->subreq.rreq->i_size)
rdata->result = 0;
}
- if (rdata->result == 0 || rdata->result == -EAGAIN)
- iov_iter_advance(&rdata->subreq.io_iter, rdata->got_bytes);
rdata->credits.value = 0;
- netfs_subreq_terminated(&rdata->subreq,
- (rdata->result == 0 || rdata->result == -EAGAIN) ?
- rdata->got_bytes : rdata->result, true);
+ INIT_WORK(&rdata->subreq.work, smb2_readv_worker);
+ queue_work(cifsiod_wq, &rdata->subreq.work);
release_mid(mid);
add_credits(server, &credits, 0);
}
@@ -4614,7 +4621,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata)
io_parms.length = rdata->subreq.len;
io_parms.persistent_fid = rdata->req->cfile->fid.persistent_fid;
io_parms.volatile_fid = rdata->req->cfile->fid.volatile_fid;
- io_parms.pid = rdata->pid;
+ io_parms.pid = rdata->req->pid;
rc = smb2_new_read_req(
(void **) &buf, &total_len, &io_parms, rdata, 0, 0);
@@ -4789,7 +4796,6 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->result = -ENOSPC;
else
wdata->subreq.len = written;
- iov_iter_advance(&wdata->subreq.io_iter, written);
break;
case MID_REQUEST_SUBMITTED:
case MID_RETRY_NEEDED:
@@ -4867,7 +4873,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
.length = wdata->subreq.len,
.persistent_fid = wdata->req->cfile->fid.persistent_fid,
.volatile_fid = wdata->req->cfile->fid.volatile_fid,
- .pid = wdata->pid,
+ .pid = wdata->req->pid,
};
io_parms = &_io_parms;
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index 02135a605305..1476c445cadc 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -216,8 +216,8 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid)
}
tcon = smb2_find_smb_sess_tcon_unlocked(ses, tid);
if (!tcon) {
- cifs_put_smb_ses(ses);
spin_unlock(&cifs_tcp_ses_lock);
+ cifs_put_smb_ses(ses);
return NULL;
}
spin_unlock(&cifs_tcp_ses_lock);
diff --git a/fs/smb/common/cifs_arc4.c b/fs/smb/common/cifs_arc4.c
index 043e4cb839fa..df360ca47826 100644
--- a/fs/smb/common/cifs_arc4.c
+++ b/fs/smb/common/cifs_arc4.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include "arc4.h"
+MODULE_DESCRIPTION("ARC4 Cipher Algorithm");
MODULE_LICENSE("GPL");
int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len)
diff --git a/fs/smb/common/cifs_md4.c b/fs/smb/common/cifs_md4.c
index 50f78cfc6ce9..7ee7f4dad90c 100644
--- a/fs/smb/common/cifs_md4.c
+++ b/fs/smb/common/cifs_md4.c
@@ -24,6 +24,7 @@
#include <asm/byteorder.h>
#include "md4.h"
+MODULE_DESCRIPTION("MD4 Message Digest Algorithm (RFC1320)");
MODULE_LICENSE("GPL");
static inline u32 lshift(u32 x, unsigned int s)
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index 8d10be1fe18a..c3ee42188d25 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -917,6 +917,40 @@ struct smb2_query_directory_rsp {
__u8 Buffer[];
} __packed;
+/* DeviceType Flags */
+#define FILE_DEVICE_CD_ROM 0x00000002
+#define FILE_DEVICE_CD_ROM_FILE_SYSTEM 0x00000003
+#define FILE_DEVICE_DFS 0x00000006
+#define FILE_DEVICE_DISK 0x00000007
+#define FILE_DEVICE_DISK_FILE_SYSTEM 0x00000008
+#define FILE_DEVICE_FILE_SYSTEM 0x00000009
+#define FILE_DEVICE_NAMED_PIPE 0x00000011
+#define FILE_DEVICE_NETWORK 0x00000012
+#define FILE_DEVICE_NETWORK_FILE_SYSTEM 0x00000014
+#define FILE_DEVICE_NULL 0x00000015
+#define FILE_DEVICE_PARALLEL_PORT 0x00000016
+#define FILE_DEVICE_PRINTER 0x00000018
+#define FILE_DEVICE_SERIAL_PORT 0x0000001b
+#define FILE_DEVICE_STREAMS 0x0000001e
+#define FILE_DEVICE_TAPE 0x0000001f
+#define FILE_DEVICE_TAPE_FILE_SYSTEM 0x00000020
+#define FILE_DEVICE_VIRTUAL_DISK 0x00000024
+#define FILE_DEVICE_NETWORK_REDIRECTOR 0x00000028
+
+/* Device Characteristics */
+#define FILE_REMOVABLE_MEDIA 0x00000001
+#define FILE_READ_ONLY_DEVICE 0x00000002
+#define FILE_FLOPPY_DISKETTE 0x00000004
+#define FILE_WRITE_ONCE_MEDIA 0x00000008
+#define FILE_REMOTE_DEVICE 0x00000010
+#define FILE_DEVICE_IS_MOUNTED 0x00000020
+#define FILE_VIRTUAL_VOLUME 0x00000040
+#define FILE_DEVICE_SECURE_OPEN 0x00000100
+#define FILE_CHARACTERISTIC_TS_DEVICE 0x00001000
+#define FILE_CHARACTERISTIC_WEBDAV_DEVICE 0x00002000
+#define FILE_PORTABLE_DEVICE 0x00004000
+#define FILE_DEVICE_ALLOW_APPCONTAINER_TRAVERSAL 0x00020000
+
/*
* Maximum number of iovs we need for a set-info request.
* The largest one is rename/hardlink
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index b6c5a8ea3887..840c71c66b30 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -630,6 +630,12 @@ smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls)
return name;
}
+ if (*name == '\\') {
+ pr_err("not allow directory name included leading slash\n");
+ kfree(name);
+ return ERR_PTR(-EINVAL);
+ }
+
ksmbd_conv_path_to_unix(name);
ksmbd_strip_last_slash(name);
return name;
@@ -2045,15 +2051,22 @@ out_err1:
* @access: file access flags
* @disposition: file disposition flags
* @may_flags: set with MAY_ flags
+ * @is_dir: is creating open flags for directory
*
* Return: file open flags
*/
static int smb2_create_open_flags(bool file_present, __le32 access,
__le32 disposition,
- int *may_flags)
+ int *may_flags,
+ bool is_dir)
{
int oflags = O_NONBLOCK | O_LARGEFILE;
+ if (is_dir) {
+ access &= ~FILE_WRITE_DESIRE_ACCESS_LE;
+ ksmbd_debug(SMB, "Discard write access to a directory\n");
+ }
+
if (access & FILE_READ_DESIRED_ACCESS_LE &&
access & FILE_WRITE_DESIRE_ACCESS_LE) {
oflags |= O_RDWR;
@@ -2361,7 +2374,8 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len,
if (rc > 0) {
rc = ksmbd_vfs_remove_xattr(idmap,
path,
- attr_name);
+ attr_name,
+ get_write);
if (rc < 0) {
ksmbd_debug(SMB,
@@ -2376,7 +2390,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len,
} else {
rc = ksmbd_vfs_setxattr(idmap, path, attr_name, value,
le16_to_cpu(eabuf->EaValueLength),
- 0, true);
+ 0, get_write);
if (rc < 0) {
ksmbd_debug(SMB,
"ksmbd_vfs_setxattr is failed(%d)\n",
@@ -2468,7 +2482,7 @@ static int smb2_remove_smb_xattrs(const struct path *path)
!strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX,
STREAM_PREFIX_LEN)) {
err = ksmbd_vfs_remove_xattr(idmap, path,
- name);
+ name, true);
if (err)
ksmbd_debug(SMB, "remove xattr failed : %s\n",
name);
@@ -2842,20 +2856,11 @@ int smb2_open(struct ksmbd_work *work)
}
if (req->NameLength) {
- if ((req->CreateOptions & FILE_DIRECTORY_FILE_LE) &&
- *(char *)req->Buffer == '\\') {
- pr_err("not allow directory name included leading slash\n");
- rc = -EINVAL;
- goto err_out2;
- }
-
name = smb2_get_name((char *)req + le16_to_cpu(req->NameOffset),
le16_to_cpu(req->NameLength),
work->conn->local_nls);
if (IS_ERR(name)) {
rc = PTR_ERR(name);
- if (rc != -ENOMEM)
- rc = -ENOENT;
name = NULL;
goto err_out2;
}
@@ -3169,7 +3174,9 @@ int smb2_open(struct ksmbd_work *work)
open_flags = smb2_create_open_flags(file_present, daccess,
req->CreateDisposition,
- &may_flags);
+ &may_flags,
+ req->CreateOptions & FILE_DIRECTORY_FILE_LE ||
+ (file_present && S_ISDIR(d_inode(path.dentry)->i_mode)));
if (!test_tree_conn_flag(tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
if (open_flags & (O_CREAT | O_TRUNC)) {
@@ -5316,8 +5323,13 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info = (struct filesystem_device_info *)rsp->Buffer;
- info->DeviceType = cpu_to_le32(stfs.f_type);
- info->DeviceCharacteristics = cpu_to_le32(0x00000020);
+ info->DeviceType = cpu_to_le32(FILE_DEVICE_DISK);
+ info->DeviceCharacteristics =
+ cpu_to_le32(FILE_DEVICE_IS_MOUNTED);
+ if (!test_tree_conn_flag(work->tcon,
+ KSMBD_TREE_CONN_FLAG_WRITABLE))
+ info->DeviceCharacteristics |=
+ cpu_to_le32(FILE_READ_ONLY_DEVICE);
rsp->OutputBufferLength = cpu_to_le32(8);
break;
}
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 51b1b0bed616..9e859ba010cf 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -1058,16 +1058,21 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
}
int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
- const struct path *path, char *attr_name)
+ const struct path *path, char *attr_name,
+ bool get_write)
{
int err;
- err = mnt_want_write(path->mnt);
- if (err)
- return err;
+ if (get_write == true) {
+ err = mnt_want_write(path->mnt);
+ if (err)
+ return err;
+ }
err = vfs_removexattr(idmap, path->dentry, attr_name);
- mnt_drop_write(path->mnt);
+
+ if (get_write == true)
+ mnt_drop_write(path->mnt);
return err;
}
@@ -1380,7 +1385,7 @@ int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path)
ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name));
if (!strncmp(name, XATTR_NAME_SD, XATTR_NAME_SD_LEN)) {
- err = ksmbd_vfs_remove_xattr(idmap, path, name);
+ err = ksmbd_vfs_remove_xattr(idmap, path, name, true);
if (err)
ksmbd_debug(SMB, "remove xattr failed : %s\n", name);
}
diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h
index cfe1c8092f23..cb76f4b5bafe 100644
--- a/fs/smb/server/vfs.h
+++ b/fs/smb/server/vfs.h
@@ -114,7 +114,8 @@ int ksmbd_vfs_setxattr(struct mnt_idmap *idmap,
int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
size_t *xattr_stream_name_size, int s_type);
int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
- const struct path *path, char *attr_name);
+ const struct path *path, char *attr_name,
+ bool get_write);
int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
unsigned int flags, struct path *parent_path,
struct path *path, bool caseless);
diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c
index 6cb599cd287e..8b2e37c8716e 100644
--- a/fs/smb/server/vfs_cache.c
+++ b/fs/smb/server/vfs_cache.c
@@ -254,7 +254,8 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp)
ci->m_flags &= ~S_DEL_ON_CLS_STREAM;
err = ksmbd_vfs_remove_xattr(file_mnt_idmap(filp),
&filp->f_path,
- fp->stream.name);
+ fp->stream.name,
+ true);
if (err)
pr_err("remove xattr failed : %s\n",
fp->stream.name);
diff --git a/fs/stat.c b/fs/stat.c
index 70bd3e888cfa..89ce1be56310 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -90,6 +90,37 @@ void generic_fill_statx_attr(struct inode *inode, struct kstat *stat)
EXPORT_SYMBOL(generic_fill_statx_attr);
/**
+ * generic_fill_statx_atomic_writes - Fill in atomic writes statx attributes
+ * @stat: Where to fill in the attribute flags
+ * @unit_min: Minimum supported atomic write length in bytes
+ * @unit_max: Maximum supported atomic write length in bytes
+ *
+ * Fill in the STATX{_ATTR}_WRITE_ATOMIC flags in the kstat structure from
+ * atomic write unit_min and unit_max values.
+ */
+void generic_fill_statx_atomic_writes(struct kstat *stat,
+ unsigned int unit_min,
+ unsigned int unit_max)
+{
+ /* Confirm that the request type is known */
+ stat->result_mask |= STATX_WRITE_ATOMIC;
+
+ /* Confirm that the file attribute type is known */
+ stat->attributes_mask |= STATX_ATTR_WRITE_ATOMIC;
+
+ if (unit_min) {
+ stat->atomic_write_unit_min = unit_min;
+ stat->atomic_write_unit_max = unit_max;
+ /* Initially only allow 1x segment */
+ stat->atomic_write_segments_max = 1;
+
+ /* Confirm atomic writes are actually supported */
+ stat->attributes |= STATX_ATTR_WRITE_ATOMIC;
+ }
+}
+EXPORT_SYMBOL_GPL(generic_fill_statx_atomic_writes);
+
+/**
* vfs_getattr_nosec - getattr without security checks
* @path: file to get attributes from
* @stat: structure to return attributes in
@@ -214,6 +245,43 @@ int getname_statx_lookup_flags(int flags)
return lookup_flags;
}
+static int vfs_statx_path(struct path *path, int flags, struct kstat *stat,
+ u32 request_mask)
+{
+ int error = vfs_getattr(path, stat, request_mask, flags);
+
+ if (request_mask & STATX_MNT_ID_UNIQUE) {
+ stat->mnt_id = real_mount(path->mnt)->mnt_id_unique;
+ stat->result_mask |= STATX_MNT_ID_UNIQUE;
+ } else {
+ stat->mnt_id = real_mount(path->mnt)->mnt_id;
+ stat->result_mask |= STATX_MNT_ID;
+ }
+
+ if (path_mounted(path))
+ stat->attributes |= STATX_ATTR_MOUNT_ROOT;
+ stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
+
+ /*
+ * If this is a block device inode, override the filesystem
+ * attributes with the block device specific parameters that need to be
+ * obtained from the bdev backing inode.
+ */
+ if (S_ISBLK(stat->mode))
+ bdev_statx(path, stat, request_mask);
+
+ return error;
+}
+
+static int vfs_statx_fd(int fd, int flags, struct kstat *stat,
+ u32 request_mask)
+{
+ CLASS(fd_raw, f)(fd);
+ if (!f.file)
+ return -EBADF;
+ return vfs_statx_path(&f.file->f_path, flags, stat, request_mask);
+}
+
/**
* vfs_statx - Get basic and extra attributes by filename
* @dfd: A file descriptor representing the base dir for a relative filename
@@ -243,36 +311,13 @@ static int vfs_statx(int dfd, struct filename *filename, int flags,
retry:
error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
if (error)
- goto out;
-
- error = vfs_getattr(&path, stat, request_mask, flags);
-
- if (request_mask & STATX_MNT_ID_UNIQUE) {
- stat->mnt_id = real_mount(path.mnt)->mnt_id_unique;
- stat->result_mask |= STATX_MNT_ID_UNIQUE;
- } else {
- stat->mnt_id = real_mount(path.mnt)->mnt_id;
- stat->result_mask |= STATX_MNT_ID;
- }
-
- if (path.mnt->mnt_root == path.dentry)
- stat->attributes |= STATX_ATTR_MOUNT_ROOT;
- stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
-
- /* Handle STATX_DIOALIGN for block devices. */
- if (request_mask & STATX_DIOALIGN) {
- struct inode *inode = d_backing_inode(path.dentry);
-
- if (S_ISBLK(inode->i_mode))
- bdev_statx_dioalign(inode, stat);
- }
-
+ return error;
+ error = vfs_statx_path(&path, flags, stat, request_mask);
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
-out:
return error;
}
@@ -289,18 +334,10 @@ int vfs_fstatat(int dfd, const char __user *filename,
* If AT_EMPTY_PATH is set, we expect the common case to be that
* empty path, and avoid doing all the extra pathname work.
*/
- if (dfd >= 0 && flags == AT_EMPTY_PATH) {
- char c;
-
- ret = get_user(c, filename);
- if (unlikely(ret))
- return ret;
+ if (flags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename))
+ return vfs_fstat(dfd, stat);
- if (likely(!c))
- return vfs_fstat(dfd, stat);
- }
-
- name = getname_flags(filename, getname_statx_lookup_flags(statx_flags), NULL);
+ name = getname_flags(filename, getname_statx_lookup_flags(statx_flags));
ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS);
putname(name);
@@ -488,34 +525,39 @@ static int do_readlinkat(int dfd, const char __user *pathname,
char __user *buf, int bufsiz)
{
struct path path;
+ struct filename *name;
int error;
- int empty = 0;
unsigned int lookup_flags = LOOKUP_EMPTY;
if (bufsiz <= 0)
return -EINVAL;
retry:
- error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty);
- if (!error) {
- struct inode *inode = d_backing_inode(path.dentry);
-
- error = empty ? -ENOENT : -EINVAL;
- /*
- * AFS mountpoints allow readlink(2) but are not symlinks
- */
- if (d_is_symlink(path.dentry) || inode->i_op->readlink) {
- error = security_inode_readlink(path.dentry);
- if (!error) {
- touch_atime(&path);
- error = vfs_readlink(path.dentry, buf, bufsiz);
- }
- }
- path_put(&path);
- if (retry_estale(error, lookup_flags)) {
- lookup_flags |= LOOKUP_REVAL;
- goto retry;
+ name = getname_flags(pathname, lookup_flags);
+ error = filename_lookup(dfd, name, lookup_flags, &path, NULL);
+ if (unlikely(error)) {
+ putname(name);
+ return error;
+ }
+
+ /*
+ * AFS mountpoints allow readlink(2) but are not symlinks
+ */
+ if (d_is_symlink(path.dentry) ||
+ d_backing_inode(path.dentry)->i_op->readlink) {
+ error = security_inode_readlink(path.dentry);
+ if (!error) {
+ touch_atime(&path);
+ error = vfs_readlink(path.dentry, buf, bufsiz);
}
+ } else {
+ error = (name->name[0] == '\0') ? -ENOENT : -EINVAL;
+ }
+ path_put(&path);
+ putname(name);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
}
return error;
}
@@ -659,6 +701,9 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
tmp.stx_dio_mem_align = stat->dio_mem_align;
tmp.stx_dio_offset_align = stat->dio_offset_align;
tmp.stx_subvol = stat->subvol;
+ tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min;
+ tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max;
+ tmp.stx_atomic_write_segments_max = stat->atomic_write_segments_max;
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
@@ -674,7 +719,8 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags,
if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
return -EINVAL;
- /* STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests
+ /*
+ * STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests
* from userland.
*/
mask &= ~STATX_CHANGE_COOKIE;
@@ -686,16 +732,41 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags,
return cp_statx(&stat, buffer);
}
+int do_statx_fd(int fd, unsigned int flags, unsigned int mask,
+ struct statx __user *buffer)
+{
+ struct kstat stat;
+ int error;
+
+ if (mask & STATX__RESERVED)
+ return -EINVAL;
+ if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
+ return -EINVAL;
+
+ /*
+ * STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests
+ * from userland.
+ */
+ mask &= ~STATX_CHANGE_COOKIE;
+
+ error = vfs_statx_fd(fd, flags, &stat, mask);
+ if (error)
+ return error;
+
+ return cp_statx(&stat, buffer);
+}
+
/**
* sys_statx - System call to get enhanced stats
* @dfd: Base directory to pathwalk from *or* fd to stat.
- * @filename: File to stat or "" with AT_EMPTY_PATH
+ * @filename: File to stat or either NULL or "" with AT_EMPTY_PATH
* @flags: AT_* flags to control pathwalk.
* @mask: Parts of statx struct actually required.
* @buffer: Result buffer.
*
* Note that fstat() can be emulated by setting dfd to the fd of interest,
- * supplying "" as the filename and setting AT_EMPTY_PATH in the flags.
+ * supplying "" (or preferably NULL) as the filename and setting AT_EMPTY_PATH
+ * in the flags.
*/
SYSCALL_DEFINE5(statx,
int, dfd, const char __user *, filename, unsigned, flags,
@@ -703,9 +774,24 @@ SYSCALL_DEFINE5(statx,
struct statx __user *, buffer)
{
int ret;
+ unsigned lflags;
struct filename *name;
- name = getname_flags(filename, getname_statx_lookup_flags(flags), NULL);
+ /*
+ * Short-circuit handling of NULL and "" paths.
+ *
+ * For a NULL path we require and accept only the AT_EMPTY_PATH flag
+ * (possibly |'d with AT_STATX flags).
+ *
+ * However, glibc on 32-bit architectures implements fstatat as statx
+ * with the "" pathname and AT_NO_AUTOMOUNT | AT_EMPTY_PATH flags.
+ * Supporting this results in the uglification below.
+ */
+ lflags = flags & ~(AT_NO_AUTOMOUNT | AT_STATX_SYNC_TYPE);
+ if (lflags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename))
+ return do_statx_fd(dfd, flags & ~AT_NO_AUTOMOUNT, mask, buffer);
+
+ name = getname_flags(filename, getname_statx_lookup_flags(flags));
ret = do_statx(dfd, name, flags, mask, buffer);
putname(name);
diff --git a/fs/super.c b/fs/super.c
index b72f1d288e95..095ba793e10c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1502,8 +1502,17 @@ static int fs_bdev_thaw(struct block_device *bdev)
lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
+ /*
+ * The block device may have been frozen before it was claimed by a
+ * filesystem. Concurrently another process might try to mount that
+ * frozen block device and has temporarily claimed the block device for
+ * that purpose causing a concurrent fs_bdev_thaw() to end up here. The
+ * mounter is already about to abort mounting because they still saw an
+ * elevanted bdev->bd_fsfreeze_count so get_bdev_super() will return
+ * NULL in that case.
+ */
sb = get_bdev_super(bdev);
- if (WARN_ON_ONCE(!sb))
+ if (!sb)
return -EINVAL;
if (sb->s_op->thaw_super)
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 3365a30dc1e0..5c0d07ddbda2 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -591,4 +591,5 @@ static void __exit exit_sysv_fs(void)
module_init(init_sysv_fs)
module_exit(exit_sysv_fs)
+MODULE_DESCRIPTION("SystemV Filesystem");
MODULE_LICENSE("GPL");
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 7c29f4afc23d..1028ab6d9a74 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -296,9 +296,9 @@ enum {
};
static const struct fs_parameter_spec tracefs_param_specs[] = {
- fsparam_u32 ("gid", Opt_gid),
+ fsparam_gid ("gid", Opt_gid),
fsparam_u32oct ("mode", Opt_mode),
- fsparam_u32 ("uid", Opt_uid),
+ fsparam_uid ("uid", Opt_uid),
{}
};
@@ -306,8 +306,6 @@ static int tracefs_parse_param(struct fs_context *fc, struct fs_parameter *param
{
struct tracefs_fs_info *opts = fc->s_fs_info;
struct fs_parse_result result;
- kuid_t uid;
- kgid_t gid;
int opt;
opt = fs_parse(fc, tracefs_param_specs, param, &result);
@@ -316,16 +314,10 @@ static int tracefs_parse_param(struct fs_context *fc, struct fs_parameter *param
switch (opt) {
case Opt_uid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid))
- return invalf(fc, "Unknown uid");
- opts->uid = uid;
+ opts->uid = result.uid;
break;
case Opt_gid:
- gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(gid))
- return invalf(fc, "Unknown gid");
- opts->gid = gid;
+ opts->gid = result.gid;
break;
case Opt_mode:
opts->mode = result.uint_32 & S_IALLUGO;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 27c85d92d1dc..61f25d3cf3f7 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -188,7 +188,6 @@ Eend:
"offset=%lu",
dir->i_ino, (page->index<<PAGE_SHIFT)+offs);
fail:
- SetPageError(page);
return false;
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index eee7320ab0b0..17e409ceaa33 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -2057,7 +2057,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
goto out;
features = uffdio_api.features;
ret = -EINVAL;
- if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
+ if (uffdio_api.api != UFFD_API)
goto err_out;
ret = -EPERM;
if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
@@ -2081,6 +2081,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
#endif
+
+ ret = -EINVAL;
+ if (features & ~uffdio_api.features)
+ goto err_out;
+
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index 118dedef8ebe..fdb4da24d662 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -228,26 +228,19 @@ const struct inode_operations vboxsf_reg_iops = {
static int vboxsf_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
struct vboxsf_handle *sf_handle = file->private_data;
- loff_t off = page_offset(page);
+ loff_t off = folio_pos(folio);
u32 nread = PAGE_SIZE;
u8 *buf;
int err;
- buf = kmap(page);
+ buf = kmap_local_folio(folio, 0);
err = vboxsf_read(sf_handle->root, sf_handle->handle, off, &nread, buf);
- if (err == 0) {
- memset(&buf[nread], 0, PAGE_SIZE - nread);
- flush_dcache_page(page);
- SetPageUptodate(page);
- } else {
- SetPageError(page);
- }
+ buf = folio_zero_tail(folio, nread, buf + nread);
- kunmap(page);
- unlock_page(page);
+ kunmap_local(buf);
+ folio_end_read(folio, err == 0);
return err;
}
@@ -295,7 +288,6 @@ static int vboxsf_writepage(struct page *page, struct writeback_control *wbc)
kref_put(&sf_handle->refcount, vboxsf_handle_release);
if (err == 0) {
- ClearPageError(page);
/* mtime changed */
sf_i->force_restat = 1;
} else {
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index ffb1d565da39..e95b8a48d8a0 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -41,8 +41,8 @@ enum { opt_nls, opt_uid, opt_gid, opt_ttl, opt_dmode, opt_fmode,
static const struct fs_parameter_spec vboxsf_fs_parameters[] = {
fsparam_string ("nls", opt_nls),
- fsparam_u32 ("uid", opt_uid),
- fsparam_u32 ("gid", opt_gid),
+ fsparam_uid ("uid", opt_uid),
+ fsparam_gid ("gid", opt_gid),
fsparam_u32 ("ttl", opt_ttl),
fsparam_u32oct ("dmode", opt_dmode),
fsparam_u32oct ("fmode", opt_fmode),
@@ -55,8 +55,6 @@ static int vboxsf_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct vboxsf_fs_context *ctx = fc->fs_private;
struct fs_parse_result result;
- kuid_t uid;
- kgid_t gid;
int opt;
opt = fs_parse(fc, vboxsf_fs_parameters, param, &result);
@@ -73,16 +71,10 @@ static int vboxsf_parse_param(struct fs_context *fc, struct fs_parameter *param)
param->string = NULL;
break;
case opt_uid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid))
- return -EINVAL;
- ctx->o.uid = uid;
+ ctx->o.uid = result.uid;
break;
case opt_gid:
- gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(gid))
- return -EINVAL;
- ctx->o.gid = gid;
+ ctx->o.gid = result.gid;
break;
case opt_ttl:
ctx->o.ttl = msecs_to_jiffies(result.uint_32);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 6cb8b2ddc541..6c55a6e88eba 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1008,13 +1008,12 @@ xfs_alloc_cur_finish(
struct xfs_alloc_arg *args,
struct xfs_alloc_cur *acur)
{
- struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
int error;
ASSERT(acur->cnt && acur->bnolt);
ASSERT(acur->bno >= acur->rec_bno);
ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
- ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
+ ASSERT(xfs_verify_agbext(args->pag, acur->rec_bno, acur->rec_len));
error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
acur->rec_len, acur->bno, acur->len, 0);
@@ -1217,7 +1216,6 @@ STATIC int /* error */
xfs_alloc_ag_vextent_exact(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
- struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
int error;
@@ -1297,7 +1295,7 @@ xfs_alloc_ag_vextent_exact(
*/
cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp,
args->pag);
- ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
+ ASSERT(xfs_verify_agbext(args->pag, args->agbno, args->len));
error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
args->len, XFSA_FIXUP_BNO_OK);
if (error) {
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 430cd3244c14..f30bcc64100d 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -329,26 +329,20 @@ xfs_attr_calc_size(
return nblks;
}
-/* Initialize transaction reservation for attr operations */
-void
-xfs_init_attr_trans(
- struct xfs_da_args *args,
- struct xfs_trans_res *tres,
- unsigned int *total)
+/* Initialize transaction reservation for an xattr set/replace/upsert */
+inline struct xfs_trans_res
+xfs_attr_set_resv(
+ const struct xfs_da_args *args)
{
- struct xfs_mount *mp = args->dp->i_mount;
-
- if (args->value) {
- tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
- M_RES(mp)->tr_attrsetrt.tr_logres *
- args->total;
- tres->tr_logcount = XFS_ATTRSET_LOG_COUNT;
- tres->tr_logflags = XFS_TRANS_PERM_LOG_RES;
- *total = args->total;
- } else {
- *tres = M_RES(mp)->tr_attrrm;
- *total = XFS_ATTRRM_SPACE_RES(mp);
- }
+ struct xfs_mount *mp = args->dp->i_mount;
+ struct xfs_trans_res ret = {
+ .tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+ M_RES(mp)->tr_attrsetrt.tr_logres * args->total,
+ .tr_logcount = XFS_ATTRSET_LOG_COUNT,
+ .tr_logflags = XFS_TRANS_PERM_LOG_RES,
+ };
+
+ return ret;
}
/*
@@ -1006,7 +1000,7 @@ xfs_attr_set(
struct xfs_trans_res tres;
int error, local;
int rmt_blks = 0;
- unsigned int total;
+ unsigned int total = 0;
ASSERT(!args->trans);
@@ -1033,10 +1027,15 @@ xfs_attr_set(
if (!local)
rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen);
+
+ tres = xfs_attr_set_resv(args);
+ total = args->total;
break;
case XFS_ATTRUPDATE_REMOVE:
XFS_STATS_INC(mp, xs_attr_remove);
rmt_blks = xfs_attr3_max_rmt_blocks(mp);
+ tres = M_RES(mp)->tr_attrrm;
+ total = XFS_ATTRRM_SPACE_RES(mp);
break;
}
@@ -1044,7 +1043,6 @@ xfs_attr_set(
* Root fork attributes can use reserved data blocks for this
* operation if necessary
*/
- xfs_init_attr_trans(args, &tres, &total);
error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index 088cb7b30168..0e51d0723f9a 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -565,8 +565,7 @@ bool xfs_attr_check_namespace(unsigned int attr_flags);
bool xfs_attr_namecheck(unsigned int attr_flags, const void *name,
size_t length);
int xfs_attr_calc_size(struct xfs_da_args *args, int *local);
-void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres,
- unsigned int *total);
+struct xfs_trans_res xfs_attr_set_resv(const struct xfs_da_args *args);
/*
* Check to see if the attr should be upgraded from non-existent or shortform to
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 3b3206d312d6..6af6f744fdd6 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4058,20 +4058,32 @@ xfs_bmapi_reserve_delalloc(
xfs_extlen_t indlen;
uint64_t fdblocks;
int error;
- xfs_fileoff_t aoff = off;
+ xfs_fileoff_t aoff;
+ bool use_cowextszhint =
+ whichfork == XFS_COW_FORK && !prealloc;
+retry:
/*
* Cap the alloc length. Keep track of prealloc so we know whether to
* tag the inode before we return.
*/
+ aoff = off;
alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
if (!eof)
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
if (prealloc && alen >= len)
prealloc = alen - len;
- /* Figure out the extent size, adjust alen */
- if (whichfork == XFS_COW_FORK) {
+ /*
+ * If we're targetting the COW fork but aren't creating a speculative
+ * posteof preallocation, try to expand the reservation to align with
+ * the COW extent size hint if there's sufficient free space.
+ *
+ * Unlike the data fork, the CoW cancellation functions will free all
+ * the reservations at inactivation, so we don't require that every
+ * delalloc reservation have a dirty pagecache.
+ */
+ if (use_cowextszhint) {
struct xfs_bmbt_irec prev;
xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
@@ -4090,7 +4102,7 @@ xfs_bmapi_reserve_delalloc(
*/
error = xfs_quota_reserve_blkres(ip, alen);
if (error)
- return error;
+ goto out;
/*
* Split changing sb for alen and indlen since they could be coming
@@ -4140,6 +4152,17 @@ out_unreserve_frextents:
out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp))
xfs_quota_unreserve_blkres(ip, alen);
+out:
+ if (error == -ENOSPC || error == -EDQUOT) {
+ trace_xfs_delalloc_enospc(ip, off, len);
+
+ if (prealloc || use_cowextszhint) {
+ /* retry without any preallocation */
+ use_cowextszhint = false;
+ prealloc = 0;
+ goto retry;
+ }
+ }
return error;
}
@@ -6383,6 +6406,7 @@ xfs_bunmapi_range(
error = xfs_defer_finish(tpp);
if (error)
goto out;
+ cond_resched();
}
out:
return error;
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 97996cb79aaa..454b63ef7201 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -996,7 +996,7 @@ struct xfs_getparents_by_handle {
#define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom)
#define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req)
#define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req)
-#define XFS_IOC_EXCHANGE_RANGE _IOWR('X', 129, struct xfs_exchange_range)
+#define XFS_IOC_EXCHANGE_RANGE _IOW ('X', 129, struct xfs_exchange_range)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index d79002343d0b..513b50da6215 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -374,17 +374,40 @@ xfs_dinode_verify_fork(
/*
* For fork types that can contain local data, check that the fork
* format matches the size of local data contained within the fork.
- *
- * For all types, check that when the size says the should be in extent
- * or btree format, the inode isn't claiming it is in local format.
*/
if (whichfork == XFS_DATA_FORK) {
- if (S_ISDIR(mode) || S_ISLNK(mode)) {
+ /*
+ * A directory small enough to fit in the inode must be stored
+ * in local format. The directory sf <-> extents conversion
+ * code updates the directory size accordingly. Directories
+ * being truncated have zero size and are not subject to this
+ * check.
+ */
+ if (S_ISDIR(mode)) {
+ if (dip->di_size &&
+ be64_to_cpu(dip->di_size) <= fork_size &&
+ fork_format != XFS_DINODE_FMT_LOCAL)
+ return __this_address;
+ }
+
+ /*
+ * A symlink with a target small enough to fit in the inode can
+ * be stored in extents format if xattrs were added (thus
+ * converting the data fork from shortform to remote format)
+ * and then removed.
+ */
+ if (S_ISLNK(mode)) {
if (be64_to_cpu(dip->di_size) <= fork_size &&
+ fork_format != XFS_DINODE_FMT_EXTENTS &&
fork_format != XFS_DINODE_FMT_LOCAL)
return __this_address;
}
+ /*
+ * For all types, check that when the size says the fork should
+ * be in extent or btree format, the inode isn't claiming to be
+ * in local format.
+ */
if (be64_to_cpu(dip->di_size) > fork_size &&
fork_format == XFS_DINODE_FMT_LOCAL)
return __this_address;
@@ -508,9 +531,19 @@ xfs_dinode_verify(
if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN)
return __this_address;
- /* No zero-length symlinks/dirs. */
- if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0)
- return __this_address;
+ /*
+ * No zero-length symlinks/dirs unless they're unlinked and hence being
+ * inactivated.
+ */
+ if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) {
+ if (dip->di_version > 1) {
+ if (dip->di_nlink)
+ return __this_address;
+ } else {
+ if (dip->di_onlink)
+ return __this_address;
+ }
+ }
fa = xfs_dinode_verify_nrext64(mp, dip);
if (fa)
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 09e4bf949bf8..6b56f0f6d4c1 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1038,11 +1038,12 @@ xfs_log_sb(
* and hence we don't need have to update it here.
*/
if (xfs_has_lazysbcount(mp)) {
- mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
+ mp->m_sb.sb_icount = percpu_counter_sum_positive(&mp->m_icount);
mp->m_sb.sb_ifree = min_t(uint64_t,
- percpu_counter_sum(&mp->m_ifree),
+ percpu_counter_sum_positive(&mp->m_ifree),
mp->m_sb.sb_icount);
- mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+ mp->m_sb.sb_fdblocks =
+ percpu_counter_sum_positive(&mp->m_fdblocks);
}
xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index c013f0ba4f36..4cbcf7a86dbe 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -856,7 +856,7 @@ xfs_ioc_scrubv_metadata(
if (vec_bytes > PAGE_SIZE)
return -ENOMEM;
- uvectors = (void __user *)(uintptr_t)head.svh_vectors;
+ uvectors = u64_to_user_ptr(head.svh_vectors);
vectors = memdup_user(uvectors, vec_bytes);
if (IS_ERR(vectors))
return PTR_ERR(vectors);
diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c
index 9185ae7088d4..cdd13ed9c569 100644
--- a/fs/xfs/scrub/xfarray.c
+++ b/fs/xfs/scrub/xfarray.c
@@ -822,12 +822,14 @@ xfarray_sort_scan(
/* Grab the first folio that backs this array element. */
if (!si->folio) {
+ struct folio *folio;
loff_t next_pos;
- si->folio = xfile_get_folio(si->array->xfile, idx_pos,
+ folio = xfile_get_folio(si->array->xfile, idx_pos,
si->array->obj_size, XFILE_ALLOC);
- if (IS_ERR(si->folio))
- return PTR_ERR(si->folio);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ si->folio = folio;
si->first_folio_idx = xfarray_idx(si->array,
folio_pos(si->folio) + si->array->obj_size - 1);
@@ -1048,6 +1050,7 @@ xfarray_sort(
out_free:
trace_xfarray_sort_stats(si, error);
+ xfarray_sort_scan_done(si);
kvfree(si);
return error;
}
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 2b10ac4c5fce..f683b7a9323f 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -746,7 +746,7 @@ xfs_attr_recover_work(
struct xfs_attri_log_format *attrp;
struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
int error;
- int total;
+ unsigned int total = 0;
/*
* First check the validity of the attr described by the ATTRI. If any
@@ -763,7 +763,20 @@ xfs_attr_recover_work(
return PTR_ERR(attr);
args = attr->xattri_da_args;
- xfs_init_attr_trans(args, &resv, &total);
+ switch (xfs_attr_intent_op(attr)) {
+ case XFS_ATTRI_OP_FLAGS_PPTR_SET:
+ case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
+ case XFS_ATTRI_OP_FLAGS_SET:
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ resv = xfs_attr_set_resv(args);
+ total = args->total;
+ break;
+ case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ resv = M_RES(mp)->tr_attrrm;
+ total = XFS_ATTRRM_SPACE_RES(mp);
+ break;
+ }
resv = xlog_recover_resv(&resv);
error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp);
if (error)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index ac2e77ebb54c..a4d9fbc21b83 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -486,13 +486,11 @@ out_unlock:
/*
* Test whether it is appropriate to check an inode for and free post EOF
- * blocks. The 'force' parameter determines whether we should also consider
- * regular files that are marked preallocated or append-only.
+ * blocks.
*/
bool
xfs_can_free_eofblocks(
- struct xfs_inode *ip,
- bool force)
+ struct xfs_inode *ip)
{
struct xfs_bmbt_irec imap;
struct xfs_mount *mp = ip->i_mount;
@@ -526,11 +524,11 @@ xfs_can_free_eofblocks(
return false;
/*
- * Do not free real preallocated or append-only files unless the file
- * has delalloc blocks and we are forced to remove them.
+ * Only free real extents for inodes with persistent preallocations or
+ * the append-only flag.
*/
if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
- if (!force || ip->i_delayed_blks == 0)
+ if (ip->i_delayed_blks == 0)
return false;
/*
@@ -584,6 +582,22 @@ xfs_free_eofblocks(
/* Wait on dio to ensure i_size has settled. */
inode_dio_wait(VFS_I(ip));
+ /*
+ * For preallocated files only free delayed allocations.
+ *
+ * Note that this means we also leave speculative preallocations in
+ * place for preallocated files.
+ */
+ if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) {
+ if (ip->i_delayed_blks) {
+ xfs_bmap_punch_delalloc_range(ip,
+ round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
+ LLONG_MAX);
+ }
+ xfs_inode_clear_eofblocks_tag(ip);
+ return 0;
+ }
+
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error) {
ASSERT(xfs_is_shutdown(mp));
@@ -891,7 +905,7 @@ xfs_prepare_shift(
* Trim eofblocks to avoid shifting uninitialized post-eof preallocation
* into the accessible region of the file.
*/
- if (xfs_can_free_eofblocks(ip, true)) {
+ if (xfs_can_free_eofblocks(ip)) {
error = xfs_free_eofblocks(ip);
if (error)
return error;
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 51f84d8ff372..eb0895bfb9da 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -63,7 +63,7 @@ int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
xfs_off_t len);
/* EOF block manipulation functions */
-bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
+bool xfs_can_free_eofblocks(struct xfs_inode *ip);
int xfs_free_eofblocks(struct xfs_inode *ip);
int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c
index c8785ed59543..a3f16e9b6fe5 100644
--- a/fs/xfs/xfs_handle.c
+++ b/fs/xfs/xfs_handle.c
@@ -773,11 +773,6 @@ xfs_getparents_expand_lastrec(
trace_xfs_getparents_expand_lastrec(gpx->ip, gp, &gpx->context, gpr);
}
-static inline void __user *u64_to_uptr(u64 val)
-{
- return (void __user *)(uintptr_t)val;
-}
-
/* Retrieve the parent pointers for a given inode. */
STATIC int
xfs_getparents(
@@ -862,7 +857,7 @@ xfs_getparents(
ASSERT(gpx->context.firstu <= gpx->gph.gph_request.gp_bufsize);
/* Copy the records to userspace. */
- if (copy_to_user(u64_to_uptr(gpx->gph.gph_request.gp_buffer),
+ if (copy_to_user(u64_to_user_ptr(gpx->gph.gph_request.gp_buffer),
gpx->krecords, gpx->context.firstu))
error = -EFAULT;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0953163a2d84..cf629302d48e 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -86,9 +86,8 @@ xfs_inode_alloc(
return NULL;
}
- /* VFS doesn't initialise i_mode or i_state! */
+ /* VFS doesn't initialise i_mode! */
VFS_I(ip)->i_mode = 0;
- VFS_I(ip)->i_state = 0;
mapping_set_large_folios(VFS_I(ip)->i_mapping);
XFS_STATS_INC(mp, vn_active);
@@ -314,6 +313,7 @@ xfs_reinit_inode(
dev_t dev = inode->i_rdev;
kuid_t uid = inode->i_uid;
kgid_t gid = inode->i_gid;
+ unsigned long state = inode->i_state;
error = inode_init_always(mp->m_super, inode);
@@ -324,6 +324,7 @@ xfs_reinit_inode(
inode->i_rdev = dev;
inode->i_uid = uid;
inode->i_gid = gid;
+ inode->i_state = state;
mapping_set_large_folios(inode->i_mapping);
return error;
}
@@ -1155,7 +1156,7 @@ xfs_inode_free_eofblocks(
}
*lockflags |= XFS_IOLOCK_EXCL;
- if (xfs_can_free_eofblocks(ip, false))
+ if (xfs_can_free_eofblocks(ip))
return xfs_free_eofblocks(ip);
/* inode could be preallocated or append-only */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 58fb7a5062e1..a4e3cd8971fc 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -42,6 +42,7 @@
#include "xfs_pnfs.h"
#include "xfs_parent.h"
#include "xfs_xattr.h"
+#include "xfs_sb.h"
struct kmem_cache *xfs_inode_cache;
@@ -870,9 +871,16 @@ xfs_init_new_inode(
* this saves us from needing to run a separate transaction to set the
* fork offset in the immediate future.
*/
- if (init_xattrs && xfs_has_attr(mp)) {
+ if (init_xattrs) {
ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
+
+ if (!xfs_has_attr(mp)) {
+ spin_lock(&mp->m_sb_lock);
+ xfs_add_attr(mp);
+ spin_unlock(&mp->m_sb_lock);
+ xfs_log_sb(tp);
+ }
}
/*
@@ -1595,7 +1603,7 @@ xfs_release(
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
return 0;
- if (xfs_can_free_eofblocks(ip, false)) {
+ if (xfs_can_free_eofblocks(ip)) {
/*
* Check if the inode is being opened, written and closed
* frequently and we have delayed allocation blocks outstanding
@@ -1856,15 +1864,13 @@ xfs_inode_needs_inactive(
/*
* This file isn't being freed, so check if there are post-eof blocks
- * to free. @force is true because we are evicting an inode from the
- * cache. Post-eof blocks must be freed, lest we end up with broken
- * free space accounting.
+ * to free.
*
* Note: don't bother with iolock here since lockdep complains about
* acquiring it in reclaim context. We have the only reference to the
* inode at this point anyways.
*/
- return xfs_can_free_eofblocks(ip, true);
+ return xfs_can_free_eofblocks(ip);
}
/*
@@ -1947,15 +1953,11 @@ xfs_inactive(
if (VFS_I(ip)->i_nlink != 0) {
/*
- * force is true because we are evicting an inode from the
- * cache. Post-eof blocks must be freed, lest we end up with
- * broken free space accounting.
- *
* Note: don't bother with iolock here since lockdep complains
* about acquiring it in reclaim context. We have the only
* reference to the inode at this point anyways.
*/
- if (xfs_can_free_eofblocks(ip, true))
+ if (xfs_can_free_eofblocks(ip))
error = xfs_free_eofblocks(ip);
goto out;
@@ -2548,11 +2550,26 @@ xfs_ifree_cluster(
* This buffer may not have been correctly initialised as we
* didn't read it from disk. That's not important because we are
* only using to mark the buffer as stale in the log, and to
- * attach stale cached inodes on it. That means it will never be
- * dispatched for IO. If it is, we want to know about it, and we
- * want it to fail. We can acheive this by adding a write
- * verifier to the buffer.
+ * attach stale cached inodes on it.
+ *
+ * For the inode that triggered the cluster freeing, this
+ * attachment may occur in xfs_inode_item_precommit() after we
+ * have marked this buffer stale. If this buffer was not in
+ * memory before xfs_ifree_cluster() started, it will not be
+ * marked XBF_DONE and this will cause problems later in
+ * xfs_inode_item_precommit() when we trip over a (stale, !done)
+ * buffer to attached to the transaction.
+ *
+ * Hence we have to mark the buffer as XFS_DONE here. This is
+ * safe because we are also marking the buffer as XBF_STALE and
+ * XFS_BLI_STALE. That means it will never be dispatched for
+ * IO and it won't be unlocked until the cluster freeing has
+ * been committed to the journal and the buffer unpinned. If it
+ * is written, we want to know about it, and we want it to
+ * fail. We can acheive this by adding a write verifier to the
+ * buffer.
*/
+ bp->b_flags |= XBF_DONE;
bp->b_ops = &xfs_inode_buf_ops;
/*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 378342673925..414903885ab9 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1148,33 +1148,23 @@ xfs_buffered_write_iomap_begin(
}
}
-retry:
- error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
- end_fsb - offset_fsb, prealloc_blocks,
- allocfork == XFS_DATA_FORK ? &imap : &cmap,
- allocfork == XFS_DATA_FORK ? &icur : &ccur,
- allocfork == XFS_DATA_FORK ? eof : cow_eof);
- switch (error) {
- case 0:
- break;
- case -ENOSPC:
- case -EDQUOT:
- /* retry without any preallocation */
- trace_xfs_delalloc_enospc(ip, offset, count);
- if (prealloc_blocks) {
- prealloc_blocks = 0;
- goto retry;
- }
- fallthrough;
- default:
- goto out_unlock;
- }
-
if (allocfork == XFS_COW_FORK) {
+ error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
+ end_fsb - offset_fsb, prealloc_blocks, &cmap,
+ &ccur, cow_eof);
+ if (error)
+ goto out_unlock;
+
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap);
goto found_cow;
}
+ error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
+ end_fsb - offset_fsb, prealloc_blocks, &imap, &icur,
+ eof);
+ if (error)
+ goto out_unlock;
+
/*
* Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
* them out if the write happens to fail.
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ff222827e550..a00dcbc77e12 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -17,6 +17,8 @@
#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_trans.h"
+#include "xfs_trans_space.h"
+#include "xfs_bmap_btree.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_symlink.h"
@@ -811,6 +813,7 @@ xfs_setattr_size(
struct xfs_trans *tp;
int error;
uint lock_flags = 0;
+ uint resblks = 0;
bool did_zeroing = false;
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
@@ -917,7 +920,17 @@ xfs_setattr_size(
return error;
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ /*
+ * For realtime inode with more than one block rtextsize, we need the
+ * block reservation for bmap btree block allocations/splits that can
+ * happen since it could split the tail written extent and convert the
+ * right beyond EOF one to unwritten.
+ */
+ if (xfs_inode_has_bigrtalloc(ip))
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
+ 0, 0, &tp);
if (error)
return error;
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 730c8d48da28..86f14ec7c31f 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -351,7 +351,6 @@ xfs_iwalk_run_callbacks(
int *has_more)
{
struct xfs_mount *mp = iwag->mp;
- struct xfs_inobt_rec_incore *irec;
xfs_agino_t next_agino;
int error;
@@ -361,8 +360,8 @@ xfs_iwalk_run_callbacks(
/* Delete cursor but remember the last record we cached... */
xfs_iwalk_del_inobt(iwag->tp, curpp, agi_bpp, 0);
- irec = &iwag->recs[iwag->nr_recs - 1];
- ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK);
+ ASSERT(next_agino >= iwag->recs[iwag->nr_recs - 1].ir_startino +
+ XFS_INODES_PER_CHUNK);
if (iwag->drop_trans) {
xfs_trans_cancel(iwag->tp);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 063a2e00d169..265a2a418bc7 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1387,6 +1387,7 @@ xfs_reflink_remap_blocks(
destoff += imap.br_blockcount;
len -= imap.br_blockcount;
remapped_len += imap.br_blockcount;
+ cond_resched();
}
if (error)