summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig1
-rw-r--r--fs/9p/acl.c11
-rw-r--r--fs/9p/acl.h27
-rw-r--r--fs/9p/cache.c143
-rw-r--r--fs/9p/cache.h97
-rw-r--r--fs/9p/fid.c17
-rw-r--r--fs/9p/v9fs.c30
-rw-r--r--fs/9p/v9fs.h17
-rw-r--r--fs/9p/v9fs_vfs.h11
-rw-r--r--fs/9p/vfs_addr.c215
-rw-r--r--fs/9p/vfs_dentry.c4
-rw-r--r--fs/9p/vfs_dir.c6
-rw-r--r--fs/9p/vfs_file.c53
-rw-r--r--fs/9p/vfs_inode.c53
-rw-r--r--fs/9p/vfs_inode_dotl.c22
-rw-r--r--fs/9p/vfs_super.c14
-rw-r--r--fs/9p/xattr.c10
-rw-r--r--fs/9p/xattr.h29
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/afs/callback.c44
-rw-r--r--fs/afs/cell.c2
-rw-r--r--fs/afs/dir.c57
-rw-r--r--fs/afs/dir_edit.c4
-rw-r--r--fs/afs/dir_silly.c4
-rw-r--r--fs/afs/file.c100
-rw-r--r--fs/afs/fs_probe.c8
-rw-r--r--fs/afs/fsclient.c31
-rw-r--r--fs/afs/inode.c104
-rw-r--r--fs/afs/internal.h24
-rw-r--r--fs/afs/protocol_afs.h15
-rw-r--r--fs/afs/protocol_yfs.h6
-rw-r--r--fs/afs/rotate.c1
-rw-r--r--fs/afs/server.c2
-rw-r--r--fs/afs/super.c1
-rw-r--r--fs/afs/write.c41
-rw-r--r--fs/afs/yfsclient.c32
-rw-r--r--fs/aio.c9
-rw-r--r--fs/anon_inodes.c29
-rw-r--r--fs/autofs/waitq.c2
-rw-r--r--fs/binfmt_elf.c39
-rw-r--r--fs/binfmt_elf_fdpic.c2
-rw-r--r--fs/btrfs/block-group.c242
-rw-r--r--fs/btrfs/block-group.h8
-rw-r--r--fs/btrfs/btrfs_inode.h46
-rw-r--r--fs/btrfs/check-integrity.c205
-rw-r--r--fs/btrfs/compression.c685
-rw-r--r--fs/btrfs/compression.h4
-rw-r--r--fs/btrfs/ctree.c157
-rw-r--r--fs/btrfs/ctree.h86
-rw-r--r--fs/btrfs/delayed-inode.c41
-rw-r--r--fs/btrfs/delayed-ref.c17
-rw-r--r--fs/btrfs/delayed-ref.h51
-rw-r--r--fs/btrfs/dev-replace.c19
-rw-r--r--fs/btrfs/dir-item.c48
-rw-r--r--fs/btrfs/disk-io.c53
-rw-r--r--fs/btrfs/disk-io.h5
-rw-r--r--fs/btrfs/extent-tree.c327
-rw-r--r--fs/btrfs/extent_io.c334
-rw-r--r--fs/btrfs/extent_io.h10
-rw-r--r--fs/btrfs/extent_map.c4
-rw-r--r--fs/btrfs/file-item.c34
-rw-r--r--fs/btrfs/file.c61
-rw-r--r--fs/btrfs/free-space-cache.c24
-rw-r--r--fs/btrfs/inode.c623
-rw-r--r--fs/btrfs/ioctl.c1013
-rw-r--r--fs/btrfs/locking.h7
-rw-r--r--fs/btrfs/lzo.c301
-rw-r--r--fs/btrfs/raid56.c175
-rw-r--r--fs/btrfs/raid56.h22
-rw-r--r--fs/btrfs/reada.c26
-rw-r--r--fs/btrfs/ref-verify.c4
-rw-r--r--fs/btrfs/reflink.c4
-rw-r--r--fs/btrfs/relocation.c81
-rw-r--r--fs/btrfs/root-tree.c6
-rw-r--r--fs/btrfs/scrub.c139
-rw-r--r--fs/btrfs/send.c38
-rw-r--r--fs/btrfs/send.h7
-rw-r--r--fs/btrfs/space-info.c33
-rw-r--r--fs/btrfs/subpage.c290
-rw-r--r--fs/btrfs/subpage.h56
-rw-r--r--fs/btrfs/super.c28
-rw-r--r--fs/btrfs/sysfs.c93
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c2
-rw-r--r--fs/btrfs/tests/extent-io-tests.c12
-rw-r--r--fs/btrfs/tests/inode-tests.c4
-rw-r--r--fs/btrfs/transaction.c11
-rw-r--r--fs/btrfs/tree-log.c822
-rw-r--r--fs/btrfs/tree-log.h18
-rw-r--r--fs/btrfs/verity.c6
-rw-r--r--fs/btrfs/volumes.c615
-rw-r--r--fs/btrfs/volumes.h119
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/btrfs/zlib.c36
-rw-r--r--fs/btrfs/zoned.c531
-rw-r--r--fs/btrfs/zoned.h39
-rw-r--r--fs/btrfs/zstd.c27
-rw-r--r--fs/buffer.c12
-rw-r--r--fs/cachefiles/io.c12
-rw-r--r--fs/cachefiles/rdwr.c16
-rw-r--r--fs/ceph/caps.c16
-rw-r--r--fs/ceph/file.c3
-rw-r--r--fs/ceph/inode.c2
-rw-r--r--fs/ceph/locks.c3
-rw-r--r--fs/ceph/mds_client.c17
-rw-r--r--fs/ceph/super.c17
-rw-r--r--fs/ceph/super.h3
-rw-r--r--fs/ceph/xattr.c3
-rw-r--r--fs/cifs/cache.c2
-rw-r--r--fs/cifs/cifs_debug.c1
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifs_ioctl.h1
-rw-r--r--fs/cifs/cifs_spnego.c2
-rw-r--r--fs/cifs/cifs_spnego.h2
-rw-r--r--fs/cifs/cifs_unicode.c1
-rw-r--r--fs/cifs/cifsacl.c1
-rw-r--r--fs/cifs/cifsacl.h1
-rw-r--r--fs/cifs/cifsencrypt.c1
-rw-r--r--fs/cifs/cifsfs.c2
-rw-r--r--fs/cifs/cifsfs.h1
-rw-r--r--fs/cifs/cifsglob.h5
-rw-r--r--fs/cifs/cifspdu.h1
-rw-r--r--fs/cifs/cifsproto.h4
-rw-r--r--fs/cifs/cifssmb.c1
-rw-r--r--fs/cifs/connect.c44
-rw-r--r--fs/cifs/dir.c1
-rw-r--r--fs/cifs/dns_resolve.c1
-rw-r--r--fs/cifs/dns_resolve.h4
-rw-r--r--fs/cifs/export.c1
-rw-r--r--fs/cifs/file.c11
-rw-r--r--fs/cifs/fs_context.c16
-rw-r--r--fs/cifs/fs_context.h2
-rw-r--r--fs/cifs/fscache.c2
-rw-r--r--fs/cifs/fscache.h2
-rw-r--r--fs/cifs/inode.c7
-rw-r--r--fs/cifs/ioctl.c3
-rw-r--r--fs/cifs/link.c1
-rw-r--r--fs/cifs/misc.c61
-rw-r--r--fs/cifs/netmisc.c1
-rw-r--r--fs/cifs/ntlmssp.h1
-rw-r--r--fs/cifs/readdir.c1
-rw-r--r--fs/cifs/rfc1002pdu.h1
-rw-r--r--fs/cifs/sess.c1
-rw-r--r--fs/cifs/smb2file.c1
-rw-r--r--fs/cifs/smb2glob.h1
-rw-r--r--fs/cifs/smb2inode.c1
-rw-r--r--fs/cifs/smb2maperror.c16
-rw-r--r--fs/cifs/smb2misc.c48
-rw-r--r--fs/cifs/smb2ops.c73
-rw-r--r--fs/cifs/smb2pdu.c192
-rw-r--r--fs/cifs/smb2pdu.h920
-rw-r--r--fs/cifs/smb2proto.h3
-rw-r--r--fs/cifs/smb2status.h1
-rw-r--r--fs/cifs/smb2transport.c37
-rw-r--r--fs/cifs/smberr.h1
-rw-r--r--fs/cifs/trace.h71
-rw-r--r--fs/cifs/transport.c1
-rw-r--r--fs/cifs/winucase.c1
-rw-r--r--fs/cifs/xattr.c1
-rw-r--r--fs/coda/cnode.c13
-rw-r--r--fs/coda/coda_linux.c39
-rw-r--r--fs/coda/coda_linux.h6
-rw-r--r--fs/coda/dir.c20
-rw-r--r--fs/coda/file.c12
-rw-r--r--fs/coda/psdev.c14
-rw-r--r--fs/coda/upcall.c3
-rw-r--r--fs/coredump.c88
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/crypto/bio.c32
-rw-r--r--fs/crypto/fname.c3
-rw-r--r--fs/crypto/fscrypt_private.h16
-rw-r--r--fs/crypto/hkdf.c11
-rw-r--r--fs/crypto/keysetup.c62
-rw-r--r--fs/d_path.c8
-rw-r--r--fs/debugfs/inode.c2
-rw-r--r--fs/direct-io.c16
-rw-r--r--fs/erofs/Kconfig40
-rw-r--r--fs/erofs/Makefile1
-rw-r--r--fs/erofs/compress.h28
-rw-r--r--fs/erofs/data.c75
-rw-r--r--fs/erofs/decompressor.c139
-rw-r--r--fs/erofs/decompressor_lzma.c290
-rw-r--r--fs/erofs/erofs_fs.h73
-rw-r--r--fs/erofs/inode.c4
-rw-r--r--fs/erofs/internal.h105
-rw-r--r--fs/erofs/pcpubuf.c6
-rw-r--r--fs/erofs/super.c231
-rw-r--r--fs/erofs/utils.c19
-rw-r--r--fs/erofs/xattr.c4
-rw-r--r--fs/erofs/zdata.c175
-rw-r--r--fs/erofs/zdata.h7
-rw-r--r--fs/erofs/zmap.c68
-rw-r--r--fs/exec.c14
-rw-r--r--fs/exfat/inode.c2
-rw-r--r--fs/ext2/balloc.c14
-rw-r--r--fs/ext4/dir.c6
-rw-r--r--fs/ext4/ext4.h3
-rw-r--r--fs/ext4/extents.c19
-rw-r--r--fs/ext4/fast_commit.c6
-rw-r--r--fs/ext4/file.c7
-rw-r--r--fs/ext4/inline.c150
-rw-r--r--fs/ext4/inode.c176
-rw-r--r--fs/ext4/super.c32
-rw-r--r--fs/f2fs/compress.c1
-rw-r--r--fs/f2fs/file.c2
-rw-r--r--fs/f2fs/super.c1
-rw-r--r--fs/fat/inode.c11
-rw-r--r--fs/fs-writeback.c11
-rw-r--r--fs/fscache/object.c2
-rw-r--r--fs/fscache/operation.c3
-rw-r--r--fs/fuse/dax.c5
-rw-r--r--fs/fuse/dev.c24
-rw-r--r--fs/fuse/dir.c128
-rw-r--r--fs/fuse/file.c110
-rw-r--r--fs/fuse/fuse_i.h20
-rw-r--r--fs/fuse/inode.c132
-rw-r--r--fs/fuse/ioctl.c4
-rw-r--r--fs/fuse/readdir.c6
-rw-r--r--fs/fuse/virtio_fs.c14
-rw-r--r--fs/fuse/xattr.c10
-rw-r--r--fs/gfs2/bmap.c60
-rw-r--r--fs/gfs2/file.c269
-rw-r--r--fs/gfs2/glock.c471
-rw-r--r--fs/gfs2/glock.h34
-rw-r--r--fs/gfs2/glops.c29
-rw-r--r--fs/gfs2/incore.h10
-rw-r--r--fs/gfs2/inode.c12
-rw-r--r--fs/gfs2/rgrp.c70
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/super.c4
-rw-r--r--fs/gfs2/trace_gfs2.h9
-rw-r--r--fs/gfs2/util.c2
-rw-r--r--fs/hfs/inode.c6
-rw-r--r--fs/hfs/mdb.c2
-rw-r--r--fs/hfsplus/inode.c12
-rw-r--r--fs/hfsplus/wrapper.c2
-rw-r--r--fs/hpfs/hpfs.h8
-rw-r--r--fs/hugetlbfs/inode.c23
-rw-r--r--fs/inode.c59
-rw-r--r--fs/internal.h12
-rw-r--r--fs/io-wq.c114
-rw-r--r--fs/io-wq.h59
-rw-r--r--fs/io_uring.c2083
-rw-r--r--fs/iomap/buffered-io.c2
-rw-r--r--fs/iomap/direct-io.c88
-rw-r--r--fs/isofs/inode.c2
-rw-r--r--fs/jfs/jfs_metapage.c1
-rw-r--r--fs/jfs/jfs_mount.c51
-rw-r--r--fs/jfs/resize.c5
-rw-r--r--fs/jfs/super.c5
-rw-r--r--fs/kernel_read_file.c2
-rw-r--r--fs/kernfs/dir.c18
-rw-r--r--fs/kernfs/symlink.c3
-rw-r--r--fs/ksmbd/auth.c219
-rw-r--r--fs/ksmbd/connection.c12
-rw-r--r--fs/ksmbd/crypto_ctx.c16
-rw-r--r--fs/ksmbd/crypto_ctx.h8
-rw-r--r--fs/ksmbd/glob.h2
-rw-r--r--fs/ksmbd/ksmbd_netlink.h2
-rw-r--r--fs/ksmbd/mgmt/user_config.c2
-rw-r--r--fs/ksmbd/mgmt/user_config.h1
-rw-r--r--fs/ksmbd/misc.c21
-rw-r--r--fs/ksmbd/misc.h4
-rw-r--r--fs/ksmbd/oplock.c41
-rw-r--r--fs/ksmbd/server.c3
-rw-r--r--fs/ksmbd/smb2misc.c149
-rw-r--r--fs/ksmbd/smb2ops.c8
-rw-r--r--fs/ksmbd/smb2pdu.c770
-rw-r--r--fs/ksmbd/smb2pdu.h12
-rw-r--r--fs/ksmbd/smb_common.c72
-rw-r--r--fs/ksmbd/smb_common.h11
-rw-r--r--fs/ksmbd/smbacl.c21
-rw-r--r--fs/ksmbd/transport_ipc.c3
-rw-r--r--fs/ksmbd/transport_ipc.h2
-rw-r--r--fs/ksmbd/transport_rdma.c22
-rw-r--r--fs/ksmbd/transport_tcp.c4
-rw-r--r--fs/ksmbd/vfs.c174
-rw-r--r--fs/ksmbd/vfs.h11
-rw-r--r--fs/lockd/svcxdr.h13
-rw-r--r--fs/locks.c161
-rw-r--r--fs/namei.c4
-rw-r--r--fs/netfs/read_helper.c2
-rw-r--r--fs/nfs/blocklayout/dev.c4
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/file.c9
-rw-r--r--fs/nfs/nfs4proc.c3
-rw-r--r--fs/nfs_common/grace.c1
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/blocklayout.c158
-rw-r--r--fs/nfsd/filecache.c5
-rw-r--r--fs/nfsd/nfs4layouts.c5
-rw-r--r--fs/nfsd/nfs4state.c16
-rw-r--r--fs/nfsd/nfs4xdr.c19
-rw-r--r--fs/nfsd/nfsctl.c7
-rw-r--r--fs/nilfs2/alloc.c2
-rw-r--r--fs/nilfs2/alloc.h2
-rw-r--r--fs/nilfs2/bmap.c2
-rw-r--r--fs/nilfs2/bmap.h2
-rw-r--r--fs/nilfs2/btnode.c2
-rw-r--r--fs/nilfs2/btnode.h2
-rw-r--r--fs/nilfs2/btree.c2
-rw-r--r--fs/nilfs2/btree.h2
-rw-r--r--fs/nilfs2/cpfile.c2
-rw-r--r--fs/nilfs2/cpfile.h2
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/dat.h2
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/direct.c2
-rw-r--r--fs/nilfs2/direct.h2
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/gcinode.c2
-rw-r--r--fs/nilfs2/ifile.c2
-rw-r--r--fs/nilfs2/ifile.h2
-rw-r--r--fs/nilfs2/inode.c2
-rw-r--r--fs/nilfs2/ioctl.c4
-rw-r--r--fs/nilfs2/mdt.c2
-rw-r--r--fs/nilfs2/mdt.h2
-rw-r--r--fs/nilfs2/namei.c2
-rw-r--r--fs/nilfs2/nilfs.h2
-rw-r--r--fs/nilfs2/page.c2
-rw-r--r--fs/nilfs2/page.h2
-rw-r--r--fs/nilfs2/recovery.c2
-rw-r--r--fs/nilfs2/segbuf.c2
-rw-r--r--fs/nilfs2/segbuf.h2
-rw-r--r--fs/nilfs2/segment.c2
-rw-r--r--fs/nilfs2/segment.h2
-rw-r--r--fs/nilfs2/sufile.c2
-rw-r--r--fs/nilfs2/sufile.h2
-rw-r--r--fs/nilfs2/super.c4
-rw-r--r--fs/nilfs2/sysfs.c78
-rw-r--r--fs/nilfs2/sysfs.h2
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/nilfs2/the_nilfs.h2
-rw-r--r--fs/notify/fanotify/fanotify.c117
-rw-r--r--fs/notify/fanotify/fanotify.h54
-rw-r--r--fs/notify/fanotify/fanotify_user.c157
-rw-r--r--fs/notify/fsnotify.c10
-rw-r--r--fs/notify/group.c2
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c5
-rw-r--r--fs/notify/inotify/inotify_user.c6
-rw-r--r--fs/notify/notification.c14
-rw-r--r--fs/ntfs/file.c3
-rw-r--r--fs/ntfs/super.c8
-rw-r--r--fs/ntfs3/attrib.c20
-rw-r--r--fs/ntfs3/attrlist.c9
-rw-r--r--fs/ntfs3/bitfunc.c10
-rw-r--r--fs/ntfs3/bitmap.c14
-rw-r--r--fs/ntfs3/debug.h3
-rw-r--r--fs/ntfs3/dir.c30
-rw-r--r--fs/ntfs3/file.c15
-rw-r--r--fs/ntfs3/frecord.c55
-rw-r--r--fs/ntfs3/fslog.c12
-rw-r--r--fs/ntfs3/fsntfs.c77
-rw-r--r--fs/ntfs3/index.c160
-rw-r--r--fs/ntfs3/inode.c161
-rw-r--r--fs/ntfs3/lib/decompress_common.h5
-rw-r--r--fs/ntfs3/lib/lib.h6
-rw-r--r--fs/ntfs3/lznt.c12
-rw-r--r--fs/ntfs3/namei.c24
-rw-r--r--fs/ntfs3/ntfs.h20
-rw-r--r--fs/ntfs3/ntfs_fs.h67
-rw-r--r--fs/ntfs3/record.c3
-rw-r--r--fs/ntfs3/run.c2
-rw-r--r--fs/ntfs3/super.c651
-rw-r--r--fs/ntfs3/upcase.c8
-rw-r--r--fs/ntfs3/xattr.c249
-rw-r--r--fs/ocfs2/alloc.c67
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/dlmglue.c3
-rw-r--r--fs/ocfs2/file.c8
-rw-r--r--fs/ocfs2/inode.c4
-rw-r--r--fs/ocfs2/journal.c26
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/suballoc.c22
-rw-r--r--fs/ocfs2/super.c54
-rw-r--r--fs/open.c18
-rw-r--r--fs/orangefs/dcache.c4
-rw-r--r--fs/orangefs/inode.c2
-rw-r--r--fs/orangefs/super.c5
-rw-r--r--fs/overlayfs/copy_up.c23
-rw-r--r--fs/overlayfs/dir.c13
-rw-r--r--fs/overlayfs/file.c35
-rw-r--r--fs/overlayfs/inode.c5
-rw-r--r--fs/overlayfs/overlayfs.h1
-rw-r--r--fs/overlayfs/super.c12
-rw-r--r--fs/posix_acl.c3
-rw-r--r--fs/proc/array.c13
-rw-r--r--fs/proc/base.c40
-rw-r--r--fs/proc/stat.c4
-rw-r--r--fs/proc/task_mmu.c28
-rw-r--r--fs/proc/uptime.c14
-rw-r--r--fs/proc/vmcore.c109
-rw-r--r--fs/pstore/blk.c8
-rw-r--r--fs/qnx4/dir.c69
-rw-r--r--fs/quota/quota.c1
-rw-r--r--fs/quota/quota_tree.c15
-rw-r--r--fs/ramfs/inode.c12
-rw-r--r--fs/read_write.c4
-rw-r--r--fs/reiserfs/super.c14
-rw-r--r--fs/seq_file.c16
-rw-r--r--fs/smbfs_common/smb2pdu.h989
-rw-r--r--fs/smbfs_common/smbfsctl.h2
-rw-r--r--fs/squashfs/super.c5
-rw-r--r--fs/super.c3
-rw-r--r--fs/sync.c62
-rw-r--r--fs/sysfs/dir.c3
-rw-r--r--fs/sysfs/file.c140
-rw-r--r--fs/sysfs/group.c15
-rw-r--r--fs/sysfs/sysfs.h8
-rw-r--r--fs/sysv/super.c6
-rw-r--r--fs/tracefs/inode.c3
-rw-r--r--fs/ubifs/crypto.c1
-rw-r--r--fs/udf/lowlevel.c5
-rw-r--r--fs/udf/super.c9
-rw-r--r--fs/userfaultfd.c12
-rw-r--r--fs/vboxsf/super.c12
-rw-r--r--fs/verity/enable.c2
-rw-r--r--fs/verity/open.c2
-rw-r--r--fs/xfs/kmem.h4
-rw-r--r--fs/xfs/libxfs/xfs_ag.c2
-rw-r--r--fs/xfs/libxfs/xfs_ag.h36
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c3
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c120
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h38
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c63
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c101
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h35
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c62
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_btree.c333
-rw-r--r--fs/xfs/libxfs/xfs_btree.h99
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.c8
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_defer.c241
-rw-r--r--fs/xfs/libxfs/xfs_defer.h41
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c4
-rw-r--r--fs/xfs/libxfs/xfs_format.h12
-rw-r--r--fs/xfs/libxfs/xfs_fs.h2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c5
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c90
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c6
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c24
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c46
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h7
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c65
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c21
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h7
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c116
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_sb.c4
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c18
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h9
-rw-r--r--fs/xfs/scrub/agheader.c13
-rw-r--r--fs/xfs/scrub/agheader_repair.c8
-rw-r--r--fs/xfs/scrub/bitmap.c22
-rw-r--r--fs/xfs/scrub/bmap.c2
-rw-r--r--fs/xfs/scrub/btree.c121
-rw-r--r--fs/xfs/scrub/btree.h17
-rw-r--r--fs/xfs/scrub/dabtree.c62
-rw-r--r--fs/xfs/scrub/repair.h3
-rw-r--r--fs/xfs/scrub/scrub.c64
-rw-r--r--fs/xfs/scrub/trace.c11
-rw-r--r--fs/xfs/scrub/trace.h10
-rw-r--r--fs/xfs/xfs_aops.c15
-rw-r--r--fs/xfs/xfs_attr_inactive.c2
-rw-r--r--fs/xfs/xfs_bmap_item.c18
-rw-r--r--fs/xfs/xfs_bmap_item.h6
-rw-r--r--fs/xfs/xfs_buf.c14
-rw-r--r--fs/xfs/xfs_buf_item.c8
-rw-r--r--fs/xfs/xfs_buf_item.h2
-rw-r--r--fs/xfs/xfs_buf_item_recover.c2
-rw-r--r--fs/xfs/xfs_dquot.c28
-rw-r--r--fs/xfs/xfs_extfree_item.c33
-rw-r--r--fs/xfs/xfs_extfree_item.h6
-rw-r--r--fs/xfs/xfs_file.c8
-rw-r--r--fs/xfs/xfs_icache.c10
-rw-r--r--fs/xfs/xfs_icreate_item.c6
-rw-r--r--fs/xfs/xfs_icreate_item.h2
-rw-r--r--fs/xfs/xfs_inode.c12
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.c6
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_ioctl.c6
-rw-r--r--fs/xfs/xfs_log.c6
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c12
-rw-r--r--fs/xfs/xfs_mount.c14
-rw-r--r--fs/xfs/xfs_mount.h5
-rw-r--r--fs/xfs/xfs_mru_cache.c2
-rw-r--r--fs/xfs/xfs_qm.c2
-rw-r--r--fs/xfs/xfs_qm.h2
-rw-r--r--fs/xfs/xfs_refcount_item.c18
-rw-r--r--fs/xfs/xfs_refcount_item.h6
-rw-r--r--fs/xfs/xfs_reflink.c2
-rw-r--r--fs/xfs/xfs_rmap_item.c18
-rw-r--r--fs/xfs/xfs_rmap_item.h6
-rw-r--r--fs/xfs/xfs_super.c233
-rw-r--r--fs/xfs/xfs_sysfs.c24
-rw-r--r--fs/xfs/xfs_trace.h2
-rw-r--r--fs/xfs/xfs_trans.c16
-rw-r--r--fs/xfs/xfs_trans.h8
-rw-r--r--fs/xfs/xfs_trans_dquot.c4
-rw-r--r--fs/zonefs/super.c6
508 files changed, 14674 insertions, 10663 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 09fd4a185fd2..d7bc93447c85 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -2,6 +2,7 @@
config 9P_FS
tristate "Plan 9 Resource Sharing Support (9P2000)"
depends on INET && NET_9P
+ select NETFS_SUPPORT
help
If you say Y here, you will get experimental support for
Plan 9 resource sharing via the 9P2000 protocol.
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index c381499f5416..4dac4a0dc5f4 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* Copyright IBM Corporation, 2010
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
*/
#include <linux/module.h>
@@ -123,6 +115,7 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
char *name;
size_t size;
void *buffer;
+
if (!acl)
return 0;
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index d43c8949e807..ce5175d463dd 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -1,28 +1,21 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* Copyright IBM Corporation, 2010
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
*/
#ifndef FS_9P_ACL_H
#define FS_9P_ACL_H
#ifdef CONFIG_9P_FS_POSIX_ACL
-extern int v9fs_get_acl(struct inode *, struct p9_fid *);
-extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu);
-extern int v9fs_acl_chmod(struct inode *, struct p9_fid *);
-extern int v9fs_set_create_acl(struct inode *, struct p9_fid *,
- struct posix_acl *, struct posix_acl *);
-extern int v9fs_acl_mode(struct inode *dir, umode_t *modep,
- struct posix_acl **dpacl, struct posix_acl **pacl);
-extern void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl);
+int v9fs_get_acl(struct inode *inode, struct p9_fid *fid);
+struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type,
+ bool rcu);
+int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid);
+int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid,
+ struct posix_acl *dacl, struct posix_acl *acl);
+int v9fs_acl_mode(struct inode *dir, umode_t *modep,
+ struct posix_acl **dpacl, struct posix_acl **pacl);
+void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl);
#else
#define v9fs_iop_get_acl NULL
static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index eb2151fb6049..f2ba131cede1 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -19,11 +19,11 @@
#define CACHETAG_LEN 11
struct fscache_netfs v9fs_cache_netfs = {
- .name = "9p",
- .version = 0,
+ .name = "9p",
+ .version = 0,
};
-/**
+/*
* v9fs_random_cachetag - Generate a random tag to be associated
* with a new cache session.
*
@@ -199,140 +199,3 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
mutex_unlock(&v9inode->fscache_lock);
}
-
-int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
-{
- struct inode *inode = page->mapping->host;
- struct v9fs_inode *v9inode = V9FS_I(inode);
-
- BUG_ON(!v9inode->fscache);
-
- return fscache_maybe_release_page(v9inode->fscache, page, gfp);
-}
-
-void __v9fs_fscache_invalidate_page(struct page *page)
-{
- struct inode *inode = page->mapping->host;
- struct v9fs_inode *v9inode = V9FS_I(inode);
-
- BUG_ON(!v9inode->fscache);
-
- if (PageFsCache(page)) {
- fscache_wait_on_page_write(v9inode->fscache, page);
- BUG_ON(!PageLocked(page));
- fscache_uncache_page(v9inode->fscache, page);
- }
-}
-
-static void v9fs_vfs_readpage_complete(struct page *page, void *data,
- int error)
-{
- if (!error)
- SetPageUptodate(page);
-
- unlock_page(page);
-}
-
-/**
- * __v9fs_readpage_from_fscache - read a page from cache
- *
- * Returns 0 if the pages are in cache and a BIO is submitted,
- * 1 if the pages are not in cache and -error otherwise.
- */
-
-int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
-{
- int ret;
- const struct v9fs_inode *v9inode = V9FS_I(inode);
-
- p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
- if (!v9inode->fscache)
- return -ENOBUFS;
-
- ret = fscache_read_or_alloc_page(v9inode->fscache,
- page,
- v9fs_vfs_readpage_complete,
- NULL,
- GFP_KERNEL);
- switch (ret) {
- case -ENOBUFS:
- case -ENODATA:
- p9_debug(P9_DEBUG_FSC, "page/inode not in cache %d\n", ret);
- return 1;
- case 0:
- p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
- return ret;
- default:
- p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
- return ret;
- }
-}
-
-/**
- * __v9fs_readpages_from_fscache - read multiple pages from cache
- *
- * Returns 0 if the pages are in cache and a BIO is submitted,
- * 1 if the pages are not in cache and -error otherwise.
- */
-
-int __v9fs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
-{
- int ret;
- const struct v9fs_inode *v9inode = V9FS_I(inode);
-
- p9_debug(P9_DEBUG_FSC, "inode %p pages %u\n", inode, *nr_pages);
- if (!v9inode->fscache)
- return -ENOBUFS;
-
- ret = fscache_read_or_alloc_pages(v9inode->fscache,
- mapping, pages, nr_pages,
- v9fs_vfs_readpage_complete,
- NULL,
- mapping_gfp_mask(mapping));
- switch (ret) {
- case -ENOBUFS:
- case -ENODATA:
- p9_debug(P9_DEBUG_FSC, "pages/inodes not in cache %d\n", ret);
- return 1;
- case 0:
- BUG_ON(!list_empty(pages));
- BUG_ON(*nr_pages != 0);
- p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
- return ret;
- default:
- p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
- return ret;
- }
-}
-
-/**
- * __v9fs_readpage_to_fscache - write a page to the cache
- *
- */
-
-void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
-{
- int ret;
- const struct v9fs_inode *v9inode = V9FS_I(inode);
-
- p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
- ret = fscache_write_page(v9inode->fscache, page,
- i_size_read(&v9inode->vfs_inode), GFP_KERNEL);
- p9_debug(P9_DEBUG_FSC, "ret = %d\n", ret);
- if (ret != 0)
- v9fs_uncache_page(inode, page);
-}
-
-/*
- * wait for a page to complete writing to the cache
- */
-void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
-{
- const struct v9fs_inode *v9inode = V9FS_I(inode);
- p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
- if (PageFsCache(page))
- fscache_wait_on_page_write(v9inode->fscache, page);
-}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index 00f107af443e..7480b4b49fea 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -7,9 +7,10 @@
#ifndef _9P_CACHE_H
#define _9P_CACHE_H
-#ifdef CONFIG_9P_FSCACHE
+#define FSCACHE_USE_NEW_IO_API
#include <linux/fscache.h>
-#include <linux/spinlock.h>
+
+#ifdef CONFIG_9P_FSCACHE
extern struct fscache_netfs v9fs_cache_netfs;
extern const struct fscache_cookie_def v9fs_cache_session_index_def;
@@ -27,64 +28,6 @@ extern void v9fs_cache_inode_reset_cookie(struct inode *inode);
extern int __v9fs_cache_register(void);
extern void __v9fs_cache_unregister(void);
-extern int __v9fs_fscache_release_page(struct page *page, gfp_t gfp);
-extern void __v9fs_fscache_invalidate_page(struct page *page);
-extern int __v9fs_readpage_from_fscache(struct inode *inode,
- struct page *page);
-extern int __v9fs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages);
-extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
-extern void __v9fs_fscache_wait_on_page_write(struct inode *inode,
- struct page *page);
-
-static inline int v9fs_fscache_release_page(struct page *page,
- gfp_t gfp)
-{
- return __v9fs_fscache_release_page(page, gfp);
-}
-
-static inline void v9fs_fscache_invalidate_page(struct page *page)
-{
- __v9fs_fscache_invalidate_page(page);
-}
-
-static inline int v9fs_readpage_from_fscache(struct inode *inode,
- struct page *page)
-{
- return __v9fs_readpage_from_fscache(inode, page);
-}
-
-static inline int v9fs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
-{
- return __v9fs_readpages_from_fscache(inode, mapping, pages,
- nr_pages);
-}
-
-static inline void v9fs_readpage_to_fscache(struct inode *inode,
- struct page *page)
-{
- if (PageFsCache(page))
- __v9fs_readpage_to_fscache(inode, page);
-}
-
-static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
-{
- struct v9fs_inode *v9inode = V9FS_I(inode);
- fscache_uncache_page(v9inode->fscache, page);
- BUG_ON(PageFsCache(page));
-}
-
-static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
- struct page *page)
-{
- return __v9fs_fscache_wait_on_page_write(inode, page);
-}
-
#else /* CONFIG_9P_FSCACHE */
static inline void v9fs_cache_inode_get_cookie(struct inode *inode)
@@ -99,39 +42,5 @@ static inline void v9fs_cache_inode_set_cookie(struct inode *inode, struct file
{
}
-static inline int v9fs_fscache_release_page(struct page *page,
- gfp_t gfp) {
- return 1;
-}
-
-static inline void v9fs_fscache_invalidate_page(struct page *page) {}
-
-static inline int v9fs_readpage_from_fscache(struct inode *inode,
- struct page *page)
-{
- return -ENOBUFS;
-}
-
-static inline int v9fs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
-{
- return -ENOBUFS;
-}
-
-static inline void v9fs_readpage_to_fscache(struct inode *inode,
- struct page *page)
-{}
-
-static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
-{}
-
-static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
- struct page *page)
-{
- return;
-}
-
#endif /* CONFIG_9P_FSCACHE */
#endif /* _9P_CACHE_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 9d9de62592be..6aab046c98e2 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -19,18 +19,18 @@
#include "v9fs_vfs.h"
#include "fid.h"
+static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid)
+{
+ hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata);
+}
+
+
/**
* v9fs_fid_add - add a fid to a dentry
* @dentry: dentry that the fid is being added to
* @fid: fid to add
*
*/
-
-static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid)
-{
- hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata);
-}
-
void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
{
spin_lock(&dentry->d_lock);
@@ -67,7 +67,7 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid)
/**
* v9fs_open_fid_add - add an open fid to an inode
- * @dentry: inode that the fid is being added to
+ * @inode: inode that the fid is being added to
* @fid: fid to add
*
*/
@@ -103,6 +103,7 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
/* we'll recheck under lock if there's anything to look in */
if (!ret && dentry->d_fsdata) {
struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata;
+
spin_lock(&dentry->d_lock);
hlist_for_each_entry(fid, h, dlist) {
if (any || uid_eq(fid->uid, uid)) {
@@ -185,7 +186,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
return ERR_PTR(-EPERM);
if (v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses))
- uname = NULL;
+ uname = NULL;
else
uname = v9ses->uname;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index cdb99507ef33..e32dd5f7721b 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/v9fs.c
- *
* This file contains functions assisting in mapping VFS to 9P2000
*
* Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -155,6 +153,7 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root)
/**
* v9fs_parse_options - parse mount options into session structure
* @v9ses: existing v9fs session information
+ * @opts: The mount option string
*
* Return 0 upon success, -ERRNO upon failure.
*/
@@ -165,7 +164,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
substring_t args[MAX_OPT_ARGS];
char *p;
int option = 0;
- char *s, *e;
+ char *s;
int ret = 0;
/* setup defaults */
@@ -189,8 +188,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
while ((p = strsep(&options, ",")) != NULL) {
int token, r;
+
if (!*p)
continue;
+
token = match_token(p, tokens, args);
switch (token) {
case Opt_debug:
@@ -320,12 +321,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
v9ses->flags |= V9FS_ACCESS_CLIENT;
} else {
uid_t uid;
+
v9ses->flags |= V9FS_ACCESS_SINGLE;
- uid = simple_strtoul(s, &e, 10);
- if (*e != '\0') {
- ret = -EINVAL;
- pr_info("Unknown access argument %s\n",
- s);
+ r = kstrtouint(s, 10, &uid);
+ if (r) {
+ ret = r;
+ pr_info("Unknown access argument %s: %d\n",
+ s, r);
kfree(s);
continue;
}
@@ -519,7 +521,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
* mark transport as disconnected and cancel all pending requests.
*/
-void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
+void v9fs_session_cancel(struct v9fs_session_info *v9ses)
+{
p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
p9_client_disconnect(v9ses->clnt);
}
@@ -542,12 +545,9 @@ extern int v9fs_error_init(void);
static struct kobject *v9fs_kobj;
#ifdef CONFIG_9P_FSCACHE
-/**
- * caches_show - list caches associated with a session
- *
- * Returns the size of buffer written.
+/*
+ * List caches associated with a session
*/
-
static ssize_t caches_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
@@ -661,6 +661,7 @@ static void v9fs_destroy_inode_cache(void)
static int v9fs_cache_register(void)
{
int ret;
+
ret = v9fs_init_inode_cache();
if (ret < 0)
return ret;
@@ -688,6 +689,7 @@ static void v9fs_cache_unregister(void)
static int __init init_v9fs(void)
{
int err;
+
pr_info("Installing v9fs 9p2000 file system support\n");
/* TODO: Setup list of registered trasnport modules */
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 4ca56c5dd637..1647a8e63671 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -124,15 +124,24 @@ static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
return container_of(inode, struct v9fs_inode, vfs_inode);
}
+static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode)
+{
+#ifdef CONFIG_9P_FSCACHE
+ return v9inode->fscache;
+#else
+ return NULL;
+#endif
+}
+
extern int v9fs_show_options(struct seq_file *m, struct dentry *root);
-struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
- char *);
+struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
+ const char *dev_name, char *data);
extern void v9fs_session_close(struct v9fs_session_info *v9ses);
extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags);
+ unsigned int flags);
extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
extern int v9fs_vfs_rename(struct user_namespace *mnt_userns,
@@ -158,7 +167,7 @@ extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
{
- return (inode->i_sb->s_fs_info);
+ return inode->i_sb->s_fs_info;
}
static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry)
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index d44ade76966a..bc417da7e9c1 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -44,9 +44,10 @@ extern struct kmem_cache *v9fs_inode_cache;
struct inode *v9fs_alloc_inode(struct super_block *sb);
void v9fs_free_inode(struct inode *inode);
-struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t);
+struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode,
+ dev_t rdev);
int v9fs_init_inode(struct v9fs_session_info *v9ses,
- struct inode *inode, umode_t mode, dev_t);
+ struct inode *inode, umode_t mode, dev_t rdev);
void v9fs_evict_inode(struct inode *inode);
ino_t v9fs_qid2ino(struct p9_qid *qid);
void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
@@ -59,8 +60,8 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
int v9fs_uflags2omode(int uflags, int extended);
void v9fs_blank_wstat(struct p9_wstat *wstat);
-int v9fs_vfs_setattr_dotl(struct user_namespace *, struct dentry *,
- struct iattr *);
+int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *iattr);
int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
int datasync);
int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
@@ -68,9 +69,9 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode);
static inline void v9fs_invalidate_inode_attr(struct inode *inode)
{
struct v9fs_inode *v9inode;
+
v9inode = V9FS_I(inode);
v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
- return;
}
int v9fs_open_to_dotl_flags(int flags);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index cce9ace651a2..adafdf86f42f 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_addr.c
- *
* This file contians vfs address (mmap) ops for 9P2000.
*
* Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -19,7 +17,7 @@
#include <linux/idr.h>
#include <linux/sched.h>
#include <linux/uio.h>
-#include <linux/bvec.h>
+#include <linux/netfs.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -29,93 +27,103 @@
#include "fid.h"
/**
- * v9fs_fid_readpage - read an entire page in from 9P
- *
- * @fid: fid being read
- * @page: structure to page
- *
+ * v9fs_req_issue_op - Issue a read from 9P
+ * @subreq: The read to make
*/
-static int v9fs_fid_readpage(void *data, struct page *page)
+static void v9fs_req_issue_op(struct netfs_read_subrequest *subreq)
{
- struct p9_fid *fid = data;
- struct inode *inode = page->mapping->host;
- struct bio_vec bvec = {.bv_page = page, .bv_len = PAGE_SIZE};
+ struct netfs_read_request *rreq = subreq->rreq;
+ struct p9_fid *fid = rreq->netfs_priv;
struct iov_iter to;
- int retval, err;
+ loff_t pos = subreq->start + subreq->transferred;
+ size_t len = subreq->len - subreq->transferred;
+ int total, err;
- p9_debug(P9_DEBUG_VFS, "\n");
+ iov_iter_xarray(&to, READ, &rreq->mapping->i_pages, pos, len);
- BUG_ON(!PageLocked(page));
+ total = p9_client_read(fid, pos, &to, &err);
+ netfs_subreq_terminated(subreq, err ?: total, false);
+}
- retval = v9fs_readpage_from_fscache(inode, page);
- if (retval == 0)
- return retval;
+/**
+ * v9fs_init_rreq - Initialise a read request
+ * @rreq: The read request
+ * @file: The file being read from
+ */
+static void v9fs_init_rreq(struct netfs_read_request *rreq, struct file *file)
+{
+ struct p9_fid *fid = file->private_data;
- iov_iter_bvec(&to, READ, &bvec, 1, PAGE_SIZE);
+ refcount_inc(&fid->count);
+ rreq->netfs_priv = fid;
+}
- retval = p9_client_read(fid, page_offset(page), &to, &err);
- if (err) {
- v9fs_uncache_page(inode, page);
- retval = err;
- goto done;
- }
+/**
+ * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_rreq
+ * @mapping: unused mapping of request to cleanup
+ * @priv: private data to cleanup, a fid, guaranted non-null.
+ */
+static void v9fs_req_cleanup(struct address_space *mapping, void *priv)
+{
+ struct p9_fid *fid = priv;
- zero_user(page, retval, PAGE_SIZE - retval);
- flush_dcache_page(page);
- SetPageUptodate(page);
+ p9_client_clunk(fid);
+}
- v9fs_readpage_to_fscache(inode, page);
- retval = 0;
+/**
+ * v9fs_is_cache_enabled - Determine if caching is enabled for an inode
+ * @inode: The inode to check
+ */
+static bool v9fs_is_cache_enabled(struct inode *inode)
+{
+ struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(inode));
-done:
- unlock_page(page);
- return retval;
+ return fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects);
}
/**
+ * v9fs_begin_cache_operation - Begin a cache operation for a read
+ * @rreq: The read request
+ */
+static int v9fs_begin_cache_operation(struct netfs_read_request *rreq)
+{
+ struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode));
+
+ return fscache_begin_read_operation(rreq, cookie);
+}
+
+static const struct netfs_read_request_ops v9fs_req_ops = {
+ .init_rreq = v9fs_init_rreq,
+ .is_cache_enabled = v9fs_is_cache_enabled,
+ .begin_cache_operation = v9fs_begin_cache_operation,
+ .issue_op = v9fs_req_issue_op,
+ .cleanup = v9fs_req_cleanup,
+};
+
+/**
* v9fs_vfs_readpage - read an entire page in from 9P
- *
- * @filp: file being read
+ * @file: file being read
* @page: structure to page
*
*/
-
-static int v9fs_vfs_readpage(struct file *filp, struct page *page)
+static int v9fs_vfs_readpage(struct file *file, struct page *page)
{
- return v9fs_fid_readpage(filp->private_data, page);
+ return netfs_readpage(file, page, &v9fs_req_ops, NULL);
}
/**
- * v9fs_vfs_readpages - read a set of pages from 9P
- *
- * @filp: file being read
- * @mapping: the address space
- * @pages: list of pages to read
- * @nr_pages: count of pages to read
- *
+ * v9fs_vfs_readahead - read a set of pages from 9P
+ * @ractl: The readahead parameters
*/
-
-static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void v9fs_vfs_readahead(struct readahead_control *ractl)
{
- int ret = 0;
- struct inode *inode;
-
- inode = mapping->host;
- p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
-
- ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages);
- if (ret == 0)
- return ret;
-
- ret = read_cache_pages(mapping, pages, v9fs_fid_readpage,
- filp->private_data);
- p9_debug(P9_DEBUG_VFS, " = %d\n", ret);
- return ret;
+ netfs_readahead(ractl, &v9fs_req_ops, NULL);
}
/**
* v9fs_release_page - release the private state associated with a page
+ * @page: The page to be released
+ * @gfp: The caller's allocation restrictions
*
* Returns 1 if the page can be released, false otherwise.
*/
@@ -124,34 +132,36 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
{
if (PagePrivate(page))
return 0;
- return v9fs_fscache_release_page(page, gfp);
+#ifdef CONFIG_9P_FSCACHE
+ if (PageFsCache(page)) {
+ if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS))
+ return 0;
+ wait_on_page_fscache(page);
+ }
+#endif
+ return 1;
}
/**
* v9fs_invalidate_page - Invalidate a page completely or partially
- *
- * @page: structure to page
- * @offset: offset in the page
+ * @page: The page to be invalidated
+ * @offset: offset of the invalidated region
+ * @length: length of the invalidated region
*/
static void v9fs_invalidate_page(struct page *page, unsigned int offset,
unsigned int length)
{
- /*
- * If called with zero offset, we should release
- * the private state assocated with the page
- */
- if (offset == 0 && length == PAGE_SIZE)
- v9fs_fscache_invalidate_page(page);
+ wait_on_page_fscache(page);
}
static int v9fs_vfs_writepage_locked(struct page *page)
{
struct inode *inode = page->mapping->host;
struct v9fs_inode *v9inode = V9FS_I(inode);
+ loff_t start = page_offset(page);
loff_t size = i_size_read(inode);
struct iov_iter from;
- struct bio_vec bvec;
int err, len;
if (page->index == size >> PAGE_SHIFT)
@@ -159,17 +169,14 @@ static int v9fs_vfs_writepage_locked(struct page *page)
else
len = PAGE_SIZE;
- bvec.bv_page = page;
- bvec.bv_offset = 0;
- bvec.bv_len = len;
- iov_iter_bvec(&from, WRITE, &bvec, 1, len);
+ iov_iter_xarray(&from, WRITE, &page->mapping->i_pages, start, len);
/* We should have writeback_fid always set */
BUG_ON(!v9inode->writeback_fid);
set_page_writeback(page);
- p9_client_write(v9inode->writeback_fid, page_offset(page), &from, &err);
+ p9_client_write(v9inode->writeback_fid, start, &from, &err);
end_page_writeback(page);
return err;
@@ -199,26 +206,28 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
/**
* v9fs_launder_page - Writeback a dirty page
+ * @page: The page to be cleaned up
+ *
* Returns 0 on success.
*/
static int v9fs_launder_page(struct page *page)
{
int retval;
- struct inode *inode = page->mapping->host;
- v9fs_fscache_wait_on_page_write(inode, page);
if (clear_page_dirty_for_io(page)) {
retval = v9fs_vfs_writepage_locked(page);
if (retval)
return retval;
}
+ wait_on_page_fscache(page);
return 0;
}
/**
* v9fs_direct_IO - 9P address space operation for direct I/O
* @iocb: target I/O control block
+ * @iter: The data/buffer to use
*
* The presence of v9fs_direct_IO() in the address space ops vector
* allowes open() O_DIRECT flags which would have failed otherwise.
@@ -238,11 +247,13 @@ v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
loff_t pos = iocb->ki_pos;
ssize_t n;
int err = 0;
+
if (iov_iter_rw(iter) == WRITE) {
n = p9_client_write(file->private_data, pos, iter, &err);
if (n) {
struct inode *inode = file_inode(file);
loff_t i_size = i_size_read(inode);
+
if (pos + n > i_size)
inode_add_bytes(inode, pos + n - i_size);
}
@@ -253,43 +264,32 @@ v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned int len, unsigned int flags,
struct page **pagep, void **fsdata)
{
- int retval = 0;
+ int retval;
struct page *page;
- struct v9fs_inode *v9inode;
- pgoff_t index = pos >> PAGE_SHIFT;
- struct inode *inode = mapping->host;
-
+ struct v9fs_inode *v9inode = V9FS_I(mapping->host);
p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
- v9inode = V9FS_I(inode);
-start:
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page) {
- retval = -ENOMEM;
- goto out;
- }
BUG_ON(!v9inode->writeback_fid);
- if (PageUptodate(page))
- goto out;
- if (len == PAGE_SIZE)
- goto out;
+ /* Prefetch area to be written into the cache if we're caching this
+ * file. We need to do this before we get a lock on the page in case
+ * there's more than one writer competing for the same cache block.
+ */
+ retval = netfs_write_begin(filp, mapping, pos, len, flags, &page, fsdata,
+ &v9fs_req_ops, NULL);
+ if (retval < 0)
+ return retval;
- retval = v9fs_fid_readpage(v9inode->writeback_fid, page);
- put_page(page);
- if (!retval)
- goto start;
-out:
- *pagep = page;
+ *pagep = find_subpage(page, pos / PAGE_SIZE);
return retval;
}
static int v9fs_write_end(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
+ loff_t pos, unsigned int len, unsigned int copied,
struct page *page, void *fsdata)
{
loff_t last_pos = pos + copied;
@@ -301,10 +301,11 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
if (unlikely(copied < len)) {
copied = 0;
goto out;
- } else if (len == PAGE_SIZE) {
- SetPageUptodate(page);
}
+
+ SetPageUptodate(page);
}
+
/*
* No need to use i_size_read() here, the i_size
* cannot change under us because we hold the i_mutex.
@@ -324,7 +325,7 @@ out:
const struct address_space_operations v9fs_addr_operations = {
.readpage = v9fs_vfs_readpage,
- .readpages = v9fs_vfs_readpages,
+ .readahead = v9fs_vfs_readahead,
.set_page_dirty = __set_page_dirty_nobuffers,
.writepage = v9fs_vfs_writepage,
.write_begin = v9fs_write_begin,
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 4b4292123b3d..1c609e99d280 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_dentry.c
- *
* This file contians vfs dentry ops for the 9P2000 protocol.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -52,6 +50,7 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)
static void v9fs_dentry_release(struct dentry *dentry)
{
struct hlist_node *p, *n;
+
p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
dentry, dentry);
hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata)
@@ -76,6 +75,7 @@ static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
int retval;
struct v9fs_session_info *v9ses;
+
fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
return PTR_ERR(fid);
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index b6a5a0be444d..8c854d8cb0cd 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_dir.c
- *
* This file contains vfs directory ops for the 9P2000 protocol.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -71,6 +69,7 @@ static inline int dt_type(struct p9_wstat *mistat)
static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen)
{
struct p9_fid *fid = filp->private_data;
+
if (!fid->rdir)
fid->rdir = kzalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
return fid->rdir;
@@ -108,6 +107,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
if (rdir->tail == rdir->head) {
struct iov_iter to;
int n;
+
iov_iter_kvec(&to, READ, &kvec, 1, buflen);
n = p9_client_read(file->private_data, ctx->pos, &to,
&err);
@@ -233,5 +233,5 @@ const struct file_operations v9fs_dir_operations_dotl = {
.iterate_shared = v9fs_dir_readdir_dotl,
.open = v9fs_file_open,
.release = v9fs_dir_release,
- .fsync = v9fs_file_fsync_dotl,
+ .fsync = v9fs_file_fsync_dotl,
};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index aab5e6538660..4244d48398ef 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_file.c
- *
* This file contians vfs file ops for 9P2000.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -359,14 +357,11 @@ out_err:
}
/**
- * v9fs_file_read - read from a file
- * @filp: file pointer to read
- * @udata: user data buffer to read data into
- * @count: size of buffer
- * @offset: offset at which to read data
+ * v9fs_file_read_iter - read from a file
+ * @iocb: The operation parameters
+ * @to: The buffer to read into
*
*/
-
static ssize_t
v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
@@ -388,11 +383,9 @@ v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
/**
- * v9fs_file_write - write to a file
- * @filp: file pointer to write
- * @data: data buffer to write data from
- * @count: size of buffer
- * @offset: offset at which to write data
+ * v9fs_file_write_iter - write to a file
+ * @iocb: The operation parameters
+ * @from: The data to write
*
*/
static ssize_t
@@ -413,6 +406,7 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(file);
loff_t i_size;
unsigned long pg_start, pg_end;
+
pg_start = origin >> PAGE_SHIFT;
pg_end = (origin + retval - 1) >> PAGE_SHIFT;
if (inode->i_mapping && inode->i_mapping->nrpages)
@@ -542,14 +536,23 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf)
p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n",
page, (unsigned long)filp->private_data);
+ v9inode = V9FS_I(inode);
+
+ /* Wait for the page to be written to the cache before we allow it to
+ * be modified. We then assume the entire page will need writing back.
+ */
+#ifdef CONFIG_9P_FSCACHE
+ if (PageFsCache(page) &&
+ wait_on_page_fscache_killable(page) < 0)
+ return VM_FAULT_RETRY;
+#endif
+
/* Update file times before taking page lock */
file_update_time(filp);
- v9inode = V9FS_I(inode);
- /* make sure the cache has finished storing the page */
- v9fs_fscache_wait_on_page_write(inode, page);
BUG_ON(!v9inode->writeback_fid);
- lock_page(page);
+ if (lock_page_killable(page) < 0)
+ return VM_FAULT_RETRY;
if (page->mapping != inode->i_mapping)
goto out_unlock;
wait_for_stable_page(page);
@@ -561,11 +564,9 @@ out_unlock:
}
/**
- * v9fs_mmap_file_read - read from a file
- * @filp: file pointer to read
- * @data: user data buffer to read data into
- * @count: size of buffer
- * @offset: offset at which to read data
+ * v9fs_mmap_file_read_iter - read from a file
+ * @iocb: The operation parameters
+ * @to: The buffer to read into
*
*/
static ssize_t
@@ -576,11 +577,9 @@ v9fs_mmap_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
/**
- * v9fs_mmap_file_write - write to a file
- * @filp: file pointer to write
- * @data: data buffer to write data from
- * @count: size of buffer
- * @offset: offset at which to write data
+ * v9fs_mmap_file_write_iter - write to a file
+ * @iocb: The operation parameters
+ * @from: The data to write
*
*/
static ssize_t
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 795706520b5e..328c338ff304 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_inode.c
- *
* This file contains vfs inode ops for the 9P2000 protocol.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -49,6 +47,7 @@ static const struct inode_operations v9fs_symlink_inode_operations;
static u32 unixmode2p9mode(struct v9fs_session_info *v9ses, umode_t mode)
{
int res;
+
res = mode & 0777;
if (S_ISDIR(mode))
res |= P9_DMDIR;
@@ -110,7 +109,7 @@ static int p9mode2perm(struct v9fs_session_info *v9ses,
static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
struct p9_wstat *stat, dev_t *rdev)
{
- int res;
+ int res, r;
u32 mode = stat->mode;
*rdev = 0;
@@ -128,11 +127,16 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
res |= S_IFIFO;
else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
&& (v9ses->nodev == 0)) {
- char type = 0, ext[32];
+ char type = 0;
int major = -1, minor = -1;
- strlcpy(ext, stat->extension, sizeof(ext));
- sscanf(ext, "%c %i %i", &type, &major, &minor);
+ r = sscanf(stat->extension, "%c %i %i", &type, &major, &minor);
+ if (r != 3) {
+ p9_debug(P9_DEBUG_ERROR,
+ "invalid device string, umode will be bogus: %s\n",
+ stat->extension);
+ return res;
+ }
switch (type) {
case 'c':
res |= S_IFCHR;
@@ -218,11 +222,12 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
/**
* v9fs_alloc_inode - helper function to allocate an inode
- *
+ * @sb: The superblock to allocate the inode from
*/
struct inode *v9fs_alloc_inode(struct super_block *sb)
{
struct v9fs_inode *v9inode;
+
v9inode = kmem_cache_alloc(v9fs_inode_cache, GFP_KERNEL);
if (!v9inode)
return NULL;
@@ -238,7 +243,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
/**
* v9fs_free_inode - destroy an inode
- *
+ * @inode: The inode to be freed
*/
void v9fs_free_inode(struct inode *inode)
@@ -251,7 +256,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
{
int err = 0;
- inode_init_owner(&init_user_ns,inode, NULL, mode);
+ inode_init_owner(&init_user_ns, inode, NULL, mode);
inode->i_blocks = 0;
inode->i_rdev = rdev;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
@@ -343,7 +348,7 @@ error:
* v9fs_get_inode - helper function to setup an inode
* @sb: superblock
* @mode: mode to setup inode with
- *
+ * @rdev: The device numbers to set
*/
struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
@@ -369,7 +374,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
}
/**
- * v9fs_clear_inode - release an inode
+ * v9fs_evict_inode - Remove an inode from the inode cache
* @inode: inode to release
*
*/
@@ -440,7 +445,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
unsigned long i_ino;
struct inode *inode;
struct v9fs_session_info *v9ses = sb->s_fs_info;
- int (*test)(struct inode *, void *);
+ int (*test)(struct inode *inode, void *data);
if (new)
test = v9fs_test_new_inode;
@@ -499,8 +504,10 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
static int v9fs_at_to_dotl_flags(int flags)
{
int rflags = 0;
+
if (flags & AT_REMOVEDIR)
rflags |= P9_DOTL_AT_REMOVEDIR;
+
return rflags;
}
@@ -665,14 +672,15 @@ error:
/**
* v9fs_vfs_create - VFS hook to create a regular file
+ * @mnt_userns: The user namespace of the mount
+ * @dir: The parent directory
+ * @dentry: The name of file to be created
+ * @mode: The UNIX file mode to set
+ * @excl: True if the file must not yet exist
*
* open(.., O_CREAT) is handled in v9fs_vfs_atomic_open(). This is only called
* for mknod(2).
*
- * @dir: directory inode that is being created
- * @dentry: dentry that is being deleted
- * @mode: create permissions
- *
*/
static int
@@ -696,6 +704,7 @@ v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
/**
* v9fs_vfs_mkdir - VFS mkdir hook to create a directory
+ * @mnt_userns: The user namespace of the mount
* @dir: inode that is being unlinked
* @dentry: dentry that is being unlinked
* @mode: mode for new directory
@@ -795,7 +804,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
static int
v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
- struct file *file, unsigned flags, umode_t mode)
+ struct file *file, unsigned int flags, umode_t mode)
{
int err;
u32 perm;
@@ -900,10 +909,12 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
/**
* v9fs_vfs_rename - VFS hook to rename an inode
+ * @mnt_userns: The user namespace of the mount
* @old_dir: old dir inode
* @old_dentry: old dentry
* @new_dir: new dir inode
* @new_dentry: new dentry
+ * @flags: RENAME_* flags
*
*/
@@ -1009,6 +1020,7 @@ done:
/**
* v9fs_vfs_getattr - retrieve file metadata
+ * @mnt_userns: The user namespace of the mount
* @path: Object to query
* @stat: metadata structure to populate
* @request_mask: Mask of STATX_xxx flags indicating the caller's interests
@@ -1050,6 +1062,7 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
/**
* v9fs_vfs_setattr - set file metadata
+ * @mnt_userns: The user namespace of the mount
* @dentry: file whose metadata to set
* @iattr: metadata assignment structure
*
@@ -1078,7 +1091,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns,
fid = v9fs_fid_lookup(dentry);
use_dentry = 1;
}
- if(IS_ERR(fid))
+ if (IS_ERR(fid))
return PTR_ERR(fid);
v9fs_blank_wstat(&wstat);
@@ -1285,6 +1298,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
/**
* v9fs_vfs_symlink - helper function to create symlinks
+ * @mnt_userns: The user namespace of the mount
* @dir: directory inode containing symlink
* @dentry: dentry for symlink
* @symname: symlink data
@@ -1340,6 +1354,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
/**
* v9fs_vfs_mknod - create a special file
+ * @mnt_userns: The user namespace of the mount
* @dir: inode destination for new link
* @dentry: dentry for file
* @mode: mode for creation
@@ -1356,7 +1371,7 @@ v9fs_vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
char name[2 + U32_MAX_DIGITS + 1 + U32_MAX_DIGITS + 1];
u32 perm;
- p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n",
+ p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %x MAJOR: %u MINOR: %u\n",
dir->i_ino, dentry, mode,
MAJOR(rdev), MINOR(rdev));
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index e1c0240b51c0..7dee89ba32e7 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_inode_dotl.c
- *
* This file contains vfs inode ops for the 9P2000.L protocol.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -37,7 +35,10 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t omode, dev_t rdev);
/**
- * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
+ * v9fs_get_fsgid_for_create - Helper function to get the gid for a new object
+ * @dir_inode: The directory inode
+ *
+ * Helper function to get the gid for creating a
* new file system object. This checks the S_ISGID to determine the owning
* group of the new file system object.
*/
@@ -104,7 +105,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
unsigned long i_ino;
struct inode *inode;
struct v9fs_session_info *v9ses = sb->s_fs_info;
- int (*test)(struct inode *, void *);
+ int (*test)(struct inode *inode, void *data);
if (new)
test = v9fs_test_new_inode_dotl;
@@ -211,12 +212,13 @@ int v9fs_open_to_dotl_flags(int flags)
/**
* v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
+ * @mnt_userns: The user namespace of the mount
* @dir: directory inode that is being created
* @dentry: dentry that is being deleted
* @omode: create permissions
+ * @excl: True if the file must not yet exist
*
*/
-
static int
v9fs_vfs_create_dotl(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t omode, bool excl)
@@ -226,7 +228,7 @@ v9fs_vfs_create_dotl(struct user_namespace *mnt_userns, struct inode *dir,
static int
v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
- struct file *file, unsigned flags, umode_t omode)
+ struct file *file, unsigned int flags, umode_t omode)
{
int err = 0;
kgid_t gid;
@@ -257,7 +259,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
v9ses = v9fs_inode2v9ses(dir);
name = dentry->d_name.name;
- p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n",
+ p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%x\n",
name, flags, omode);
dfid = v9fs_parent_fid(dentry);
@@ -361,6 +363,7 @@ err_clunk_old_fid:
/**
* v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
+ * @mnt_userns: The user namespace of the mount
* @dir: inode that is being unlinked
* @dentry: dentry that is being unlinked
* @omode: mode for new directory
@@ -537,6 +540,7 @@ static int v9fs_mapped_iattr_valid(int iattr_valid)
/**
* v9fs_vfs_setattr_dotl - set file metadata
+ * @mnt_userns: The user namespace of the mount
* @dentry: file whose metadata to set
* @iattr: metadata assignment structure
*
@@ -801,6 +805,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
/* Get the latest stat info from server. */
struct p9_fid *fid;
+
fid = v9fs_fid_lookup(old_dentry);
if (IS_ERR(fid))
return PTR_ERR(fid);
@@ -816,6 +821,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
/**
* v9fs_vfs_mknod_dotl - create a special file
+ * @mnt_userns: The user namespace of the mount
* @dir: inode destination for new link
* @dentry: dentry for file
* @omode: mode for creation
@@ -836,7 +842,7 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
struct p9_qid qid;
struct posix_acl *dacl = NULL, *pacl = NULL;
- p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n",
+ p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %x MAJOR: %u MINOR: %u\n",
dir->i_ino, dentry, omode,
MAJOR(rdev), MINOR(rdev));
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 5fce6e30bc5a..b739e02f5ef7 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -1,9 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_super.c
- *
- * This file contians superblock ops for 9P2000. It is intended that
- * you mount this file system on directories.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
* Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
@@ -83,6 +79,9 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
if (!v9ses->cache) {
sb->s_bdi->ra_pages = 0;
sb->s_bdi->io_pages = 0;
+ } else {
+ sb->s_bdi->ra_pages = v9ses->maxdata >> PAGE_SHIFT;
+ sb->s_bdi->io_pages = v9ses->maxdata >> PAGE_SHIFT;
}
sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
@@ -113,7 +112,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
struct inode *inode = NULL;
struct dentry *root = NULL;
struct v9fs_session_info *v9ses = NULL;
- umode_t mode = S_IRWXUGO | S_ISVTX;
+ umode_t mode = 0777 | S_ISVTX;
struct p9_fid *fid;
int retval = 0;
@@ -157,6 +156,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
sb->s_root = root;
if (v9fs_proto_dotl(v9ses)) {
struct p9_stat_dotl *st = NULL;
+
st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
if (IS_ERR(st)) {
retval = PTR_ERR(st);
@@ -167,6 +167,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
kfree(st);
} else {
struct p9_wstat *st = NULL;
+
st = p9_client_stat(fid);
if (IS_ERR(st)) {
retval = PTR_ERR(st);
@@ -275,12 +276,13 @@ done:
static int v9fs_drop_inode(struct inode *inode)
{
struct v9fs_session_info *v9ses;
+
v9ses = v9fs_inode2v9ses(inode);
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
return generic_drop_inode(inode);
/*
* in case of non cached mode always drop the
- * the inode because we want the inode attribute
+ * inode because we want the inode attribute
* to always match that on the server.
*/
return 1;
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index ee331845e2c7..a824441b95a2 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* Copyright IBM Corporation, 2010
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
*/
#include <linux/module.h>
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index c63c3bea5de5..3e11fc3331eb 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -1,15 +1,7 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* Copyright IBM Corporation, 2010
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
*/
#ifndef FS_9P_XATTR_H
#define FS_9P_XATTR_H
@@ -22,13 +14,14 @@ extern const struct xattr_handler *v9fs_xattr_handlers[];
extern const struct xattr_handler v9fs_xattr_acl_access_handler;
extern const struct xattr_handler v9fs_xattr_acl_default_handler;
-extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,
- void *, size_t);
-extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
- void *, size_t);
-extern int v9fs_fid_xattr_set(struct p9_fid *, const char *,
- const void *, size_t, int);
-extern int v9fs_xattr_set(struct dentry *, const char *,
- const void *, size_t, int);
-extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t);
+ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
+ void *buffer, size_t buffer_size);
+ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t buffer_size);
+int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
+ const void *value, size_t value_len, int flags);
+int v9fs_xattr_set(struct dentry *dentry, const char *name,
+ const void *value, size_t value_len, int flags);
+ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer,
+ size_t buffer_size);
#endif /* FS_9P_XATTR_H */
diff --git a/fs/affs/super.c b/fs/affs/super.c
index c6c2a513ec92..c609005a9eaa 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -389,7 +389,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
* blocks, we will have to change it.
*/
- size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+ size = bdev_nr_sectors(sb->s_bdev);
pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
affs_set_blocksize(sb, PAGE_SIZE);
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 7d9b23d981bf..1b4d5809808d 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -21,6 +21,37 @@
#include "internal.h"
/*
+ * Handle invalidation of an mmap'd file. We invalidate all the PTEs referring
+ * to the pages in this file's pagecache, forcing the kernel to go through
+ * ->fault() or ->page_mkwrite() - at which point we can handle invalidation
+ * more fully.
+ */
+void afs_invalidate_mmap_work(struct work_struct *work)
+{
+ struct afs_vnode *vnode = container_of(work, struct afs_vnode, cb_work);
+
+ unmap_mapping_pages(vnode->vfs_inode.i_mapping, 0, 0, false);
+}
+
+void afs_server_init_callback_work(struct work_struct *work)
+{
+ struct afs_server *server = container_of(work, struct afs_server, initcb_work);
+ struct afs_vnode *vnode;
+ struct afs_cell *cell = server->cell;
+
+ down_read(&cell->fs_open_mmaps_lock);
+
+ list_for_each_entry(vnode, &cell->fs_open_mmaps, cb_mmap_link) {
+ if (vnode->cb_server == server) {
+ clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ queue_work(system_unbound_wq, &vnode->cb_work);
+ }
+ }
+
+ up_read(&cell->fs_open_mmaps_lock);
+}
+
+/*
* Allow the fileserver to request callback state (re-)initialisation.
* Unfortunately, UUIDs are not guaranteed unique.
*/
@@ -29,8 +60,11 @@ void afs_init_callback_state(struct afs_server *server)
rcu_read_lock();
do {
server->cb_s_break++;
- server = rcu_dereference(server->uuid_next);
- } while (0);
+ atomic_inc(&server->cell->fs_s_break);
+ if (!list_empty(&server->cell->fs_open_mmaps))
+ queue_work(system_unbound_wq, &server->initcb_work);
+
+ } while ((server = rcu_dereference(server->uuid_next)));
rcu_read_unlock();
}
@@ -44,11 +78,17 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas
clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
vnode->cb_break++;
+ vnode->cb_v_break = vnode->volume->cb_v_break;
afs_clear_permits(vnode);
if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB)
afs_lock_may_be_available(vnode);
+ if (reason != afs_cb_break_for_deleted &&
+ vnode->status.type == AFS_FTYPE_FILE &&
+ atomic_read(&vnode->cb_nr_mmap))
+ queue_work(system_unbound_wq, &vnode->cb_work);
+
trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, true);
} else {
trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, false);
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 887b673f6223..d88407fb9bc0 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -166,6 +166,8 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
seqlock_init(&cell->volume_lock);
cell->fs_servers = RB_ROOT;
seqlock_init(&cell->fs_lock);
+ INIT_LIST_HEAD(&cell->fs_open_mmaps);
+ init_rwsem(&cell->fs_open_mmaps_lock);
rwlock_init(&cell->vl_servers_lock);
cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index ac829e63c570..4579bbda4634 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1077,9 +1077,9 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
*/
static int afs_d_revalidate_rcu(struct dentry *dentry)
{
- struct afs_vnode *dvnode, *vnode;
+ struct afs_vnode *dvnode;
struct dentry *parent;
- struct inode *dir, *inode;
+ struct inode *dir;
long dir_version, de_version;
_enter("%p", dentry);
@@ -1109,18 +1109,6 @@ static int afs_d_revalidate_rcu(struct dentry *dentry)
return -ECHILD;
}
- /* Check to see if the vnode referred to by the dentry still
- * has a callback.
- */
- if (d_really_is_positive(dentry)) {
- inode = d_inode_rcu(dentry);
- if (inode) {
- vnode = AFS_FS_I(inode);
- if (!afs_check_validity(vnode))
- return -ECHILD;
- }
- }
-
return 1; /* Still valid */
}
@@ -1156,17 +1144,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
if (IS_ERR(key))
key = NULL;
- if (d_really_is_positive(dentry)) {
- inode = d_inode(dentry);
- if (inode) {
- vnode = AFS_FS_I(inode);
- afs_validate(vnode, key);
- if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
- goto out_bad;
- }
- }
-
- /* lock down the parent dentry so we can peer at it */
+ /* Hold the parent dentry so we can peer at it */
parent = dget_parent(dentry);
dir = AFS_FS_I(d_inode(parent));
@@ -1175,7 +1153,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
_debug("%pd: parent dir deleted", dentry);
- goto out_bad_parent;
+ goto not_found;
}
/* We only need to invalidate a dentry if the server's copy changed
@@ -1201,12 +1179,12 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
case 0:
/* the filename maps to something */
if (d_really_is_negative(dentry))
- goto out_bad_parent;
+ goto not_found;
inode = d_inode(dentry);
if (is_bad_inode(inode)) {
printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n",
dentry);
- goto out_bad_parent;
+ goto not_found;
}
vnode = AFS_FS_I(inode);
@@ -1228,9 +1206,6 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
dentry, fid.unique,
vnode->fid.unique,
vnode->vfs_inode.i_generation);
- write_seqlock(&vnode->cb_lock);
- set_bit(AFS_VNODE_DELETED, &vnode->flags);
- write_sequnlock(&vnode->cb_lock);
goto not_found;
}
goto out_valid;
@@ -1245,7 +1220,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
default:
_debug("failed to iterate dir %pd: %d",
parent, ret);
- goto out_bad_parent;
+ goto not_found;
}
out_valid:
@@ -1256,16 +1231,9 @@ out_valid_noupdate:
_leave(" = 1 [valid]");
return 1;
- /* the dirent, if it exists, now points to a different vnode */
not_found:
- spin_lock(&dentry->d_lock);
- dentry->d_flags |= DCACHE_NFSFS_RENAMED;
- spin_unlock(&dentry->d_lock);
-
-out_bad_parent:
_debug("dropping dentry %pd2", dentry);
dput(parent);
-out_bad:
key_put(key);
_leave(" = 0 [bad]");
@@ -1792,6 +1760,10 @@ static int afs_link(struct dentry *from, struct inode *dir,
goto error;
}
+ ret = afs_validate(vnode, op->key);
+ if (ret < 0)
+ goto error_op;
+
afs_op_set_vnode(op, 0, dvnode);
afs_op_set_vnode(op, 1, vnode);
op->file[0].dv_delta = 1;
@@ -1805,6 +1777,8 @@ static int afs_link(struct dentry *from, struct inode *dir,
op->create.reason = afs_edit_dir_for_link;
return afs_do_sync_operation(op);
+error_op:
+ afs_put_operation(op);
error:
d_drop(dentry);
_leave(" = %d", ret);
@@ -1989,6 +1963,11 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
if (IS_ERR(op))
return PTR_ERR(op);
+ ret = afs_validate(vnode, op->key);
+ op->error = ret;
+ if (ret < 0)
+ goto error;
+
afs_op_set_vnode(op, 0, orig_dvnode);
afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */
op->file[0].dv_delta = 1;
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index f4600c1353ad..540b9fc96824 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -263,7 +263,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
if (b == nr_blocks) {
_debug("init %u", b);
afs_edit_init_block(meta, block, b);
- i_size_write(&vnode->vfs_inode, (b + 1) * AFS_DIR_BLOCK_SIZE);
+ afs_set_i_size(vnode, (b + 1) * AFS_DIR_BLOCK_SIZE);
}
/* Only lower dir pages have a counter in the header. */
@@ -296,7 +296,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
new_directory:
afs_edit_init_block(meta, meta, 0);
i_size = AFS_DIR_BLOCK_SIZE;
- i_size_write(&vnode->vfs_inode, i_size);
+ afs_set_i_size(vnode, i_size);
slot = AFS_DIR_RESV_BLOCKS0;
page = page0;
block = meta;
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index dae9a57d7ec0..45cfd50a9521 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -86,8 +86,8 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
return afs_do_sync_operation(op);
}
-/**
- * afs_sillyrename - Perform a silly-rename of a dentry
+/*
+ * Perform silly-rename of a dentry.
*
* AFS is stateless and the server doesn't know when the client is holding a
* file open. To prevent application problems when a file is unlinked while
diff --git a/fs/afs/file.c b/fs/afs/file.c
index db035ae2a134..eb11d047c0ae 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,17 +19,22 @@
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
static int afs_readpage(struct file *file, struct page *page);
+static int afs_symlink_readpage(struct file *file, struct page *page);
static void afs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length);
static int afs_releasepage(struct page *page, gfp_t gfp_flags);
static void afs_readahead(struct readahead_control *ractl);
+static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
+static void afs_vm_open(struct vm_area_struct *area);
+static void afs_vm_close(struct vm_area_struct *area);
+static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff);
const struct file_operations afs_file_operations = {
.open = afs_open,
.release = afs_release,
.llseek = generic_file_llseek,
- .read_iter = generic_file_read_iter,
+ .read_iter = afs_file_read_iter,
.write_iter = afs_file_write,
.mmap = afs_file_mmap,
.splice_read = generic_file_splice_read,
@@ -45,7 +50,7 @@ const struct inode_operations afs_file_inode_operations = {
.permission = afs_permission,
};
-const struct address_space_operations afs_fs_aops = {
+const struct address_space_operations afs_file_aops = {
.readpage = afs_readpage,
.readahead = afs_readahead,
.set_page_dirty = afs_set_page_dirty,
@@ -58,9 +63,17 @@ const struct address_space_operations afs_fs_aops = {
.writepages = afs_writepages,
};
+const struct address_space_operations afs_symlink_aops = {
+ .readpage = afs_symlink_readpage,
+ .releasepage = afs_releasepage,
+ .invalidatepage = afs_invalidatepage,
+};
+
static const struct vm_operations_struct afs_vm_ops = {
+ .open = afs_vm_open,
+ .close = afs_vm_close,
.fault = filemap_fault,
- .map_pages = filemap_map_pages,
+ .map_pages = afs_vm_map_pages,
.page_mkwrite = afs_page_mkwrite,
};
@@ -295,7 +308,7 @@ static void afs_req_issue_op(struct netfs_read_subrequest *subreq)
fsreq->subreq = subreq;
fsreq->pos = subreq->start + subreq->transferred;
fsreq->len = subreq->len - subreq->transferred;
- fsreq->key = subreq->rreq->netfs_priv;
+ fsreq->key = key_get(subreq->rreq->netfs_priv);
fsreq->vnode = vnode;
fsreq->iter = &fsreq->def_iter;
@@ -304,9 +317,10 @@ static void afs_req_issue_op(struct netfs_read_subrequest *subreq)
fsreq->pos, fsreq->len);
afs_fetch_data(fsreq->vnode, fsreq);
+ afs_put_read(fsreq);
}
-static int afs_symlink_readpage(struct page *page)
+static int afs_symlink_readpage(struct file *file, struct page *page)
{
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
struct afs_read *fsreq;
@@ -371,9 +385,6 @@ const struct netfs_read_request_ops afs_req_ops = {
static int afs_readpage(struct file *file, struct page *page)
{
- if (!file)
- return afs_symlink_readpage(page);
-
return netfs_readpage(file, page, &afs_req_ops, NULL);
}
@@ -490,15 +501,88 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)
return 1;
}
+static void afs_add_open_mmap(struct afs_vnode *vnode)
+{
+ if (atomic_inc_return(&vnode->cb_nr_mmap) == 1) {
+ down_write(&vnode->volume->cell->fs_open_mmaps_lock);
+
+ list_add_tail(&vnode->cb_mmap_link,
+ &vnode->volume->cell->fs_open_mmaps);
+
+ up_write(&vnode->volume->cell->fs_open_mmaps_lock);
+ }
+}
+
+static void afs_drop_open_mmap(struct afs_vnode *vnode)
+{
+ if (!atomic_dec_and_test(&vnode->cb_nr_mmap))
+ return;
+
+ down_write(&vnode->volume->cell->fs_open_mmaps_lock);
+
+ if (atomic_read(&vnode->cb_nr_mmap) == 0)
+ list_del_init(&vnode->cb_mmap_link);
+
+ up_write(&vnode->volume->cell->fs_open_mmaps_lock);
+ flush_work(&vnode->cb_work);
+}
+
/*
* Handle setting up a memory mapping on an AFS file.
*/
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
int ret;
+ afs_add_open_mmap(vnode);
+
ret = generic_file_mmap(file, vma);
if (ret == 0)
vma->vm_ops = &afs_vm_ops;
+ else
+ afs_drop_open_mmap(vnode);
return ret;
}
+
+static void afs_vm_open(struct vm_area_struct *vma)
+{
+ afs_add_open_mmap(AFS_FS_I(file_inode(vma->vm_file)));
+}
+
+static void afs_vm_close(struct vm_area_struct *vma)
+{
+ afs_drop_open_mmap(AFS_FS_I(file_inode(vma->vm_file)));
+}
+
+static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff)
+{
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file));
+ struct afs_file *af = vmf->vma->vm_file->private_data;
+
+ switch (afs_validate(vnode, af->key)) {
+ case 0:
+ return filemap_map_pages(vmf, start_pgoff, end_pgoff);
+ case -ENOMEM:
+ return VM_FAULT_OOM;
+ case -EINTR:
+ case -ERESTARTSYS:
+ return VM_FAULT_RETRY;
+ case -ESTALE:
+ default:
+ return VM_FAULT_SIGBUS;
+ }
+}
+
+static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
+ struct afs_file *af = iocb->ki_filp->private_data;
+ int ret;
+
+ ret = afs_validate(vnode, af->key);
+ if (ret < 0)
+ return ret;
+
+ return generic_file_read_iter(iocb, iter);
+}
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index e7e98ad63a91..c0031a3ab42f 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -9,6 +9,7 @@
#include <linux/slab.h>
#include "afs_fs.h"
#include "internal.h"
+#include "protocol_afs.h"
#include "protocol_yfs.h"
static unsigned int afs_fs_probe_fast_poll_interval = 30 * HZ;
@@ -102,7 +103,7 @@ void afs_fileserver_probe_result(struct afs_call *call)
struct afs_addr_list *alist = call->alist;
struct afs_server *server = call->server;
unsigned int index = call->addr_ix;
- unsigned int rtt_us = 0;
+ unsigned int rtt_us = 0, cap0;
int ret = call->error;
_enter("%pU,%u", &server->uuid, index);
@@ -159,6 +160,11 @@ responded:
clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
alist->addrs[index].srx_service = call->service_id;
}
+ cap0 = ntohl(call->tmp);
+ if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES)
+ set_bit(AFS_SERVER_FL_HAS_FS64, &server->flags);
+ else
+ clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags);
}
if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index dd3f45d906d2..4943413d9c5f 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -456,9 +456,7 @@ void afs_fs_fetch_data(struct afs_operation *op)
struct afs_read *req = op->fetch.req;
__be32 *bp;
- if (upper_32_bits(req->pos) ||
- upper_32_bits(req->len) ||
- upper_32_bits(req->pos + req->len))
+ if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags))
return afs_fs_fetch_data64(op);
_enter("");
@@ -1113,9 +1111,7 @@ void afs_fs_store_data(struct afs_operation *op)
(unsigned long long)op->store.pos,
(unsigned long long)op->store.i_size);
- if (upper_32_bits(op->store.pos) ||
- upper_32_bits(op->store.size) ||
- upper_32_bits(op->store.i_size))
+ if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags))
return afs_fs_store_data64(op);
call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData,
@@ -1229,7 +1225,7 @@ static void afs_fs_setattr_size(struct afs_operation *op)
key_serial(op->key), vp->fid.vid, vp->fid.vnode);
ASSERT(attr->ia_valid & ATTR_SIZE);
- if (upper_32_bits(attr->ia_size))
+ if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags))
return afs_fs_setattr_size64(op);
call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData_as_Status,
@@ -1657,20 +1653,33 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
return ret;
count = ntohl(call->tmp);
-
call->count = count;
call->count2 = count;
- afs_extract_discard(call, count * sizeof(__be32));
+ if (count == 0) {
+ call->unmarshall = 4;
+ call->tmp = 0;
+ break;
+ }
+
+ /* Extract the first word of the capabilities to call->tmp */
+ afs_extract_to_tmp(call);
call->unmarshall++;
fallthrough;
- /* Extract capabilities words */
case 2:
ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
- /* TODO: Examine capabilities */
+ afs_extract_discard(call, (count - 1) * sizeof(__be32));
+ call->unmarshall++;
+ fallthrough;
+
+ /* Extract remaining capabilities words */
+ case 3:
+ ret = afs_extract_data(call, false);
+ if (ret < 0)
+ return ret;
call->unmarshall++;
break;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 80b6c8d967d5..16906eb592d9 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -54,16 +54,6 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren
}
/*
- * Set the file size and block count. Estimate the number of 512 bytes blocks
- * used, rounded up to nearest 1K for consistency with other AFS clients.
- */
-static void afs_set_i_size(struct afs_vnode *vnode, u64 size)
-{
- i_size_write(&vnode->vfs_inode, size);
- vnode->vfs_inode.i_blocks = ((size + 1023) >> 10) << 1;
-}
-
-/*
* Initialise an inode from the vnode status.
*/
static int afs_inode_init_from_status(struct afs_operation *op,
@@ -105,7 +95,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
inode->i_mode = S_IFREG | (status->mode & S_IALLUGO);
inode->i_op = &afs_file_inode_operations;
inode->i_fop = &afs_file_operations;
- inode->i_mapping->a_ops = &afs_fs_aops;
+ inode->i_mapping->a_ops = &afs_file_aops;
break;
case AFS_FTYPE_DIR:
inode->i_mode = S_IFDIR | (status->mode & S_IALLUGO);
@@ -123,11 +113,11 @@ static int afs_inode_init_from_status(struct afs_operation *op,
inode->i_mode = S_IFDIR | 0555;
inode->i_op = &afs_mntpt_inode_operations;
inode->i_fop = &afs_mntpt_file_operations;
- inode->i_mapping->a_ops = &afs_fs_aops;
+ inode->i_mapping->a_ops = &afs_symlink_aops;
} else {
inode->i_mode = S_IFLNK | status->mode;
inode->i_op = &afs_symlink_inode_operations;
- inode->i_mapping->a_ops = &afs_fs_aops;
+ inode->i_mapping->a_ops = &afs_symlink_aops;
}
inode_nohighmem(inode);
break;
@@ -587,22 +577,32 @@ static void afs_zap_data(struct afs_vnode *vnode)
}
/*
- * Get the server reinit counter for a vnode's current server.
+ * Check to see if we have a server currently serving this volume and that it
+ * hasn't been reinitialised or dropped from the list.
*/
-static bool afs_get_s_break_rcu(struct afs_vnode *vnode, unsigned int *_s_break)
+static bool afs_check_server_good(struct afs_vnode *vnode)
{
- struct afs_server_list *slist = rcu_dereference(vnode->volume->servers);
+ struct afs_server_list *slist;
struct afs_server *server;
+ bool good;
int i;
+ if (vnode->cb_fs_s_break == atomic_read(&vnode->volume->cell->fs_s_break))
+ return true;
+
+ rcu_read_lock();
+
+ slist = rcu_dereference(vnode->volume->servers);
for (i = 0; i < slist->nr_servers; i++) {
server = slist->servers[i].server;
if (server == vnode->cb_server) {
- *_s_break = READ_ONCE(server->cb_s_break);
- return true;
+ good = (vnode->cb_s_break == server->cb_s_break);
+ rcu_read_unlock();
+ return good;
}
}
+ rcu_read_unlock();
return false;
}
@@ -611,57 +611,46 @@ static bool afs_get_s_break_rcu(struct afs_vnode *vnode, unsigned int *_s_break)
*/
bool afs_check_validity(struct afs_vnode *vnode)
{
- struct afs_volume *volume = vnode->volume;
enum afs_cb_break_reason need_clear = afs_cb_break_no_break;
time64_t now = ktime_get_real_seconds();
- bool valid;
- unsigned int cb_break, cb_s_break, cb_v_break;
+ unsigned int cb_break;
int seq = 0;
do {
read_seqbegin_or_lock(&vnode->cb_lock, &seq);
- cb_v_break = READ_ONCE(volume->cb_v_break);
cb_break = vnode->cb_break;
- if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
- afs_get_s_break_rcu(vnode, &cb_s_break)) {
- if (vnode->cb_s_break != cb_s_break ||
- vnode->cb_v_break != cb_v_break) {
- vnode->cb_s_break = cb_s_break;
- vnode->cb_v_break = cb_v_break;
- need_clear = afs_cb_break_for_vsbreak;
- valid = false;
- } else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+ if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+ if (vnode->cb_v_break != vnode->volume->cb_v_break)
+ need_clear = afs_cb_break_for_v_break;
+ else if (!afs_check_server_good(vnode))
+ need_clear = afs_cb_break_for_s_reinit;
+ else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
need_clear = afs_cb_break_for_zap;
- valid = false;
- } else if (vnode->cb_expires_at - 10 <= now) {
+ else if (vnode->cb_expires_at - 10 <= now)
need_clear = afs_cb_break_for_lapsed;
- valid = false;
- } else {
- valid = true;
- }
} else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
- valid = true;
+ ;
} else {
- vnode->cb_v_break = cb_v_break;
- valid = false;
+ need_clear = afs_cb_break_no_promise;
}
} while (need_seqretry(&vnode->cb_lock, seq));
done_seqretry(&vnode->cb_lock, seq);
- if (need_clear != afs_cb_break_no_break) {
- write_seqlock(&vnode->cb_lock);
- if (cb_break == vnode->cb_break)
- __afs_break_callback(vnode, need_clear);
- else
- trace_afs_cb_miss(&vnode->fid, need_clear);
- write_sequnlock(&vnode->cb_lock);
- valid = false;
- }
+ if (need_clear == afs_cb_break_no_break)
+ return true;
- return valid;
+ write_seqlock(&vnode->cb_lock);
+ if (need_clear == afs_cb_break_no_promise)
+ vnode->cb_v_break = vnode->volume->cb_v_break;
+ else if (cb_break == vnode->cb_break)
+ __afs_break_callback(vnode, need_clear);
+ else
+ trace_afs_cb_miss(&vnode->fid, need_clear);
+ write_sequnlock(&vnode->cb_lock);
+ return false;
}
/*
@@ -675,21 +664,20 @@ bool afs_check_validity(struct afs_vnode *vnode)
*/
int afs_validate(struct afs_vnode *vnode, struct key *key)
{
- bool valid;
int ret;
_enter("{v={%llx:%llu} fl=%lx},%x",
vnode->fid.vid, vnode->fid.vnode, vnode->flags,
key_serial(key));
- rcu_read_lock();
- valid = afs_check_validity(vnode);
- rcu_read_unlock();
-
- if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
- clear_nlink(&vnode->vfs_inode);
+ if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
+ if (vnode->vfs_inode.i_nlink)
+ clear_nlink(&vnode->vfs_inode);
+ goto valid;
+ }
- if (valid)
+ if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
+ afs_check_validity(vnode))
goto valid;
down_write(&vnode->validate_lock);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 5ed416f4ff33..9357c53faa69 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -390,6 +390,9 @@ struct afs_cell {
/* Active fileserver interaction state. */
struct rb_root fs_servers; /* afs_server (by server UUID) */
seqlock_t fs_lock; /* For fs_servers */
+ struct rw_semaphore fs_open_mmaps_lock;
+ struct list_head fs_open_mmaps; /* List of vnodes that are mmapped */
+ atomic_t fs_s_break; /* Counter of CB.InitCallBackState messages */
/* VL server list. */
rwlock_t vl_servers_lock; /* Lock on vl_servers */
@@ -503,6 +506,7 @@ struct afs_server {
struct hlist_node addr4_link; /* Link in net->fs_addresses4 */
struct hlist_node addr6_link; /* Link in net->fs_addresses6 */
struct hlist_node proc_link; /* Link in net->fs_proc */
+ struct work_struct initcb_work; /* Work for CB.InitCallBackState* */
struct afs_server *gc_next; /* Next server in manager's list */
time64_t unuse_time; /* Time at which last unused */
unsigned long flags;
@@ -516,6 +520,7 @@ struct afs_server {
#define AFS_SERVER_FL_IS_YFS 16 /* Server is YFS not AFS */
#define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */
#define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */
+#define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */
atomic_t ref; /* Object refcount */
atomic_t active; /* Active user count */
u32 addr_version; /* Address list version */
@@ -657,7 +662,11 @@ struct afs_vnode {
afs_lock_type_t lock_type : 8;
/* outstanding callback notification on this file */
+ struct work_struct cb_work; /* Work for mmap'd files */
+ struct list_head cb_mmap_link; /* Link in cell->fs_open_mmaps */
void *cb_server; /* Server with callback/filelock */
+ atomic_t cb_nr_mmap; /* Number of mmaps */
+ unsigned int cb_fs_s_break; /* Mass server break counter (cell->fs_s_break) */
unsigned int cb_s_break; /* Mass break counter on ->server */
unsigned int cb_v_break; /* Mass break counter on ->volume */
unsigned int cb_break; /* Break counter on vnode */
@@ -965,6 +974,8 @@ extern struct fscache_cookie_def afs_vnode_cache_index_def;
/*
* callback.c
*/
+extern void afs_invalidate_mmap_work(struct work_struct *);
+extern void afs_server_init_callback_work(struct work_struct *work);
extern void afs_init_callback_state(struct afs_server *);
extern void __afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason);
extern void afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason);
@@ -1044,7 +1055,8 @@ extern void afs_dynroot_depopulate(struct super_block *);
/*
* file.c
*/
-extern const struct address_space_operations afs_fs_aops;
+extern const struct address_space_operations afs_file_aops;
+extern const struct address_space_operations afs_symlink_aops;
extern const struct inode_operations afs_file_inode_operations;
extern const struct file_operations afs_file_operations;
extern const struct netfs_read_request_ops afs_req_ops;
@@ -1586,6 +1598,16 @@ static inline void afs_update_dentry_version(struct afs_operation *op,
}
/*
+ * Set the file size and block count. Estimate the number of 512 bytes blocks
+ * used, rounded up to nearest 1K for consistency with other AFS clients.
+ */
+static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size)
+{
+ i_size_write(&vnode->vfs_inode, size);
+ vnode->vfs_inode.i_blocks = ((size + 1023) >> 10) << 1;
+}
+
+/*
* Check for a conflicting operation on a directory that we just unlinked from.
* If someone managed to sneak a link or an unlink in on the file we just
* unlinked, we won't be able to trust nlink on an AFS file (but not YFS).
diff --git a/fs/afs/protocol_afs.h b/fs/afs/protocol_afs.h
new file mode 100644
index 000000000000..0c39358c8b70
--- /dev/null
+++ b/fs/afs/protocol_afs.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* AFS protocol bits
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+
+#define AFSCAPABILITIESMAX 196 /* Maximum number of words in a capability set */
+
+/* AFS3 Fileserver capabilities word 0 */
+#define AFS3_VICED_CAPABILITY_ERRORTRANS 0x0001 /* Uses UAE errors */
+#define AFS3_VICED_CAPABILITY_64BITFILES 0x0002 /* FetchData64 & StoreData64 supported */
+#define AFS3_VICED_CAPABILITY_WRITELOCKACL 0x0004 /* Can lock a file even without lock perm */
+#define AFS3_VICED_CAPABILITY_SANEACLS 0x0008 /* ACLs reviewed for sanity - don't use */
diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h
index b5bd03b1d3c7..e4cd89c44c46 100644
--- a/fs/afs/protocol_yfs.h
+++ b/fs/afs/protocol_yfs.h
@@ -168,3 +168,9 @@ enum yfs_lock_type {
yfs_LockMandatoryWrite = 0x101,
yfs_LockMandatoryExtend = 0x102,
};
+
+/* RXYFS Viced Capability Flags */
+#define YFS_VICED_CAPABILITY_ERRORTRANS 0x0001 /* Deprecated v0.195 */
+#define YFS_VICED_CAPABILITY_64BITFILES 0x0002 /* Deprecated v0.195 */
+#define YFS_VICED_CAPABILITY_WRITELOCKACL 0x0004 /* Can lock a file even without lock perm */
+#define YFS_VICED_CAPABILITY_SANEACLS 0x0008 /* Deprecated v0.195 */
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index d83f13c44b92..79e1a5f6701b 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -374,6 +374,7 @@ selected_server:
if (vnode->cb_server != server) {
vnode->cb_server = server;
vnode->cb_s_break = server->cb_s_break;
+ vnode->cb_fs_s_break = atomic_read(&server->cell->fs_s_break);
vnode->cb_v_break = vnode->volume->cb_v_break;
clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 684a2b02b9ff..6e5b9a19b234 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -235,6 +235,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
server->addr_version = alist->version;
server->uuid = *uuid;
rwlock_init(&server->fs_lock);
+ INIT_WORK(&server->initcb_work, afs_server_init_callback_work);
init_waitqueue_head(&server->probe_wq);
INIT_LIST_HEAD(&server->probe_link);
spin_lock_init(&server->probe_lock);
@@ -467,6 +468,7 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
afs_give_up_callbacks(net, server);
+ flush_work(&server->initcb_work);
afs_put_server(net, server, afs_server_trace_destroy);
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index e38bb1e7a4d2..d110def8aa8e 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -698,6 +698,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
vnode->lock_state = AFS_VNODE_LOCK_NONE;
init_rwsem(&vnode->rmdir_lock);
+ INIT_WORK(&vnode->cb_work, afs_invalidate_mmap_work);
_leave(" = %p", &vnode->vfs_inode);
return &vnode->vfs_inode;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c0534697268e..8b1d9c2f6bec 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -137,7 +137,7 @@ int afs_write_end(struct file *file, struct address_space *mapping,
write_seqlock(&vnode->cb_lock);
i_size = i_size_read(&vnode->vfs_inode);
if (maybe_i_size > i_size)
- i_size_write(&vnode->vfs_inode, maybe_i_size);
+ afs_set_i_size(vnode, maybe_i_size);
write_sequnlock(&vnode->cb_lock);
}
@@ -471,13 +471,18 @@ static void afs_extend_writeback(struct address_space *mapping,
}
/* Has the page moved or been split? */
- if (unlikely(page != xas_reload(&xas)))
+ if (unlikely(page != xas_reload(&xas))) {
+ put_page(page);
break;
+ }
- if (!trylock_page(page))
+ if (!trylock_page(page)) {
+ put_page(page);
break;
+ }
if (!PageDirty(page) || PageWriteback(page)) {
unlock_page(page);
+ put_page(page);
break;
}
@@ -487,6 +492,7 @@ static void afs_extend_writeback(struct address_space *mapping,
t = afs_page_dirty_to(page, priv);
if (f != 0 && !new_content) {
unlock_page(page);
+ put_page(page);
break;
}
@@ -801,6 +807,7 @@ int afs_writepages(struct address_space *mapping,
ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
+ struct afs_file *af = iocb->ki_filp->private_data;
ssize_t result;
size_t count = iov_iter_count(from);
@@ -816,6 +823,10 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
if (!count)
return 0;
+ result = afs_validate(vnode, af->key);
+ if (result < 0)
+ return result;
+
result = generic_file_write_iter(iocb, from);
_leave(" = %zd", result);
@@ -829,13 +840,18 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
*/
int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct inode *inode = file_inode(file);
- struct afs_vnode *vnode = AFS_FS_I(inode);
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
+ struct afs_file *af = file->private_data;
+ int ret;
_enter("{%llx:%llu},{n=%pD},%d",
vnode->fid.vid, vnode->fid.vnode, file,
datasync);
+ ret = afs_validate(vnode, af->key);
+ if (ret < 0)
+ return ret;
+
return file_write_and_wait_range(file, start, end);
}
@@ -845,15 +861,19 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
{
- struct page *page = thp_head(vmf->page);
+ struct folio *folio = page_folio(vmf->page);
+ struct page *page = &folio->page;
struct file *file = vmf->vma->vm_file;
struct inode *inode = file_inode(file);
struct afs_vnode *vnode = AFS_FS_I(inode);
+ struct afs_file *af = file->private_data;
unsigned long priv;
vm_fault_t ret = VM_FAULT_RETRY;
_enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index);
+ afs_validate(vnode, af->key);
+
sb_start_pagefault(inode->i_sb);
/* Wait for the page to be written to the cache before we allow it to
@@ -865,7 +885,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
goto out;
#endif
- if (wait_on_page_writeback_killable(page))
+ if (folio_wait_writeback_killable(folio))
goto out;
if (lock_page_killable(page) < 0)
@@ -875,8 +895,8 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
* details the portion of the page we need to write back and we might
* need to redirty the page if there's a problem.
*/
- if (wait_on_page_writeback_killable(page) < 0) {
- unlock_page(page);
+ if (folio_wait_writeback_killable(folio) < 0) {
+ folio_unlock(folio);
goto out;
}
@@ -955,8 +975,7 @@ int afs_launder_page(struct page *page)
iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len);
trace_afs_page_dirty(vnode, tracepoint_string("launder"), page);
- ret = afs_store_data(vnode, &iter, (loff_t)page->index * PAGE_SIZE,
- true);
+ ret = afs_store_data(vnode, &iter, page_offset(page) + f, true);
}
trace_afs_page_dirty(vnode, tracepoint_string("laundered"), page);
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 2b35cba8ad62..fdc7d675b4b0 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -83,25 +83,18 @@ static s64 linux_to_yfs_time(const struct timespec64 *t)
return (u64)t->tv_sec * 10000000 + t->tv_nsec/100;
}
-static __be32 *xdr_encode_YFSStoreStatus_mode(__be32 *bp, mode_t mode)
-{
- struct yfs_xdr_YFSStoreStatus *x = (void *)bp;
-
- x->mask = htonl(AFS_SET_MODE);
- x->mode = htonl(mode & S_IALLUGO);
- x->mtime_client = u64_to_xdr(0);
- x->owner = u64_to_xdr(0);
- x->group = u64_to_xdr(0);
- return bp + xdr_size(x);
-}
-
-static __be32 *xdr_encode_YFSStoreStatus_mtime(__be32 *bp, const struct timespec64 *t)
+static __be32 *xdr_encode_YFSStoreStatus(__be32 *bp, mode_t *mode,
+ const struct timespec64 *t)
{
struct yfs_xdr_YFSStoreStatus *x = (void *)bp;
+ mode_t masked_mode = mode ? *mode & S_IALLUGO : 0;
s64 mtime = linux_to_yfs_time(t);
+ u32 mask = AFS_SET_MTIME;
- x->mask = htonl(AFS_SET_MTIME);
- x->mode = htonl(0);
+ mask |= mode ? AFS_SET_MODE : 0;
+
+ x->mask = htonl(mask);
+ x->mode = htonl(masked_mode);
x->mtime_client = u64_to_xdr(mtime);
x->owner = u64_to_xdr(0);
x->group = u64_to_xdr(0);
@@ -576,7 +569,7 @@ void yfs_fs_create_file(struct afs_operation *op)
bp = xdr_encode_u32(bp, 0); /* RPC flags */
bp = xdr_encode_YFSFid(bp, &dvp->fid);
bp = xdr_encode_name(bp, name);
- bp = xdr_encode_YFSStoreStatus_mode(bp, op->create.mode);
+ bp = xdr_encode_YFSStoreStatus(bp, &op->create.mode, &op->mtime);
bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */
yfs_check_req(call, bp);
@@ -625,7 +618,7 @@ void yfs_fs_make_dir(struct afs_operation *op)
bp = xdr_encode_u32(bp, 0); /* RPC flags */
bp = xdr_encode_YFSFid(bp, &dvp->fid);
bp = xdr_encode_name(bp, name);
- bp = xdr_encode_YFSStoreStatus_mode(bp, op->create.mode);
+ bp = xdr_encode_YFSStoreStatus(bp, &op->create.mode, &op->mtime);
yfs_check_req(call, bp);
trace_afs_make_fs_call1(call, &dvp->fid, name);
@@ -946,6 +939,7 @@ void yfs_fs_symlink(struct afs_operation *op)
struct afs_vnode_param *dvp = &op->file[0];
struct afs_call *call;
size_t contents_sz;
+ mode_t mode = 0777;
__be32 *bp;
_enter("");
@@ -972,7 +966,7 @@ void yfs_fs_symlink(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &dvp->fid);
bp = xdr_encode_name(bp, name);
bp = xdr_encode_string(bp, op->create.symlink, contents_sz);
- bp = xdr_encode_YFSStoreStatus_mode(bp, S_IRWXUGO);
+ bp = xdr_encode_YFSStoreStatus(bp, &mode, &op->mtime);
yfs_check_req(call, bp);
trace_afs_make_fs_call1(call, &dvp->fid, name);
@@ -1103,7 +1097,7 @@ void yfs_fs_store_data(struct afs_operation *op)
bp = xdr_encode_u32(bp, YFSSTOREDATA64);
bp = xdr_encode_u32(bp, 0); /* RPC flags */
bp = xdr_encode_YFSFid(bp, &vp->fid);
- bp = xdr_encode_YFSStoreStatus_mtime(bp, &op->mtime);
+ bp = xdr_encode_YFSStoreStatus(bp, NULL, &op->mtime);
bp = xdr_encode_u64(bp, op->store.pos);
bp = xdr_encode_u64(bp, op->store.size);
bp = xdr_encode_u64(bp, op->store.i_size);
diff --git a/fs/aio.c b/fs/aio.c
index 51b08ab01dff..9c81cf611d65 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -659,8 +659,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
new_nr = (table ? table->nr : 1) * 4;
spin_unlock(&mm->ioctx_lock);
- table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
- new_nr, GFP_KERNEL);
+ table = kzalloc(struct_size(table, table, new_nr), GFP_KERNEL);
if (!table)
return -ENOMEM;
@@ -1417,7 +1416,7 @@ static void aio_remove_iocb(struct aio_kiocb *iocb)
spin_unlock_irqrestore(&ctx->ctx_lock, flags);
}
-static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void aio_complete_rw(struct kiocb *kiocb, long res)
{
struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
@@ -1437,7 +1436,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
}
iocb->ki_res.res = res;
- iocb->ki_res.res2 = res2;
+ iocb->ki_res.res2 = 0;
iocb_put(iocb);
}
@@ -1508,7 +1507,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
ret = -EINTR;
fallthrough;
default:
- req->ki_complete(req, ret, 0);
+ req->ki_complete(req, ret);
}
}
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index a280156138ed..e0c3e33c4177 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -148,6 +148,35 @@ struct file *anon_inode_getfile(const char *name,
}
EXPORT_SYMBOL_GPL(anon_inode_getfile);
+/**
+ * anon_inode_getfile_secure - Like anon_inode_getfile(), but creates a new
+ * !S_PRIVATE anon inode rather than reuse the
+ * singleton anon inode and calls the
+ * inode_init_security_anon() LSM hook. This
+ * allows for both the inode to have its own
+ * security context and for the LSM to enforce
+ * policy on the inode's creation.
+ *
+ * @name: [in] name of the "class" of the new file
+ * @fops: [in] file operations for the new file
+ * @priv: [in] private data for the new file (will be file's private_data)
+ * @flags: [in] flags
+ * @context_inode:
+ * [in] the logical relationship with the new inode (optional)
+ *
+ * The LSM may use @context_inode in inode_init_security_anon(), but a
+ * reference to it is not held. Returns the newly created file* or an error
+ * pointer. See the anon_inode_getfile() documentation for more information.
+ */
+struct file *anon_inode_getfile_secure(const char *name,
+ const struct file_operations *fops,
+ void *priv, int flags,
+ const struct inode *context_inode)
+{
+ return __anon_inode_getfile(name, fops, priv, flags,
+ context_inode, true);
+}
+
static int __anon_inode_getfd(const char *name,
const struct file_operations *fops,
void *priv, int flags,
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index 16b5fca0626e..54c1f8b8b075 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -358,7 +358,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
qstr.len = strlen(p);
offset = p - name;
}
- qstr.hash = full_name_hash(dentry, name, qstr.len);
+ qstr.hash = full_name_hash(dentry, qstr.name, qstr.len);
if (mutex_lock_interruptible(&sbi->wq_mutex)) {
kfree(name);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 69d900a8473d..f8c7f26f1fbb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -156,7 +156,7 @@ static int padzero(unsigned long elf_bss)
#define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) - (items))
#define STACK_ROUND(sp, items) \
(((unsigned long) (sp - items)) &~ 15UL)
-#define STACK_ALLOC(sp, len) ({ sp -= len ; sp; })
+#define STACK_ALLOC(sp, len) (sp -= len)
#endif
#ifndef ELF_BASE_PLATFORM
@@ -630,7 +630,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
vaddr = eppnt->p_vaddr;
if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
- elf_type |= MAP_FIXED_NOREPLACE;
+ elf_type |= MAP_FIXED;
else if (no_base && interp_elf_ex->e_type == ET_DYN)
load_addr = -vaddr;
@@ -1074,20 +1074,26 @@ out_free_interp:
vaddr = elf_ppnt->p_vaddr;
/*
- * If we are loading ET_EXEC or we have already performed
- * the ET_DYN load_addr calculations, proceed normally.
+ * The first time through the loop, load_addr_set is false:
+ * layout will be calculated. Once set, use MAP_FIXED since
+ * we know we've already safely mapped the entire region with
+ * MAP_FIXED_NOREPLACE in the once-per-binary logic following.
*/
- if (elf_ex->e_type == ET_EXEC || load_addr_set) {
+ if (load_addr_set) {
elf_flags |= MAP_FIXED;
+ } else if (elf_ex->e_type == ET_EXEC) {
+ /*
+ * This logic is run once for the first LOAD Program
+ * Header for ET_EXEC binaries. No special handling
+ * is needed.
+ */
+ elf_flags |= MAP_FIXED_NOREPLACE;
} else if (elf_ex->e_type == ET_DYN) {
/*
* This logic is run once for the first LOAD Program
* Header for ET_DYN binaries to calculate the
* randomization (load_bias) for all the LOAD
- * Program Headers, and to calculate the entire
- * size of the ELF mapping (total_size). (Note that
- * load_addr_set is set to true later once the
- * initial mapping is performed.)
+ * Program Headers.
*
* There are effectively two types of ET_DYN
* binaries: programs (i.e. PIE: ET_DYN with INTERP)
@@ -1108,7 +1114,7 @@ out_free_interp:
* Therefore, programs are loaded offset from
* ELF_ET_DYN_BASE and loaders are loaded into the
* independently randomized mmap region (0 load_bias
- * without MAP_FIXED).
+ * without MAP_FIXED nor MAP_FIXED_NOREPLACE).
*/
if (interpreter) {
load_bias = ELF_ET_DYN_BASE;
@@ -1117,7 +1123,7 @@ out_free_interp:
alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
if (alignment)
load_bias &= ~(alignment - 1);
- elf_flags |= MAP_FIXED;
+ elf_flags |= MAP_FIXED_NOREPLACE;
} else
load_bias = 0;
@@ -1129,7 +1135,14 @@ out_free_interp:
* is then page aligned.
*/
load_bias = ELF_PAGESTART(load_bias - vaddr);
+ }
+ /*
+ * Calculate the entire size of the ELF mapping (total_size).
+ * (Note that load_addr_set is set to true later once the
+ * initial mapping is performed.)
+ */
+ if (!load_addr_set) {
total_size = total_mapping_size(elf_phdata,
elf_ex->e_phnum);
if (!total_size) {
@@ -1834,7 +1847,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
/*
* Allocate a structure for each thread.
*/
- for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) {
+ for (ct = &dump_task->signal->core_state->dumper; ct; ct = ct->next) {
t = kzalloc(offsetof(struct elf_thread_core_info,
notes[info->thread_notes]),
GFP_KERNEL);
@@ -2024,7 +2037,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
if (!elf_note_info_init(info))
return 0;
- for (ct = current->mm->core_state->dumper.next;
+ for (ct = current->signal->core_state->dumper.next;
ct; ct = ct->next) {
ets = kzalloc(sizeof(*ets), GFP_KERNEL);
if (!ets)
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 6d8fd6030cbb..c6f588dc4a9d 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1494,7 +1494,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
goto end_coredump;
- for (ct = current->mm->core_state->dumper.next;
+ for (ct = current->signal->core_state->dumper.next;
ct; ct = ct->next) {
tmp = elf_dump_thread_status(cprm->siginfo->si_signo,
ct->task, &thread_status_size);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index a3b830b8410a..444e9c89ff3e 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/list_sort.h>
#include "misc.h"
#include "ctree.h"
#include "block-group.h"
@@ -144,6 +145,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
*/
WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
kfree(cache->free_space_ctl);
+ kfree(cache->physical_map);
kfree(cache);
}
}
@@ -902,6 +904,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cluster->refill_lock);
btrfs_clear_treelog_bg(block_group);
+ btrfs_clear_data_reloc_bg(block_group);
path = btrfs_alloc_path();
if (!path) {
@@ -1484,6 +1487,21 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
}
+/*
+ * We want block groups with a low number of used bytes to be in the beginning
+ * of the list, so they will get reclaimed first.
+ */
+static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
+ const struct list_head *b)
+{
+ const struct btrfs_block_group *bg1, *bg2;
+
+ bg1 = list_entry(a, struct btrfs_block_group, bg_list);
+ bg2 = list_entry(b, struct btrfs_block_group, bg_list);
+
+ return bg1->used > bg2->used;
+}
+
void btrfs_reclaim_bgs_work(struct work_struct *work)
{
struct btrfs_fs_info *fs_info =
@@ -1508,6 +1526,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
}
spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * Sort happens under lock because we can't simply splice it and sort.
+ * The block groups might still be in use and reachable via bg_list,
+ * and their presence in the reclaim_bgs list must be preserved.
+ */
+ list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
while (!list_empty(&fs_info->reclaim_bgs)) {
u64 zone_unusable;
int ret = 0;
@@ -1895,6 +1919,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
INIT_LIST_HEAD(&cache->discard_list);
INIT_LIST_HEAD(&cache->dirty_list);
INIT_LIST_HEAD(&cache->io_list);
+ INIT_LIST_HEAD(&cache->active_bg_list);
btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
atomic_set(&cache->frozen, 0);
mutex_init(&cache->free_space_lock);
@@ -2035,6 +2060,8 @@ static int read_one_block_group(struct btrfs_fs_info *info,
*/
if (btrfs_is_zoned(info)) {
btrfs_calc_zone_unusable(cache);
+ /* Should not have any excluded extents. Just in case, though. */
+ btrfs_free_excluded_extents(cache);
} else if (cache->length == cache->used) {
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
@@ -2062,15 +2089,18 @@ static int read_one_block_group(struct btrfs_fs_info *info,
link_block_group(cache);
set_avail_alloc_bits(info, cache->flags);
- if (btrfs_chunk_readonly(info, cache->start)) {
+ if (btrfs_chunk_writeable(info, cache->start)) {
+ if (cache->used == 0) {
+ ASSERT(list_empty(&cache->bg_list));
+ if (btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&info->discard_ctl, cache);
+ else
+ btrfs_mark_bg_unused(cache);
+ }
+ } else {
inc_block_group_ro(cache, 1);
- } else if (cache->used == 0) {
- ASSERT(list_empty(&cache->bg_list));
- if (btrfs_test_opt(info, DISCARD_ASYNC))
- btrfs_discard_queue_work(&info->discard_ctl, cache);
- else
- btrfs_mark_bg_unused(cache);
}
+
return 0;
error:
btrfs_put_block_group(cache);
@@ -2438,6 +2468,12 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
return ERR_PTR(ret);
}
+ /*
+ * New block group is likely to be used soon. Try to activate it now.
+ * Failure is OK for now.
+ */
+ btrfs_zone_activate(cache);
+
ret = exclude_super_stripes(cache);
if (ret) {
/* We may have excluded something, so call this just in case */
@@ -2479,7 +2515,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
*/
trace_btrfs_add_block_group(fs_info, cache, 1);
btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
- cache->bytes_super, 0, &cache->space_info);
+ cache->bytes_super, cache->zone_unusable,
+ &cache->space_info);
btrfs_update_global_block_rsv(fs_info);
link_block_group(cache);
@@ -2594,7 +2631,9 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
if (!--cache->ro) {
if (btrfs_is_zoned(cache->fs_info)) {
/* Migrate zone_unusable bytes back */
- cache->zone_unusable = cache->alloc_offset - cache->used;
+ cache->zone_unusable =
+ (cache->alloc_offset - cache->used) +
+ (cache->length - cache->zone_capacity);
sinfo->bytes_zone_unusable += cache->zone_unusable;
sinfo->bytes_readonly -= cache->zone_unusable;
}
@@ -3143,7 +3182,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
}
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, int alloc)
+ u64 bytenr, u64 num_bytes, bool alloc)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_block_group *cache = NULL;
@@ -3380,36 +3419,17 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
*/
check_system_chunk(trans, flags);
- bg = btrfs_alloc_chunk(trans, flags);
+ bg = btrfs_create_chunk(trans, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
goto out;
}
- /*
- * If this is a system chunk allocation then stop right here and do not
- * add the chunk item to the chunk btree. This is to prevent a deadlock
- * because this system chunk allocation can be triggered while COWing
- * some extent buffer of the chunk btree and while holding a lock on a
- * parent extent buffer, in which case attempting to insert the chunk
- * item (or update the device item) would result in a deadlock on that
- * parent extent buffer. In this case defer the chunk btree updates to
- * the second phase of chunk allocation and keep our reservation until
- * the second phase completes.
- *
- * This is a rare case and can only be triggered by the very few cases
- * we have where we need to touch the chunk btree outside chunk allocation
- * and chunk removal. These cases are basically adding a device, removing
- * a device or resizing a device.
- */
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- return 0;
-
ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
/*
* Normally we are not expected to fail with -ENOSPC here, since we have
* previously reserved space in the system space_info and allocated one
- * new system chunk if necessary. However there are two exceptions:
+ * new system chunk if necessary. However there are three exceptions:
*
* 1) We may have enough free space in the system space_info but all the
* existing system block groups have a profile which can not be used
@@ -3435,13 +3455,20 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
* with enough free space got turned into RO mode by a running scrub,
* and in this case we have to allocate a new one and retry. We only
* need do this allocate and retry once, since we have a transaction
- * handle and scrub uses the commit root to search for block groups.
+ * handle and scrub uses the commit root to search for block groups;
+ *
+ * 3) We had one system block group with enough free space when we called
+ * check_system_chunk(), but after that, right before we tried to
+ * allocate the last extent buffer we needed, a discard operation came
+ * in and it temporarily removed the last free space entry from the
+ * block group (discard removes a free space entry, discards it, and
+ * then adds back the entry to the block group cache).
*/
if (ret == -ENOSPC) {
const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
struct btrfs_block_group *sys_bg;
- sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+ sys_bg = btrfs_create_chunk(trans, sys_flags);
if (IS_ERR(sys_bg)) {
ret = PTR_ERR(sys_bg);
btrfs_abort_transaction(trans, ret);
@@ -3519,7 +3546,15 @@ out:
* properly, either intentionally or as a bug. One example where this is
* done intentionally is fsync, as it does not reserve any transaction units
* and ends up allocating a variable number of metadata extents for log
- * tree extent buffers.
+ * tree extent buffers;
+ *
+ * 4) The task has reserved enough transaction units / metadata space, but right
+ * before it tries to allocate the last extent buffer it needs, a discard
+ * operation comes in and, temporarily, removes the last free space entry from
+ * the only metadata block group that had free space (discard starts by
+ * removing a free space entry from a block group, then does the discard
+ * operation and, once it's done, it adds back the free space entry to the
+ * block group).
*
* We also need this 2 phases setup when adding a device to a filesystem with
* a seed device - we must create new metadata and system chunks without adding
@@ -3537,14 +3572,14 @@ out:
* This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
* the system chunk array due to concurrent allocations") provides more details.
*
- * For allocation of system chunks, we defer the updates and insertions into the
- * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
- * if the chunk allocation is triggered while COWing an extent buffer of the
- * chunk btree, we are holding a lock on the parent of that extent buffer and
- * doing the chunk btree updates and insertions can require locking that parent.
- * This is for the very few and rare cases where we update the chunk btree that
- * are not chunk allocation or chunk removal: adding a device, removing a device
- * or resizing a device.
+ * Allocation of system chunks does not happen through this function. A task that
+ * needs to update the chunk btree (the only btree that uses system chunks), must
+ * preallocate chunk space by calling either check_system_chunk() or
+ * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
+ * metadata chunk or when removing a chunk, while the later is used before doing
+ * a modification to the chunk btree - use cases for the later are adding,
+ * removing and resizing a device as well as relocation of a system chunk.
+ * See the comment below for more details.
*
* The reservation of system space, done through check_system_chunk(), as well
* as all the updates and insertions into the chunk btree must be done while
@@ -3581,11 +3616,27 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
if (trans->allocating_chunk)
return -ENOSPC;
/*
- * If we are removing a chunk, don't re-enter or we would deadlock.
- * System space reservation and system chunk allocation is done by the
- * chunk remove operation (btrfs_remove_chunk()).
+ * Allocation of system chunks can not happen through this path, as we
+ * could end up in a deadlock if we are allocating a data or metadata
+ * chunk and there is another task modifying the chunk btree.
+ *
+ * This is because while we are holding the chunk mutex, we will attempt
+ * to add the new chunk item to the chunk btree or update an existing
+ * device item in the chunk btree, while the other task that is modifying
+ * the chunk btree is attempting to COW an extent buffer while holding a
+ * lock on it and on its parent - if the COW operation triggers a system
+ * chunk allocation, then we can deadlock because we are holding the
+ * chunk mutex and we may need to access that extent buffer or its parent
+ * in order to add the chunk item or update a device item.
+ *
+ * Tasks that want to modify the chunk tree should reserve system space
+ * before updating the chunk btree, by calling either
+ * btrfs_reserve_chunk_metadata() or check_system_chunk().
+ * It's possible that after a task reserves the space, it still ends up
+ * here - this happens in the cases described above at do_chunk_alloc().
+ * The task will have to either retry or fail.
*/
- if (trans->removing_chunk)
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
return -ENOSPC;
space_info = btrfs_find_space_info(fs_info, flags);
@@ -3684,17 +3735,14 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
return num_dev;
}
-/*
- * Reserve space in the system space for allocating or removing a chunk
- */
-void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+static void reserve_chunk_space(struct btrfs_trans_handle *trans,
+ u64 bytes,
+ u64 type)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *info;
u64 left;
- u64 thresh;
int ret = 0;
- u64 num_devs;
/*
* Needed because we can end up allocating a system chunk and for an
@@ -3707,19 +3755,13 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
left = info->total_bytes - btrfs_space_info_used(info, true);
spin_unlock(&info->lock);
- num_devs = get_profile_num_devs(fs_info, type);
-
- /* num_devs device items to update and 1 chunk item to add or remove */
- thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
- btrfs_calc_insert_metadata_size(fs_info, 1);
-
- if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
- left, thresh, type);
+ left, bytes, type);
btrfs_dump_space_info(fs_info, info, 0, 0);
}
- if (left < thresh) {
+ if (left < bytes) {
u64 flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *bg;
@@ -3728,21 +3770,20 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
* needing it, as we might not need to COW all nodes/leafs from
* the paths we visit in the chunk tree (they were already COWed
* or created in the current transaction for example).
- *
- * Also, if our caller is allocating a system chunk, do not
- * attempt to insert the chunk item in the chunk btree, as we
- * could deadlock on an extent buffer since our caller may be
- * COWing an extent buffer from the chunk btree.
*/
- bg = btrfs_alloc_chunk(trans, flags);
+ bg = btrfs_create_chunk(trans, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
- } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+ } else {
/*
* If we fail to add the chunk item here, we end up
* trying again at phase 2 of chunk allocation, at
* btrfs_create_pending_block_groups(). So ignore
- * any error here.
+ * any error here. An ENOSPC here could happen, due to
+ * the cases described at do_chunk_alloc() - the system
+ * block group we just created was just turned into RO
+ * mode by a scrub for example, or a running discard
+ * temporarily removed its free space entries, etc.
*/
btrfs_chunk_alloc_add_chunk_item(trans, bg);
}
@@ -3751,12 +3792,61 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
if (!ret) {
ret = btrfs_block_rsv_add(fs_info->chunk_root,
&fs_info->chunk_block_rsv,
- thresh, BTRFS_RESERVE_NO_FLUSH);
+ bytes, BTRFS_RESERVE_NO_FLUSH);
if (!ret)
- trans->chunk_bytes_reserved += thresh;
+ trans->chunk_bytes_reserved += bytes;
}
}
+/*
+ * Reserve space in the system space for allocating or removing a chunk.
+ * The caller must be holding fs_info->chunk_mutex.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const u64 num_devs = get_profile_num_devs(fs_info, type);
+ u64 bytes;
+
+ /* num_devs device items to update and 1 chunk item to add or remove. */
+ bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
+ btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ reserve_chunk_space(trans, bytes, type);
+}
+
+/*
+ * Reserve space in the system space, if needed, for doing a modification to the
+ * chunk btree.
+ *
+ * @trans: A transaction handle.
+ * @is_item_insertion: Indicate if the modification is for inserting a new item
+ * in the chunk btree or if it's for the deletion or update
+ * of an existing item.
+ *
+ * This is used in a context where we need to update the chunk btree outside
+ * block group allocation and removal, to avoid a deadlock with a concurrent
+ * task that is allocating a metadata or data block group and therefore needs to
+ * update the chunk btree while holding the chunk mutex. After the update to the
+ * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
+ *
+ */
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+ bool is_item_insertion)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u64 bytes;
+
+ if (is_item_insertion)
+ bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+ else
+ bytes = btrfs_calc_metadata_size(fs_info, 1);
+
+ mutex_lock(&fs_info->chunk_mutex);
+ reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
+ mutex_unlock(&fs_info->chunk_mutex);
+}
+
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
struct btrfs_block_group *block_group;
@@ -3833,6 +3923,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->unused_bgs_lock);
+ spin_lock(&info->zone_active_bgs_lock);
+ while (!list_empty(&info->zone_active_bgs)) {
+ block_group = list_first_entry(&info->zone_active_bgs,
+ struct btrfs_block_group,
+ active_bg_list);
+ list_del_init(&block_group->active_bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->zone_active_bgs_lock);
+
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group,
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index c72a71efcb18..5878b7ce3b78 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -98,6 +98,7 @@ struct btrfs_block_group {
unsigned int to_copy:1;
unsigned int relocating_repair:1;
unsigned int chunk_item_inserted:1;
+ unsigned int zone_is_active:1;
int disk_cache_state;
@@ -202,7 +203,10 @@ struct btrfs_block_group {
*/
u64 alloc_offset;
u64 zone_unusable;
+ u64 zone_capacity;
u64 meta_write_pointer;
+ struct map_lookup *physical_map;
+ struct list_head active_bg_list;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@@ -280,7 +284,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, int alloc);
+ u64 bytenr, u64 num_bytes, bool alloc);
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
u64 ram_bytes, u64 num_bytes, int delalloc);
void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
@@ -289,6 +293,8 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
enum btrfs_chunk_alloc_enum force);
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+ bool is_item_insertion);
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 76ee1452c57b..ab2a4a52e0bb 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -138,17 +138,34 @@ struct btrfs_inode {
/* a local copy of root's last_log_commit */
int last_log_commit;
- /* total number of bytes pending delalloc, used by stat to calc the
- * real block usage of the file
- */
- u64 delalloc_bytes;
-
- /*
- * Total number of bytes pending delalloc that fall within a file
- * range that is either a hole or beyond EOF (and no prealloc extent
- * exists in the range). This is always <= delalloc_bytes.
- */
- u64 new_delalloc_bytes;
+ union {
+ /*
+ * Total number of bytes pending delalloc, used by stat to
+ * calculate the real block usage of the file. This is used
+ * only for files.
+ */
+ u64 delalloc_bytes;
+ /*
+ * The offset of the last dir item key that was logged.
+ * This is used only for directories.
+ */
+ u64 last_dir_item_offset;
+ };
+
+ union {
+ /*
+ * Total number of bytes pending delalloc that fall within a file
+ * range that is either a hole or beyond EOF (and no prealloc extent
+ * exists in the range). This is always <= delalloc_bytes and this
+ * is used only for files.
+ */
+ u64 new_delalloc_bytes;
+ /*
+ * The offset of the last dir index key that was logged.
+ * This is used only for directories.
+ */
+ u64 last_dir_index_offset;
+ };
/*
* total number of bytes pending defrag, used by stat to check whether
@@ -339,7 +356,12 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
struct btrfs_dio_private {
struct inode *inode;
- u64 logical_offset;
+
+ /*
+ * Since DIO can use anonymous page, we cannot use page_offset() to
+ * grab the file offset, thus need a dedicated member for file offset.
+ */
+ u64 file_offset;
u64 disk_bytenr;
/* Used for bio::bi_size */
u32 bytes;
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 86816088927f..7e9f90fa0388 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -186,7 +186,6 @@ struct btrfsic_dev_state {
struct list_head collision_resolving_node; /* list node */
struct btrfsic_block dummy_block_for_bio_bh_flush;
u64 last_flush_gen;
- char name[BDEVNAME_SIZE];
};
struct btrfsic_block_hashtable {
@@ -403,7 +402,6 @@ static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
ds->bdev = NULL;
ds->state = NULL;
- ds->name[0] = '\0';
INIT_LIST_HEAD(&ds->collision_resolving_node);
ds->last_flush_gen = 0;
btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
@@ -756,10 +754,10 @@ static int btrfsic_process_superblock_dev_mirror(
superblock_tmp->mirror_num = 1 + superblock_mirror_num;
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
btrfs_info_in_rcu(fs_info,
- "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)",
+ "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)",
superblock_bdev,
rcu_str_deref(device->name), dev_bytenr,
- dev_state->name, dev_bytenr,
+ dev_state->bdev, dev_bytenr,
superblock_mirror_num);
list_add(&superblock_tmp->all_blocks_node,
&state->all_blocks_list);
@@ -938,9 +936,10 @@ continue_with_current_leaf_stack_frame:
if (disk_item_offset + sizeof(struct btrfs_item) >
sf->block_ctx->len) {
leaf_item_out_of_bounce_error:
- pr_info("btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
+ pr_info(
+ "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n",
sf->block_ctx->start,
- sf->block_ctx->dev->name);
+ sf->block_ctx->dev->bdev);
goto one_stack_frame_backwards;
}
btrfsic_read_from_block_data(sf->block_ctx,
@@ -1058,9 +1057,10 @@ continue_with_current_node_stack_frame:
(uintptr_t)nodehdr;
if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
sf->block_ctx->len) {
- pr_info("btrfsic: node item out of bounce at logical %llu, dev %s\n",
+ pr_info(
+ "btrfsic: node item out of bounce at logical %llu, dev %pg\n",
sf->block_ctx->start,
- sf->block_ctx->dev->name);
+ sf->block_ctx->dev->bdev);
goto one_stack_frame_backwards;
}
btrfsic_read_from_block_data(
@@ -1228,15 +1228,17 @@ static int btrfsic_create_link_to_next_block(
if (next_block->logical_bytenr != next_bytenr &&
!(!next_block->is_metadata &&
0 == next_block->logical_bytenr))
- pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
- next_bytenr, next_block_ctx->dev->name,
+ pr_info(
+"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
+ next_bytenr, next_block_ctx->dev->bdev,
next_block_ctx->dev_bytenr, *mirror_nump,
btrfsic_get_block_type(state,
next_block),
next_block->logical_bytenr);
else
- pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
- next_bytenr, next_block_ctx->dev->name,
+ pr_info(
+ "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ next_bytenr, next_block_ctx->dev->bdev,
next_block_ctx->dev_bytenr, *mirror_nump,
btrfsic_get_block_type(state,
next_block));
@@ -1324,8 +1326,8 @@ static int btrfsic_handle_extent_data(
if (file_extent_item_offset +
offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
block_ctx->len) {
- pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n",
- block_ctx->start, block_ctx->dev->name);
+ pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
+ block_ctx->start, block_ctx->dev->bdev);
return -1;
}
@@ -1344,8 +1346,8 @@ static int btrfsic_handle_extent_data(
if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
block_ctx->len) {
- pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n",
- block_ctx->start, block_ctx->dev->name);
+ pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
+ block_ctx->start, block_ctx->dev->bdev);
return -1;
}
btrfsic_read_from_block_data(block_ctx, &file_extent_item,
@@ -1421,9 +1423,10 @@ static int btrfsic_handle_extent_data(
next_block->logical_bytenr != next_bytenr &&
!(!next_block->is_metadata &&
0 == next_block->logical_bytenr)) {
- pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu).\n",
+ pr_info(
+"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n",
next_bytenr,
- next_block_ctx.dev->name,
+ next_block_ctx.dev->bdev,
next_block_ctx.dev_bytenr,
mirror_num,
next_block->logical_bytenr);
@@ -1455,7 +1458,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
struct btrfs_fs_info *fs_info = state->fs_info;
int ret;
u64 length;
- struct btrfs_bio *multi = NULL;
+ struct btrfs_io_context *multi = NULL;
struct btrfs_device *device;
length = len;
@@ -1561,7 +1564,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
struct bio *bio;
unsigned int j;
- bio = btrfs_io_bio_alloc(num_pages - i);
+ bio = btrfs_bio_alloc(num_pages - i);
bio_set_dev(bio, block_ctx->dev->bdev);
bio->bi_iter.bi_sector = dev_bytenr >> 9;
bio->bi_opf = REQ_OP_READ;
@@ -1577,8 +1580,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
return -1;
}
if (submit_bio_wait(bio)) {
- pr_info("btrfsic: read error at logical %llu dev %s!\n",
- block_ctx->start, block_ctx->dev->name);
+ pr_info("btrfsic: read error at logical %llu dev %pg!\n",
+ block_ctx->start, block_ctx->dev->bdev);
bio_put(bio);
return -1;
}
@@ -1602,33 +1605,35 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
const struct btrfsic_block_link *l;
- pr_info("%c-block @%llu (%s/%llu/%d)\n",
+ pr_info("%c-block @%llu (%pg/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num);
list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
- pr_info(" %c @%llu (%s/%llu/%d) refers %u* to %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
}
list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
- pr_info(" %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr,
l->block_ref_from->mirror_num);
}
@@ -1743,16 +1748,18 @@ again:
if (block->logical_bytenr != bytenr &&
!(!block->is_metadata &&
block->logical_bytenr == 0))
- pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
- bytenr, dev_state->name,
+ pr_info(
+"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
+ bytenr, dev_state->bdev,
dev_bytenr,
block->mirror_num,
btrfsic_get_block_type(state,
block),
block->logical_bytenr);
else
- pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
- bytenr, dev_state->name,
+ pr_info(
+ "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ bytenr, dev_state->bdev,
dev_bytenr, block->mirror_num,
btrfsic_get_block_type(state,
block));
@@ -1767,8 +1774,9 @@ again:
processed_len = state->datablock_size;
bytenr = block->logical_bytenr;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
- bytenr, dev_state->name, dev_bytenr,
+ pr_info(
+ "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ bytenr, dev_state->bdev, dev_bytenr,
block->mirror_num,
btrfsic_get_block_type(state, block));
}
@@ -1778,9 +1786,10 @@ again:
list_empty(&block->ref_to_list) ? ' ' : '!',
list_empty(&block->ref_from_list) ? ' ' : '!');
if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
- pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
+ pr_info(
+"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
btrfsic_get_block_type(state, block), bytenr,
- dev_state->name, dev_bytenr, block->mirror_num,
+ dev_state->bdev, dev_bytenr, block->mirror_num,
block->generation,
btrfs_disk_key_objectid(&block->disk_key),
block->disk_key.type,
@@ -1792,9 +1801,10 @@ again:
}
if (!block->is_iodone && !block->never_written) {
- pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
+ pr_info(
+"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
btrfsic_get_block_type(state, block), bytenr,
- dev_state->name, dev_bytenr, block->mirror_num,
+ dev_state->bdev, dev_bytenr, block->mirror_num,
block->generation,
btrfs_stack_header_generation(
(struct btrfs_header *)
@@ -1921,8 +1931,9 @@ again:
if (!is_metadata) {
processed_len = state->datablock_size;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("Written block (%s/%llu/?) !found in hash table, D.\n",
- dev_state->name, dev_bytenr);
+ pr_info(
+ "written block (%pg/%llu/?) !found in hash table, D\n",
+ dev_state->bdev, dev_bytenr);
if (!state->include_extent_data) {
/* ignore that written D block */
goto continue_loop;
@@ -1939,8 +1950,9 @@ again:
btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
dev_bytenr);
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("Written block @%llu (%s/%llu/?) !found in hash table, M.\n",
- bytenr, dev_state->name, dev_bytenr);
+ pr_info(
+ "written block @%llu (%pg/%llu/?) !found in hash table, M\n",
+ bytenr, dev_state->bdev, dev_bytenr);
}
block_ctx.dev = dev_state;
@@ -1995,9 +2007,9 @@ again:
block->next_in_same_bio = NULL;
}
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("New written %c-block @%llu (%s/%llu/%d)\n",
+ pr_info("new written %c-block @%llu (%pg/%llu/%d)\n",
is_metadata ? 'M' : 'D',
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num);
list_add(&block->all_blocks_node, &state->all_blocks_list);
btrfsic_block_hashtable_add(block, &state->block_hashtable);
@@ -2041,10 +2053,10 @@ static void btrfsic_bio_end_io(struct bio *bp)
if ((dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+ pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n",
bp->bi_status,
btrfsic_get_block_type(dev_state->state, block),
- block->logical_bytenr, dev_state->name,
+ block->logical_bytenr, dev_state->bdev,
block->dev_bytenr, block->mirror_num);
next_block = block->next_in_same_bio;
block->iodone_w_error = iodone_w_error;
@@ -2052,8 +2064,8 @@ static void btrfsic_bio_end_io(struct bio *bp)
dev_state->last_flush_gen++;
if ((dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bio_end_io() new %s flush_gen=%llu\n",
- dev_state->name,
+ pr_info("bio_end_io() new %pg flush_gen=%llu\n",
+ dev_state->bdev,
dev_state->last_flush_gen);
}
if (block->submit_bio_bh_rw & REQ_FUA)
@@ -2078,17 +2090,19 @@ static int btrfsic_process_written_superblock(
if (!(superblock->generation > state->max_superblock_generation ||
0 == state->max_superblock_generation)) {
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- pr_info("btrfsic: superblock @%llu (%s/%llu/%d) with old gen %llu <= %llu\n",
+ pr_info(
+ "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n",
superblock->logical_bytenr,
- superblock->dev_state->name,
+ superblock->dev_state->bdev,
superblock->dev_bytenr, superblock->mirror_num,
btrfs_super_generation(super_hdr),
state->max_superblock_generation);
} else {
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- pr_info("btrfsic: got new superblock @%llu (%s/%llu/%d) with new gen %llu > %llu\n",
+ pr_info(
+ "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n",
superblock->logical_bytenr,
- superblock->dev_state->name,
+ superblock->dev_state->bdev,
superblock->dev_bytenr, superblock->mirror_num,
btrfs_super_generation(super_hdr),
state->max_superblock_generation);
@@ -2232,38 +2246,42 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
*/
list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("rl=%d, %c @%llu (%s/%llu/%d) %u* refers to %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n",
recursion_level,
btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
if (l->block_ref_to->never_written) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is never written!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
} else if (!l->block_ref_to->is_iodone) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not yet iodone!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
} else if (l->block_ref_to->iodone_w_error) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which has write error!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
@@ -2273,10 +2291,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
l->parent_generation &&
BTRFSIC_GENERATION_UNKNOWN !=
l->block_ref_to->generation) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) with generation %llu != parent generation %llu!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num,
l->block_ref_to->generation,
@@ -2284,10 +2303,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
ret = -1;
} else if (l->block_ref_to->flush_gen >
l->block_ref_to->dev_state->last_flush_gen) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num, block->flush_gen,
l->block_ref_to->dev_state->last_flush_gen);
@@ -2324,15 +2344,16 @@ static int btrfsic_is_block_ref_by_superblock(
*/
list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("rl=%d, %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
recursion_level,
btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr,
l->block_ref_from->mirror_num);
if (l->block_ref_from->is_superblock &&
@@ -2354,30 +2375,30 @@ static int btrfsic_is_block_ref_by_superblock(
static void btrfsic_print_add_link(const struct btrfsic_state *state,
const struct btrfsic_block_link *l)
{
- pr_info("Add %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n",
+ pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+ l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
}
static void btrfsic_print_rem_link(const struct btrfsic_state *state,
const struct btrfsic_block_link *l)
{
- pr_info("Rem %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n",
+ pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+ l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
}
@@ -2419,9 +2440,9 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
* This algorithm is recursive because the amount of used stack space
* is very small and the max recursion depth is limited.
*/
- indent_add = sprintf(buf, "%c-%llu(%s/%llu/%u)",
+ indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)",
btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num);
if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
printk("[...]\n");
@@ -2542,10 +2563,10 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
block->never_written = never_written;
block->mirror_num = mirror_num;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("New %s%c-block @%llu (%s/%llu/%d)\n",
+ pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n",
additional_string,
btrfsic_get_block_type(state, block),
- block->logical_bytenr, dev_state->name,
+ block->logical_bytenr, dev_state->bdev,
block->dev_bytenr, mirror_num);
list_add(&block->all_blocks_node, &state->all_blocks_list);
btrfsic_block_hashtable_add(block, &state->block_hashtable);
@@ -2592,8 +2613,9 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
}
if (WARN_ON(!match)) {
- pr_info("btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%s, phys_bytenr=%llu)!\n",
- bytenr, dev_state->name, dev_bytenr);
+ pr_info(
+"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n",
+ bytenr, dev_state->bdev, dev_bytenr);
for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
ret = btrfsic_map_block(state, bytenr,
state->metablock_size,
@@ -2601,8 +2623,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
if (ret)
continue;
- pr_info("Read logical bytenr @%llu maps to (%s/%llu/%d)\n",
- bytenr, block_ctx.dev->name,
+ pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n",
+ bytenr, block_ctx.dev->bdev,
block_ctx.dev_bytenr, mirror_num);
}
}
@@ -2675,8 +2697,9 @@ static void __btrfsic_submit_bio(struct bio *bio)
if ((dev_state->state->print_mask &
(BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
BTRFSIC_PRINT_MASK_VERBOSE)))
- pr_info("btrfsic_submit_bio(%s) with FLUSH but dummy block already in use (ignored)!\n",
- dev_state->name);
+ pr_info(
+"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
+ dev_state->bdev);
} else {
struct btrfsic_block *const block =
&dev_state->dummy_block_for_bio_bh_flush;
@@ -2751,7 +2774,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
list_for_each_entry(device, dev_head, dev_list) {
struct btrfsic_dev_state *ds;
- const char *p;
if (!device->bdev || !device->name)
continue;
@@ -2763,10 +2785,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
}
ds->bdev = device->bdev;
ds->state = state;
- bdevname(ds->bdev, ds->name);
- ds->name[BDEVNAME_SIZE - 1] = '\0';
- p = kbasename(ds->name);
- strlcpy(ds->name, p, sizeof(ds->name));
btrfsic_dev_state_hashtable_add(ds,
&btrfsic_dev_state_hashtable);
}
@@ -2844,9 +2862,10 @@ void btrfsic_unmount(struct btrfs_fs_devices *fs_devices)
if (b_all->is_iodone || b_all->never_written)
btrfsic_block_free(b_all);
else
- pr_info("btrfs: attempt to free %c-block @%llu (%s/%llu/%d) on umount which is not yet iodone!\n",
+ pr_info(
+"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num);
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 7869ad12bc6e..32da97c3c19d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -9,6 +9,7 @@
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
+#include <linux/kthread.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
@@ -28,6 +29,7 @@
#include "compression.h"
#include "extent_io.h"
#include "extent_map.h"
+#include "subpage.h"
#include "zoned.h"
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
@@ -172,16 +174,17 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
/* Hash through the page sector by sector */
for (pg_offset = 0; pg_offset < bytes_left;
pg_offset += sectorsize) {
- kaddr = page_address(page);
+ kaddr = kmap_atomic(page);
crypto_shash_digest(shash, kaddr + pg_offset,
sectorsize, csum);
+ kunmap_atomic(kaddr);
if (memcmp(&csum, cb_sum, csum_size) != 0) {
btrfs_print_data_csum_error(inode, disk_start,
csum, cb_sum, cb->mirror_num);
- if (btrfs_io_bio(bio)->device)
+ if (btrfs_bio(bio)->device)
btrfs_dev_stat_inc_and_print(
- btrfs_io_bio(bio)->device,
+ btrfs_bio(bio)->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
return -EIO;
}
@@ -192,6 +195,87 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
return 0;
}
+/*
+ * Reduce bio and io accounting for a compressed_bio with its corresponding bio.
+ *
+ * Return true if there is no pending bio nor io.
+ * Return false otherwise.
+ */
+static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ unsigned int bi_size = 0;
+ bool last_io = false;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ /*
+ * At endio time, bi_iter.bi_size doesn't represent the real bio size.
+ * Thus here we have to iterate through all segments to grab correct
+ * bio size.
+ */
+ bio_for_each_segment_all(bvec, bio, iter_all)
+ bi_size += bvec->bv_len;
+
+ if (bio->bi_status)
+ cb->errors = 1;
+
+ ASSERT(bi_size && bi_size <= cb->compressed_len);
+ last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
+ &cb->pending_sectors);
+ /*
+ * Here we must wake up the possible error handler after all other
+ * operations on @cb finished, or we can race with
+ * finish_compressed_bio_*() which may free @cb.
+ */
+ wake_up_var(cb);
+
+ return last_io;
+}
+
+static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio)
+{
+ unsigned int index;
+ struct page *page;
+
+ /* Release the compressed pages */
+ for (index = 0; index < cb->nr_pages; index++) {
+ page = cb->compressed_pages[index];
+ page->mapping = NULL;
+ put_page(page);
+ }
+
+ /* Do io completion on the original bio */
+ if (cb->errors) {
+ bio_io_error(cb->orig_bio);
+ } else {
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ ASSERT(bio);
+ ASSERT(!bio->bi_status);
+ /*
+ * We have verified the checksum already, set page checked so
+ * the end_io handlers know about it
+ */
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
+ bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
+ u64 bvec_start = page_offset(bvec->bv_page) +
+ bvec->bv_offset;
+
+ btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb),
+ bvec->bv_page, bvec_start,
+ bvec->bv_len);
+ }
+
+ bio_endio(cb->orig_bio);
+ }
+
+ /* Finally free the cb struct */
+ kfree(cb->compressed_pages);
+ kfree(cb);
+}
+
/* when we finish reading compressed pages from the disk, we
* decompress them and then run the bio end_io routines on the
* decompressed pages (in the inode address space).
@@ -206,25 +290,17 @@ static void end_compressed_bio_read(struct bio *bio)
{
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
- struct page *page;
- unsigned int index;
- unsigned int mirror = btrfs_io_bio(bio)->mirror_num;
+ unsigned int mirror = btrfs_bio(bio)->mirror_num;
int ret = 0;
- if (bio->bi_status)
- cb->errors = 1;
-
- /* if there are more bios still pending for this compressed
- * extent, just exit
- */
- if (!refcount_dec_and_test(&cb->pending_bios))
+ if (!dec_and_test_compressed_bio(cb, bio))
goto out;
/*
* Record the correct mirror_num in cb->orig_bio so that
* read-repair can work properly.
*/
- btrfs_io_bio(cb->orig_bio)->mirror_num = mirror;
+ btrfs_bio(cb->orig_bio)->mirror_num = mirror;
cb->mirror_num = mirror;
/*
@@ -248,36 +324,7 @@ static void end_compressed_bio_read(struct bio *bio)
csum_failed:
if (ret)
cb->errors = 1;
-
- /* release the compressed pages */
- index = 0;
- for (index = 0; index < cb->nr_pages; index++) {
- page = cb->compressed_pages[index];
- page->mapping = NULL;
- put_page(page);
- }
-
- /* do io completion on the original bio */
- if (cb->errors) {
- bio_io_error(cb->orig_bio);
- } else {
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- /*
- * we have verified the checksum already, set page
- * checked so the end_io handlers know about it
- */
- ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, cb->orig_bio, iter_all)
- SetPageChecked(bvec->bv_page);
-
- bio_endio(cb->orig_bio);
- }
-
- /* finally free the cb struct */
- kfree(cb->compressed_pages);
- kfree(cb);
+ finish_compressed_bio_read(cb, bio);
out:
bio_put(bio);
}
@@ -289,6 +336,7 @@ out:
static noinline void end_compressed_writeback(struct inode *inode,
const struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long index = cb->start >> PAGE_SHIFT;
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct page *pages[16];
@@ -311,7 +359,8 @@ static noinline void end_compressed_writeback(struct inode *inode,
for (i = 0; i < ret; i++) {
if (cb->errors)
SetPageError(pages[i]);
- end_page_writeback(pages[i]);
+ btrfs_page_clamp_clear_writeback(fs_info, pages[i],
+ cb->start, cb->len);
put_page(pages[i]);
}
nr_pages -= ret;
@@ -320,60 +369,127 @@ static noinline void end_compressed_writeback(struct inode *inode,
/* the inode may be gone now */
}
-/*
- * do the cleanup once all the compressed pages hit the disk.
- * This will clear writeback on the file pages and free the compressed
- * pages.
- *
- * This also calls the writeback end hooks for the file pages so that
- * metadata and checksums can be updated in the file.
- */
-static void end_compressed_bio_write(struct bio *bio)
+static void finish_compressed_bio_write(struct compressed_bio *cb)
{
- struct compressed_bio *cb = bio->bi_private;
- struct inode *inode;
- struct page *page;
+ struct inode *inode = cb->inode;
unsigned int index;
- if (bio->bi_status)
- cb->errors = 1;
-
- /* if there are more bios still pending for this compressed
- * extent, just exit
- */
- if (!refcount_dec_and_test(&cb->pending_bios))
- goto out;
-
- /* ok, we're the last bio for this extent, step one is to
- * call back into the FS and do all the end_io operations
+ /*
+ * Ok, we're the last bio for this extent, step one is to call back
+ * into the FS and do all the end_io operations.
*/
- inode = cb->inode;
- btrfs_record_physical_zoned(inode, cb->start, bio);
btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
cb->start, cb->start + cb->len - 1,
!cb->errors);
end_compressed_writeback(inode, cb);
- /* note, our inode could be gone now */
+ /* Note, our inode could be gone now */
/*
- * release the compressed pages, these came from alloc_page and
+ * Release the compressed pages, these came from alloc_page and
* are not attached to the inode at all
*/
- index = 0;
for (index = 0; index < cb->nr_pages; index++) {
- page = cb->compressed_pages[index];
+ struct page *page = cb->compressed_pages[index];
+
page->mapping = NULL;
put_page(page);
}
- /* finally free the cb struct */
+ /* Finally free the cb struct */
kfree(cb->compressed_pages);
kfree(cb);
+}
+
+/*
+ * Do the cleanup once all the compressed pages hit the disk. This will clear
+ * writeback on the file pages and free the compressed pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that metadata
+ * and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio)
+{
+ struct compressed_bio *cb = bio->bi_private;
+
+ if (!dec_and_test_compressed_bio(cb, bio))
+ goto out;
+
+ btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+
+ finish_compressed_bio_write(cb);
out:
bio_put(bio);
}
+static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
+ struct compressed_bio *cb,
+ struct bio *bio, int mirror_num)
+{
+ blk_status_t ret;
+
+ ASSERT(bio->bi_iter.bi_size);
+ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
+ if (ret)
+ return ret;
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ return ret;
+}
+
+/*
+ * Allocate a compressed_bio, which will be used to read/write on-disk
+ * (aka, compressed) * data.
+ *
+ * @cb: The compressed_bio structure, which records all the needed
+ * information to bind the compressed data to the uncompressed
+ * page cache.
+ * @disk_byten: The logical bytenr where the compressed data will be read
+ * from or written to.
+ * @endio_func: The endio function to call after the IO for compressed data
+ * is finished.
+ * @next_stripe_start: Return value of logical bytenr of where next stripe starts.
+ * Let the caller know to only fill the bio up to the stripe
+ * boundary.
+ */
+
+
+static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
+ unsigned int opf, bio_end_io_t endio_func,
+ u64 *next_stripe_start)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ struct btrfs_io_geometry geom;
+ struct extent_map *em;
+ struct bio *bio;
+ int ret;
+
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
+
+ bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+ bio->bi_opf = opf;
+ bio->bi_private = cb;
+ bio->bi_end_io = endio_func;
+
+ em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
+ if (IS_ERR(em)) {
+ bio_put(bio);
+ return ERR_CAST(em);
+ }
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+ bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev);
+
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom);
+ free_extent_map(em);
+ if (ret < 0) {
+ bio_put(bio);
+ return ERR_PTR(ret);
+ }
+ *next_stripe_start = disk_bytenr + geom.len;
+
+ return bio;
+}
+
/*
* worker function to build and submit bios for previously compressed pages.
* The corresponding pages in the inode should be marked for writeback
@@ -394,20 +510,19 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = NULL;
struct compressed_bio *cb;
- unsigned long bytes_left;
- int pg_index = 0;
- struct page *page;
- u64 first_byte = disk_start;
+ u64 cur_disk_bytenr = disk_start;
+ u64 next_stripe_start;
blk_status_t ret;
int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
const bool use_append = btrfs_use_zone_append(inode, disk_start);
const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
- WARN_ON(!PAGE_ALIGNED(start));
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(len, fs_info->sectorsize));
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
if (!cb)
return BLK_STS_RESOURCE;
- refcount_set(&cb->pending_bios, 0);
+ refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
cb->errors = 0;
cb->inode = &inode->vfs_inode;
cb->start = start;
@@ -418,118 +533,100 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
cb->orig_bio = NULL;
cb->nr_pages = nr_pages;
- bio = btrfs_bio_alloc(first_byte);
- bio->bi_opf = bio_op | write_flags;
- bio->bi_private = cb;
- bio->bi_end_io = end_compressed_bio_write;
-
- if (use_append) {
- struct btrfs_device *device;
-
- device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE);
- if (IS_ERR(device)) {
- kfree(cb);
- bio_put(bio);
- return BLK_STS_NOTSUPP;
+ while (cur_disk_bytenr < disk_start + compressed_len) {
+ u64 offset = cur_disk_bytenr - disk_start;
+ unsigned int index = offset >> PAGE_SHIFT;
+ unsigned int real_size;
+ unsigned int added;
+ struct page *page = compressed_pages[index];
+ bool submit = false;
+
+ /* Allocate new bio if submitted or not yet allocated */
+ if (!bio) {
+ bio = alloc_compressed_bio(cb, cur_disk_bytenr,
+ bio_op | write_flags, end_compressed_bio_write,
+ &next_stripe_start);
+ if (IS_ERR(bio)) {
+ ret = errno_to_blk_status(PTR_ERR(bio));
+ bio = NULL;
+ goto finish_cb;
+ }
}
-
- bio_set_dev(bio, device->bdev);
- }
-
- if (blkcg_css) {
- bio->bi_opf |= REQ_CGROUP_PUNT;
- kthread_associate_blkcg(blkcg_css);
- }
- refcount_set(&cb->pending_bios, 1);
-
- /* create and submit bios for the compressed pages */
- bytes_left = compressed_len;
- for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
- int submit = 0;
- int len = 0;
-
- page = compressed_pages[pg_index];
- page->mapping = inode->vfs_inode.i_mapping;
- if (bio->bi_iter.bi_size)
- submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
- 0);
-
/*
- * Page can only be added to bio if the current bio fits in
- * stripe.
+ * We should never reach next_stripe_start start as we will
+ * submit comp_bio when reach the boundary immediately.
*/
- if (!submit) {
- if (pg_index == 0 && use_append)
- len = bio_add_zone_append_page(bio, page,
- PAGE_SIZE, 0);
- else
- len = bio_add_page(bio, page, PAGE_SIZE, 0);
- }
-
- page->mapping = NULL;
- if (submit || len < PAGE_SIZE) {
- /*
- * inc the count before we submit the bio so
- * we know the end IO handler won't happen before
- * we inc the count. Otherwise, the cb might get
- * freed before we're done setting it up
- */
- refcount_inc(&cb->pending_bios);
- ret = btrfs_bio_wq_end_io(fs_info, bio,
- BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
+ ASSERT(cur_disk_bytenr != next_stripe_start);
+ /*
+ * We have various limits on the real read size:
+ * - stripe boundary
+ * - page boundary
+ * - compressed length boundary
+ */
+ real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr);
+ real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+ real_size = min_t(u64, real_size, compressed_len - offset);
+ ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+
+ if (use_append)
+ added = bio_add_zone_append_page(bio, page, real_size,
+ offset_in_page(offset));
+ else
+ added = bio_add_page(bio, page, real_size,
+ offset_in_page(offset));
+ /* Reached zoned boundary */
+ if (added == 0)
+ submit = true;
+
+ cur_disk_bytenr += added;
+ /* Reached stripe boundary */
+ if (cur_disk_bytenr == next_stripe_start)
+ submit = true;
+
+ /* Finished the range */
+ if (cur_disk_bytenr == disk_start + compressed_len)
+ submit = true;
+
+ if (submit) {
if (!skip_sum) {
ret = btrfs_csum_one_bio(inode, bio, start, 1);
- BUG_ON(ret); /* -ENOMEM */
- }
-
- ret = btrfs_map_bio(fs_info, bio, 0);
- if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
+ if (ret)
+ goto finish_cb;
}
- bio = btrfs_bio_alloc(first_byte);
- bio->bi_opf = bio_op | write_flags;
- bio->bi_private = cb;
- bio->bi_end_io = end_compressed_bio_write;
- if (blkcg_css)
- bio->bi_opf |= REQ_CGROUP_PUNT;
- /*
- * Use bio_add_page() to ensure the bio has at least one
- * page.
- */
- bio_add_page(bio, page, PAGE_SIZE, 0);
+ ret = submit_compressed_bio(fs_info, cb, bio, 0);
+ if (ret)
+ goto finish_cb;
+ bio = NULL;
}
- if (bytes_left < PAGE_SIZE) {
- btrfs_info(fs_info,
- "bytes left %lu compress len %u nr %u",
- bytes_left, cb->compressed_len, cb->nr_pages);
- }
- bytes_left -= PAGE_SIZE;
- first_byte += PAGE_SIZE;
cond_resched();
}
+ if (blkcg_css)
+ kthread_associate_blkcg(NULL);
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
-
- if (!skip_sum) {
- ret = btrfs_csum_one_bio(inode, bio, start, 1);
- BUG_ON(ret); /* -ENOMEM */
- }
+ return 0;
- ret = btrfs_map_bio(fs_info, bio, 0);
- if (ret) {
+finish_cb:
+ if (bio) {
bio->bi_status = ret;
bio_endio(bio);
}
+ /* Last byte of @cb is submitted, endio will free @cb */
+ if (cur_disk_bytenr == disk_start + compressed_len)
+ return ret;
- if (blkcg_css)
- kthread_associate_blkcg(NULL);
-
- return 0;
+ wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
+ (disk_start + compressed_len - cur_disk_bytenr) >>
+ fs_info->sectorsize_bits);
+ /*
+ * Even with previous bio ended, we should still have io not yet
+ * submitted, thus need to finish manually.
+ */
+ ASSERT(refcount_read(&cb->pending_sectors));
+ /* Now we are the only one referring @cb, can finish it safely. */
+ finish_compressed_bio_write(cb);
+ return ret;
}
static u64 bio_end_offset(struct bio *bio)
@@ -539,25 +636,33 @@ static u64 bio_end_offset(struct bio *bio)
return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
}
+/*
+ * Add extra pages in the same compressed file extent so that we don't need to
+ * re-read the same extent again and again.
+ *
+ * NOTE: this won't work well for subpage, as for subpage read, we lock the
+ * full page then submit bio for each compressed/regular extents.
+ *
+ * This means, if we have several sectors in the same page points to the same
+ * on-disk compressed data, we will re-read the same extent many times and
+ * this function can only help for the next page.
+ */
static noinline int add_ra_bio_pages(struct inode *inode,
u64 compressed_end,
struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long end_index;
- unsigned long pg_index;
- u64 last_offset;
+ u64 cur = bio_end_offset(cb->orig_bio);
u64 isize = i_size_read(inode);
int ret;
struct page *page;
- unsigned long nr_pages = 0;
struct extent_map *em;
struct address_space *mapping = inode->i_mapping;
struct extent_map_tree *em_tree;
struct extent_io_tree *tree;
- u64 end;
- int misses = 0;
+ int sectors_missed = 0;
- last_offset = bio_end_offset(cb->orig_bio);
em_tree = &BTRFS_I(inode)->extent_tree;
tree = &BTRFS_I(inode)->io_tree;
@@ -576,18 +681,29 @@ static noinline int add_ra_bio_pages(struct inode *inode,
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
- while (last_offset < compressed_end) {
- pg_index = last_offset >> PAGE_SHIFT;
+ while (cur < compressed_end) {
+ u64 page_end;
+ u64 pg_index = cur >> PAGE_SHIFT;
+ u32 add_size;
if (pg_index > end_index)
break;
page = xa_load(&mapping->i_pages, pg_index);
if (page && !xa_is_value(page)) {
- misses++;
- if (misses > 4)
+ sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >>
+ fs_info->sectorsize_bits;
+
+ /* Beyond threshold, no need to continue */
+ if (sectors_missed > 4)
break;
- goto next;
+
+ /*
+ * Jump to next page start as we already have page for
+ * current offset.
+ */
+ cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ continue;
}
page = __page_cache_alloc(mapping_gfp_constraint(mapping,
@@ -597,14 +713,11 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
put_page(page);
- goto next;
+ /* There is already a page, skip to page end */
+ cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ continue;
}
- /*
- * at this point, we have a locked page in the page cache
- * for these bytes in the file. But, we have to make
- * sure they map to this compressed extent on disk.
- */
ret = set_page_extent_mapped(page);
if (ret < 0) {
unlock_page(page);
@@ -612,18 +725,22 @@ static noinline int add_ra_bio_pages(struct inode *inode,
break;
}
- end = last_offset + PAGE_SIZE - 1;
- lock_extent(tree, last_offset, end);
+ page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
+ lock_extent(tree, cur, page_end);
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, last_offset,
- PAGE_SIZE);
+ em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
read_unlock(&em_tree->lock);
- if (!em || last_offset < em->start ||
- (last_offset + PAGE_SIZE > extent_map_end(em)) ||
+ /*
+ * At this point, we have a locked page in the page cache for
+ * these bytes in the file. But, we have to make sure they map
+ * to this compressed extent on disk.
+ */
+ if (!em || cur < em->start ||
+ (cur + fs_info->sectorsize > extent_map_end(em)) ||
(em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
- unlock_extent(tree, last_offset, end);
+ unlock_extent(tree, cur, page_end);
unlock_page(page);
put_page(page);
break;
@@ -641,20 +758,23 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
}
- ret = bio_add_page(cb->orig_bio, page,
- PAGE_SIZE, 0);
-
- if (ret == PAGE_SIZE) {
- nr_pages++;
- put_page(page);
- } else {
- unlock_extent(tree, last_offset, end);
+ add_size = min(em->start + em->len, page_end + 1) - cur;
+ ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
+ if (ret != add_size) {
+ unlock_extent(tree, cur, page_end);
unlock_page(page);
put_page(page);
break;
}
-next:
- last_offset += PAGE_SIZE;
+ /*
+ * If it's subpage, we also need to increase its
+ * subpage::readers number, as at endio we will decrease
+ * subpage::readers and to unlock the page.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE)
+ btrfs_subpage_start_reader(fs_info, page, cur, add_size);
+ put_page(page);
+ cur += add_size;
}
return 0;
}
@@ -679,9 +799,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
unsigned int compressed_len;
unsigned int nr_pages;
unsigned int pg_index;
- struct page *page;
- struct bio *comp_bio;
- u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
+ struct bio *comp_bio = NULL;
+ const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 cur_disk_byte = disk_bytenr;
+ u64 next_stripe_start;
u64 file_offset;
u64 em_len;
u64 em_start;
@@ -708,7 +829,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
if (!cb)
goto out;
- refcount_set(&cb->pending_bios, 0);
+ refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
cb->errors = 0;
cb->inode = inode;
cb->mirror_num = mirror_num;
@@ -748,86 +869,74 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
/* include any pages we added in add_ra-bio_pages */
cb->len = bio->bi_iter.bi_size;
- comp_bio = btrfs_bio_alloc(cur_disk_byte);
- comp_bio->bi_opf = REQ_OP_READ;
- comp_bio->bi_private = cb;
- comp_bio->bi_end_io = end_compressed_bio_read;
- refcount_set(&cb->pending_bios, 1);
-
- for (pg_index = 0; pg_index < nr_pages; pg_index++) {
- u32 pg_len = PAGE_SIZE;
- int submit = 0;
+ while (cur_disk_byte < disk_bytenr + compressed_len) {
+ u64 offset = cur_disk_byte - disk_bytenr;
+ unsigned int index = offset >> PAGE_SHIFT;
+ unsigned int real_size;
+ unsigned int added;
+ struct page *page = cb->compressed_pages[index];
+ bool submit = false;
+
+ /* Allocate new bio if submitted or not yet allocated */
+ if (!comp_bio) {
+ comp_bio = alloc_compressed_bio(cb, cur_disk_byte,
+ REQ_OP_READ, end_compressed_bio_read,
+ &next_stripe_start);
+ if (IS_ERR(comp_bio)) {
+ ret = errno_to_blk_status(PTR_ERR(comp_bio));
+ comp_bio = NULL;
+ goto finish_cb;
+ }
+ }
+ /*
+ * We should never reach next_stripe_start start as we will
+ * submit comp_bio when reach the boundary immediately.
+ */
+ ASSERT(cur_disk_byte != next_stripe_start);
+ /*
+ * We have various limit on the real read size:
+ * - stripe boundary
+ * - page boundary
+ * - compressed length boundary
+ */
+ real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte);
+ real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+ real_size = min_t(u64, real_size, compressed_len - offset);
+ ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+ added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset));
/*
- * To handle subpage case, we need to make sure the bio only
- * covers the range we need.
- *
- * If we're at the last page, truncate the length to only cover
- * the remaining part.
+ * Maximum compressed extent is smaller than bio size limit,
+ * thus bio_add_page() should always success.
*/
- if (pg_index == nr_pages - 1)
- pg_len = min_t(u32, PAGE_SIZE,
- compressed_len - pg_index * PAGE_SIZE);
+ ASSERT(added == real_size);
+ cur_disk_byte += added;
- page = cb->compressed_pages[pg_index];
- page->mapping = inode->i_mapping;
- page->index = em_start >> PAGE_SHIFT;
+ /* Reached stripe boundary, need to submit */
+ if (cur_disk_byte == next_stripe_start)
+ submit = true;
- if (comp_bio->bi_iter.bi_size)
- submit = btrfs_bio_fits_in_stripe(page, pg_len,
- comp_bio, 0);
+ /* Has finished the range, need to submit */
+ if (cur_disk_byte == disk_bytenr + compressed_len)
+ submit = true;
- page->mapping = NULL;
- if (submit || bio_add_page(comp_bio, page, pg_len, 0) < pg_len) {
+ if (submit) {
unsigned int nr_sectors;
- ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
- BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
-
- /*
- * inc the count before we submit the bio so
- * we know the end IO handler won't happen before
- * we inc the count. Otherwise, the cb might get
- * freed before we're done setting it up
- */
- refcount_inc(&cb->pending_bios);
-
ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ goto finish_cb;
nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
fs_info->sectorsize);
sums += fs_info->csum_size * nr_sectors;
- ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
- if (ret) {
- comp_bio->bi_status = ret;
- bio_endio(comp_bio);
- }
-
- comp_bio = btrfs_bio_alloc(cur_disk_byte);
- comp_bio->bi_opf = REQ_OP_READ;
- comp_bio->bi_private = cb;
- comp_bio->bi_end_io = end_compressed_bio_read;
-
- bio_add_page(comp_bio, page, pg_len, 0);
+ ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num);
+ if (ret)
+ goto finish_cb;
+ comp_bio = NULL;
}
- cur_disk_byte += pg_len;
}
-
- ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
-
- ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
- BUG_ON(ret); /* -ENOMEM */
-
- ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
- if (ret) {
- comp_bio->bi_status = ret;
- bio_endio(comp_bio);
- }
-
return 0;
fail2:
@@ -842,6 +951,26 @@ fail1:
out:
free_extent_map(em);
return ret;
+finish_cb:
+ if (comp_bio) {
+ comp_bio->bi_status = ret;
+ bio_endio(comp_bio);
+ }
+ /* All bytes of @cb is submitted, endio will free @cb */
+ if (cur_disk_byte == disk_bytenr + compressed_len)
+ return ret;
+
+ wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
+ (disk_bytenr + compressed_len - cur_disk_byte) >>
+ fs_info->sectorsize_bits);
+ /*
+ * Even with previous bio ended, we should still have io not yet
+ * submitted, thus need to finish @cb manually.
+ */
+ ASSERT(refcount_read(&cb->pending_sectors));
+ /* Now we are the only one referring @cb, can finish it safely. */
+ finish_compressed_bio_read(cb, NULL);
+ return ret;
}
/*
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 399be0b435bf..56eef0821e3e 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -28,8 +28,8 @@ struct btrfs_inode;
#define BTRFS_ZLIB_DEFAULT_LEVEL 3
struct compressed_bio {
- /* number of bios pending for this compressed extent */
- refcount_t pending_bios;
+ /* Number of sectors with unfinished IO (unsubmitted or unfinished) */
+ refcount_t pending_sectors;
/* Number of compressed pages in the array */
unsigned int nr_pages;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 84627cbd5b5b..c3983bdaf4b8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/mm.h>
+#include <linux/error-injection.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -395,7 +396,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (*cow_ret == buf)
unlock_orig = 1;
- btrfs_assert_tree_locked(buf);
+ btrfs_assert_tree_write_locked(buf);
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
@@ -2487,7 +2488,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
int ret;
BUG_ON(!path->nodes[level]);
- btrfs_assert_tree_locked(path->nodes[level]);
+ btrfs_assert_tree_write_locked(path->nodes[level]);
lower = path->nodes[level];
nritems = btrfs_header_nritems(lower);
BUG_ON(slot > nritems);
@@ -2827,7 +2828,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (slot >= btrfs_header_nritems(upper) - 1)
return 1;
- btrfs_assert_tree_locked(path->nodes[1]);
+ btrfs_assert_tree_write_locked(path->nodes[1]);
right = btrfs_read_node_slot(upper, slot + 1);
/*
@@ -3065,7 +3066,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
if (right_nritems == 0)
return 1;
- btrfs_assert_tree_locked(path->nodes[1]);
+ btrfs_assert_tree_write_locked(path->nodes[1]);
left = btrfs_read_node_slot(path->nodes[1], slot - 1);
/*
@@ -3581,40 +3582,6 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
}
/*
- * This function duplicate a item, giving 'new_key' to the new item.
- * It guarantees both items live in the same tree leaf and the new item
- * is contiguous with the original item.
- *
- * This allows us to split file extent in place, keeping a lock on the
- * leaf the entire time.
- */
-int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const struct btrfs_key *new_key)
-{
- struct extent_buffer *leaf;
- int ret;
- u32 item_size;
-
- leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
- ret = setup_leaf_for_split(trans, root, path,
- item_size + sizeof(struct btrfs_item));
- if (ret)
- return ret;
-
- path->slots[0]++;
- setup_items_for_insert(root, path, new_key, &item_size, 1);
- leaf = path->nodes[0];
- memcpy_extent_buffer(leaf,
- btrfs_item_ptr_offset(leaf, path->slots[0]),
- btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
- item_size);
- return 0;
-}
-
-/*
* make the item pointed to by the path smaller. new_size indicates
* how small to make it, and from_end tells us if we just chop bytes
* off the end of the item or if we shift the item to chop bytes off
@@ -3785,13 +3752,10 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
*
* @root: root we are inserting items to
* @path: points to the leaf/slot where we are going to insert new items
- * @cpu_key: array of keys for items to be inserted
- * @data_size: size of the body of each item we are going to insert
- * @nr: size of @cpu_key/@data_size arrays
+ * @batch: information about the batch of items to insert
*/
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr)
+static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+ const struct btrfs_item_batch *batch)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_item *item;
@@ -3803,14 +3767,14 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
int slot;
struct btrfs_map_token token;
u32 total_size;
- u32 total_data = 0;
-
- for (i = 0; i < nr; i++)
- total_data += data_size[i];
- total_size = total_data + (nr * sizeof(struct btrfs_item));
+ /*
+ * Before anything else, update keys in the parent and other ancestors
+ * if needed, then release the write locks on them, so that other tasks
+ * can use them while we modify the leaf.
+ */
if (path->slots[0] == 0) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+ btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
fixup_low_keys(path, &disk_key, 1);
}
btrfs_unlock_up_safe(path, 1);
@@ -3820,6 +3784,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(leaf);
+ total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
if (btrfs_leaf_free_space(leaf) < total_size) {
btrfs_print_leaf(leaf);
@@ -3849,31 +3814,32 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
item = btrfs_item_nr(i);
ioff = btrfs_token_item_offset(&token, item);
btrfs_set_token_item_offset(&token, item,
- ioff - total_data);
+ ioff - batch->total_data_size);
}
/* shift the items */
- memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
btrfs_item_nr_offset(slot),
(nritems - slot) * sizeof(struct btrfs_item));
/* shift the data */
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
- data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
- data_end, old_data - data_end);
+ data_end - batch->total_data_size,
+ BTRFS_LEAF_DATA_OFFSET + data_end,
+ old_data - data_end);
data_end = old_data;
}
/* setup the item for the new data */
- for (i = 0; i < nr; i++) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+ for (i = 0; i < batch->nr; i++) {
+ btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
btrfs_set_item_key(leaf, &disk_key, slot + i);
item = btrfs_item_nr(slot + i);
- data_end -= data_size[i];
+ data_end -= batch->data_sizes[i];
btrfs_set_token_item_offset(&token, item, data_end);
- btrfs_set_token_item_size(&token, item, data_size[i]);
+ btrfs_set_token_item_size(&token, item, batch->data_sizes[i]);
}
- btrfs_set_header_nritems(leaf, nritems + nr);
+ btrfs_set_header_nritems(leaf, nritems + batch->nr);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
@@ -3883,26 +3849,43 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
}
/*
+ * Insert a new item into a leaf.
+ *
+ * @root: The root of the btree.
+ * @path: A path pointing to the target leaf and slot.
+ * @key: The key of the new item.
+ * @data_size: The size of the data associated with the new key.
+ */
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key,
+ u32 data_size)
+{
+ struct btrfs_item_batch batch;
+
+ batch.keys = key;
+ batch.data_sizes = &data_size;
+ batch.total_data_size = data_size;
+ batch.nr = 1;
+
+ setup_items_for_insert(root, path, &batch);
+}
+
+/*
* Given a key and some data, insert items into the tree.
* This does all the path init required, making room in the tree if needed.
*/
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr)
+ const struct btrfs_item_batch *batch)
{
int ret = 0;
int slot;
- int i;
- u32 total_size = 0;
- u32 total_data = 0;
-
- for (i = 0; i < nr; i++)
- total_data += data_size[i];
+ u32 total_size;
- total_size = total_data + (nr * sizeof(struct btrfs_item));
- ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+ total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
+ ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
if (ret == 0)
return -EEXIST;
if (ret < 0)
@@ -3911,7 +3894,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
slot = path->slots[0];
BUG_ON(slot < 0);
- setup_items_for_insert(root, path, cpu_key, data_size, nr);
+ setup_items_for_insert(root, path, batch);
return 0;
}
@@ -3943,6 +3926,40 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
/*
+ * This function duplicates an item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item is
+ * contiguous with the original item.
+ *
+ * This allows us to split a file extent in place, keeping a lock on the leaf
+ * the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *new_key)
+{
+ struct extent_buffer *leaf;
+ int ret;
+ u32 item_size;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ret = setup_leaf_for_split(trans, root, path,
+ item_size + sizeof(struct btrfs_item));
+ if (ret)
+ return ret;
+
+ path->slots[0]++;
+ btrfs_setup_item_for_insert(root, path, new_key, item_size);
+ leaf = path->nodes[0];
+ memcpy_extent_buffer(leaf,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+ item_size);
+ return 0;
+}
+
+/*
* delete the pointer from a given node.
*
* the tree should have been previously balanced so the deletion does not
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dff2c8a3e059..7553e9dc5f93 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,6 +48,7 @@ extern struct kmem_cache *btrfs_free_space_cachep;
extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
struct btrfs_ordered_sum;
struct btrfs_ref;
+struct btrfs_bio;
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
@@ -217,6 +218,9 @@ struct btrfs_root_backup {
u8 unused_8[10];
} __attribute__ ((__packed__));
+#define BTRFS_SUPER_INFO_OFFSET SZ_64K
+#define BTRFS_SUPER_INFO_SIZE 4096
+
/*
* the super block basically lists the main trees of the FS
* it currently lacks any block count etc etc
@@ -269,7 +273,11 @@ struct btrfs_super_block {
__le64 reserved[28];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
+
+ /* Padded to 4096 bytes */
+ u8 padding[565];
} __attribute__ ((__packed__));
+static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
/*
* Compat flags that we support. If any incompat flags are set other than the
@@ -899,6 +907,7 @@ struct btrfs_fs_info {
struct btrfs_workqueue *scrub_workers;
struct btrfs_workqueue *scrub_wr_completion_workers;
struct btrfs_workqueue *scrub_parity_workers;
+ struct btrfs_subpage_info *subpage_info;
struct btrfs_discard_ctl discard_ctl;
@@ -1017,6 +1026,16 @@ struct btrfs_fs_info {
spinlock_t treelog_bg_lock;
u64 treelog_bg;
+ /*
+ * Start of the dedicated data relocation block group, protected by
+ * relocation_bg_lock.
+ */
+ spinlock_t relocation_bg_lock;
+ u64 data_reloc_bg;
+
+ spinlock_t zone_active_bgs_lock;
+ struct list_head zone_active_bgs;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -2885,16 +2904,42 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
return btrfs_del_items(trans, root, path, path->slots[0], 1);
}
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr);
+/*
+ * Describes a batch of items to insert in a btree. This is used by
+ * btrfs_insert_empty_items().
+ */
+struct btrfs_item_batch {
+ /*
+ * Pointer to an array containing the keys of the items to insert (in
+ * sorted order).
+ */
+ const struct btrfs_key *keys;
+ /* Pointer to an array containing the data size for each item to insert. */
+ const u32 *data_sizes;
+ /*
+ * The sum of data sizes for all items. The caller can compute this while
+ * setting up the data_sizes array, so it ends up being more efficient
+ * than having btrfs_insert_empty_items() or setup_item_for_insert()
+ * doing it, as it would avoid an extra loop over a potentially large
+ * array, and in the case of setup_item_for_insert(), we would be doing
+ * it while holding a write lock on a leaf and often on upper level nodes
+ * too, unnecessarily increasing the size of a critical section.
+ */
+ u32 total_data_size;
+ /* Size of the keys and data_sizes arrays (number of items in the batch). */
+ int nr;
+};
+
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key,
+ u32 data_size);
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, void *data, u32 data_size);
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr);
+ const struct btrfs_item_batch *batch);
static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -2902,7 +2947,14 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
const struct btrfs_key *key,
u32 data_size)
{
- return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+ struct btrfs_item_batch batch;
+
+ batch.keys = key;
+ batch.data_sizes = &data_size;
+ batch.total_data_size = data_size;
+ batch.nr = 1;
+
+ return btrfs_insert_empty_items(trans, root, path, &batch);
}
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
@@ -3030,7 +3082,7 @@ struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
- u64 objectid, const char *name, int name_len,
+ u64 index, const char *name, int name_len,
int mod);
struct btrfs_dir_item *
btrfs_search_dir_index_item(struct btrfs_root *root,
@@ -3129,8 +3181,9 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
-unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end);
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page,
+ u64 start, u64 end);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
@@ -3142,7 +3195,6 @@ void __btrfs_del_delalloc_inode(struct btrfs_root *root,
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *dir, struct btrfs_inode *inode,
const char *name, int name_len);
int btrfs_add_link(struct btrfs_trans_handle *trans,
@@ -3174,8 +3226,6 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
struct extent_state *other);
void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split);
-int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
- unsigned long bio_flags);
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
@@ -3242,9 +3292,9 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
int __pure btrfs_is_empty_uuid(u8 *uuid);
-int btrfs_defrag_file(struct inode *inode, struct file *file,
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
- u64 newer_than, unsigned long max_pages);
+ u64 newer_than, unsigned long max_to_defrag);
void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
@@ -3563,6 +3613,9 @@ do { \
(errno), fmt, ##args); \
} while (0)
+#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
+ &(fs_info)->fs_state)))
+
__printf(5, 6)
__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
@@ -3842,6 +3895,11 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
return fs_info->zoned != 0;
}
+static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
+{
+ return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
+}
+
/*
* We use page status Private2 to indicate there is an ordered extent with
* unfinished IO.
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 1e08eb2b27f0..e164766dcc38 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -679,19 +679,18 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_item *first_item)
{
- LIST_HEAD(batch);
+ LIST_HEAD(item_list);
struct btrfs_delayed_item *curr;
struct btrfs_delayed_item *next;
const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
+ struct btrfs_item_batch batch;
int total_size;
- int nitems;
char *ins_data = NULL;
- struct btrfs_key *ins_keys;
- u32 *ins_sizes;
int ret;
- list_add_tail(&first_item->tree_list, &batch);
- nitems = 1;
+ list_add_tail(&first_item->tree_list, &item_list);
+ batch.total_data_size = first_item->data_len;
+ batch.nr = 1;
total_size = first_item->data_len + sizeof(struct btrfs_item);
curr = first_item;
@@ -706,39 +705,43 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
if (total_size + next_size > max_size)
break;
- list_add_tail(&next->tree_list, &batch);
- nitems++;
+ list_add_tail(&next->tree_list, &item_list);
+ batch.nr++;
total_size += next_size;
+ batch.total_data_size += next->data_len;
curr = next;
}
- if (nitems == 1) {
- ins_keys = &first_item->key;
- ins_sizes = &first_item->data_len;
+ if (batch.nr == 1) {
+ batch.keys = &first_item->key;
+ batch.data_sizes = &first_item->data_len;
} else {
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
int i = 0;
- ins_data = kmalloc(nitems * sizeof(u32) +
- nitems * sizeof(struct btrfs_key), GFP_NOFS);
+ ins_data = kmalloc(batch.nr * sizeof(u32) +
+ batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
if (!ins_data) {
ret = -ENOMEM;
goto out;
}
ins_sizes = (u32 *)ins_data;
- ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32));
- list_for_each_entry(curr, &batch, tree_list) {
+ ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ list_for_each_entry(curr, &item_list, tree_list) {
ins_keys[i] = curr->key;
ins_sizes[i] = curr->data_len;
i++;
}
}
- ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes,
- nitems);
+ ret = btrfs_insert_empty_items(trans, root, path, &batch);
if (ret)
goto out;
- list_for_each_entry(curr, &batch, tree_list) {
+ list_for_each_entry(curr, &item_list, tree_list) {
char *data_ptr;
data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
@@ -754,7 +757,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
*/
btrfs_release_path(path);
- list_for_each_entry_safe(curr, next, &batch, tree_list) {
+ list_for_each_entry_safe(curr, next, &item_list, tree_list) {
list_del(&curr->tree_list);
btrfs_delayed_item_release_metadata(root, curr);
btrfs_release_delayed_item(curr);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ca848b183474..cca7e85e32dd 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -906,7 +906,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
u64 parent = generic_ref->parent;
u8 ref_type;
- is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID);
+ is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
BUG_ON(extent_op && extent_op->is_data);
@@ -921,8 +921,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- is_fstree(generic_ref->real_root) &&
- is_fstree(generic_ref->tree_ref.root) &&
!generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
@@ -938,14 +936,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
ref_type = BTRFS_TREE_BLOCK_REF_KEY;
init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
- generic_ref->tree_ref.root, action, ref_type);
- ref->root = generic_ref->tree_ref.root;
+ generic_ref->tree_ref.owning_root, action,
+ ref_type);
+ ref->root = generic_ref->tree_ref.owning_root;
ref->parent = parent;
ref->level = level;
init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
- generic_ref->tree_ref.root, 0, action, false,
- is_system);
+ generic_ref->tree_ref.owning_root, 0, action,
+ false, is_system);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -997,7 +996,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
u64 bytenr = generic_ref->bytenr;
u64 num_bytes = generic_ref->len;
u64 parent = generic_ref->parent;
- u64 ref_root = generic_ref->data_ref.ref_root;
+ u64 ref_root = generic_ref->data_ref.owning_root;
u64 owner = generic_ref->data_ref.ino;
u64 offset = generic_ref->data_ref.offset;
u8 ref_type;
@@ -1026,8 +1025,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- is_fstree(ref_root) &&
- is_fstree(generic_ref->real_root) &&
!generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index e22fba272e4f..91a3aabad150 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -186,8 +186,8 @@ enum btrfs_ref_type {
struct btrfs_data_ref {
/* For EXTENT_DATA_REF */
- /* Root which refers to this data extent */
- u64 ref_root;
+ /* Original root this data extent belongs to */
+ u64 owning_root;
/* Inode which refers to this data extent */
u64 ino;
@@ -210,11 +210,11 @@ struct btrfs_tree_ref {
int level;
/*
- * Root which refers to this tree block.
+ * Root which owns this tree block.
*
* For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
*/
- u64 root;
+ u64 owning_root;
/* For non-skinny metadata, no special member needed */
};
@@ -231,17 +231,10 @@ struct btrfs_ref {
*/
bool skip_qgroup;
- /*
- * Optional. For which root is this modification.
- * Mostly used for qgroup optimization.
- *
- * When unset, data/tree ref init code will populate it.
- * In certain cases, we're modifying reference for a different root.
- * E.g. COW fs tree blocks for balance.
- * In that case, tree_ref::root will be fs tree, but we're doing this
- * for reloc tree, then we should set @real_root to reloc tree.
- */
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ /* Through which root is this modification. */
u64 real_root;
+#endif
u64 bytenr;
u64 len;
@@ -271,26 +264,40 @@ static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
}
static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref,
- int level, u64 root)
+ int level, u64 root, u64 mod_root, bool skip_qgroup)
{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
/* If @real_root not set, use @root as fallback */
- if (!generic_ref->real_root)
- generic_ref->real_root = root;
+ generic_ref->real_root = mod_root ?: root;
+#endif
generic_ref->tree_ref.level = level;
- generic_ref->tree_ref.root = root;
+ generic_ref->tree_ref.owning_root = root;
generic_ref->type = BTRFS_REF_METADATA;
+ if (skip_qgroup || !(is_fstree(root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
+
}
static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
- u64 ref_root, u64 ino, u64 offset)
+ u64 ref_root, u64 ino, u64 offset, u64 mod_root,
+ bool skip_qgroup)
{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
/* If @real_root not set, use @root as fallback */
- if (!generic_ref->real_root)
- generic_ref->real_root = ref_root;
- generic_ref->data_ref.ref_root = ref_root;
+ generic_ref->real_root = mod_root ?: ref_root;
+#endif
+ generic_ref->data_ref.owning_root = ref_root;
generic_ref->data_ref.ino = ino;
generic_ref->data_ref.offset = offset;
generic_ref->type = BTRFS_REF_DATA;
+ if (skip_qgroup || !(is_fstree(ref_root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
}
static inline struct btrfs_delayed_extent_op *
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index d029be40ea6f..c85a7d44da79 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -70,6 +70,7 @@ static int btrfs_dev_replace_kthread(void *data);
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID };
struct btrfs_key key;
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
@@ -100,8 +101,7 @@ no_valid_dev_replace_entry_found:
* We don't have a replace item or it's corrupted. If there is
* a replace target, fail the mount.
*/
- if (btrfs_find_device(fs_info->fs_devices,
- BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+ if (btrfs_find_device(fs_info->fs_devices, &args)) {
btrfs_err(fs_info,
"found replace target device without a valid replace item");
ret = -EUCLEAN;
@@ -163,8 +163,7 @@ no_valid_dev_replace_entry_found:
* We don't have an active replace item but if there is a
* replace target, fail the mount.
*/
- if (btrfs_find_device(fs_info->fs_devices,
- BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+ if (btrfs_find_device(fs_info->fs_devices, &args)) {
btrfs_err(fs_info,
"replace devid present without an active replace item");
ret = -EUCLEAN;
@@ -175,11 +174,10 @@ no_valid_dev_replace_entry_found:
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
- dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
- src_devid, NULL, NULL);
- dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
- BTRFS_DEV_REPLACE_DEVID,
- NULL, NULL);
+ dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args);
+ args.devid = src_devid;
+ dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args);
+
/*
* allow 'btrfs dev replace_cancel' if src/tgt device is
* missing
@@ -283,8 +281,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
}
- if (i_size_read(bdev->bd_inode) <
- btrfs_device_get_total_bytes(srcdev)) {
+ if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) {
btrfs_err(fs_info,
"target device is smaller than source device!");
ret = -EINVAL;
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index f1274d5c3805..7721ce0c0604 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -190,9 +190,20 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir(
}
/*
- * lookup a directory item based on name. 'dir' is the objectid
- * we're searching in, and 'mod' tells us if you plan on deleting the
- * item (use mod < 0) or changing the options (use mod > 0)
+ * Lookup for a directory item by name.
+ *
+ * @trans: The transaction handle to use. Can be NULL if @mod is 0.
+ * @root: The root of the target tree.
+ * @path: Path to use for the search.
+ * @dir: The inode number (objectid) of the directory.
+ * @name: The name associated to the directory entry we are looking for.
+ * @name_len: The length of the name.
+ * @mod: Used to indicate if the tree search is meant for a read only
+ * lookup, for a modification lookup or for a deletion lookup, so
+ * its value should be 0, 1 or -1, respectively.
+ *
+ * Returns: NULL if the dir item does not exists, an error pointer if an error
+ * happened, or a pointer to a dir item if a dir item exists for the given name.
*/
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -273,27 +284,42 @@ out:
}
/*
- * lookup a directory item based on index. 'dir' is the objectid
- * we're searching in, and 'mod' tells us if you plan on deleting the
- * item (use mod < 0) or changing the options (use mod > 0)
+ * Lookup for a directory index item by name and index number.
*
- * The name is used to make sure the index really points to the name you were
- * looking for.
+ * @trans: The transaction handle to use. Can be NULL if @mod is 0.
+ * @root: The root of the target tree.
+ * @path: Path to use for the search.
+ * @dir: The inode number (objectid) of the directory.
+ * @index: The index number.
+ * @name: The name associated to the directory entry we are looking for.
+ * @name_len: The length of the name.
+ * @mod: Used to indicate if the tree search is meant for a read only
+ * lookup, for a modification lookup or for a deletion lookup, so
+ * its value should be 0, 1 or -1, respectively.
+ *
+ * Returns: NULL if the dir index item does not exists, an error pointer if an
+ * error happened, or a pointer to a dir item if the dir index item exists and
+ * matches the criteria (name and index number).
*/
struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
- u64 objectid, const char *name, int name_len,
+ u64 index, const char *name, int name_len,
int mod)
{
+ struct btrfs_dir_item *di;
struct btrfs_key key;
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
- key.offset = objectid;
+ key.offset = index;
- return btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ if (di == ERR_PTR(-ENOENT))
+ return NULL;
+
+ return di;
}
struct btrfs_dir_item *
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 355ea88d5c5f..59c3be8c1f4c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -683,7 +683,7 @@ err:
return ret;
}
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
struct page *page, u64 start, u64 end,
int mirror)
{
@@ -1036,7 +1036,7 @@ static int btree_set_page_dirty(struct page *page)
BUG_ON(!eb);
BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
BUG_ON(!atomic_read(&eb->refs));
- btrfs_assert_tree_locked(eb);
+ btrfs_assert_tree_write_locked(eb);
return __set_page_dirty_nobuffers(page);
}
ASSERT(PagePrivate(page) && page->private);
@@ -1061,7 +1061,7 @@ static int btree_set_page_dirty(struct page *page)
ASSERT(eb);
ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
ASSERT(atomic_read(&eb->refs));
- btrfs_assert_tree_locked(eb);
+ btrfs_assert_tree_write_locked(eb);
free_extent_buffer(eb);
cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
@@ -1125,7 +1125,7 @@ void btrfs_clean_tree_block(struct extent_buffer *buf)
struct btrfs_fs_info *fs_info = buf->fs_info;
if (btrfs_header_generation(buf) ==
fs_info->running_transaction->transid) {
- btrfs_assert_tree_locked(buf);
+ btrfs_assert_tree_write_locked(buf);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
@@ -1500,7 +1500,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
goto fail;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
- root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ !btrfs_is_data_reloc_root(root)) {
set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1644,6 +1644,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
+ kfree(fs_info->subpage_info);
kvfree(fs_info);
}
@@ -1953,8 +1954,7 @@ sleep:
wake_up_process(fs_info->cleaner_kthread);
mutex_unlock(&fs_info->transaction_kthread_mutex);
- if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
- &fs_info->fs_state)))
+ if (BTRFS_FS_ERROR(fs_info))
btrfs_cleanup_transaction(fs_info);
if (!kthread_should_stop() &&
(!btrfs_transaction_blocked(fs_info) ||
@@ -2592,8 +2592,7 @@ static int validate_super(struct btrfs_fs_info *fs_info,
/*
* For 4K page size, we only support 4K sector size.
- * For 64K page size, we support read-write for 64K sector size, and
- * read-only for 4K sector size.
+ * For 64K page size, we support 64K and 4K sector sizes.
*/
if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
(PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
@@ -2883,6 +2882,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
spin_lock_init(&fs_info->treelog_bg_lock);
+ spin_lock_init(&fs_info->zone_active_bgs_lock);
+ spin_lock_init(&fs_info->relocation_bg_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
mutex_init(&fs_info->reclaim_bgs_lock);
@@ -2896,6 +2897,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->unused_bgs);
INIT_LIST_HEAD(&fs_info->reclaim_bgs);
+ INIT_LIST_HEAD(&fs_info->zone_active_bgs);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&fs_info->allocated_roots);
INIT_LIST_HEAD(&fs_info->allocated_ebs);
@@ -3228,12 +3230,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
btrfs_init_btree_inode(fs_info);
- invalidate_bdev(fs_devices->latest_bdev);
+ invalidate_bdev(fs_devices->latest_dev->bdev);
/*
* Read super block and check the signature bytes only
*/
- disk_super = btrfs_read_dev_super(fs_devices->latest_bdev);
+ disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
if (IS_ERR(disk_super)) {
err = PTR_ERR(disk_super);
goto fail_alloc;
@@ -3392,12 +3394,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
- if (sectorsize != PAGE_SIZE) {
+ if (sectorsize < PAGE_SIZE) {
+ struct btrfs_subpage_info *subpage_info;
+
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
- }
- if (sectorsize != PAGE_SIZE) {
if (btrfs_super_incompat_flags(fs_info->super_copy) &
BTRFS_FEATURE_INCOMPAT_RAID56) {
btrfs_err(fs_info,
@@ -3406,6 +3408,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
err = -EINVAL;
goto fail_alloc;
}
+ subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
+ if (!subpage_info)
+ goto fail_alloc;
+ btrfs_init_subpage_info(subpage_info, sectorsize);
+ fs_info->subpage_info = subpage_info;
}
ret = btrfs_init_workqueues(fs_info, fs_devices);
@@ -3465,7 +3472,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
* below in btrfs_init_dev_replace().
*/
btrfs_free_extra_devids(fs_devices);
- if (!fs_devices->latest_bdev) {
+ if (!fs_devices->latest_dev->bdev) {
btrfs_err(fs_info, "failed to read devices");
goto fail_tree_roots;
}
@@ -3556,7 +3563,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sysfs;
}
- if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
+ if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
+ !btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
"writable mount is not allowed due to too many missing devices");
goto fail_sysfs;
@@ -3740,7 +3748,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
else if (ret)
return ERR_PTR(ret);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
@@ -3881,7 +3889,9 @@ static int write_dev_supers(struct btrfs_device *device,
bio->bi_opf |= REQ_FUA;
btrfsic_submit_bio(bio);
- btrfs_advance_sb_log(device, i);
+
+ if (btrfs_advance_sb_log(device, i))
+ errors++;
}
return errors < i ? 0 : -1;
}
@@ -4221,7 +4231,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
drop_ref = true;
spin_unlock(&fs_info->fs_roots_radix_lock);
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
ASSERT(root->log_root == NULL);
if (root->reloc_root) {
btrfs_put_root(root->reloc_root);
@@ -4372,8 +4382,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "commit super ret %d", ret);
}
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
- test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
btrfs_error_commit_super(fs_info);
kthread_stop(fs_info->transaction_kthread);
@@ -4470,7 +4479,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
return;
#endif
- btrfs_assert_tree_locked(buf);
+ btrfs_assert_tree_write_locked(buf);
if (transid != fs_info->generation)
WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
buf->start, transid, fs_info->generation);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 0e7e9526b6a8..a2b5db4ba262 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -6,9 +6,6 @@
#ifndef BTRFS_DISK_IO_H
#define BTRFS_DISK_IO_H
-#define BTRFS_SUPER_INFO_OFFSET SZ_64K
-#define BTRFS_SUPER_INFO_SIZE 4096
-
#define BTRFS_SUPER_MIRROR_MAX 3
#define BTRFS_SUPER_MIRROR_SHIFT 12
@@ -81,7 +78,7 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
struct page *page, u64 start, u64 end,
int mirror);
blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fc3da7585fb7..3fd736a02c1e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1266,7 +1266,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
return ret;
}
-static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes)
+static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
{
struct btrfs_device *dev = stripe->dev;
struct btrfs_fs_info *fs_info = dev->fs_info;
@@ -1313,22 +1313,21 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 discarded_bytes = 0;
u64 end = bytenr + num_bytes;
u64 cur = bytenr;
- struct btrfs_bio *bbio = NULL;
-
+ struct btrfs_io_context *bioc = NULL;
/*
- * Avoid races with device replace and make sure our bbio has devices
+ * Avoid races with device replace and make sure our bioc has devices
* associated to its stripes that don't go away while we are discarding.
*/
btrfs_bio_counter_inc_blocked(fs_info);
while (cur < end) {
- struct btrfs_bio_stripe *stripe;
+ struct btrfs_io_stripe *stripe;
int i;
num_bytes = end - cur;
/* Tell the block device(s) that the sectors can be discarded */
ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
- &num_bytes, &bbio, 0);
+ &num_bytes, &bioc, 0);
/*
* Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
* -EOPNOTSUPP. For any such error, @num_bytes is not updated,
@@ -1337,8 +1336,8 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
if (ret < 0)
goto out;
- stripe = bbio->stripes;
- for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+ stripe = bioc->stripes;
+ for (i = 0; i < bioc->num_stripes; i++, stripe++) {
u64 bytes;
struct btrfs_device *device = stripe->dev;
@@ -1361,7 +1360,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
* And since there are two loops, explicitly
* go to out to avoid confusion.
*/
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
goto out;
}
@@ -1372,7 +1371,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
*/
ret = 0;
}
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
cur += num_bytes;
}
out:
@@ -1397,7 +1396,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
generic_ref->action);
BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
- generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
+ generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
if (generic_ref->type == BTRFS_REF_METADATA)
ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
@@ -2376,7 +2375,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
out:
btrfs_free_path(path);
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
WARN_ON(ret > 0);
return ret;
}
@@ -2438,10 +2437,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
btrfs_init_generic_ref(&generic_ref, action, bytenr,
num_bytes, parent);
- generic_ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
- key.offset);
- generic_ref.skip_qgroup = for_reloc;
+ key.offset, root->root_key.objectid,
+ for_reloc);
if (inc)
ret = btrfs_inc_extent_ref(trans, &generic_ref);
else
@@ -2453,9 +2451,8 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
num_bytes = fs_info->nodesize;
btrfs_init_generic_ref(&generic_ref, action, bytenr,
num_bytes, parent);
- generic_ref.real_root = root->root_key.objectid;
- btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
- generic_ref.skip_qgroup = for_reloc;
+ btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
+ root->root_key.objectid, for_reloc);
if (inc)
ret = btrfs_inc_extent_ref(trans, &generic_ref);
else
@@ -3196,7 +3193,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
- ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3289,7 +3286,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
buf->start, buf->len, parent);
btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
- root->root_key.objectid);
+ root->root_key.objectid, 0, false);
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
btrfs_ref_tree_mod(fs_info, &generic_ref);
@@ -3373,9 +3370,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
* tree, just update pinning info and exit early.
*/
if ((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+ ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
(ref->type == BTRFS_REF_DATA &&
- ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
+ ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
/* unlocks the pinned mutex */
btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
ret = 0;
@@ -3386,9 +3383,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
}
if (!((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+ ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
(ref->type == BTRFS_REF_DATA &&
- ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
+ ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
btrfs_ref_tree_mod(fs_info, ref);
return ret;
@@ -3476,7 +3473,9 @@ enum btrfs_extent_allocation_policy {
*/
struct find_free_extent_ctl {
/* Basic allocation info */
+ u64 ram_bytes;
u64 num_bytes;
+ u64 min_alloc_size;
u64 empty_size;
u64 flags;
int delalloc;
@@ -3495,6 +3494,9 @@ struct find_free_extent_ctl {
/* Allocation is called for tree-log */
bool for_treelog;
+ /* Allocation is called for data relocation */
+ bool for_data_reloc;
+
/* RAID index, converted from flags */
int index;
@@ -3756,8 +3758,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
u64 avail;
u64 bytenr = block_group->start;
u64 log_bytenr;
+ u64 data_reloc_bytenr;
int ret = 0;
- bool skip;
+ bool skip = false;
ASSERT(btrfs_is_zoned(block_group->fs_info));
@@ -3767,19 +3770,49 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
*/
spin_lock(&fs_info->treelog_bg_lock);
log_bytenr = fs_info->treelog_bg;
- skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
- (!ffe_ctl->for_treelog && bytenr == log_bytenr));
+ if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
+ (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
+ skip = true;
spin_unlock(&fs_info->treelog_bg_lock);
if (skip)
return 1;
+ /*
+ * Do not allow non-relocation blocks in the dedicated relocation block
+ * group, and vice versa.
+ */
+ spin_lock(&fs_info->relocation_bg_lock);
+ data_reloc_bytenr = fs_info->data_reloc_bg;
+ if (data_reloc_bytenr &&
+ ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
+ (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
+ skip = true;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ if (skip)
+ return 1;
+ /* Check RO and no space case before trying to activate it */
+ spin_lock(&block_group->lock);
+ if (block_group->ro ||
+ block_group->alloc_offset == block_group->zone_capacity) {
+ spin_unlock(&block_group->lock);
+ return 1;
+ }
+ spin_unlock(&block_group->lock);
+
+ if (!btrfs_zone_activate(block_group))
+ return 1;
+
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
spin_lock(&fs_info->treelog_bg_lock);
+ spin_lock(&fs_info->relocation_bg_lock);
ASSERT(!ffe_ctl->for_treelog ||
block_group->start == fs_info->treelog_bg ||
fs_info->treelog_bg == 0);
+ ASSERT(!ffe_ctl->for_data_reloc ||
+ block_group->start == fs_info->data_reloc_bg ||
+ fs_info->data_reloc_bg == 0);
if (block_group->ro) {
ret = 1;
@@ -3796,7 +3829,18 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
goto out;
}
- avail = block_group->length - block_group->alloc_offset;
+ /*
+ * Do not allow currently used block group to be the data relocation
+ * dedicated block group.
+ */
+ if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
+ (block_group->used || block_group->reserved)) {
+ ret = 1;
+ goto out;
+ }
+
+ WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
+ avail = block_group->zone_capacity - block_group->alloc_offset;
if (avail < num_bytes) {
if (ffe_ctl->max_extent_size < avail) {
/*
@@ -3813,6 +3857,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
fs_info->treelog_bg = block_group->start;
+ if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg)
+ fs_info->data_reloc_bg = block_group->start;
+
ffe_ctl->found_offset = start + block_group->alloc_offset;
block_group->alloc_offset += num_bytes;
spin_lock(&ctl->tree_lock);
@@ -3829,6 +3876,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
out:
if (ret && ffe_ctl->for_treelog)
fs_info->treelog_bg = 0;
+ if (ret && ffe_ctl->for_data_reloc)
+ fs_info->data_reloc_bg = 0;
+ spin_unlock(&fs_info->relocation_bg_lock);
spin_unlock(&fs_info->treelog_bg_lock);
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
@@ -3932,18 +3982,30 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
ffe_ctl->orig_have_caching_bg = true;
- if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
- ffe_ctl->have_caching_bg)
- return 1;
-
- if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
- return 1;
-
if (ins->objectid) {
found_extent(ffe_ctl, ins);
return 0;
}
+ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
+ !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->index)) {
+ /*
+ * If we have enough free space left in an already active block
+ * group and we can't activate any other zone now, retry the
+ * active ones with a smaller allocation size. Returning early
+ * from here will tell btrfs_reserve_extent() to haven the
+ * size.
+ */
+ return -ENOSPC;
+ }
+
+ if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
+ return 1;
+
+ ffe_ctl->index++;
+ if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
+ return 1;
+
/*
* LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
* caching kthreads as we move along
@@ -4085,6 +4147,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
ffe_ctl->hint_byte = fs_info->treelog_bg;
spin_unlock(&fs_info->treelog_bg_lock);
}
+ if (ffe_ctl->for_data_reloc) {
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg)
+ ffe_ctl->hint_byte = fs_info->data_reloc_bg;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ }
return 0;
default:
BUG();
@@ -4117,65 +4185,62 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
* |- If not found, re-iterate all block groups
*/
static noinline int find_free_extent(struct btrfs_root *root,
- u64 ram_bytes, u64 num_bytes, u64 empty_size,
- u64 hint_byte_orig, struct btrfs_key *ins,
- u64 flags, int delalloc)
+ struct btrfs_key *ins,
+ struct find_free_extent_ctl *ffe_ctl)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
int cache_block_group_error = 0;
struct btrfs_block_group *block_group = NULL;
- struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
bool full_search = false;
- bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
- WARN_ON(num_bytes < fs_info->sectorsize);
-
- ffe_ctl.num_bytes = num_bytes;
- ffe_ctl.empty_size = empty_size;
- ffe_ctl.flags = flags;
- ffe_ctl.search_start = 0;
- ffe_ctl.delalloc = delalloc;
- ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
- ffe_ctl.have_caching_bg = false;
- ffe_ctl.orig_have_caching_bg = false;
- ffe_ctl.found_offset = 0;
- ffe_ctl.hint_byte = hint_byte_orig;
- ffe_ctl.for_treelog = for_treelog;
- ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
+ WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
+ ffe_ctl->search_start = 0;
/* For clustered allocation */
- ffe_ctl.retry_clustered = false;
- ffe_ctl.retry_unclustered = false;
- ffe_ctl.last_ptr = NULL;
- ffe_ctl.use_cluster = true;
+ ffe_ctl->empty_cluster = 0;
+ ffe_ctl->last_ptr = NULL;
+ ffe_ctl->use_cluster = true;
+ ffe_ctl->have_caching_bg = false;
+ ffe_ctl->orig_have_caching_bg = false;
+ ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
+ ffe_ctl->loop = 0;
+ /* For clustered allocation */
+ ffe_ctl->retry_clustered = false;
+ ffe_ctl->retry_unclustered = false;
+ ffe_ctl->cached = 0;
+ ffe_ctl->max_extent_size = 0;
+ ffe_ctl->total_free_space = 0;
+ ffe_ctl->found_offset = 0;
+ ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
if (btrfs_is_zoned(fs_info))
- ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED;
+ ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
- trace_find_free_extent(root, num_bytes, empty_size, flags);
+ trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size,
+ ffe_ctl->flags);
- space_info = btrfs_find_space_info(fs_info, flags);
+ space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
if (!space_info) {
- btrfs_err(fs_info, "No space info for %llu", flags);
+ btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
return -ENOSPC;
}
- ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
+ ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
if (ret < 0)
return ret;
- ffe_ctl.search_start = max(ffe_ctl.search_start,
- first_logical_byte(fs_info, 0));
- ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte);
- if (ffe_ctl.search_start == ffe_ctl.hint_byte) {
+ ffe_ctl->search_start = max(ffe_ctl->search_start,
+ first_logical_byte(fs_info, 0));
+ ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
+ if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
block_group = btrfs_lookup_block_group(fs_info,
- ffe_ctl.search_start);
+ ffe_ctl->search_start);
/*
* we don't want to use the block group if it doesn't match our
* allocation bits, or if its not cached.
@@ -4183,7 +4248,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
* However if we are re-searching with an ideal block group
* picked out then we don't care that the block group is cached.
*/
- if (block_group && block_group_bits(block_group, flags) &&
+ if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
block_group->cached != BTRFS_CACHE_NO) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
@@ -4197,9 +4262,10 @@ static noinline int find_free_extent(struct btrfs_root *root,
btrfs_put_block_group(block_group);
up_read(&space_info->groups_sem);
} else {
- ffe_ctl.index = btrfs_bg_flags_to_raid_index(
- block_group->flags);
- btrfs_lock_block_group(block_group, delalloc);
+ ffe_ctl->index = btrfs_bg_flags_to_raid_index(
+ block_group->flags);
+ btrfs_lock_block_group(block_group,
+ ffe_ctl->delalloc);
goto have_block_group;
}
} else if (block_group) {
@@ -4207,31 +4273,33 @@ static noinline int find_free_extent(struct btrfs_root *root,
}
}
search:
- ffe_ctl.have_caching_bg = false;
- if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
- ffe_ctl.index == 0)
+ ffe_ctl->have_caching_bg = false;
+ if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
+ ffe_ctl->index == 0)
full_search = true;
down_read(&space_info->groups_sem);
list_for_each_entry(block_group,
- &space_info->block_groups[ffe_ctl.index], list) {
+ &space_info->block_groups[ffe_ctl->index], list) {
struct btrfs_block_group *bg_ret;
/* If the block group is read-only, we can skip it entirely. */
if (unlikely(block_group->ro)) {
- if (for_treelog)
+ if (ffe_ctl->for_treelog)
btrfs_clear_treelog_bg(block_group);
+ if (ffe_ctl->for_data_reloc)
+ btrfs_clear_data_reloc_bg(block_group);
continue;
}
- btrfs_grab_block_group(block_group, delalloc);
- ffe_ctl.search_start = block_group->start;
+ btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
+ ffe_ctl->search_start = block_group->start;
/*
* this can happen if we end up cycling through all the
* raid types, but we want to make sure we only allocate
* for the proper type.
*/
- if (!block_group_bits(block_group, flags)) {
+ if (!block_group_bits(block_group, ffe_ctl->flags)) {
u64 extra = BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_RAID56_MASK |
@@ -4242,7 +4310,7 @@ search:
* doesn't provide them, bail. This does allow us to
* fill raid0 from raid1.
*/
- if ((flags & extra) && !(block_group->flags & extra))
+ if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
goto loop;
/*
@@ -4250,14 +4318,14 @@ search:
* It's possible that we have MIXED_GROUP flag but no
* block group is mixed. Just skip such block group.
*/
- btrfs_release_block_group(block_group, delalloc);
+ btrfs_release_block_group(block_group, ffe_ctl->delalloc);
continue;
}
have_block_group:
- ffe_ctl.cached = btrfs_block_group_done(block_group);
- if (unlikely(!ffe_ctl.cached)) {
- ffe_ctl.have_caching_bg = true;
+ ffe_ctl->cached = btrfs_block_group_done(block_group);
+ if (unlikely(!ffe_ctl->cached)) {
+ ffe_ctl->have_caching_bg = true;
ret = btrfs_cache_block_group(block_group, 0);
/*
@@ -4280,10 +4348,11 @@ have_block_group:
goto loop;
bg_ret = NULL;
- ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
+ ret = do_allocation(block_group, ffe_ctl, &bg_ret);
if (ret == 0) {
if (bg_ret && bg_ret != block_group) {
- btrfs_release_block_group(block_group, delalloc);
+ btrfs_release_block_group(block_group,
+ ffe_ctl->delalloc);
block_group = bg_ret;
}
} else if (ret == -EAGAIN) {
@@ -4293,46 +4362,49 @@ have_block_group:
}
/* Checks */
- ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
- fs_info->stripesize);
+ ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
+ fs_info->stripesize);
/* move on to the next group */
- if (ffe_ctl.search_start + num_bytes >
+ if (ffe_ctl->search_start + ffe_ctl->num_bytes >
block_group->start + block_group->length) {
btrfs_add_free_space_unused(block_group,
- ffe_ctl.found_offset, num_bytes);
+ ffe_ctl->found_offset,
+ ffe_ctl->num_bytes);
goto loop;
}
- if (ffe_ctl.found_offset < ffe_ctl.search_start)
+ if (ffe_ctl->found_offset < ffe_ctl->search_start)
btrfs_add_free_space_unused(block_group,
- ffe_ctl.found_offset,
- ffe_ctl.search_start - ffe_ctl.found_offset);
+ ffe_ctl->found_offset,
+ ffe_ctl->search_start - ffe_ctl->found_offset);
- ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
- num_bytes, delalloc);
+ ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
+ ffe_ctl->num_bytes,
+ ffe_ctl->delalloc);
if (ret == -EAGAIN) {
btrfs_add_free_space_unused(block_group,
- ffe_ctl.found_offset, num_bytes);
+ ffe_ctl->found_offset,
+ ffe_ctl->num_bytes);
goto loop;
}
btrfs_inc_block_group_reservations(block_group);
/* we are all good, lets return */
- ins->objectid = ffe_ctl.search_start;
- ins->offset = num_bytes;
+ ins->objectid = ffe_ctl->search_start;
+ ins->offset = ffe_ctl->num_bytes;
- trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
- num_bytes);
- btrfs_release_block_group(block_group, delalloc);
+ trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start,
+ ffe_ctl->num_bytes);
+ btrfs_release_block_group(block_group, ffe_ctl->delalloc);
break;
loop:
- release_block_group(block_group, &ffe_ctl, delalloc);
+ release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
cond_resched();
}
up_read(&space_info->groups_sem);
- ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
+ ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
if (ret > 0)
goto search;
@@ -4341,12 +4413,12 @@ loop:
* Use ffe_ctl->total_free_space as fallback if we can't find
* any contiguous hole.
*/
- if (!ffe_ctl.max_extent_size)
- ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
+ if (!ffe_ctl->max_extent_size)
+ ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
spin_lock(&space_info->lock);
- space_info->max_extent_size = ffe_ctl.max_extent_size;
+ space_info->max_extent_size = ffe_ctl->max_extent_size;
spin_unlock(&space_info->lock);
- ins->offset = ffe_ctl.max_extent_size;
+ ins->offset = ffe_ctl->max_extent_size;
} else if (ret == -ENOSPC) {
ret = cache_block_group_error;
}
@@ -4404,16 +4476,28 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
struct btrfs_key *ins, int is_data, int delalloc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct find_free_extent_ctl ffe_ctl = {};
bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
flags = get_alloc_profile_by_root(root, is_data);
again:
WARN_ON(num_bytes < fs_info->sectorsize);
- ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
- hint_byte, ins, flags, delalloc);
+
+ ffe_ctl.ram_bytes = ram_bytes;
+ ffe_ctl.num_bytes = num_bytes;
+ ffe_ctl.min_alloc_size = min_alloc_size;
+ ffe_ctl.empty_size = empty_size;
+ ffe_ctl.flags = flags;
+ ffe_ctl.delalloc = delalloc;
+ ffe_ctl.hint_byte = hint_byte;
+ ffe_ctl.for_treelog = for_treelog;
+ ffe_ctl.for_data_reloc = for_data_reloc;
+
+ ret = find_free_extent(root, ins, &ffe_ctl);
if (!ret && !is_data) {
btrfs_dec_block_group_reservations(fs_info, ins->objectid);
} else if (ret == -ENOSPC) {
@@ -4431,8 +4515,8 @@ again:
sinfo = btrfs_find_space_info(fs_info, flags);
btrfs_err(fs_info,
- "allocation failed flags %llu, wanted %llu tree-log %d",
- flags, num_bytes, for_treelog);
+ "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
+ flags, num_bytes, for_treelog, for_data_reloc);
if (sinfo)
btrfs_dump_space_info(fs_info, sinfo,
num_bytes, 1);
@@ -4543,7 +4627,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
+ ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
@@ -4632,7 +4716,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
return ret;
ret = btrfs_update_block_group(trans, extent_key.objectid,
- fs_info->nodesize, 1);
+ fs_info->nodesize, true);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
extent_key.objectid, extent_key.offset);
@@ -4655,7 +4739,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
ins->objectid, ins->offset, 0);
- btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
+ btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
+ offset, 0, false);
btrfs_ref_tree_mod(root->fs_info, &generic_ref);
return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
@@ -4847,8 +4932,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
ins.objectid, ins.offset, parent);
- generic_ref.real_root = root->root_key.objectid;
- btrfs_init_tree_ref(&generic_ref, level, root_objectid);
+ btrfs_init_tree_ref(&generic_ref, level, root_objectid,
+ root->root_key.objectid, false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
if (ret)
@@ -4859,6 +4944,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
out_free_delayed:
btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
+ btrfs_tree_unlock(buf);
free_extent_buffer(buf);
out_free_reserved:
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
@@ -5264,7 +5350,8 @@ skip:
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
fs_info->nodesize, parent);
- btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
+ btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
+ 0, false);
ret = btrfs_free_extent(trans, &ref);
if (ret)
goto out_unlock;
@@ -5749,13 +5836,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- btrfs_assert_tree_locked(parent);
+ btrfs_assert_tree_write_locked(parent);
parent_level = btrfs_header_level(parent);
atomic_inc(&parent->refs);
path->nodes[parent_level] = parent;
path->slots[parent_level] = btrfs_header_nritems(parent);
- btrfs_assert_tree_locked(node);
+ btrfs_assert_tree_write_locked(node);
level = btrfs_header_level(node);
path->nodes[level] = node;
path->slots[level] = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index aaddd7225348..4e03a6d3aa32 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -241,7 +241,7 @@ int __init extent_io_init(void)
return -ENOMEM;
if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
- offsetof(struct btrfs_io_bio, bio),
+ offsetof(struct btrfs_bio, bio),
BIOSET_NEED_BVECS))
goto free_buffer_cache;
@@ -1975,10 +1975,18 @@ static noinline int lock_delalloc_pages(struct inode *inode,
/*
* Find and lock a contiguous range of bytes in the file marked as delalloc, no
- * more than @max_bytes. @Start and @end are used to return the range,
+ * more than @max_bytes.
*
- * Return: true if we find something
- * false if nothing was in the tree
+ * @start: The original start bytenr to search.
+ * Will store the extent range start bytenr.
+ * @end: The original end bytenr of the search range
+ * Will store the extent range end bytenr.
+ *
+ * Return true if we find a delalloc range which starts inside the original
+ * range, and @start/@end will store the delalloc range start/end.
+ *
+ * Return false if we can't find any delalloc range which starts inside the
+ * original range, and @start/@end will be the non-delalloc range start/end.
*/
EXPORT_FOR_TESTS
noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
@@ -1986,6 +1994,8 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
u64 *end)
{
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ const u64 orig_start = *start;
+ const u64 orig_end = *end;
u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end;
@@ -1994,15 +2004,23 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
int ret;
int loops = 0;
+ /* Caller should pass a valid @end to indicate the search range end */
+ ASSERT(orig_end > orig_start);
+
+ /* The range should at least cover part of the page */
+ ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
+ orig_end <= page_offset(locked_page)));
again:
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;
found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
max_bytes, &cached_state);
- if (!found || delalloc_end <= *start) {
+ if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
*start = delalloc_start;
- *end = delalloc_end;
+
+ /* @delalloc_end can be -1, never go beyond @orig_end */
+ *end = min(delalloc_end, orig_end);
free_extent_state(cached_state);
return false;
}
@@ -2282,15 +2300,15 @@ int free_io_failure(struct extent_io_tree *failure_tree,
* currently, there can be no more than two copies of every data bit. thus,
* exactly one rewrite is required.
*/
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num)
+static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ u64 length, u64 logical, struct page *page,
+ unsigned int pg_offset, int mirror_num)
{
struct bio *bio;
struct btrfs_device *dev;
u64 map_length = 0;
u64 sector;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
int ret;
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
@@ -2299,12 +2317,12 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
if (btrfs_is_zoned(fs_info))
return btrfs_repair_one_zone(fs_info, logical);
- bio = btrfs_io_bio_alloc(1);
+ bio = btrfs_bio_alloc(1);
bio->bi_iter.bi_size = 0;
map_length = length;
/*
- * Avoid races with device replace and make sure our bbio has devices
+ * Avoid races with device replace and make sure our bioc has devices
* associated to its stripes that don't go away while we are doing the
* read repair operation.
*/
@@ -2317,28 +2335,28 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
* stripe's dev and sector.
*/
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
- &map_length, &bbio, 0);
+ &map_length, &bioc, 0);
if (ret) {
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
}
- ASSERT(bbio->mirror_num == 1);
+ ASSERT(bioc->mirror_num == 1);
} else {
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
- &map_length, &bbio, mirror_num);
+ &map_length, &bioc, mirror_num);
if (ret) {
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
}
- BUG_ON(mirror_num != bbio->mirror_num);
+ BUG_ON(mirror_num != bioc->mirror_num);
}
- sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
+ sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
bio->bi_iter.bi_sector = sector;
- dev = bbio->stripes[bbio->mirror_num - 1].dev;
- btrfs_put_bbio(bbio);
+ dev = bioc->stripes[bioc->mirror_num - 1].dev;
+ btrfs_put_bioc(bioc);
if (!dev || !dev->bdev ||
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
btrfs_bio_counter_dec(fs_info);
@@ -2618,10 +2636,10 @@ int btrfs_repair_one_sector(struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
+ struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio);
const int icsum = bio_offset >> fs_info->sectorsize_bits;
struct bio *repair_bio;
- struct btrfs_io_bio *repair_io_bio;
+ struct btrfs_bio *repair_bbio;
blk_status_t status;
btrfs_debug(fs_info,
@@ -2639,24 +2657,23 @@ int btrfs_repair_one_sector(struct inode *inode,
return -EIO;
}
- repair_bio = btrfs_io_bio_alloc(1);
- repair_io_bio = btrfs_io_bio(repair_bio);
+ repair_bio = btrfs_bio_alloc(1);
+ repair_bbio = btrfs_bio(repair_bio);
repair_bio->bi_opf = REQ_OP_READ;
repair_bio->bi_end_io = failed_bio->bi_end_io;
repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
repair_bio->bi_private = failed_bio->bi_private;
- if (failed_io_bio->csum) {
+ if (failed_bbio->csum) {
const u32 csum_size = fs_info->csum_size;
- repair_io_bio->csum = repair_io_bio->csum_inline;
- memcpy(repair_io_bio->csum,
- failed_io_bio->csum + csum_size * icsum, csum_size);
+ repair_bbio->csum = repair_bbio->csum_inline;
+ memcpy(repair_bbio->csum,
+ failed_bbio->csum + csum_size * icsum, csum_size);
}
bio_add_page(repair_bio, page, failrec->len, pgoff);
- repair_io_bio->logical = failrec->start;
- repair_io_bio->iter = repair_bio->bi_iter;
+ repair_bbio->iter = repair_bio->bi_iter;
btrfs_debug(btrfs_sb(inode->i_sb),
"repair read error: submitting new read to mirror %d",
@@ -2976,7 +2993,7 @@ static struct extent_buffer *find_extent_buffer_readpage(
static void end_bio_extent_readpage(struct bio *bio)
{
struct bio_vec *bvec;
- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct btrfs_bio *bbio = btrfs_bio(bio);
struct extent_io_tree *tree, *failure_tree;
struct processed_extent processed = { 0 };
/*
@@ -3003,7 +3020,7 @@ static void end_bio_extent_readpage(struct bio *bio)
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
bio->bi_iter.bi_sector, bio->bi_status,
- io_bio->mirror_num);
+ bbio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
failure_tree = &BTRFS_I(inode)->io_failure_tree;
@@ -3028,14 +3045,14 @@ static void end_bio_extent_readpage(struct bio *bio)
end = start + bvec->bv_len - 1;
len = bvec->bv_len;
- mirror = io_bio->mirror_num;
+ mirror = bbio->mirror_num;
if (likely(uptodate)) {
if (is_data_inode(inode)) {
- error_bitmap = btrfs_verify_data_csum(io_bio,
+ error_bitmap = btrfs_verify_data_csum(bbio,
bio_offset, page, start, end);
ret = error_bitmap;
} else {
- ret = btrfs_validate_metadata_buffer(io_bio,
+ ret = btrfs_validate_metadata_buffer(bbio,
page, start, end, mirror);
}
if (ret)
@@ -3106,7 +3123,7 @@ readpage_ok:
}
/* Release the last extent */
endio_readpage_release_extent(&processed, NULL, 0, 0, false);
- btrfs_io_bio_free_csum(io_bio);
+ btrfs_bio_free_csum(bbio);
bio_put(bio);
}
@@ -3115,53 +3132,43 @@ readpage_ok:
* new bio by bio_alloc_bioset as it does not initialize the bytes outside of
* 'bio' because use of __GFP_ZERO is not supported.
*/
-static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
+static inline void btrfs_bio_init(struct btrfs_bio *bbio)
{
- memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
+ memset(bbio, 0, offsetof(struct btrfs_bio, bio));
}
/*
- * The following helpers allocate a bio. As it's backed by a bioset, it'll
- * never fail. We're returning a bio right now but you can call btrfs_io_bio
- * for the appropriate container_of magic
+ * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs.
+ *
+ * The bio allocation is backed by bioset and does not fail.
*/
-struct bio *btrfs_bio_alloc(u64 first_byte)
+struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
{
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset);
- bio->bi_iter.bi_sector = first_byte >> 9;
- btrfs_io_bio_init(btrfs_io_bio(bio));
+ ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
+ bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
+ btrfs_bio_init(btrfs_bio(bio));
return bio;
}
struct bio *btrfs_bio_clone(struct bio *bio)
{
- struct btrfs_io_bio *btrfs_bio;
+ struct btrfs_bio *bbio;
struct bio *new;
/* Bio allocation backed by a bioset does not fail */
new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
- btrfs_bio = btrfs_io_bio(new);
- btrfs_io_bio_init(btrfs_bio);
- btrfs_bio->iter = bio->bi_iter;
+ bbio = btrfs_bio(new);
+ btrfs_bio_init(bbio);
+ bbio->iter = bio->bi_iter;
return new;
}
-struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
-{
- struct bio *bio;
-
- /* Bio allocation backed by a bioset does not fail */
- bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
- btrfs_io_bio_init(btrfs_io_bio(bio));
- return bio;
-}
-
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
{
struct bio *bio;
- struct btrfs_io_bio *btrfs_bio;
+ struct btrfs_bio *bbio;
ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
@@ -3169,11 +3176,11 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
ASSERT(bio);
- btrfs_bio = btrfs_io_bio(bio);
- btrfs_io_bio_init(btrfs_bio);
+ bbio = btrfs_bio(bio);
+ btrfs_bio_init(bbio);
bio_trim(bio, offset >> 9, size >> 9);
- btrfs_bio->iter = bio->bi_iter;
+ bbio->iter = bio->bi_iter;
return bio;
}
@@ -3307,14 +3314,15 @@ static int alloc_new_bio(struct btrfs_inode *inode,
struct bio *bio;
int ret;
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
/*
* For compressed page range, its disk_bytenr is always @disk_bytenr
* passed in, no matter if we have added any range into previous bio.
*/
if (bio_flags & EXTENT_BIO_COMPRESSED)
- bio = btrfs_bio_alloc(disk_bytenr);
+ bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
else
- bio = btrfs_bio_alloc(disk_bytenr + offset);
+ bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
bio_ctrl->bio = bio;
bio_ctrl->bio_flags = bio_flags;
bio->bi_end_io = end_io_func;
@@ -3327,7 +3335,7 @@ static int alloc_new_bio(struct btrfs_inode *inode,
if (wbc) {
struct block_device *bdev;
- bdev = fs_info->fs_devices->latest_bdev;
+ bdev = fs_info->fs_devices->latest_dev->bdev;
bio_set_dev(bio, bdev);
wbc_init_bio(wbc, bio);
}
@@ -3341,7 +3349,7 @@ static int alloc_new_bio(struct btrfs_inode *inode,
goto error;
}
- btrfs_io_bio(bio)->device = device;
+ btrfs_bio(bio)->device = device;
}
return 0;
error:
@@ -3599,6 +3607,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
bool force_bio_submit = false;
u64 disk_bytenr;
+ ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
struct extent_state *cached = NULL;
@@ -3777,17 +3786,18 @@ static void update_nr_written(struct writeback_control *wbc,
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct page *page, struct writeback_control *wbc,
- u64 delalloc_start, unsigned long *nr_written)
+ unsigned long *nr_written)
{
- u64 page_end = delalloc_start + PAGE_SIZE - 1;
- bool found;
+ const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
+ u64 delalloc_start = page_offset(page);
u64 delalloc_to_write = 0;
- u64 delalloc_end = 0;
int ret;
int page_started = 0;
+ while (delalloc_start < page_end) {
+ u64 delalloc_end = page_end;
+ bool found;
- while (delalloc_end < page_end) {
found = find_lock_delalloc_range(&inode->vfs_inode, page,
&delalloc_start,
&delalloc_end);
@@ -3854,12 +3864,11 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
struct page *page, u64 *start, u64 *end)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage_info *spi = fs_info->subpage_info;
u64 orig_start = *start;
/* Declare as unsigned long so we can use bitmap ops */
- unsigned long dirty_bitmap;
unsigned long flags;
- int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits;
- int range_start_bit = nbits;
+ int range_start_bit;
int range_end_bit;
/*
@@ -3872,13 +3881,18 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
return;
}
+ range_start_bit = spi->dirty_offset +
+ (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
+
/* We should have the page locked, but just in case */
spin_lock_irqsave(&subpage->lock, flags);
- dirty_bitmap = subpage->dirty_bitmap;
+ bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
+ spi->dirty_offset + spi->bitmap_nr_bits);
spin_unlock_irqrestore(&subpage->lock, flags);
- bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit,
- BTRFS_SUBPAGE_BITMAP_SIZE);
+ range_start_bit -= spi->dirty_offset;
+ range_end_bit -= spi->dirty_offset;
+
*start = page_offset(page) + range_start_bit * fs_info->sectorsize;
*end = page_offset(page) + range_end_bit * fs_info->sectorsize;
}
@@ -4054,8 +4068,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
struct extent_page_data *epd)
{
struct inode *inode = page->mapping->host;
- u64 start = page_offset(page);
- u64 page_end = start + PAGE_SIZE - 1;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u64 page_start = page_offset(page);
+ const u64 page_end = page_start + PAGE_SIZE - 1;
int ret;
int nr = 0;
size_t pg_offset;
@@ -4090,8 +4105,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
}
if (!epd->extent_locked) {
- ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
- &nr_written);
+ ret = writepage_delalloc(BTRFS_I(inode), page, wbc, &nr_written);
if (ret == 1)
return 0;
if (ret)
@@ -4141,8 +4155,20 @@ done:
* capable of that.
*/
if (PageError(page))
- end_extent_writepage(page, ret, start, page_end);
- unlock_page(page);
+ end_extent_writepage(page, ret, page_start, page_end);
+ if (epd->extent_locked) {
+ /*
+ * If epd->extent_locked, it's from extent_write_locked_range(),
+ * the page can either be locked by lock_page() or
+ * process_one_page().
+ * Let btrfs_page_unlock_writer() handle both cases.
+ */
+ ASSERT(wbc);
+ btrfs_page_unlock_writer(fs_info, page, wbc->range_start,
+ wbc->range_end + 1 - wbc->range_start);
+ } else {
+ unlock_page(page);
+ }
ASSERT(ret <= 0);
return ret;
}
@@ -4155,6 +4181,9 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
static void end_extent_buffer_writeback(struct extent_buffer *eb)
{
+ if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags))
+ btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len);
+
clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
smp_mb__after_atomic();
wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
@@ -4602,12 +4631,11 @@ static int submit_eb_subpage(struct page *page,
int submitted = 0;
u64 page_start = page_offset(page);
int bit_start = 0;
- const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE;
int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
int ret;
/* Lock and write each dirty extent buffers in the range */
- while (bit_start < nbits) {
+ while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
struct extent_buffer *eb;
unsigned long flags;
@@ -4623,7 +4651,8 @@ static int submit_eb_subpage(struct page *page,
break;
}
spin_lock_irqsave(&subpage->lock, flags);
- if (!((1 << bit_start) & subpage->dirty_bitmap)) {
+ if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
+ subpage->bitmaps)) {
spin_unlock_irqrestore(&subpage->lock, flags);
spin_unlock(&page->mapping->private_lock);
bit_start++;
@@ -4756,8 +4785,13 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
free_extent_buffer(eb);
return ret;
}
- if (cache)
+ if (cache) {
+ /* Impiles write in zoned mode */
btrfs_put_block_group(cache);
+ /* Mark the last eb in a block group */
+ if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity)
+ set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags);
+ }
ret = write_one_eb(eb, wbc, epd);
free_extent_buffer(eb);
if (ret < 0)
@@ -4873,7 +4907,7 @@ retry:
* extent io tree. Thus we don't want to submit such wild eb
* if the fs already has error.
*/
- if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (!BTRFS_FS_ERROR(fs_info)) {
ret = flush_write_bio(&epd);
} else {
ret = -EROFS;
@@ -5069,23 +5103,28 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
return ret;
}
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
- int mode)
+/*
+ * Submit the pages in the range to bio for call sites which delalloc range has
+ * already been ran (aka, ordered extent inserted) and all pages are still
+ * locked.
+ */
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
{
+ bool found_error = false;
+ int first_error = 0;
int ret = 0;
struct address_space *mapping = inode->i_mapping;
struct page *page;
- unsigned long nr_pages = (end - start + PAGE_SIZE) >>
- PAGE_SHIFT;
-
+ u64 cur = start;
+ unsigned long nr_pages;
+ const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
struct extent_page_data epd = {
.bio_ctrl = { 0 },
.extent_locked = 1,
- .sync_io = mode == WB_SYNC_ALL,
+ .sync_io = 1,
};
struct writeback_control wbc_writepages = {
- .sync_mode = mode,
- .nr_to_write = nr_pages * 2,
+ .sync_mode = WB_SYNC_ALL,
.range_start = start,
.range_end = end + 1,
/* We're called from an async helper function */
@@ -5093,33 +5132,51 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
.no_cgroup_owner = 1,
};
+ ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
+ nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
+ PAGE_SHIFT;
+ wbc_writepages.nr_to_write = nr_pages * 2;
+
wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
- while (start <= end) {
- page = find_get_page(mapping, start >> PAGE_SHIFT);
- if (clear_page_dirty_for_io(page))
- ret = __extent_writepage(page, &wbc_writepages, &epd);
- else {
- btrfs_writepage_endio_finish_ordered(BTRFS_I(inode),
- page, start, start + PAGE_SIZE - 1, true);
- unlock_page(page);
+ while (cur <= end) {
+ u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
+
+ page = find_get_page(mapping, cur >> PAGE_SHIFT);
+ /*
+ * All pages in the range are locked since
+ * btrfs_run_delalloc_range(), thus there is no way to clear
+ * the page dirty flag.
+ */
+ ASSERT(PageLocked(page));
+ ASSERT(PageDirty(page));
+ clear_page_dirty_for_io(page);
+ ret = __extent_writepage(page, &wbc_writepages, &epd);
+ ASSERT(ret <= 0);
+ if (ret < 0) {
+ found_error = true;
+ first_error = ret;
}
put_page(page);
- start += PAGE_SIZE;
+ cur = cur_end + 1;
}
- ASSERT(ret <= 0);
- if (ret == 0)
+ if (!found_error)
ret = flush_write_bio(&epd);
else
end_write_bio(&epd, ret);
wbc_detach_inode(&wbc_writepages);
+ if (found_error)
+ return first_error;
return ret;
}
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct inode *inode = mapping->host;
+ const bool data_reloc = btrfs_is_data_reloc_root(BTRFS_I(inode)->root);
+ const bool zoned = btrfs_is_zoned(BTRFS_I(inode)->root->fs_info);
int ret = 0;
struct extent_page_data epd = {
.bio_ctrl = { 0 },
@@ -5127,7 +5184,15 @@ int extent_writepages(struct address_space *mapping,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
+ /*
+ * Allow only a single thread to do the reloc work in zoned mode to
+ * protect the write pointer updates.
+ */
+ if (data_reloc && zoned)
+ btrfs_inode_lock(inode, 0);
ret = extent_write_cache_pages(mapping, wbc, &epd);
+ if (data_reloc && zoned)
+ btrfs_inode_unlock(inode, 0);
ASSERT(ret <= 0);
if (ret < 0) {
end_write_bio(&epd, ret);
@@ -6137,13 +6202,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
* page, but it may change in the future for 16K page size
* support, so we still preallocate the memory in the loop.
*/
- ret = btrfs_alloc_subpage(fs_info, &prealloc,
- BTRFS_SUBPAGE_METADATA);
- if (ret < 0) {
- unlock_page(p);
- put_page(p);
- exists = ERR_PTR(ret);
- goto free_eb;
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
+ if (IS_ERR(prealloc)) {
+ ret = PTR_ERR(prealloc);
+ unlock_page(p);
+ put_page(p);
+ exists = ERR_PTR(ret);
+ goto free_eb;
+ }
}
spin_lock(&mapping->private_lock);
@@ -7167,32 +7234,41 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
}
}
+#define GANG_LOOKUP_SIZE 16
static struct extent_buffer *get_next_extent_buffer(
struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
{
- struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE];
+ struct extent_buffer *gang[GANG_LOOKUP_SIZE];
struct extent_buffer *found = NULL;
u64 page_start = page_offset(page);
- int ret;
- int i;
+ u64 cur = page_start;
ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
- ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE);
lockdep_assert_held(&fs_info->buffer_lock);
- ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang,
- bytenr >> fs_info->sectorsize_bits,
- PAGE_SIZE / fs_info->nodesize);
- for (i = 0; i < ret; i++) {
- /* Already beyond page end */
- if (gang[i]->start >= page_start + PAGE_SIZE)
- break;
- /* Found one */
- if (gang[i]->start >= bytenr) {
- found = gang[i];
- break;
+ while (cur < page_start + PAGE_SIZE) {
+ int ret;
+ int i;
+
+ ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
+ (void **)gang, cur >> fs_info->sectorsize_bits,
+ min_t(unsigned int, GANG_LOOKUP_SIZE,
+ PAGE_SIZE / fs_info->nodesize));
+ if (ret == 0)
+ goto out;
+ for (i = 0; i < ret; i++) {
+ /* Already beyond page end */
+ if (gang[i]->start >= page_start + PAGE_SIZE)
+ goto out;
+ /* Found one */
+ if (gang[i]->start >= bytenr) {
+ found = gang[i];
+ goto out;
+ }
}
+ cur = gang[ret - 1]->start + gang[ret - 1]->len;
}
+out:
return found;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 53abdc280451..0399cf8e3c32 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -32,6 +32,7 @@ enum {
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
EXTENT_BUFFER_NO_CHECK,
+ EXTENT_BUFFER_ZONE_FINISH,
};
/* these are flags for __process_pages_contig */
@@ -183,8 +184,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
struct btrfs_bio_ctrl *bio_ctrl,
unsigned int read_flags, u64 *prev_em_start);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
- int mode);
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end);
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
@@ -277,14 +277,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 bits_to_clear, unsigned long page_ops);
-struct bio *btrfs_bio_alloc(u64 first_byte);
-struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
+struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
struct bio *btrfs_bio_clone(struct bio *bio);
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 4a8e02f7b6c7..5a36add21305 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -360,7 +360,7 @@ static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
int i;
for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_bio_stripe *stripe = &map->stripes[i];
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
set_extent_bits_nowait(&device->alloc_state, stripe->physical,
@@ -375,7 +375,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
int i;
for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_bio_stripe *stripe = &map->stripes[i];
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
__clear_extent_bit(&device->alloc_state, stripe->physical,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 2673c6ba7a4e..d1cbb64a78f3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -358,7 +358,7 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
* @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
* checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
* NULL, the checksum buffer is allocated and returned in
- * btrfs_io_bio(bio)->csum instead.
+ * btrfs_bio(bio)->csum instead.
*
* Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
*/
@@ -397,19 +397,18 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
return BLK_STS_RESOURCE;
if (!dst) {
- struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
+ struct btrfs_bio *bbio = btrfs_bio(bio);
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
- btrfs_bio->csum = kmalloc_array(nblocks, csum_size,
- GFP_NOFS);
- if (!btrfs_bio->csum) {
+ bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
+ if (!bbio->csum) {
btrfs_free_path(path);
return BLK_STS_RESOURCE;
}
} else {
- btrfs_bio->csum = btrfs_bio->csum_inline;
+ bbio->csum = bbio->csum_inline;
}
- csum = btrfs_bio->csum;
+ csum = bbio->csum;
} else {
csum = dst;
}
@@ -665,7 +664,18 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
if (!ordered) {
ordered = btrfs_lookup_ordered_extent(inode, offset);
- BUG_ON(!ordered); /* Logic error */
+ /*
+ * The bio range is not covered by any ordered extent,
+ * must be a code logic error.
+ */
+ if (unlikely(!ordered)) {
+ WARN(1, KERN_WARNING
+ "no ordered extent for root %llu ino %llu offset %llu\n",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode), offset);
+ kvfree(sums);
+ return BLK_STS_IOERR;
+ }
}
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info,
@@ -698,12 +708,12 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
index = 0;
}
- data = kmap_atomic(bvec.bv_page);
- crypto_shash_digest(shash, data + bvec.bv_offset
- + (i * fs_info->sectorsize),
+ data = bvec_kmap_local(&bvec);
+ crypto_shash_digest(shash,
+ data + (i * fs_info->sectorsize),
fs_info->sectorsize,
sums->sums + index);
- kunmap_atomic(data);
+ kunmap_local(data);
index += fs_info->csum_size;
offset += fs_info->sectorsize;
this_sum_bytes += fs_info->sectorsize;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 7ff577005d0f..581662d16b72 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -437,9 +437,15 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
/*
* unlocks pages after btrfs_file_write is done with them
*/
-static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
+ struct page **pages, size_t num_pages,
+ u64 pos, u64 copied)
{
size_t i;
+ u64 block_start = round_down(pos, fs_info->sectorsize);
+ u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
+
+ ASSERT(block_len <= U32_MAX);
for (i = 0; i < num_pages; i++) {
/* page checked is some magic around finding pages that
* have been modified without going through btrfs_set_page_dirty
@@ -447,7 +453,8 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
* accessed as prepare_pages should have marked them accessed
* in prepare_pages via find_or_create_page()
*/
- ClearPageChecked(pages[i]);
+ btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
+ block_len);
unlock_page(pages[i]);
put_page(pages[i]);
}
@@ -504,7 +511,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
struct page *p = pages[i];
btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
- ClearPageChecked(p);
+ btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
}
@@ -734,8 +741,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
if (args->start >= inode->disk_i_size && !args->replace_extent)
modify_tree = 0;
- update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
- root == fs_info->tree_root);
+ update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
while (1) {
recow = 0;
ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@ -870,7 +876,8 @@ next_slot:
btrfs_init_data_ref(&ref,
root->root_key.objectid,
new_key.objectid,
- args->start - extent_offset);
+ args->start - extent_offset,
+ 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
}
@@ -956,7 +963,8 @@ delete_extent_item:
btrfs_init_data_ref(&ref,
root->root_key.objectid,
key.objectid,
- key.offset - extent_offset);
+ key.offset - extent_offset, 0,
+ false);
ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
args->bytes_found += extent_end - key.offset;
@@ -1021,8 +1029,7 @@ delete_extent_item:
if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
path->slots[0]++;
}
- setup_items_for_insert(root, path, &key,
- &args->extent_item_size, 1);
+ btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
args->extent_inserted = true;
}
@@ -1233,7 +1240,7 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
num_bytes, 0);
btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
- orig_offset);
+ orig_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1258,7 +1265,8 @@ again:
other_end = 0;
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
num_bytes, 0);
- btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
+ btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
+ 0, false);
if (extent_mergeable(leaf, path->slots[0] + 1,
ino, bytenr, orig_offset,
&other_start, &other_end)) {
@@ -1710,7 +1718,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
* Fault pages before locking them in prepare_pages
* to avoid recursive lock
*/
- if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
ret = -EFAULT;
break;
}
@@ -1845,7 +1853,7 @@ again:
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
if (ret) {
- btrfs_drop_pages(pages, num_pages);
+ btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
break;
}
@@ -1853,7 +1861,7 @@ again:
if (only_release_metadata)
btrfs_check_nocow_unlock(BTRFS_I(inode));
- btrfs_drop_pages(pages, num_pages);
+ btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
cond_resched();
@@ -1957,7 +1965,7 @@ relock:
}
dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- 0);
+ 0, 0);
btrfs_inode_unlock(inode, ilock_flags);
@@ -2013,7 +2021,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
* have opened a file as writable, we have to stop this write operation
* to ensure consistency.
*/
- if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state))
+ if (BTRFS_FS_ERROR(inode->root->fs_info))
return -EROFS;
if (!(iocb->ki_flags & IOCB_DIRECT) &&
@@ -2621,7 +2629,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
extent_info->disk_len, 0);
ref_offset = extent_info->file_offset - extent_info->data_offset;
btrfs_init_data_ref(&ref, root->root_key.objectid,
- btrfs_ino(inode), ref_offset);
+ btrfs_ino(inode), ref_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
}
@@ -2704,14 +2712,16 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
drop_args.bytes_found);
if (ret != -ENOSPC) {
/*
- * When cloning we want to avoid transaction aborts when
- * nothing was done and we are attempting to clone parts
- * of inline extents, in such cases -EOPNOTSUPP is
- * returned by __btrfs_drop_extents() without having
- * changed anything in the file.
+ * The only time we don't want to abort is if we are
+ * attempting to clone a partial inline extent, in which
+ * case we'll get EOPNOTSUPP. However if we aren't
+ * clone we need to abort no matter what, because if we
+ * got EOPNOTSUPP via prealloc then we messed up and
+ * need to abort.
*/
- if (extent_info && !extent_info->is_new_extent &&
- ret && ret != -EOPNOTSUPP)
+ if (ret &&
+ (ret != -EOPNOTSUPP ||
+ (extent_info && extent_info->is_new_extent)))
btrfs_abort_transaction(trans, ret);
break;
}
@@ -3658,7 +3668,8 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
return 0;
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
- ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
+ ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ 0, 0);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
return ret;
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index da0eee7c9e5f..f3fee88c8ee0 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -22,6 +22,7 @@
#include "delalloc-space.h"
#include "block-group.h"
#include "discard.h"
+#include "subpage.h"
#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
#define MAX_CACHE_BYTES_PER_GIG SZ_64K
@@ -411,7 +412,10 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
for (i = 0; i < io_ctl->num_pages; i++) {
if (io_ctl->pages[i]) {
- ClearPageChecked(io_ctl->pages[i]);
+ btrfs_page_clear_checked(io_ctl->fs_info,
+ io_ctl->pages[i],
+ page_offset(io_ctl->pages[i]),
+ PAGE_SIZE);
unlock_page(io_ctl->pages[i]);
put_page(io_ctl->pages[i]);
}
@@ -2539,10 +2543,16 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
+ bool initial = (size == block_group->length);
+ u64 reclaimable_unusable;
+
+ WARN_ON(!initial && offset + size > block_group->zone_capacity);
spin_lock(&ctl->tree_lock);
if (!used)
to_free = size;
+ else if (initial)
+ to_free = block_group->zone_capacity;
else if (offset >= block_group->alloc_offset)
to_free = size;
else if (offset + size <= block_group->alloc_offset)
@@ -2565,12 +2575,15 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
spin_unlock(&block_group->lock);
}
+ reclaimable_unusable = block_group->zone_unusable -
+ (block_group->length - block_group->zone_capacity);
/* All the region is now unusable. Mark it as unused and reclaim */
if (block_group->zone_unusable == block_group->length) {
btrfs_mark_bg_unused(block_group);
} else if (bg_reclaim_threshold &&
- block_group->zone_unusable >=
- div_factor_fine(block_group->length, bg_reclaim_threshold)) {
+ reclaimable_unusable >=
+ div_factor_fine(block_group->zone_capacity,
+ bg_reclaim_threshold)) {
btrfs_mark_bg_to_reclaim(block_group);
}
@@ -2754,8 +2767,9 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
* out the free space after the allocation offset.
*/
if (btrfs_is_zoned(fs_info)) {
- btrfs_info(fs_info, "free space %llu",
- block_group->length - block_group->alloc_offset);
+ btrfs_info(fs_info, "free space %llu active %d",
+ block_group->zone_capacity - block_group->alloc_offset,
+ block_group->zone_is_active);
return;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 487533c35ddb..b8c911a4a320 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6,6 +6,7 @@
#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/bio.h>
+#include <linux/blk-cgroup.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -287,8 +288,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
cur_size = min_t(unsigned long, compressed_size,
PAGE_SIZE);
- kaddr = page_address(cpage);
+ kaddr = kmap_atomic(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
+ kunmap_atomic(kaddr);
i++;
ptr += cur_size;
@@ -455,11 +457,10 @@ struct async_chunk {
struct list_head extents;
struct cgroup_subsys_state *blkcg_css;
struct btrfs_work work;
- atomic_t *pending;
+ struct async_cow *async_cow;
};
struct async_cow {
- /* Number of chunks in flight; must be first in the structure */
atomic_t num_chunks;
struct async_chunk chunks[];
};
@@ -490,9 +491,6 @@ static noinline int add_async_extent(struct async_chunk *cow,
*/
static inline bool inode_can_compress(struct btrfs_inode *inode)
{
- /* Subpage doesn't support compression yet */
- if (inode->root->fs_info->sectorsize < PAGE_SIZE)
- return false;
if (inode->flags & BTRFS_INODE_NODATACOW ||
inode->flags & BTRFS_INODE_NODATASUM)
return false;
@@ -514,6 +512,38 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
btrfs_ino(inode));
return 0;
}
+ /*
+ * Special check for subpage.
+ *
+ * We lock the full page then run each delalloc range in the page, thus
+ * for the following case, we will hit some subpage specific corner case:
+ *
+ * 0 32K 64K
+ * | |///////| |///////|
+ * \- A \- B
+ *
+ * In above case, both range A and range B will try to unlock the full
+ * page [0, 64K), causing the one finished later will have page
+ * unlocked already, triggering various page lock requirement BUG_ON()s.
+ *
+ * So here we add an artificial limit that subpage compression can only
+ * if the range is fully page aligned.
+ *
+ * In theory we only need to ensure the first page is fully covered, but
+ * the tailing partial page will be locked until the full compression
+ * finishes, delaying the write of other range.
+ *
+ * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
+ * first to prevent any submitted async extent to unlock the full page.
+ * By this, we can ensure for subpage case that only the last async_cow
+ * will unlock the full page.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ if (!IS_ALIGNED(start, PAGE_SIZE) ||
+ !IS_ALIGNED(end + 1, PAGE_SIZE))
+ return 0;
+ }
+
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
@@ -615,13 +645,24 @@ again:
total_compressed = actual_end - start;
/*
- * skip compression for a small file range(<=blocksize) that
+ * Skip compression for a small file range(<=blocksize) that
* isn't an inline extent, since it doesn't save disk space at all.
*/
if (total_compressed <= blocksize &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
goto cleanup_and_bail_uncompressed;
+ /*
+ * For subpage case, we require full page alignment for the sector
+ * aligned range.
+ * Thus we must also check against @actual_end, not just @end.
+ */
+ if (blocksize < PAGE_SIZE) {
+ if (!IS_ALIGNED(start, PAGE_SIZE) ||
+ !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
+ goto cleanup_and_bail_uncompressed;
+ }
+
total_compressed = min_t(unsigned long, total_compressed,
BTRFS_MAX_UNCOMPRESSED);
total_in = 0;
@@ -759,7 +800,7 @@ cont:
* win, compare the page count read with the blocks on disk,
* compression must free at least one sector size
*/
- total_in = ALIGN(total_in, PAGE_SIZE);
+ total_in = round_up(total_in, fs_info->sectorsize);
if (total_compressed + blocksize <= total_in) {
compressed_extents++;
@@ -840,166 +881,148 @@ static void free_async_extent_pages(struct async_extent *async_extent)
async_extent->pages = NULL;
}
-/*
- * phase two of compressed writeback. This is the ordered portion
- * of the code, which only gets called in the order the work was
- * queued. We walk all the async extents created by compress_file_range
- * and send them down to the disk.
- */
-static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+static int submit_uncompressed_range(struct btrfs_inode *inode,
+ struct async_extent *async_extent,
+ struct page *locked_page)
{
- struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct async_extent *async_extent;
- u64 alloc_hint = 0;
- struct btrfs_key ins;
- struct extent_map *em;
- struct btrfs_root *root = inode->root;
- struct extent_io_tree *io_tree = &inode->io_tree;
- int ret = 0;
-
-again:
- while (!list_empty(&async_chunk->extents)) {
- async_extent = list_entry(async_chunk->extents.next,
- struct async_extent, list);
- list_del(&async_extent->list);
-
-retry:
- lock_extent(io_tree, async_extent->start,
- async_extent->start + async_extent->ram_size - 1);
- /* did the compression code fall back to uncompressed IO? */
- if (!async_extent->pages) {
- int page_started = 0;
- unsigned long nr_written = 0;
+ u64 start = async_extent->start;
+ u64 end = async_extent->start + async_extent->ram_size - 1;
+ unsigned long nr_written = 0;
+ int page_started = 0;
+ int ret;
- /* allocate blocks */
- ret = cow_file_range(inode, async_chunk->locked_page,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- &page_started, &nr_written, 0);
+ /*
+ * Call cow_file_range() to run the delalloc range directly, since we
+ * won't go to NOCOW or async path again.
+ *
+ * Also we call cow_file_range() with @unlock_page == 0, so that we
+ * can directly submit them without interruption.
+ */
+ ret = cow_file_range(inode, locked_page, start, end, &page_started,
+ &nr_written, 0);
+ /* Inline extent inserted, page gets unlocked and everything is done */
+ if (page_started) {
+ ret = 0;
+ goto out;
+ }
+ if (ret < 0) {
+ if (locked_page)
+ unlock_page(locked_page);
+ goto out;
+ }
- /* JDM XXX */
+ ret = extent_write_locked_range(&inode->vfs_inode, start, end);
+ /* All pages will be unlocked, including @locked_page */
+out:
+ kfree(async_extent);
+ return ret;
+}
- /*
- * if page_started, cow_file_range inserted an
- * inline extent and took care of all the unlocking
- * and IO for us. Otherwise, we need to submit
- * all those pages down to the drive.
- */
- if (!page_started && !ret)
- extent_write_locked_range(&inode->vfs_inode,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- WB_SYNC_ALL);
- else if (ret && async_chunk->locked_page)
- unlock_page(async_chunk->locked_page);
- kfree(async_extent);
- cond_resched();
- continue;
- }
+static int submit_one_async_extent(struct btrfs_inode *inode,
+ struct async_chunk *async_chunk,
+ struct async_extent *async_extent,
+ u64 *alloc_hint)
+{
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key ins;
+ struct page *locked_page = NULL;
+ struct extent_map *em;
+ int ret = 0;
+ u64 start = async_extent->start;
+ u64 end = async_extent->start + async_extent->ram_size - 1;
- ret = btrfs_reserve_extent(root, async_extent->ram_size,
- async_extent->compressed_size,
- async_extent->compressed_size,
- 0, alloc_hint, &ins, 1, 1);
- if (ret) {
- free_async_extent_pages(async_extent);
+ /*
+ * If async_chunk->locked_page is in the async_extent range, we need to
+ * handle it.
+ */
+ if (async_chunk->locked_page) {
+ u64 locked_page_start = page_offset(async_chunk->locked_page);
+ u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
- if (ret == -ENOSPC) {
- unlock_extent(io_tree, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1);
+ if (!(start >= locked_page_end || end <= locked_page_start))
+ locked_page = async_chunk->locked_page;
+ }
+ lock_extent(io_tree, start, end);
- /*
- * we need to redirty the pages if we decide to
- * fallback to uncompressed IO, otherwise we
- * will not submit these pages down to lower
- * layers.
- */
- extent_range_redirty_for_io(&inode->vfs_inode,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1);
+ /* We have fall back to uncompressed write */
+ if (!async_extent->pages)
+ return submit_uncompressed_range(inode, async_extent, locked_page);
- goto retry;
- }
- goto out_free;
- }
+ ret = btrfs_reserve_extent(root, async_extent->ram_size,
+ async_extent->compressed_size,
+ async_extent->compressed_size,
+ 0, *alloc_hint, &ins, 1, 1);
+ if (ret) {
+ free_async_extent_pages(async_extent);
/*
- * here we're doing allocation and writeback of the
- * compressed pages
+ * Here we used to try again by going back to non-compressed
+ * path for ENOSPC. But we can't reserve space even for
+ * compressed size, how could it work for uncompressed size
+ * which requires larger size? So here we directly go error
+ * path.
*/
- em = create_io_em(inode, async_extent->start,
- async_extent->ram_size, /* len */
- async_extent->start, /* orig_start */
- ins.objectid, /* block_start */
- ins.offset, /* block_len */
- ins.offset, /* orig_block_len */
- async_extent->ram_size, /* ram_bytes */
- async_extent->compress_type,
- BTRFS_ORDERED_COMPRESSED);
- if (IS_ERR(em))
- /* ret value is not necessary due to void function */
- goto out_free_reserve;
- free_extent_map(em);
-
- ret = btrfs_add_ordered_extent_compress(inode,
- async_extent->start,
- ins.objectid,
- async_extent->ram_size,
- ins.offset,
- async_extent->compress_type);
- if (ret) {
- btrfs_drop_extent_cache(inode, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1, 0);
- goto out_free_reserve;
- }
- btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ goto out_free;
+ }
+
+ /* Here we're doing allocation and writeback of the compressed pages */
+ em = create_io_em(inode, start,
+ async_extent->ram_size, /* len */
+ start, /* orig_start */
+ ins.objectid, /* block_start */
+ ins.offset, /* block_len */
+ ins.offset, /* orig_block_len */
+ async_extent->ram_size, /* ram_bytes */
+ async_extent->compress_type,
+ BTRFS_ORDERED_COMPRESSED);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_free_reserve;
+ }
+ free_extent_map(em);
- /*
- * clear dirty, set writeback and unlock the pages.
- */
- extent_clear_unlock_delalloc(inode, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
- PAGE_UNLOCK | PAGE_START_WRITEBACK);
- if (btrfs_submit_compressed_write(inode, async_extent->start,
- async_extent->ram_size,
- ins.objectid,
- ins.offset, async_extent->pages,
- async_extent->nr_pages,
- async_chunk->write_flags,
- async_chunk->blkcg_css)) {
- struct page *p = async_extent->pages[0];
- const u64 start = async_extent->start;
- const u64 end = start + async_extent->ram_size - 1;
-
- p->mapping = inode->vfs_inode.i_mapping;
- btrfs_writepage_endio_finish_ordered(inode, p, start,
- end, false);
-
- p->mapping = NULL;
- extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
- PAGE_END_WRITEBACK |
- PAGE_SET_ERROR);
- free_async_extent_pages(async_extent);
- }
- alloc_hint = ins.objectid + ins.offset;
- kfree(async_extent);
- cond_resched();
+ ret = btrfs_add_ordered_extent_compress(inode, start, /* file_offset */
+ ins.objectid, /* disk_bytenr */
+ async_extent->ram_size, /* num_bytes */
+ ins.offset, /* disk_num_bytes */
+ async_extent->compress_type);
+ if (ret) {
+ btrfs_drop_extent_cache(inode, start, end, 0);
+ goto out_free_reserve;
}
- return;
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+
+ /* Clear dirty, set writeback and unlock the pages. */
+ extent_clear_unlock_delalloc(inode, start, end,
+ NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK);
+ if (btrfs_submit_compressed_write(inode, start, /* file_offset */
+ async_extent->ram_size, /* num_bytes */
+ ins.objectid, /* disk_bytenr */
+ ins.offset, /* compressed_len */
+ async_extent->pages, /* compressed_pages */
+ async_extent->nr_pages,
+ async_chunk->write_flags,
+ async_chunk->blkcg_css)) {
+ const u64 start = async_extent->start;
+ const u64 end = start + async_extent->ram_size - 1;
+
+ btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
+
+ extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+ PAGE_END_WRITEBACK | PAGE_SET_ERROR);
+ free_async_extent_pages(async_extent);
+ }
+ *alloc_hint = ins.objectid + ins.offset;
+ kfree(async_extent);
+ return ret;
+
out_free_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_free:
- extent_clear_unlock_delalloc(inode, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
+ extent_clear_unlock_delalloc(inode, start, end,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
@@ -1007,7 +1030,39 @@ out_free:
PAGE_END_WRITEBACK | PAGE_SET_ERROR);
free_async_extent_pages(async_extent);
kfree(async_extent);
- goto again;
+ return ret;
+}
+
+/*
+ * Phase two of compressed writeback. This is the ordered portion of the code,
+ * which only gets called in the order the work was queued. We walk all the
+ * async extents created by compress_file_range and send them down to the disk.
+ */
+static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+{
+ struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct async_extent *async_extent;
+ u64 alloc_hint = 0;
+ int ret = 0;
+
+ while (!list_empty(&async_chunk->extents)) {
+ u64 extent_start;
+ u64 ram_size;
+
+ async_extent = list_entry(async_chunk->extents.next,
+ struct async_extent, list);
+ list_del(&async_extent->list);
+ extent_start = async_extent->start;
+ ram_size = async_extent->ram_size;
+
+ ret = submit_one_async_extent(inode, async_chunk, async_extent,
+ &alloc_hint);
+ btrfs_debug(fs_info,
+"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode), extent_start, ram_size, ret);
+ }
}
static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
@@ -1150,7 +1205,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* fails during the stage where it updates the bytenr of file extent
* items.
*/
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
min_alloc_size = num_bytes;
else
min_alloc_size = fs_info->sectorsize;
@@ -1186,8 +1241,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (ret)
goto out_drop_extent_cache;
- if (root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ if (btrfs_is_data_reloc_root(root)) {
ret = btrfs_reloc_clone_csums(inode, start,
cur_alloc_size);
/*
@@ -1325,18 +1379,17 @@ static noinline void async_cow_submit(struct btrfs_work *work)
static noinline void async_cow_free(struct btrfs_work *work)
{
struct async_chunk *async_chunk;
+ struct async_cow *async_cow;
async_chunk = container_of(work, struct async_chunk, work);
if (async_chunk->inode)
btrfs_add_delayed_iput(async_chunk->inode);
if (async_chunk->blkcg_css)
css_put(async_chunk->blkcg_css);
- /*
- * Since the pointer to 'pending' is at the beginning of the array of
- * async_chunk's, freeing it ensures the whole array has been freed.
- */
- if (atomic_dec_and_test(async_chunk->pending))
- kvfree(async_chunk->pending);
+
+ async_cow = async_chunk->async_cow;
+ if (atomic_dec_and_test(&async_cow->num_chunks))
+ kvfree(async_cow);
}
static int cow_file_range_async(struct btrfs_inode *inode,
@@ -1397,7 +1450,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
* lightweight reference for the callback lifetime
*/
ihold(&inode->vfs_inode);
- async_chunk[i].pending = &ctx->num_chunks;
+ async_chunk[i].async_cow = ctx;
async_chunk[i].inode = &inode->vfs_inode;
async_chunk[i].start = start;
async_chunk[i].end = cur_end;
@@ -1470,7 +1523,7 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
__set_page_dirty_nobuffers(locked_page);
account_page_redirty(locked_page);
- extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL);
+ extent_write_locked_range(&inode->vfs_inode, start, end);
*page_started = 1;
return 0;
@@ -1503,8 +1556,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
int *page_started, unsigned long *nr_written)
{
const bool is_space_ino = btrfs_is_free_space_inode(inode);
- const bool is_reloc_ino = (inode->root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID);
+ const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
const u64 range_bytes = end + 1 - start;
struct extent_io_tree *io_tree = &inode->io_tree;
u64 range_start = start;
@@ -1866,8 +1918,7 @@ out_check:
btrfs_dec_nocow_writers(fs_info, disk_bytenr);
nocow = false;
- if (root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
/*
* Error handled later, as we must prevent
* extent_clear_unlock_delalloc() in error handler
@@ -1946,8 +1997,23 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
int ret;
const bool zoned = btrfs_is_zoned(inode->root->fs_info);
+ /*
+ * The range must cover part of the @locked_page, or the returned
+ * @page_started can confuse the caller.
+ */
+ ASSERT(!(end <= page_offset(locked_page) ||
+ start >= page_offset(locked_page) + PAGE_SIZE));
+
if (should_nocow(inode, start, end)) {
- ASSERT(!zoned);
+ /*
+ * Normally on a zoned device we're only doing COW writes, but
+ * in case of relocation on a zoned filesystem we have taken
+ * precaution, that we're only writing sequentially. It's safe
+ * to use run_delalloc_nocow() here, like for regular
+ * preallocated inodes.
+ */
+ ASSERT(!zoned ||
+ (zoned && btrfs_is_data_reloc_root(inode->root)));
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, nr_written);
} else if (!inode_can_compress(inode) ||
@@ -2206,7 +2272,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
if (btrfs_is_testing(fs_info))
return;
- if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ if (!btrfs_is_data_reloc_root(root) &&
do_list && !(state->state & EXTENT_NORESERVE) &&
(*bits & EXTENT_CLEAR_DATA_RESV))
btrfs_free_reserved_data_space_noquota(fs_info, len);
@@ -2234,48 +2300,6 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
}
/*
- * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
- * in a chunk's stripe. This function ensures that bios do not span a
- * stripe/chunk
- *
- * @page - The page we are about to add to the bio
- * @size - size we want to add to the bio
- * @bio - bio we want to ensure is smaller than a stripe
- * @bio_flags - flags of the bio
- *
- * return 1 if page cannot be added to the bio
- * return 0 if page can be added to the bio
- * return error otherwise
- */
-int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
- unsigned long bio_flags)
-{
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- u64 logical = bio->bi_iter.bi_sector << 9;
- u32 bio_len = bio->bi_iter.bi_size;
- struct extent_map *em;
- int ret = 0;
- struct btrfs_io_geometry geom;
-
- if (bio_flags & EXTENT_BIO_COMPRESSED)
- return 0;
-
- em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
- if (IS_ERR(em))
- return PTR_ERR(em);
- ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom);
- if (ret < 0)
- goto out;
-
- if (geom.len < bio_len + size)
- ret = 1;
-out:
- free_extent_map(em);
- return ret;
-}
-
-/*
* in order to insert checksums into the metadata in large chunks,
* we wait until bio submission time. All the pages in the bio are
* checksummed and sums are attached onto the ordered extent record.
@@ -2531,7 +2555,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
goto mapit;
} else if (async && !skip_sum) {
/* csum items have already been cloned */
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
goto mapit;
/* we're doing a write, do the async checksumming */
ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
@@ -2764,7 +2788,7 @@ out_page:
clear_page_dirty_for_io(page);
SetPageError(page);
}
- ClearPageChecked(page);
+ btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
unlock_page(page);
put_page(page);
kfree(fixup);
@@ -2819,7 +2843,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
* page->mapping outside of the page lock.
*/
ihold(inode);
- SetPageChecked(page);
+ btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
get_page(page);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
@@ -3010,8 +3034,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- if (ordered_extent->bdev)
+ /* A valid bdev implies a write on a sequential zone */
+ if (ordered_extent->bdev) {
btrfs_rewrite_logical_zoned(ordered_extent);
+ btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
+ }
btrfs_free_io_failure_record(inode, start, end);
@@ -3208,7 +3236,7 @@ void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
*
* The length of such check is always one sector size.
*/
-static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
+static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
u32 bio_offset, struct page *page, u32 pgoff,
u64 start)
{
@@ -3224,7 +3252,7 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
ASSERT(pgoff + len <= PAGE_SIZE);
offset_sectors = bio_offset >> fs_info->sectorsize_bits;
- csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size;
+ csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
kaddr = kmap_atomic(page);
shash->tfm = fs_info->csum_shash;
@@ -3238,9 +3266,9 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
return 0;
zeroit:
btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
- io_bio->mirror_num);
- if (io_bio->device)
- btrfs_dev_stat_inc_and_print(io_bio->device,
+ bbio->mirror_num);
+ if (bbio->device)
+ btrfs_dev_stat_inc_and_print(bbio->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
memset(kaddr + pgoff, 1, len);
flush_dcache_page(page);
@@ -3260,33 +3288,29 @@ zeroit:
* Return a bitmap where bit set means a csum mismatch, and bit not set means
* csum match.
*/
-unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end)
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page,
+ u64 start, u64 end)
{
struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
const u32 sectorsize = root->fs_info->sectorsize;
u32 pg_off;
unsigned int result = 0;
- if (PageChecked(page)) {
- ClearPageChecked(page);
+ if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) {
+ btrfs_page_clear_checked(fs_info, page, start, end + 1 - start);
return 0;
}
/*
- * For subpage case, above PageChecked is not safe as it's not subpage
- * compatible.
- * But for now only cow fixup and compressed read utilize PageChecked
- * flag, while in this context we can easily use io_bio->csum to
- * determine if we really need to do csum verification.
- *
- * So for now, just exit if io_bio->csum is NULL, as it means it's
- * compressed read, and its compressed data csum has already been
- * verified.
+ * This only happens for NODATASUM or compressed read.
+ * Normally this should be covered by above check for compressed read
+ * or the next check for NODATASUM. Just do a quicker exit here.
*/
- if (io_bio->csum == NULL)
+ if (bbio->csum == NULL)
return 0;
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
@@ -3303,7 +3327,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
u64 file_offset = pg_off + page_offset(page);
int ret;
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ if (btrfs_is_data_reloc_root(root) &&
test_range_bit(io_tree, file_offset,
file_offset + sectorsize - 1,
EXTENT_NODATASUM, 1, NULL)) {
@@ -3313,7 +3337,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
EXTENT_NODATASUM);
continue;
}
- ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
+ ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
page_offset(page) + pg_off);
if (ret < 0) {
const int nr_bit = (pg_off - offset_in_page(start)) >>
@@ -4004,7 +4028,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
* without delay
*/
if (!btrfs_is_free_space_inode(inode)
- && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+ && !btrfs_is_data_reloc_root(root)
&& !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
btrfs_update_root_times(trans, root);
@@ -4034,11 +4058,11 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
* also drops the back refs in the inode to the directory
*/
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
const char *name, int name_len)
{
+ struct btrfs_root *root = dir->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
int ret = 0;
@@ -4098,19 +4122,9 @@ skip_backref:
goto err;
}
- ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
- dir_ino);
- if (ret != 0 && ret != -ENOENT) {
- btrfs_abort_transaction(trans, ret);
- goto err;
- }
-
- ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
- index);
- if (ret == -ENOENT)
- ret = 0;
- else if (ret)
- btrfs_abort_transaction(trans, ret);
+ btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
+ dir_ino);
+ btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index);
/*
* If we have a pending delayed iput we could end up with the final iput
@@ -4138,15 +4152,14 @@ out:
}
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *dir, struct btrfs_inode *inode,
const char *name, int name_len)
{
int ret;
- ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+ ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len);
if (!ret) {
drop_nlink(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode->root, inode);
}
return ret;
}
@@ -4175,7 +4188,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
{
- struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
struct inode *inode = d_inode(dentry);
int ret;
@@ -4187,7 +4199,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
0);
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
BTRFS_I(d_inode(dentry)), dentry->d_name.name,
dentry->d_name.len);
if (ret)
@@ -4201,7 +4213,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
out:
btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(root->fs_info);
+ btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
return ret;
}
@@ -4368,7 +4380,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root)
struct inode *inode;
u64 objectid = 0;
- if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (!BTRFS_FS_ERROR(fs_info))
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
spin_lock(&root->inode_lock);
@@ -4552,7 +4564,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
int err = 0;
- struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
u64 last_unlink_trans;
@@ -4577,7 +4588,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
/* now the directory is empty */
- err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ err = btrfs_unlink_inode(trans, BTRFS_I(dir),
BTRFS_I(d_inode(dentry)), dentry->d_name.name,
dentry->d_name.len);
if (!err) {
@@ -4598,7 +4609,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
}
out:
btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(root->fs_info);
+ btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
return err;
}
@@ -4907,9 +4918,9 @@ delete:
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
extent_start, extent_num_bytes, 0);
- ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- ino, extent_offset);
+ ino, extent_offset,
+ root->root_key.objectid, false);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -5105,7 +5116,8 @@ again:
len);
flush_dcache_page(page);
}
- ClearPageChecked(page);
+ btrfs_page_clear_checked(fs_info, page, block_start,
+ block_end + 1 - block_start);
btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
@@ -6435,7 +6447,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode_ref *ref;
struct btrfs_key key[2];
u32 sizes[2];
- int nitems = name ? 2 : 1;
+ struct btrfs_item_batch batch;
unsigned long ptr;
unsigned int nofs_flag;
int ret;
@@ -6527,7 +6539,11 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
goto fail;
}
- ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
+ batch.keys = &key[0];
+ batch.data_sizes = &sizes[0];
+ batch.total_data_size = sizes[0] + (name ? sizes[1] : 0);
+ batch.nr = name ? 2 : 1;
+ ret = btrfs_insert_empty_items(trans, root, path, &batch);
if (ret != 0)
goto fail_unlock;
@@ -7961,7 +7977,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
iomap->type = IOMAP_MAPPED;
}
iomap->offset = start;
- iomap->bdev = fs_info->fs_devices->latest_bdev;
+ iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
iomap->length = len;
if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
@@ -8038,13 +8054,13 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
__endio_write_update_ordered(BTRFS_I(dip->inode),
- dip->logical_offset,
+ dip->file_offset,
dip->bytes,
!dip->dio_bio->bi_status);
} else {
unlock_extent(&BTRFS_I(dip->inode)->io_tree,
- dip->logical_offset,
- dip->logical_offset + dip->bytes - 1);
+ dip->file_offset,
+ dip->file_offset + dip->bytes - 1);
}
bio_endio(dip->dio_bio);
@@ -8072,10 +8088,11 @@ static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio,
return ret;
}
-static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
- struct btrfs_io_bio *io_bio,
+static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
+ struct btrfs_bio *bbio,
const bool uptodate)
{
+ struct inode *inode = dip->inode;
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
const u32 sectorsize = fs_info->sectorsize;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
@@ -8083,11 +8100,12 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
struct bio_vec bvec;
struct bvec_iter iter;
- u64 start = io_bio->logical;
+ const u64 orig_file_offset = dip->file_offset;
+ u64 start = orig_file_offset;
u32 bio_offset = 0;
blk_status_t err = BLK_STS_OK;
- __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) {
+ __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) {
unsigned int i, nr_sectors, pgoff;
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
@@ -8095,7 +8113,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
for (i = 0; i < nr_sectors; i++) {
ASSERT(pgoff < PAGE_SIZE);
if (uptodate &&
- (!csum || !check_data_csum(inode, io_bio,
+ (!csum || !check_data_csum(inode, bbio,
bio_offset, bvec.bv_page,
pgoff, start))) {
clean_io_failure(fs_info, failure_tree, io_tree,
@@ -8105,12 +8123,12 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
} else {
int ret;
- ASSERT((start - io_bio->logical) < UINT_MAX);
+ ASSERT((start - orig_file_offset) < UINT_MAX);
ret = btrfs_repair_one_sector(inode,
- &io_bio->bio,
- start - io_bio->logical,
+ &bbio->bio,
+ start - orig_file_offset,
bvec.bv_page, pgoff,
- start, io_bio->mirror_num,
+ start, bbio->mirror_num,
submit_dio_repair_bio);
if (ret)
err = errno_to_blk_status(ret);
@@ -8151,15 +8169,13 @@ static void btrfs_end_dio_bio(struct bio *bio)
bio->bi_opf, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, err);
- if (bio_op(bio) == REQ_OP_READ) {
- err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio),
- !err);
- }
+ if (bio_op(bio) == REQ_OP_READ)
+ err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err);
if (err)
dip->dio_bio->bi_status = err;
- btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio);
+ btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio);
bio_put(bio);
btrfs_dio_private_put(dip);
@@ -8201,10 +8217,10 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
} else {
u64 csum_offset;
- csum_offset = file_offset - dip->logical_offset;
+ csum_offset = file_offset - dip->file_offset;
csum_offset >>= fs_info->sectorsize_bits;
csum_offset *= fs_info->csum_size;
- btrfs_io_bio(bio)->csum = dip->csums + csum_offset;
+ btrfs_bio(bio)->csum = dip->csums + csum_offset;
}
map:
ret = btrfs_map_bio(fs_info, bio, 0);
@@ -8239,7 +8255,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
return NULL;
dip->inode = inode;
- dip->logical_offset = file_offset;
+ dip->file_offset = file_offset;
dip->bytes = dio_bio->bi_iter.bi_size;
dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
dip->dio_bio = dio_bio;
@@ -8247,7 +8263,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
return dip;
}
-static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
+static void btrfs_submit_direct(const struct iomap_iter *iter,
struct bio *dio_bio, loff_t file_offset)
{
struct inode *inode = iter->inode;
@@ -8277,7 +8293,7 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
}
dio_bio->bi_status = BLK_STS_RESOURCE;
bio_endio(dio_bio);
- return BLK_QC_T_NONE;
+ return;
}
if (!write) {
@@ -8320,7 +8336,6 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
- btrfs_io_bio(bio)->logical = file_offset;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
status = extract_ordered_extent(BTRFS_I(inode), bio,
@@ -8371,15 +8386,13 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
free_extent_map(em);
} while (submit_len > 0);
- return BLK_QC_T_NONE;
+ return;
out_err_em:
free_extent_map(em);
out_err:
dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
-
- return BLK_QC_T_NONE;
}
const struct iomap_ops btrfs_dio_iomap_ops = {
@@ -8696,9 +8709,9 @@ next:
* did something wrong.
*/
ASSERT(!PageOrdered(page));
+ btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE);
if (!inode_evicting)
__btrfs_releasepage(page, GFP_NOFS);
- ClearPageChecked(page);
clear_page_extent_mapped(page);
}
@@ -8842,7 +8855,7 @@ again:
memzero_page(page, zero_start, PAGE_SIZE - zero_start);
flush_dcache_page(page);
}
- ClearPageChecked(page);
+ btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
@@ -9152,8 +9165,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
WARN_ON(inode->block_rsv.reserved);
WARN_ON(inode->block_rsv.size);
WARN_ON(inode->outstanding_extents);
- WARN_ON(inode->delalloc_bytes);
- WARN_ON(inode->new_delalloc_bytes);
+ if (!S_ISDIR(vfs_inode->i_mode)) {
+ WARN_ON(inode->delalloc_bytes);
+ WARN_ON(inode->new_delalloc_bytes);
+ }
WARN_ON(inode->csum_bytes);
WARN_ON(inode->defrag_bytes);
@@ -9450,7 +9465,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else { /* src is an inode */
- ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
old_dentry->d_name.name,
old_dentry->d_name.len);
@@ -9466,7 +9481,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
} else { /* dest is an inode */
- ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
new_dentry->d_name.name,
new_dentry->d_name.len);
@@ -9741,7 +9756,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
*/
btrfs_pin_log_trans(root);
log_pinned = true;
- ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
old_dentry->d_name.name,
old_dentry->d_name.len);
@@ -9761,7 +9776,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
BUG_ON(new_inode->i_nlink == 0);
} else {
- ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(d_inode(new_dentry)),
new_dentry->d_name.name,
new_dentry->d_name.len);
@@ -9979,7 +9994,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_conte
};
struct btrfs_fs_info *fs_info = root->fs_info;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
@@ -9998,7 +10013,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
struct list_head splice;
int ret;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
INIT_LIST_HEAD(&splice);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cc61813213d8..fb8cc9642ac4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -48,6 +48,7 @@
#include "space-info.h"
#include "delalloc-space.h"
#include "block-group.h"
+#include "subpage.h"
#ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -81,7 +82,8 @@ struct btrfs_ioctl_send_args_32 {
compat_uptr_t clone_sources; /* in */
__u64 parent_root; /* in */
__u64 flags; /* in */
- __u64 reserved[4]; /* in */
+ __u32 version; /* in */
+ __u8 reserved[28]; /* in */
} __attribute__ ((__packed__));
#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
@@ -985,129 +987,32 @@ out:
return ret;
}
-/*
- * When we're defragging a range, we don't want to kick it off again
- * if it is really just waiting for delalloc to send it down.
- * If we find a nice big extent or delalloc range for the bytes in the
- * file you want to defrag, we return 0 to let you know to skip this
- * part of the file
- */
-static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
-{
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map *em = NULL;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- u64 end;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE);
- read_unlock(&em_tree->lock);
-
- if (em) {
- end = extent_map_end(em);
- free_extent_map(em);
- if (end - offset > thresh)
- return 0;
- }
- /* if we already have a nice delalloc here, just stop */
- thresh /= 2;
- end = count_range_bits(io_tree, &offset, offset + thresh,
- thresh, EXTENT_DELALLOC, 1);
- if (end >= thresh)
- return 0;
- return 1;
-}
-
-/*
- * helper function to walk through a file and find extents
- * newer than a specific transid, and smaller than thresh.
- *
- * This is used by the defragging code to find new and small
- * extents
- */
-static int find_new_extents(struct btrfs_root *root,
- struct inode *inode, u64 newer_than,
- u64 *off, u32 thresh)
-{
- struct btrfs_path *path;
- struct btrfs_key min_key;
- struct extent_buffer *leaf;
- struct btrfs_file_extent_item *extent;
- int type;
- int ret;
- u64 ino = btrfs_ino(BTRFS_I(inode));
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- min_key.objectid = ino;
- min_key.type = BTRFS_EXTENT_DATA_KEY;
- min_key.offset = *off;
-
- while (1) {
- ret = btrfs_search_forward(root, &min_key, path, newer_than);
- if (ret != 0)
- goto none;
-process_slot:
- if (min_key.objectid != ino)
- goto none;
- if (min_key.type != BTRFS_EXTENT_DATA_KEY)
- goto none;
-
- leaf = path->nodes[0];
- extent = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- type = btrfs_file_extent_type(leaf, extent);
- if (type == BTRFS_FILE_EXTENT_REG &&
- btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
- check_defrag_in_cache(inode, min_key.offset, thresh)) {
- *off = min_key.offset;
- btrfs_free_path(path);
- return 0;
- }
-
- path->slots[0]++;
- if (path->slots[0] < btrfs_header_nritems(leaf)) {
- btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
- goto process_slot;
- }
-
- if (min_key.offset == (u64)-1)
- goto none;
-
- min_key.offset++;
- btrfs_release_path(path);
- }
-none:
- btrfs_free_path(path);
- return -ENOENT;
-}
-
-static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
+ bool locked)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em;
- u64 len = PAGE_SIZE;
+ const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
/*
* hopefully we have this extent in the tree already, try without
* the full extent lock
*/
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
+ em = lookup_extent_mapping(em_tree, start, sectorsize);
read_unlock(&em_tree->lock);
if (!em) {
struct extent_state *cached = NULL;
- u64 end = start + len - 1;
+ u64 end = start + sectorsize - 1;
/* get the big lock and read metadata off disk */
- lock_extent_bits(io_tree, start, end, &cached);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
- unlock_extent_cached(io_tree, start, end, &cached);
+ if (!locked)
+ lock_extent_bits(io_tree, start, end, &cached);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize);
+ if (!locked)
+ unlock_extent_cached(io_tree, start, end, &cached);
if (IS_ERR(em))
return NULL;
@@ -1116,7 +1021,8 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
return em;
}
-static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
+ bool locked)
{
struct extent_map *next;
bool ret = true;
@@ -1125,7 +1031,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
if (em->start + em->len >= i_size_read(inode))
return false;
- next = defrag_lookup_extent(inode, em->start + em->len);
+ next = defrag_lookup_extent(inode, em->start + em->len, locked);
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
ret = false;
else if ((em->block_start + em->block_len == next->block_start) &&
@@ -1136,297 +1042,435 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
return ret;
}
-static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
- u64 *last_len, u64 *skip, u64 *defrag_end,
- int compress)
+/*
+ * Prepare one page to be defragged.
+ *
+ * This will ensure:
+ *
+ * - Returned page is locked and has been set up properly.
+ * - No ordered extent exists in the page.
+ * - The page is uptodate.
+ *
+ * NOTE: Caller should also wait for page writeback after the cluster is
+ * prepared, here we don't do writeback wait for each page.
+ */
+static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
+ pgoff_t index)
{
- struct extent_map *em;
- int ret = 1;
- bool next_mergeable = true;
- bool prev_mergeable = true;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ gfp_t mask = btrfs_alloc_write_mask(mapping);
+ u64 page_start = (u64)index << PAGE_SHIFT;
+ u64 page_end = page_start + PAGE_SIZE - 1;
+ struct extent_state *cached_state = NULL;
+ struct page *page;
+ int ret;
+
+again:
+ page = find_or_create_page(mapping, index, mask);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
/*
- * make sure that once we start defragging an extent, we keep on
- * defragging it
+ * Since we can defragment files opened read-only, we can encounter
+ * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
+ * can't do I/O using huge pages yet, so return an error for now.
+ * Filesystem transparent huge pages are typically only used for
+ * executables that explicitly enable them, so this isn't very
+ * restrictive.
*/
- if (start < *defrag_end)
- return 1;
+ if (PageCompound(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-ETXTBSY);
+ }
- *skip = 0;
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(ret);
+ }
- em = defrag_lookup_extent(inode, start);
- if (!em)
- return 0;
+ /* Wait for any existing ordered extent in the range */
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
- /* this will cover holes, and inline extents */
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- ret = 0;
- goto out;
- }
+ lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
+ unlock_extent_cached(&inode->io_tree, page_start, page_end,
+ &cached_state);
+ if (!ordered)
+ break;
- if (!*defrag_end)
- prev_mergeable = false;
+ unlock_page(page);
+ btrfs_start_ordered_extent(ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ lock_page(page);
+ /*
+ * We unlocked the page above, so we need check if it was
+ * released or not.
+ */
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ }
- next_mergeable = defrag_check_next_extent(inode, em);
- /*
- * we hit a real extent, if it is big or the next extent is not a
- * real extent, don't bother defragging it
- */
- if (!compress && (*last_len == 0 || *last_len >= thresh) &&
- (em->len >= thresh || (!next_mergeable && !prev_mergeable)))
- ret = 0;
-out:
/*
- * last_len ends up being a counter of how many bytes we've defragged.
- * every time we choose not to defrag an extent, we reset *last_len
- * so that the next tiny extent will force a defrag.
- *
- * The end result of this is that tiny extents before a single big
- * extent will force at least part of that big extent to be defragged.
+ * Now the page range has no ordered extent any more. Read the page to
+ * make it uptodate.
*/
- if (ret) {
- *defrag_end = extent_map_end(em);
- } else {
- *last_len = 0;
- *skip = extent_map_end(em);
- *defrag_end = 0;
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
}
-
- free_extent_map(em);
- return ret;
+ return page;
}
+struct defrag_target_range {
+ struct list_head list;
+ u64 start;
+ u64 len;
+};
+
/*
- * it doesn't do much good to defrag one or two pages
- * at a time. This pulls in a nice chunk of pages
- * to COW and defrag.
- *
- * It also makes sure the delalloc code has enough
- * dirty data to avoid making new small extents as part
- * of the defrag
+ * Collect all valid target extents.
*
- * It's a good idea to start RA on this range
- * before calling this.
+ * @start: file offset to lookup
+ * @len: length to lookup
+ * @extent_thresh: file extent size threshold, any extent size >= this value
+ * will be ignored
+ * @newer_than: only defrag extents newer than this value
+ * @do_compress: whether the defrag is doing compression
+ * if true, @extent_thresh will be ignored and all regular
+ * file extents meeting @newer_than will be targets.
+ * @locked: if the range has already held extent lock
+ * @target_list: list of targets file extents
*/
-static int cluster_pages_for_defrag(struct inode *inode,
- struct page **pages,
- unsigned long start_index,
- unsigned long num_pages)
+static int defrag_collect_targets(struct btrfs_inode *inode,
+ u64 start, u64 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ bool locked, struct list_head *target_list)
{
- unsigned long file_end;
- u64 isize = i_size_read(inode);
- u64 page_start;
- u64 page_end;
- u64 page_cnt;
- u64 start = (u64)start_index << PAGE_SHIFT;
- u64 search_start;
- int ret;
- int i;
- int i_done;
- struct btrfs_ordered_extent *ordered;
- struct extent_state *cached_state = NULL;
- struct extent_io_tree *tree;
- struct extent_changeset *data_reserved = NULL;
- gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ u64 cur = start;
+ int ret = 0;
- file_end = (isize - 1) >> PAGE_SHIFT;
- if (!isize || start_index > file_end)
- return 0;
+ while (cur < start + len) {
+ struct extent_map *em;
+ struct defrag_target_range *new;
+ bool next_mergeable = true;
+ u64 range_len;
- page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
+ em = defrag_lookup_extent(&inode->vfs_inode, cur, locked);
+ if (!em)
+ break;
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- start, page_cnt << PAGE_SHIFT);
- if (ret)
- return ret;
- i_done = 0;
- tree = &BTRFS_I(inode)->io_tree;
+ /* Skip hole/inline/preallocated extents */
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ goto next;
- /* step one, lock all the pages */
- for (i = 0; i < page_cnt; i++) {
- struct page *page;
-again:
- page = find_or_create_page(inode->i_mapping,
- start_index + i, mask);
- if (!page)
- break;
+ /* Skip older extent */
+ if (em->generation < newer_than)
+ goto next;
- ret = set_page_extent_mapped(page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- break;
+ /*
+ * For do_compress case, we want to compress all valid file
+ * extents, thus no @extent_thresh or mergeable check.
+ */
+ if (do_compress)
+ goto add;
+
+ /* Skip too large extent */
+ if (em->len >= extent_thresh)
+ goto next;
+
+ next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
+ locked);
+ if (!next_mergeable) {
+ struct defrag_target_range *last;
+
+ /* Empty target list, no way to merge with last entry */
+ if (list_empty(target_list))
+ goto next;
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ /* Not mergeable with last entry */
+ if (last->start + last->len != cur)
+ goto next;
+
+ /* Mergeable, fall through to add it to @target_list. */
}
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
- while (1) {
- lock_extent_bits(tree, page_start, page_end,
- &cached_state);
- ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode),
- page_start);
- unlock_extent_cached(tree, page_start, page_end,
- &cached_state);
- if (!ordered)
- break;
-
- unlock_page(page);
- btrfs_start_ordered_extent(ordered, 1);
- btrfs_put_ordered_extent(ordered);
- lock_page(page);
- /*
- * we unlocked the page above, so we need check if
- * it was released or not.
- */
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
- put_page(page);
- goto again;
+add:
+ range_len = min(extent_map_end(em), start + len) - cur;
+ /*
+ * This one is a good target, check if it can be merged into
+ * last range of the target list.
+ */
+ if (!list_empty(target_list)) {
+ struct defrag_target_range *last;
+
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ ASSERT(last->start + last->len <= cur);
+ if (last->start + last->len == cur) {
+ /* Mergeable, enlarge the last entry */
+ last->len += range_len;
+ goto next;
}
+ /* Fall through to allocate a new entry */
}
- if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
- ret = -EIO;
- break;
- }
+ /* Allocate new defrag_target_range */
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (!new) {
+ free_extent_map(em);
+ ret = -ENOMEM;
+ break;
}
+ new->start = cur;
+ new->len = range_len;
+ list_add_tail(&new->list, target_list);
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
- put_page(page);
- goto again;
+next:
+ cur = extent_map_end(em);
+ free_extent_map(em);
+ }
+ if (ret < 0) {
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+
+ list_for_each_entry_safe(entry, tmp, target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
}
+ }
+ return ret;
+}
+
+#define CLUSTER_SIZE (SZ_256K)
+
+/*
+ * Defrag one contiguous target range.
+ *
+ * @inode: target inode
+ * @target: target range to defrag
+ * @pages: locked pages covering the defrag range
+ * @nr_pages: number of locked pages
+ *
+ * Caller should ensure:
+ *
+ * - Pages are prepared
+ * Pages should be locked, no ordered extent in the pages range,
+ * no writeback.
+ *
+ * - Extent bits are locked
+ */
+static int defrag_one_locked_target(struct btrfs_inode *inode,
+ struct defrag_target_range *target,
+ struct page **pages, int nr_pages,
+ struct extent_state **cached_state)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_changeset *data_reserved = NULL;
+ const u64 start = target->start;
+ const u64 len = target->len;
+ unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
+ unsigned long start_index = start >> PAGE_SHIFT;
+ unsigned long first_index = page_index(pages[0]);
+ int ret = 0;
+ int i;
+
+ ASSERT(last_index - first_index + 1 <= nr_pages);
+
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
+ if (ret < 0)
+ return ret;
+ clear_extent_bit(&inode->io_tree, start, start + len - 1,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, 0, 0, cached_state);
+ set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
- pages[i] = page;
- i_done++;
+ /* Update the page status */
+ for (i = start_index - first_index; i <= last_index - first_index; i++) {
+ ClearPageChecked(pages[i]);
+ btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
}
- if (!i_done || ret)
- goto out;
+ btrfs_delalloc_release_extents(inode, len);
+ extent_changeset_free(data_reserved);
- if (!(inode->i_sb->s_flags & SB_ACTIVE))
- goto out;
+ return ret;
+}
- /*
- * so now we have a nice long stream of locked
- * and up to date pages, lets wait on them
- */
- for (i = 0; i < i_done; i++)
- wait_on_page_writeback(pages[i]);
+static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+ u32 extent_thresh, u64 newer_than, bool do_compress)
+{
+ struct extent_state *cached_state = NULL;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ struct page **pages;
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ u64 last_index = (start + len - 1) >> PAGE_SHIFT;
+ u64 start_index = start >> PAGE_SHIFT;
+ unsigned int nr_pages = last_index - start_index + 1;
+ int ret = 0;
+ int i;
- page_start = page_offset(pages[0]);
- page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE;
+ ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
+ ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
+
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
- lock_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state);
+ /* Prepare all pages */
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = defrag_prepare_one_page(inode, start_index + i);
+ if (IS_ERR(pages[i])) {
+ ret = PTR_ERR(pages[i]);
+ pages[i] = NULL;
+ goto free_pages;
+ }
+ }
+ for (i = 0; i < nr_pages; i++)
+ wait_on_page_writeback(pages[i]);
+ /* Lock the pages range */
+ lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
/*
- * When defragmenting we skip ranges that have holes or inline extents,
- * (check should_defrag_range()), to avoid unnecessary IO and wasting
- * space. At btrfs_defrag_file(), we check if a range should be defragged
- * before locking the inode and then, if it should, we trigger a sync
- * page cache readahead - we lock the inode only after that to avoid
- * blocking for too long other tasks that possibly want to operate on
- * other file ranges. But before we were able to get the inode lock,
- * some other task may have punched a hole in the range, or we may have
- * now an inline extent, in which case we should not defrag. So check
- * for that here, where we have the inode and the range locked, and bail
- * out if that happened.
+ * Now we have a consistent view about the extent map, re-check
+ * which range really needs to be defragged.
+ *
+ * And this time we have extent locked already, pass @locked = true
+ * so that we won't relock the extent range and cause deadlock.
*/
- search_start = page_start;
- while (search_start < page_end) {
- struct extent_map *em;
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, true,
+ &target_list);
+ if (ret < 0)
+ goto unlock_extent;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start,
- page_end - search_start);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out_unlock_range;
- }
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- free_extent_map(em);
- /* Ok, 0 means we did not defrag anything */
- ret = 0;
- goto out_unlock_range;
+ list_for_each_entry(entry, &target_list, list) {
+ ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+ &cached_state);
+ if (ret < 0)
+ break;
+ }
+
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+unlock_extent:
+ unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
+free_pages:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i]) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
}
- search_start = extent_map_end(em);
- free_extent_map(em);
}
+ kfree(pages);
+ return ret;
+}
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
- page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 0, 0, &cached_state);
+static int defrag_one_cluster(struct btrfs_inode *inode,
+ struct file_ra_state *ra,
+ u64 start, u32 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ unsigned long *sectors_defragged,
+ unsigned long max_sectors)
+{
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ int ret;
- if (i_done != page_cnt) {
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
- spin_unlock(&BTRFS_I(inode)->lock);
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- start, (page_cnt - i_done) << PAGE_SHIFT, true);
- }
+ BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, false,
+ &target_list);
+ if (ret < 0)
+ goto out;
+ list_for_each_entry(entry, &target_list, list) {
+ u32 range_len = entry->len;
- set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
- &cached_state);
+ /* Reached the limit */
+ if (max_sectors && max_sectors == *sectors_defragged)
+ break;
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state);
+ if (max_sectors)
+ range_len = min_t(u32, range_len,
+ (max_sectors - *sectors_defragged) * sectorsize);
- for (i = 0; i < i_done; i++) {
- clear_page_dirty_for_io(pages[i]);
- ClearPageChecked(pages[i]);
- set_page_dirty(pages[i]);
- unlock_page(pages[i]);
- put_page(pages[i]);
+ if (ra)
+ page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+ ra, NULL, entry->start >> PAGE_SHIFT,
+ ((entry->start + range_len - 1) >> PAGE_SHIFT) -
+ (entry->start >> PAGE_SHIFT) + 1);
+ /*
+ * Here we may not defrag any range if holes are punched before
+ * we locked the pages.
+ * But that's fine, it only affects the @sectors_defragged
+ * accounting.
+ */
+ ret = defrag_one_range(inode, entry->start, range_len,
+ extent_thresh, newer_than, do_compress);
+ if (ret < 0)
+ break;
+ *sectors_defragged += range_len;
}
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
- extent_changeset_free(data_reserved);
- return i_done;
-
-out_unlock_range:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state);
out:
- for (i = 0; i < i_done; i++) {
- unlock_page(pages[i]);
- put_page(pages[i]);
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
}
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- start, page_cnt << PAGE_SHIFT, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
- extent_changeset_free(data_reserved);
return ret;
-
}
-int btrfs_defrag_file(struct inode *inode, struct file *file,
+/*
+ * Entry point to file defragmentation.
+ *
+ * @inode: inode to be defragged
+ * @ra: readahead state (can be NUL)
+ * @range: defrag options including range and flags
+ * @newer_than: minimum transid to defrag
+ * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
+ * will be defragged.
+ */
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct file_ra_state *ra = NULL;
- unsigned long last_index;
+ unsigned long sectors_defragged = 0;
u64 isize = i_size_read(inode);
- u64 last_len = 0;
- u64 skip = 0;
- u64 defrag_end = 0;
- u64 newer_off = range->start;
- unsigned long i;
- unsigned long ra_index = 0;
- int ret;
- int defrag_count = 0;
+ u64 cur;
+ u64 last_byte;
+ bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
+ bool ra_allocated = false;
int compress_type = BTRFS_COMPRESS_ZLIB;
+ int ret = 0;
u32 extent_thresh = range->extent_thresh;
- unsigned long max_cluster = SZ_256K >> PAGE_SHIFT;
- unsigned long cluster = max_cluster;
- u64 new_align = ~((u64)SZ_128K - 1);
- struct page **pages = NULL;
- bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
if (isize == 0)
return 0;
@@ -1444,172 +1488,87 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (extent_thresh == 0)
extent_thresh = SZ_256K;
+ if (range->start + range->len > range->start) {
+ /* Got a specific range */
+ last_byte = min(isize, range->start + range->len) - 1;
+ } else {
+ /* Defrag until file end */
+ last_byte = isize - 1;
+ }
+
/*
- * If we were not given a file, allocate a readahead context. As
+ * If we were not given a ra, allocate a readahead context. As
* readahead is just an optimization, defrag will work without it so
* we don't error out.
*/
- if (!file) {
+ if (!ra) {
+ ra_allocated = true;
ra = kzalloc(sizeof(*ra), GFP_KERNEL);
if (ra)
file_ra_state_init(ra, inode->i_mapping);
- } else {
- ra = &file->f_ra;
- }
-
- pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL);
- if (!pages) {
- ret = -ENOMEM;
- goto out_ra;
- }
-
- /* find the last page to defrag */
- if (range->start + range->len > range->start) {
- last_index = min_t(u64, isize - 1,
- range->start + range->len - 1) >> PAGE_SHIFT;
- } else {
- last_index = (isize - 1) >> PAGE_SHIFT;
- }
-
- if (newer_than) {
- ret = find_new_extents(root, inode, newer_than,
- &newer_off, SZ_64K);
- if (!ret) {
- range->start = newer_off;
- /*
- * we always align our defrag to help keep
- * the extents in the file evenly spaced
- */
- i = (newer_off & new_align) >> PAGE_SHIFT;
- } else
- goto out_ra;
- } else {
- i = range->start >> PAGE_SHIFT;
}
- if (!max_to_defrag)
- max_to_defrag = last_index - i + 1;
-
- /*
- * make writeback starts from i, so the defrag range can be
- * written sequentially.
- */
- if (i < inode->i_mapping->writeback_index)
- inode->i_mapping->writeback_index = i;
-
- while (i <= last_index && defrag_count < max_to_defrag &&
- (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) {
- /*
- * make sure we stop running if someone unmounts
- * the FS
- */
- if (!(inode->i_sb->s_flags & SB_ACTIVE))
- break;
- if (btrfs_defrag_cancelled(fs_info)) {
- btrfs_debug(fs_info, "defrag_file cancelled");
- ret = -EAGAIN;
- goto error;
- }
+ /* Align the range */
+ cur = round_down(range->start, fs_info->sectorsize);
+ last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
- if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
- extent_thresh, &last_len, &skip,
- &defrag_end, do_compress)){
- unsigned long next;
- /*
- * the should_defrag function tells us how much to skip
- * bump our counter by the suggested amount
- */
- next = DIV_ROUND_UP(skip, PAGE_SIZE);
- i = max(i + 1, next);
- continue;
- }
+ while (cur < last_byte) {
+ u64 cluster_end;
- if (!newer_than) {
- cluster = (PAGE_ALIGN(defrag_end) >>
- PAGE_SHIFT) - i;
- cluster = min(cluster, max_cluster);
- } else {
- cluster = max_cluster;
- }
+ /* The cluster size 256K should always be page aligned */
+ BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
- if (i + cluster > ra_index) {
- ra_index = max(i, ra_index);
- if (ra)
- page_cache_sync_readahead(inode->i_mapping, ra,
- file, ra_index, cluster);
- ra_index += cluster;
- }
+ /* We want the cluster end at page boundary when possible */
+ cluster_end = (((cur >> PAGE_SHIFT) +
+ (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
+ cluster_end = min(cluster_end, last_byte);
btrfs_inode_lock(inode, 0);
if (IS_SWAPFILE(inode)) {
ret = -ETXTBSY;
- } else {
- if (do_compress)
- BTRFS_I(inode)->defrag_compress = compress_type;
- ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+ btrfs_inode_unlock(inode, 0);
+ break;
}
- if (ret < 0) {
+ if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
btrfs_inode_unlock(inode, 0);
- goto out_ra;
+ break;
}
-
- defrag_count += ret;
- balance_dirty_pages_ratelimited(inode->i_mapping);
+ if (do_compress)
+ BTRFS_I(inode)->defrag_compress = compress_type;
+ ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+ cluster_end + 1 - cur, extent_thresh,
+ newer_than, do_compress,
+ &sectors_defragged, max_to_defrag);
btrfs_inode_unlock(inode, 0);
-
- if (newer_than) {
- if (newer_off == (u64)-1)
- break;
-
- if (ret > 0)
- i += ret;
-
- newer_off = max(newer_off + 1,
- (u64)i << PAGE_SHIFT);
-
- ret = find_new_extents(root, inode, newer_than,
- &newer_off, SZ_64K);
- if (!ret) {
- range->start = newer_off;
- i = (newer_off & new_align) >> PAGE_SHIFT;
- } else {
- break;
- }
- } else {
- if (ret > 0) {
- i += ret;
- last_len += ret << PAGE_SHIFT;
- } else {
- i++;
- last_len = 0;
- }
- }
+ if (ret < 0)
+ break;
+ cur = cluster_end + 1;
}
- ret = defrag_count;
-error:
- if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
- filemap_flush(inode->i_mapping);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
+ if (ra_allocated)
+ kfree(ra);
+ if (sectors_defragged) {
+ /*
+ * We have defragged some sectors, for compression case they
+ * need to be written back immediately.
+ */
+ if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ }
+ if (range->compress_type == BTRFS_COMPRESS_LZO)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+ else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+ ret = sectors_defragged;
}
-
- if (range->compress_type == BTRFS_COMPRESS_LZO) {
- btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
- } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
- btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
- }
-
-out_ra:
if (do_compress) {
btrfs_inode_lock(inode, 0);
BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
btrfs_inode_unlock(inode, 0);
}
- if (!file)
- kfree(ra);
- kfree(pages);
return ret;
}
@@ -1658,6 +1617,7 @@ static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info,
static noinline int btrfs_ioctl_resize(struct file *file,
void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 new_size;
@@ -1713,7 +1673,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
btrfs_info(fs_info, "resizing devid %llu", devid);
}
- device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ args.devid = devid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
if (!device) {
btrfs_info(fs_info, "resizer unable to find device %llu",
devid);
@@ -1730,7 +1691,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
}
if (!strcmp(sizestr, "max"))
- new_size = device->bdev->bd_inode->i_size;
+ new_size = bdev_nr_bytes(device->bdev);
else {
if (sizestr[0] == '-') {
mod = -1;
@@ -1771,7 +1732,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
ret = -EINVAL;
goto out_finish;
}
- if (new_size > device->bdev->bd_inode->i_size) {
+ if (new_size > bdev_nr_bytes(device->bdev)) {
ret = -EFBIG;
goto out_finish;
}
@@ -2261,9 +2222,8 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = sk->min_offset;
while (1) {
- ret = fault_in_pages_writeable(ubuf + sk_offset,
- *buf_size - sk_offset);
- if (ret)
+ ret = -EFAULT;
+ if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))
break;
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
@@ -3136,12 +3096,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
goto out;
}
- /* Subpage defrag will be supported in later commits */
- if (root->fs_info->sectorsize < PAGE_SIZE) {
- ret = -ENOTTY;
- goto out;
- }
-
switch (inode->i_mode & S_IFMT) {
case S_IFDIR:
if (!capable(CAP_SYS_ADMIN)) {
@@ -3176,7 +3130,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
/* the rest are all set to zero by kzalloc */
range.len = (u64)-1;
}
- ret = btrfs_defrag_file(file_inode(file), file,
+ ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
&range, BTRFS_OLDEST_GENERATION, 0);
if (ret > 0)
ret = 0;
@@ -3220,6 +3174,7 @@ out:
static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args_v2 *vol_args;
@@ -3231,35 +3186,39 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
- goto err_drop;
+ goto out;
}
if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
ret = -EOPNOTSUPP;
goto out;
}
+
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
- if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) &&
- strcmp("cancel", vol_args->name) == 0)
+ if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+ args.devid = vol_args->devid;
+ } else if (!strcmp("cancel", vol_args->name)) {
cancel = true;
+ } else {
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+ if (ret)
+ goto out;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret)
- goto out;
- /* Exclusive operation is now claimed */
+ goto err_drop;
- if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
- ret = btrfs_rm_device(fs_info, NULL, vol_args->devid, &bdev, &mode);
- else
- ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+ /* Exclusive operation is now claimed */
+ ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
btrfs_exclop_finish(fs_info);
@@ -3271,17 +3230,19 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
btrfs_info(fs_info, "device deleted: %s",
vol_args->name);
}
-out:
- kfree(vol_args);
err_drop:
mnt_drop_write_file(file);
if (bdev)
blkdev_put(bdev, mode);
+out:
+ btrfs_put_dev_args_from_path(&args);
+ kfree(vol_args);
return ret;
}
static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args *vol_args;
@@ -3293,32 +3254,38 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args)) {
- ret = PTR_ERR(vol_args);
- goto out_drop_write;
- }
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- cancel = (strcmp("cancel", vol_args->name) == 0);
+ if (!strcmp("cancel", vol_args->name)) {
+ cancel = true;
+ } else {
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+ if (ret)
+ goto out;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret == 0) {
- ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+ ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
if (!ret)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
btrfs_exclop_finish(fs_info);
}
- kfree(vol_args);
-out_drop_write:
mnt_drop_write_file(file);
if (bdev)
blkdev_put(bdev, mode);
+out:
+ btrfs_put_dev_args_from_path(&args);
+ kfree(vol_args);
return ret;
}
@@ -3379,22 +3346,21 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_ioctl_dev_info_args *di_args;
struct btrfs_device *dev;
int ret = 0;
- char *s_uuid = NULL;
di_args = memdup_user(arg, sizeof(*di_args));
if (IS_ERR(di_args))
return PTR_ERR(di_args);
+ args.devid = di_args->devid;
if (!btrfs_is_empty_uuid(di_args->uuid))
- s_uuid = di_args->uuid;
+ args.uuid = di_args->uuid;
rcu_read_lock();
- dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
- NULL);
-
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!dev) {
ret = -ENODEV;
goto out;
@@ -4430,7 +4396,6 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_quota_rescan_args qsa = {0};
- int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4441,9 +4406,9 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
}
if (copy_to_user(arg, &qsa, sizeof(qsa)))
- ret = -EFAULT;
+ return -EFAULT;
- return ret;
+ return 0;
}
static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index a2e1f1f5c6e3..bbc45534ae9a 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -96,11 +96,12 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_DEBUG
-static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
- lockdep_assert_held(&eb->lock);
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
+{
+ lockdep_assert_held_write(&eb->lock);
}
#else
-static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { }
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { }
#endif
void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index c25dfd1a8a54..65cb0766e62d 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -32,19 +32,19 @@
* payload.
* One regular LZO compressed extent can have one or more segments.
* For inlined LZO compressed extent, only one segment is allowed.
- * One segment represents at most one page of uncompressed data.
+ * One segment represents at most one sector of uncompressed data.
*
* 2.1 Segment header
* Fixed size. LZO_LEN (4) bytes long, LE32.
* Records the total size of the segment (not including the header).
- * Segment header never crosses page boundary, thus it's possible to
- * have at most 3 padding zeros at the end of the page.
+ * Segment header never crosses sector boundary, thus it's possible to
+ * have at most 3 padding zeros at the end of the sector.
*
* 2.2 Data Payload
- * Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE)
- * which is 4419 for a 4KiB page.
+ * Variable size. Size up limit should be lzo1x_worst_compress(sectorsize)
+ * which is 4419 for a 4KiB sectorsize.
*
- * Example:
+ * Example with 4K sectorsize:
* Page 1:
* 0 0x2 0x4 0x6 0x8 0xa 0xc 0xe 0x10
* 0x0000 | Header | SegHdr 01 | Data payload 01 ... |
@@ -112,163 +112,174 @@ static inline size_t read_compress_length(const char *buf)
return le32_to_cpu(dlen);
}
+/*
+ * Will do:
+ *
+ * - Write a segment header into the destination
+ * - Copy the compressed buffer into the destination
+ * - Make sure we have enough space in the last sector to fit a segment header
+ * If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros.
+ *
+ * Will allocate new pages when needed.
+ */
+static int copy_compressed_data_to_page(char *compressed_data,
+ size_t compressed_size,
+ struct page **out_pages,
+ u32 *cur_out,
+ const u32 sectorsize)
+{
+ u32 sector_bytes_left;
+ u32 orig_out;
+ struct page *cur_page;
+ char *kaddr;
+
+ /*
+ * We never allow a segment header crossing sector boundary, previous
+ * run should ensure we have enough space left inside the sector.
+ */
+ ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
+
+ cur_page = out_pages[*cur_out / PAGE_SIZE];
+ /* Allocate a new page */
+ if (!cur_page) {
+ cur_page = alloc_page(GFP_NOFS);
+ if (!cur_page)
+ return -ENOMEM;
+ out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ }
+
+ kaddr = kmap(cur_page);
+ write_compress_length(kaddr + offset_in_page(*cur_out),
+ compressed_size);
+ *cur_out += LZO_LEN;
+
+ orig_out = *cur_out;
+
+ /* Copy compressed data */
+ while (*cur_out - orig_out < compressed_size) {
+ u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize,
+ orig_out + compressed_size - *cur_out);
+
+ kunmap(cur_page);
+ cur_page = out_pages[*cur_out / PAGE_SIZE];
+ /* Allocate a new page */
+ if (!cur_page) {
+ cur_page = alloc_page(GFP_NOFS);
+ if (!cur_page)
+ return -ENOMEM;
+ out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ }
+ kaddr = kmap(cur_page);
+
+ memcpy(kaddr + offset_in_page(*cur_out),
+ compressed_data + *cur_out - orig_out, copy_len);
+
+ *cur_out += copy_len;
+ }
+
+ /*
+ * Check if we can fit the next segment header into the remaining space
+ * of the sector.
+ */
+ sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out;
+ if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0)
+ goto out;
+
+ /* The remaining size is not enough, pad it with zeros */
+ memset(kaddr + offset_in_page(*cur_out), 0,
+ sector_bytes_left);
+ *cur_out += sector_bytes_left;
+
+out:
+ kunmap(cur_page);
+ return 0;
+}
+
int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
u64 start, struct page **pages, unsigned long *out_pages,
unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize;
+ struct page *page_in = NULL;
+ char *sizes_ptr;
int ret = 0;
- char *data_in;
- char *cpage_out, *sizes_ptr;
- int nr_pages = 0;
- struct page *in_page = NULL;
- struct page *out_page = NULL;
- unsigned long bytes_left;
- unsigned long len = *total_out;
- unsigned long nr_dest_pages = *out_pages;
- const unsigned long max_out = nr_dest_pages * PAGE_SIZE;
- size_t in_len;
- size_t out_len;
- char *buf;
- unsigned long tot_in = 0;
- unsigned long tot_out = 0;
- unsigned long pg_bytes_left;
- unsigned long out_offset;
- unsigned long bytes;
+ /* Points to the file offset of input data */
+ u64 cur_in = start;
+ /* Points to the current output byte */
+ u32 cur_out = 0;
+ u32 len = *total_out;
*out_pages = 0;
*total_out = 0;
*total_in = 0;
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = page_address(in_page);
-
/*
- * store the size of all chunks of compressed data in
- * the first 4 bytes
+ * Skip the header for now, we will later come back and write the total
+ * compressed size
*/
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- cpage_out = page_address(out_page);
- out_offset = LZO_LEN;
- tot_out = LZO_LEN;
- pages[0] = out_page;
- nr_pages = 1;
- pg_bytes_left = PAGE_SIZE - LZO_LEN;
-
- /* compress at most one page of data each time */
- in_len = min(len, PAGE_SIZE);
- while (tot_in < len) {
- ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
- &out_len, workspace->mem);
- if (ret != LZO_E_OK) {
- pr_debug("BTRFS: lzo in loop returned %d\n",
- ret);
+ cur_out += LZO_LEN;
+ while (cur_in < start + len) {
+ char *data_in;
+ const u32 sectorsize_mask = sectorsize - 1;
+ u32 sector_off = (cur_in - start) & sectorsize_mask;
+ u32 in_len;
+ size_t out_len;
+
+ /* Get the input page first */
+ if (!page_in) {
+ page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT);
+ ASSERT(page_in);
+ }
+
+ /* Compress at most one sector of data each time */
+ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
+ ASSERT(in_len);
+ data_in = kmap(page_in);
+ ret = lzo1x_1_compress(data_in +
+ offset_in_page(cur_in), in_len,
+ workspace->cbuf, &out_len,
+ workspace->mem);
+ kunmap(page_in);
+ if (ret < 0) {
+ pr_debug("BTRFS: lzo in loop returned %d\n", ret);
ret = -EIO;
goto out;
}
- /* store the size of this chunk of compressed data */
- write_compress_length(cpage_out + out_offset, out_len);
- tot_out += LZO_LEN;
- out_offset += LZO_LEN;
- pg_bytes_left -= LZO_LEN;
-
- tot_in += in_len;
- tot_out += out_len;
-
- /* copy bytes from the working buffer into the pages */
- buf = workspace->cbuf;
- while (out_len) {
- bytes = min_t(unsigned long, pg_bytes_left, out_len);
-
- memcpy(cpage_out + out_offset, buf, bytes);
-
- out_len -= bytes;
- pg_bytes_left -= bytes;
- buf += bytes;
- out_offset += bytes;
-
- /*
- * we need another page for writing out.
- *
- * Note if there's less than 4 bytes left, we just
- * skip to a new page.
- */
- if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
- pg_bytes_left == 0) {
- if (pg_bytes_left) {
- memset(cpage_out + out_offset, 0,
- pg_bytes_left);
- tot_out += pg_bytes_left;
- }
-
- /* we're done, don't allocate new page */
- if (out_len == 0 && tot_in >= len)
- break;
-
- if (nr_pages == nr_dest_pages) {
- out_page = NULL;
- ret = -E2BIG;
- goto out;
- }
-
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- cpage_out = page_address(out_page);
- pages[nr_pages++] = out_page;
-
- pg_bytes_left = PAGE_SIZE;
- out_offset = 0;
- }
- }
+ ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
+ pages, &cur_out, sectorsize);
+ if (ret < 0)
+ goto out;
+
+ cur_in += in_len;
- /* we're making it bigger, give up */
- if (tot_in > 8192 && tot_in < tot_out) {
+ /*
+ * Check if we're making it bigger after two sectors. And if
+ * it is so, give up.
+ */
+ if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) {
ret = -E2BIG;
goto out;
}
- /* we're all done */
- if (tot_in >= len)
- break;
-
- if (tot_out > max_out)
- break;
-
- bytes_left = len - tot_in;
- put_page(in_page);
-
- start += PAGE_SIZE;
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = page_address(in_page);
- in_len = min(bytes_left, PAGE_SIZE);
- }
-
- if (tot_out >= tot_in) {
- ret = -E2BIG;
- goto out;
+ /* Check if we have reached page boundary */
+ if (IS_ALIGNED(cur_in, PAGE_SIZE)) {
+ put_page(page_in);
+ page_in = NULL;
+ }
}
- /* store the size of all chunks of compressed data */
- sizes_ptr = page_address(pages[0]);
- write_compress_length(sizes_ptr, tot_out);
+ /* Store the size of all chunks of compressed data */
+ sizes_ptr = kmap_local_page(pages[0]);
+ write_compress_length(sizes_ptr, cur_out);
+ kunmap_local(sizes_ptr);
ret = 0;
- *total_out = tot_out;
- *total_in = tot_in;
+ *total_out = cur_out;
+ *total_in = cur_in - start;
out:
- *out_pages = nr_pages;
-
- if (in_page)
- put_page(in_page);
-
+ *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE);
return ret;
}
@@ -283,6 +294,7 @@ static void copy_compressed_segment(struct compressed_bio *cb,
u32 orig_in = *cur_in;
while (*cur_in < orig_in + len) {
+ char *kaddr;
struct page *cur_page;
u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
orig_in + len - *cur_in);
@@ -290,9 +302,11 @@ static void copy_compressed_segment(struct compressed_bio *cb,
ASSERT(copy_len);
cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
+ kaddr = kmap(cur_page);
memcpy(dest + *cur_in - orig_in,
- page_address(cur_page) + offset_in_page(*cur_in),
+ kaddr + offset_in_page(*cur_in),
copy_len);
+ kunmap(cur_page);
*cur_in += copy_len;
}
@@ -303,6 +317,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
struct workspace *workspace = list_entry(ws, struct workspace, list);
const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
+ char *kaddr;
int ret;
/* Compressed data length, can be unaligned */
u32 len_in;
@@ -311,7 +326,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
/* Bytes decompressed so far */
u32 cur_out = 0;
- len_in = read_compress_length(page_address(cb->compressed_pages[0]));
+ kaddr = kmap(cb->compressed_pages[0]);
+ len_in = read_compress_length(kaddr);
+ kunmap(cb->compressed_pages[0]);
cur_in += LZO_LEN;
/*
@@ -345,8 +362,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
(cur_in + LZO_LEN - 1) / sectorsize);
cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
ASSERT(cur_page);
- seg_len = read_compress_length(page_address(cur_page) +
- offset_in_page(cur_in));
+ kaddr = kmap(cur_page);
+ seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
+ kunmap(cur_page);
cur_in += LZO_LEN;
/* Copy the compressed segment payload into workspace */
@@ -431,7 +449,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
destlen = min_t(unsigned long, destlen, PAGE_SIZE);
bytes = min_t(unsigned long, destlen, out_len - start_byte);
- kaddr = page_address(dest_page);
+ kaddr = kmap_local_page(dest_page);
memcpy(kaddr, workspace->buf + start_byte, bytes);
/*
@@ -441,6 +459,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
*/
if (bytes < destlen)
memset(kaddr+bytes, 0, destlen-bytes);
+ kunmap_local(kaddr);
out:
return ret;
}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index d8d268ca8aa7..0e239a4c3b26 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -60,8 +60,7 @@ enum btrfs_rbio_ops {
};
struct btrfs_raid_bio {
- struct btrfs_fs_info *fs_info;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
/* while we're doing rmw on a stripe
* we put it into a hash table so we can
@@ -192,7 +191,7 @@ static void scrub_parity_work(struct btrfs_work *work);
static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
{
btrfs_init_work(&rbio->work, work_func, NULL, NULL);
- btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
+ btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
}
/*
@@ -271,7 +270,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
*/
static int rbio_bucket(struct btrfs_raid_bio *rbio)
{
- u64 num = rbio->bbio->raid_map[0];
+ u64 num = rbio->bioc->raid_map[0];
/*
* we shift down quite a bit. We're using byte
@@ -345,7 +344,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
return;
- table = rbio->fs_info->stripe_hash_table;
+ table = rbio->bioc->fs_info->stripe_hash_table;
h = table->table + bucket;
/* hold the lock for the bucket because we may be
@@ -400,7 +399,7 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
return;
- table = rbio->fs_info->stripe_hash_table;
+ table = rbio->bioc->fs_info->stripe_hash_table;
spin_lock_irqsave(&table->cache_lock, flags);
__remove_rbio_from_cache(rbio);
@@ -460,7 +459,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
return;
- table = rbio->fs_info->stripe_hash_table;
+ table = rbio->bioc->fs_info->stripe_hash_table;
spin_lock_irqsave(&table->cache_lock, flags);
spin_lock(&rbio->bio_list_lock);
@@ -559,8 +558,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
test_bit(RBIO_CACHE_BIT, &cur->flags))
return 0;
- if (last->bbio->raid_map[0] !=
- cur->bbio->raid_map[0])
+ if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
return 0;
/* we can't merge with different operations */
@@ -669,11 +667,11 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
struct btrfs_raid_bio *cache_drop = NULL;
int ret = 0;
- h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
+ h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
spin_lock_irqsave(&h->lock, flags);
list_for_each_entry(cur, &h->hash_list, hash_list) {
- if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
+ if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
continue;
spin_lock(&cur->bio_list_lock);
@@ -751,7 +749,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
int keep_cache = 0;
bucket = rbio_bucket(rbio);
- h = rbio->fs_info->stripe_hash_table->table + bucket;
+ h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
if (list_empty(&rbio->plug_list))
cache_rbio(rbio);
@@ -838,7 +836,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
}
}
- btrfs_put_bbio(rbio->bbio);
+ btrfs_put_bioc(rbio->bioc);
kfree(rbio);
}
@@ -865,7 +863,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
struct bio *extra;
if (rbio->generic_bio_cnt)
- btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
+ btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
/*
* At this moment, rbio->bio_list is empty, however since rbio does not
@@ -906,7 +904,7 @@ static void raid_write_end_io(struct bio *bio)
/* OK, we have read all the stripes we need to. */
max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
- 0 : rbio->bbio->max_errors;
+ 0 : rbio->bioc->max_errors;
if (atomic_read(&rbio->error) > max_errors)
err = BLK_STS_IOERR;
@@ -961,12 +959,12 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
* this does not allocate any pages for rbio->pages.
*/
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
- struct btrfs_bio *bbio,
+ struct btrfs_io_context *bioc,
u64 stripe_len)
{
struct btrfs_raid_bio *rbio;
int nr_data = 0;
- int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+ int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
int num_pages = rbio_nr_pages(stripe_len, real_stripes);
int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
void *p;
@@ -987,8 +985,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
spin_lock_init(&rbio->bio_list_lock);
INIT_LIST_HEAD(&rbio->stripe_cache);
INIT_LIST_HEAD(&rbio->hash_list);
- rbio->bbio = bbio;
- rbio->fs_info = fs_info;
+ rbio->bioc = bioc;
rbio->stripe_len = stripe_len;
rbio->nr_pages = num_pages;
rbio->real_stripes = real_stripes;
@@ -1015,9 +1012,9 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
#undef CONSUME_ALLOC
- if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
nr_data = real_stripes - 1;
- else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+ else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
nr_data = real_stripes - 2;
else
BUG();
@@ -1077,10 +1074,10 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
struct bio *last = bio_list->tail;
int ret;
struct bio *bio;
- struct btrfs_bio_stripe *stripe;
+ struct btrfs_io_stripe *stripe;
u64 disk_start;
- stripe = &rbio->bbio->stripes[stripe_nr];
+ stripe = &rbio->bioc->stripes[stripe_nr];
disk_start = stripe->physical + (page_index << PAGE_SHIFT);
/* if the device is missing, just fail this stripe */
@@ -1105,8 +1102,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
}
/* put a new bio on the list */
- bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
- btrfs_io_bio(bio)->device = stripe->dev;
+ bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
+ btrfs_bio(bio)->device = stripe->dev;
bio->bi_iter.bi_size = 0;
bio_set_dev(bio, stripe->dev->bdev);
bio->bi_iter.bi_sector = disk_start >> 9;
@@ -1155,11 +1152,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
int i = 0;
start = bio->bi_iter.bi_sector << 9;
- stripe_offset = start - rbio->bbio->raid_map[0];
+ stripe_offset = start - rbio->bioc->raid_map[0];
page_index = stripe_offset >> PAGE_SHIFT;
if (bio_flagged(bio, BIO_CLONED))
- bio->bi_iter = btrfs_io_bio(bio)->iter;
+ bio->bi_iter = btrfs_bio(bio)->iter;
bio_for_each_segment(bvec, bio, iter) {
rbio->bio_pages[page_index + i] = bvec.bv_page;
@@ -1179,7 +1176,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
*/
static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
{
- struct btrfs_bio *bbio = rbio->bbio;
+ struct btrfs_io_context *bioc = rbio->bioc;
void **pointers = rbio->finish_pointers;
int nr_data = rbio->nr_data;
int stripe;
@@ -1284,11 +1281,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
}
}
- if (likely(!bbio->num_tgtdevs))
+ if (likely(!bioc->num_tgtdevs))
goto write_data;
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- if (!bbio->tgtdev_map[stripe])
+ if (!bioc->tgtdev_map[stripe])
continue;
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
@@ -1302,7 +1299,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
}
ret = rbio_add_io_page(rbio, &bio_list, page,
- rbio->bbio->tgtdev_map[stripe],
+ rbio->bioc->tgtdev_map[stripe],
pagenr, rbio->stripe_len);
if (ret)
goto cleanup;
@@ -1339,12 +1336,12 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
{
u64 physical = bio->bi_iter.bi_sector;
int i;
- struct btrfs_bio_stripe *stripe;
+ struct btrfs_io_stripe *stripe;
physical <<= 9;
- for (i = 0; i < rbio->bbio->num_stripes; i++) {
- stripe = &rbio->bbio->stripes[i];
+ for (i = 0; i < rbio->bioc->num_stripes; i++) {
+ stripe = &rbio->bioc->stripes[i];
if (in_range(physical, stripe->physical, rbio->stripe_len) &&
stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
return i;
@@ -1365,7 +1362,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
int i;
for (i = 0; i < rbio->nr_data; i++) {
- u64 stripe_start = rbio->bbio->raid_map[i];
+ u64 stripe_start = rbio->bioc->raid_map[i];
if (in_range(logical, stripe_start, rbio->stripe_len))
return i;
@@ -1456,7 +1453,7 @@ static void raid_rmw_end_io(struct bio *bio)
if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
- if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
goto cleanup;
/*
@@ -1538,8 +1535,8 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
}
/*
- * the bbio may be freed once we submit the last bio. Make sure
- * not to touch it after that
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
@@ -1547,7 +1544,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
bio->bi_end_io = raid_rmw_end_io;
bio->bi_opf = REQ_OP_READ;
- btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
@@ -1719,17 +1716,18 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
/*
* our main entry point for writes from the rest of the FS.
*/
-int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len)
+int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 stripe_len)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
struct btrfs_plug_cb *plug = NULL;
struct blk_plug_cb *cb;
int ret;
- rbio = alloc_rbio(fs_info, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc, stripe_len);
if (IS_ERR(rbio)) {
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
return PTR_ERR(rbio);
}
bio_list_add(&rbio->bio_list, bio);
@@ -1842,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
}
/* all raid6 handling here */
- if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
+ if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
/*
* single failure, rebuild from parity raid5
* style
@@ -1874,8 +1872,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
* here due to a crc mismatch and we can't give them the
* data they want
*/
- if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
- if (rbio->bbio->raid_map[faila] ==
+ if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
+ if (rbio->bioc->raid_map[faila] ==
RAID5_P_STRIPE) {
err = BLK_STS_IOERR;
goto cleanup;
@@ -1887,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
goto pstripe;
}
- if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
+ if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
raid6_datap_recov(rbio->real_stripes,
PAGE_SIZE, faila, pointers);
} else {
@@ -2006,7 +2004,7 @@ static void raid_recover_end_io(struct bio *bio)
if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
- if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
rbio_orig_end_io(rbio, BLK_STS_IOERR);
else
__raid_recover_end_io(rbio);
@@ -2074,7 +2072,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* were up to date, or we might have no bios to read because
* the devices were gone.
*/
- if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
+ if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
__raid_recover_end_io(rbio);
return 0;
} else {
@@ -2083,8 +2081,8 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
}
/*
- * the bbio may be freed once we submit the last bio. Make sure
- * not to touch it after that
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
@@ -2092,7 +2090,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
bio->bi_end_io = raid_recover_end_io;
bio->bi_opf = REQ_OP_READ;
- btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
@@ -2116,22 +2114,22 @@ cleanup:
* so we assume the bio they send down corresponds to a failed part
* of the drive.
*/
-int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- int mirror_num, int generic_io)
+int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 stripe_len, int mirror_num, int generic_io)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
int ret;
if (generic_io) {
- ASSERT(bbio->mirror_num == mirror_num);
- btrfs_io_bio(bio)->mirror_num = mirror_num;
+ ASSERT(bioc->mirror_num == mirror_num);
+ btrfs_bio(bio)->mirror_num = mirror_num;
}
- rbio = alloc_rbio(fs_info, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc, stripe_len);
if (IS_ERR(rbio)) {
if (generic_io)
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
return PTR_ERR(rbio);
}
@@ -2142,11 +2140,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
btrfs_warn(fs_info,
- "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
+"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
__func__, bio->bi_iter.bi_sector << 9,
- (u64)bio->bi_iter.bi_size, bbio->map_type);
+ (u64)bio->bi_iter.bi_size, bioc->map_type);
if (generic_io)
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
kfree(rbio);
return -EIO;
}
@@ -2155,7 +2153,7 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
btrfs_bio_counter_inc_noblocked(fs_info);
rbio->generic_bio_cnt = 1;
} else {
- btrfs_get_bbio(bbio);
+ btrfs_get_bioc(bioc);
}
/*
@@ -2214,23 +2212,23 @@ static void read_rebuild_work(struct btrfs_work *work)
/*
* The following code is used to scrub/replace the parity stripe
*
- * Caller must have already increased bio_counter for getting @bbio.
+ * Caller must have already increased bio_counter for getting @bioc.
*
* Note: We need make sure all the pages that add into the scrub/replace
* raid bio are correct and not be changed during the scrub/replace. That
* is those pages just hold metadata or file data with checksum.
*/
-struct btrfs_raid_bio *
-raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- struct btrfs_device *scrub_dev,
- unsigned long *dbitmap, int stripe_nsectors)
+struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+ struct btrfs_io_context *bioc,
+ u64 stripe_len, struct btrfs_device *scrub_dev,
+ unsigned long *dbitmap, int stripe_nsectors)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
int i;
- rbio = alloc_rbio(fs_info, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc, stripe_len);
if (IS_ERR(rbio))
return NULL;
bio_list_add(&rbio->bio_list, bio);
@@ -2242,12 +2240,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
/*
- * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
+ * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
* to the end position, so this search can start from the first parity
* stripe.
*/
for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
- if (bbio->stripes[i].dev == scrub_dev) {
+ if (bioc->stripes[i].dev == scrub_dev) {
rbio->scrubp = i;
break;
}
@@ -2260,7 +2258,7 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
/*
- * We have already increased bio_counter when getting bbio, record it
+ * We have already increased bio_counter when getting bioc, record it
* so we can free it at rbio_orig_end_io().
*/
rbio->generic_bio_cnt = 1;
@@ -2275,10 +2273,10 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
int stripe_offset;
int index;
- ASSERT(logical >= rbio->bbio->raid_map[0]);
- ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
+ ASSERT(logical >= rbio->bioc->raid_map[0]);
+ ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] +
rbio->stripe_len * rbio->nr_data);
- stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
+ stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
index = stripe_offset >> PAGE_SHIFT;
rbio->bio_pages[index] = page;
}
@@ -2312,7 +2310,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check)
{
- struct btrfs_bio *bbio = rbio->bbio;
+ struct btrfs_io_context *bioc = rbio->bioc;
void **pointers = rbio->finish_pointers;
unsigned long *pbitmap = rbio->finish_pbitmap;
int nr_data = rbio->nr_data;
@@ -2335,7 +2333,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
else
BUG();
- if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
+ if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
is_replace = 1;
bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
}
@@ -2435,7 +2433,7 @@ writeback:
page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
ret = rbio_add_io_page(rbio, &bio_list, page,
- bbio->tgtdev_map[rbio->scrubp],
+ bioc->tgtdev_map[rbio->scrubp],
pagenr, rbio->stripe_len);
if (ret)
goto cleanup;
@@ -2483,7 +2481,7 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
*/
static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
{
- if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
goto cleanup;
if (rbio->faila >= 0 || rbio->failb >= 0) {
@@ -2504,7 +2502,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
* the data, so the capability of the repair is declined.
* (In the case of RAID5, we can not repair anything)
*/
- if (dfail > rbio->bbio->max_errors - 1)
+ if (dfail > rbio->bioc->max_errors - 1)
goto cleanup;
/*
@@ -2625,8 +2623,8 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
}
/*
- * the bbio may be freed once we submit the last bio. Make sure
- * not to touch it after that
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
@@ -2634,7 +2632,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
bio->bi_end_io = raid56_parity_scrub_end_io;
bio->bi_opf = REQ_OP_READ;
- btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
@@ -2670,12 +2668,13 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
/* The following code is used for dev replace of a missing RAID 5/6 device. */
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 length)
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 length)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
- rbio = alloc_rbio(fs_info, bbio, length);
+ rbio = alloc_rbio(fs_info, bioc, length);
if (IS_ERR(rbio))
return NULL;
@@ -2695,7 +2694,7 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
}
/*
- * When we get bbio, we have already increased bio_counter, record it
+ * When we get bioc, we have already increased bio_counter, record it
* so we can free it at rbio_orig_end_io()
*/
rbio->generic_bio_cnt = 1;
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 2503485db859..72c00fc284b5 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -30,25 +30,23 @@ static inline int nr_data_stripes(const struct map_lookup *map)
struct btrfs_raid_bio;
struct btrfs_device;
-int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- int mirror_num, int generic_io);
-int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len);
+int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 stripe_len, int mirror_num, int generic_io);
+int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 stripe_len);
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
u64 logical);
-struct btrfs_raid_bio *
-raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- struct btrfs_device *scrub_dev,
- unsigned long *dbitmap, int stripe_nsectors);
+struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+ struct btrfs_io_context *bioc, u64 stripe_len,
+ struct btrfs_device *scrub_dev,
+ unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 length);
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 length);
void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 06713a8fe26b..eb96fdc3be25 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -227,7 +227,7 @@ start_machine:
}
static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
- struct btrfs_bio *bbio)
+ struct btrfs_io_context *bioc)
{
struct btrfs_fs_info *fs_info = dev->fs_info;
int ret;
@@ -275,11 +275,11 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
kref_init(&zone->refcnt);
zone->elems = 0;
zone->device = dev; /* our device always sits at index 0 */
- for (i = 0; i < bbio->num_stripes; ++i) {
+ for (i = 0; i < bioc->num_stripes; ++i) {
/* bounds have already been checked */
- zone->devs[i] = bbio->stripes[i].dev;
+ zone->devs[i] = bioc->stripes[i].dev;
}
- zone->ndevs = bbio->num_stripes;
+ zone->ndevs = bioc->num_stripes;
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&dev->reada_zones,
@@ -309,7 +309,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
int ret;
struct reada_extent *re = NULL;
struct reada_extent *re_exist = NULL;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
struct btrfs_device *dev;
struct btrfs_device *prev_dev;
u64 length;
@@ -345,28 +345,28 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
*/
length = fs_info->nodesize;
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &length, &bbio, 0);
- if (ret || !bbio || length < fs_info->nodesize)
+ &length, &bioc, 0);
+ if (ret || !bioc || length < fs_info->nodesize)
goto error;
- if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
+ if (bioc->num_stripes > BTRFS_MAX_MIRRORS) {
btrfs_err(fs_info,
"readahead: more than %d copies not supported",
BTRFS_MAX_MIRRORS);
goto error;
}
- real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+ real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
for (nzones = 0; nzones < real_stripes; ++nzones) {
struct reada_zone *zone;
- dev = bbio->stripes[nzones].dev;
+ dev = bioc->stripes[nzones].dev;
/* cannot read ahead on missing device. */
if (!dev->bdev)
continue;
- zone = reada_find_zone(dev, logical, bbio);
+ zone = reada_find_zone(dev, logical, bioc);
if (!zone)
continue;
@@ -464,7 +464,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
if (!have_zone)
goto error;
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
return re;
error:
@@ -488,7 +488,7 @@ error:
kref_put(&zone->refcnt, reada_zone_release);
spin_unlock(&fs_info->reada_lock);
}
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
kfree(re);
return re_exist;
}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index d2062d5f71dd..e2b9f8616501 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -678,10 +678,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
if (generic_ref->type == BTRFS_REF_METADATA) {
if (!parent)
- ref_root = generic_ref->tree_ref.root;
+ ref_root = generic_ref->tree_ref.owning_root;
owner = generic_ref->tree_ref.level;
} else if (!parent) {
- ref_root = generic_ref->data_ref.ref_root;
+ ref_root = generic_ref->data_ref.owning_root;
owner = generic_ref->data_ref.ino;
offset = generic_ref->data_ref.offset;
}
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 9b0814318e72..e0f93b357548 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -138,7 +138,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
}
btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
- ClearPageChecked(page);
+ btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
out_unlock:
if (page) {
@@ -649,7 +649,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
- int ret;
+ int ret = 0;
u64 i, tail_len, chunk_count;
struct btrfs_root *root_dst = BTRFS_I(dst)->root;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 914d403b4415..33a0ee7ac590 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -25,6 +25,7 @@
#include "backref.h"
#include "misc.h"
#include "subpage.h"
+#include "zoned.h"
/*
* Relocation overview
@@ -1145,9 +1146,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(leaf, fi);
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
num_bytes, parent);
- ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- key.objectid, key.offset);
+ key.objectid, key.offset,
+ root->root_key.objectid, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1156,9 +1157,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
num_bytes, parent);
- ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- key.objectid, key.offset);
+ key.objectid, key.offset,
+ root->root_key.objectid, false);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1367,8 +1368,8 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
blocksize, path->nodes[level]->start);
- ref.skip_qgroup = true;
- btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
+ btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
+ 0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1376,8 +1377,8 @@ again:
}
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
blocksize, 0);
- ref.skip_qgroup = true;
- btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
+ btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
+ true);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1386,8 +1387,8 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
blocksize, path->nodes[level]->start);
- btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
- ref.skip_qgroup = true;
+ btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
+ 0, true);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1396,8 +1397,8 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
blocksize, 0);
- btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
- ref.skip_qgroup = true;
+ btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
+ 0, true);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -2473,9 +2474,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
node->eb->start, blocksize,
upper->eb->start);
- ref.real_root = root->root_key.objectid;
btrfs_init_tree_ref(&ref, node->level,
- btrfs_header_owner(upper->eb));
+ btrfs_header_owner(upper->eb),
+ root->root_key.objectid, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (!ret)
ret = btrfs_drop_subtree(trans, root, eb,
@@ -2691,8 +2692,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
list_add_tail(&node->list, &rc->backref_cache.changed);
} else {
path->lowest_level = node->level;
+ if (root == root->fs_info->chunk_root)
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
btrfs_release_path(path);
+ if (root == root->fs_info->chunk_root)
+ btrfs_trans_release_chunk_metadata(trans);
if (ret > 0)
ret = 0;
}
@@ -2852,31 +2857,6 @@ static noinline_for_stack int prealloc_file_extent_cluster(
if (ret)
return ret;
- /*
- * On a zoned filesystem, we cannot preallocate the file region.
- * Instead, we dirty and fiemap_write the region.
- */
- if (btrfs_is_zoned(inode->root->fs_info)) {
- struct btrfs_root *root = inode->root;
- struct btrfs_trans_handle *trans;
-
- end = cluster->end - offset + 1;
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
- i_size_write(&inode->vfs_inode, end);
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- return ret;
- }
-
- return btrfs_end_transaction(trans);
- }
-
btrfs_inode_lock(&inode->vfs_inode, 0);
for (nr = 0; nr < cluster->nr; nr++) {
start = cluster->boundary[nr] - offset;
@@ -2903,9 +2883,8 @@ static noinline_for_stack int prealloc_file_extent_cluster(
return ret;
}
-static noinline_for_stack
-int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
- u64 block_start)
+static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
+ u64 start, u64 end, u64 block_start)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
@@ -3084,7 +3063,6 @@ release_page:
static int relocate_file_extent_cluster(struct inode *inode,
struct file_extent_cluster *cluster)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 offset = BTRFS_I(inode)->index_cnt;
unsigned long index;
unsigned long last_index;
@@ -3105,7 +3083,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
file_ra_state_init(ra, inode->i_mapping);
- ret = setup_extent_mapping(inode, cluster->start - offset,
+ ret = setup_relocation_extent_mapping(inode, cluster->start - offset,
cluster->end - offset, cluster->start);
if (ret)
goto out;
@@ -3114,8 +3092,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
for (index = (cluster->start - offset) >> PAGE_SHIFT;
index <= last_index && !ret; index++)
ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
- if (btrfs_is_zoned(fs_info) && !ret)
- ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (ret == 0)
WARN_ON(cluster_nr != cluster->nr);
out:
@@ -3770,12 +3746,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct btrfs_inode_item *item;
struct extent_buffer *leaf;
- u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
int ret;
- if (btrfs_is_zoned(trans->fs_info))
- flags &= ~BTRFS_INODE_PREALLOC;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -3790,7 +3762,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_generation(leaf, item, 1);
btrfs_set_inode_size(leaf, item, 0);
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
- btrfs_set_inode_flags(leaf, item, flags);
+ btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+ BTRFS_INODE_PREALLOC);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
@@ -4063,6 +4036,9 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
rc->block_group->start,
rc->block_group->length);
+ ret = btrfs_zone_finish(rc->block_group);
+ WARN_ON(ret && ret != -EAGAIN);
+
while (1) {
int finishes_stage;
@@ -4386,8 +4362,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
if (!rc)
return 0;
- BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
- root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+ BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root));
level = btrfs_header_level(buf);
if (btrfs_header_generation(buf) <=
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 702dc5441f03..12ceb14a1141 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -39,10 +39,8 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
need_reset = 1;
}
if (need_reset) {
- memset(&item->generation_v2, 0,
- sizeof(*item) - offsetof(struct btrfs_root_item,
- generation_v2));
-
+ /* Clear all members from generation_v2 onwards. */
+ memset_startat(item, 0, generation_v2);
generate_random_guid(item->uuid);
}
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 088641ba7a8e..cf82ea6f54fb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -57,7 +57,7 @@ struct scrub_ctx;
struct scrub_recover {
refcount_t refs;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
u64 map_length;
};
@@ -254,7 +254,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx);
static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
{
return spage->recover &&
- (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
}
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -798,7 +798,7 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
{
if (refcount_dec_and_test(&recover->refs)) {
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bbio(recover->bbio);
+ btrfs_put_bioc(recover->bioc);
kfree(recover);
}
}
@@ -1027,8 +1027,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sblock_other = sblocks_for_recheck + mirror_index;
} else {
struct scrub_recover *r = sblock_bad->pagev[0]->recover;
- int max_allowed = r->bbio->num_stripes -
- r->bbio->num_tgtdevs;
+ int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
if (mirror_index >= max_allowed)
break;
@@ -1218,14 +1217,14 @@ out:
return 0;
}
-static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
+static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
{
- if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
return 2;
- else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+ else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
return 3;
else
- return (int)bbio->num_stripes;
+ return (int)bioc->num_stripes;
}
static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
@@ -1269,7 +1268,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
u64 flags = original_sblock->pagev[0]->flags;
u64 have_csum = original_sblock->pagev[0]->have_csum;
struct scrub_recover *recover;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
u64 sublen;
u64 mapped_length;
u64 stripe_offset;
@@ -1288,7 +1287,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
while (length > 0) {
sublen = min_t(u64, length, fs_info->sectorsize);
mapped_length = sublen;
- bbio = NULL;
+ bioc = NULL;
/*
* With a length of sectorsize, each returned stripe represents
@@ -1296,27 +1295,27 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
*/
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &mapped_length, &bbio);
- if (ret || !bbio || mapped_length < sublen) {
- btrfs_put_bbio(bbio);
+ logical, &mapped_length, &bioc);
+ if (ret || !bioc || mapped_length < sublen) {
+ btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
return -EIO;
}
recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
if (!recover) {
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
return -ENOMEM;
}
refcount_set(&recover->refs, 1);
- recover->bbio = bbio;
+ recover->bioc = bioc;
recover->map_length = mapped_length;
BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
- nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
+ nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
@@ -1348,17 +1347,17 @@ leave_nomem:
sctx->fs_info->csum_size);
scrub_stripe_index_and_offset(logical,
- bbio->map_type,
- bbio->raid_map,
+ bioc->map_type,
+ bioc->raid_map,
mapped_length,
- bbio->num_stripes -
- bbio->num_tgtdevs,
+ bioc->num_stripes -
+ bioc->num_tgtdevs,
mirror_index,
&stripe_index,
&stripe_offset);
- spage->physical = bbio->stripes[stripe_index].physical +
+ spage->physical = bioc->stripes[stripe_index].physical +
stripe_offset;
- spage->dev = bbio->stripes[stripe_index].dev;
+ spage->dev = bioc->stripes[stripe_index].dev;
BUG_ON(page_index >= original_sblock->page_count);
spage->physical_for_dev_replace =
@@ -1401,7 +1400,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
bio->bi_end_io = scrub_bio_wait_endio;
mirror_num = spage->sblock->pagev[0]->mirror_num;
- ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio,
+ ret = raid56_parity_recover(bio, spage->recover->bioc,
spage->recover->map_length,
mirror_num, 0);
if (ret)
@@ -1423,7 +1422,7 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
if (!first_page->dev->bdev)
goto out;
- bio = btrfs_io_bio_alloc(BIO_MAX_VECS);
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
bio_set_dev(bio, first_page->dev->bdev);
for (page_num = 0; page_num < sblock->page_count; page_num++) {
@@ -1480,7 +1479,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
}
WARN_ON(!spage->page);
- bio = btrfs_io_bio_alloc(1);
+ bio = btrfs_bio_alloc(1);
bio_set_dev(bio, spage->dev->bdev);
bio_add_page(bio, spage->page, fs_info->sectorsize, 0);
@@ -1562,7 +1561,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
return -EIO;
}
- bio = btrfs_io_bio_alloc(1);
+ bio = btrfs_bio_alloc(1);
bio_set_dev(bio, spage_bad->dev->bdev);
bio->bi_iter.bi_sector = spage_bad->physical >> 9;
bio->bi_opf = REQ_OP_WRITE;
@@ -1676,7 +1675,7 @@ again:
sbio->dev = sctx->wr_tgtdev;
bio = sbio->bio;
if (!bio) {
- bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
+ bio = btrfs_bio_alloc(sctx->pages_per_wr_bio);
sbio->bio = bio;
}
@@ -2102,7 +2101,7 @@ again:
sbio->dev = spage->dev;
bio = sbio->bio;
if (!bio) {
- bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
+ bio = btrfs_bio_alloc(sctx->pages_per_rd_bio);
sbio->bio = bio;
}
@@ -2203,7 +2202,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->fs_info;
u64 length = sblock->page_count * PAGE_SIZE;
u64 logical = sblock->pagev[0]->logical;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
struct bio *bio;
struct btrfs_raid_bio *rbio;
int ret;
@@ -2211,27 +2210,27 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &length, &bbio);
- if (ret || !bbio || !bbio->raid_map)
- goto bbio_out;
+ &length, &bioc);
+ if (ret || !bioc || !bioc->raid_map)
+ goto bioc_out;
if (WARN_ON(!sctx->is_dev_replace ||
- !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
+ !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
/*
* We shouldn't be scrubbing a missing device. Even for dev
* replace, we should only get here for RAID 5/6. We either
* managed to mount something with no mirrors remaining or
* there's a bug in scrub_remap_extent()/btrfs_map_block().
*/
- goto bbio_out;
+ goto bioc_out;
}
- bio = btrfs_io_bio_alloc(0);
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
bio->bi_iter.bi_sector = logical >> 9;
bio->bi_private = sblock;
bio->bi_end_io = scrub_missing_raid56_end_io;
- rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
+ rbio = raid56_alloc_missing_rbio(bio, bioc, length);
if (!rbio)
goto rbio_out;
@@ -2249,9 +2248,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
rbio_out:
bio_put(bio);
-bbio_out:
+bioc_out:
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
@@ -2826,7 +2825,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct bio *bio;
struct btrfs_raid_bio *rbio;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
u64 length;
int ret;
@@ -2838,17 +2837,17 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
- &length, &bbio);
- if (ret || !bbio || !bbio->raid_map)
- goto bbio_out;
+ &length, &bioc);
+ if (ret || !bioc || !bioc->raid_map)
+ goto bioc_out;
- bio = btrfs_io_bio_alloc(0);
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
bio->bi_iter.bi_sector = sparity->logic_start >> 9;
bio->bi_private = sparity;
bio->bi_end_io = scrub_parity_bio_endio;
- rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
- length, sparity->scrub_dev,
+ rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length,
+ sparity->scrub_dev,
sparity->dbitmap,
sparity->nsectors);
if (!rbio)
@@ -2860,9 +2859,9 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
rbio_out:
bio_put(bio);
-bbio_out:
+bioc_out:
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
spin_lock(&sctx->stat_lock);
@@ -2901,7 +2900,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_root *csum_root = fs_info->csum_root;
struct btrfs_extent_item *extent;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
u64 flags;
int ret;
int slot;
@@ -3044,22 +3043,22 @@ again:
extent_len);
mapped_length = extent_len;
- bbio = NULL;
+ bioc = NULL;
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
- extent_logical, &mapped_length, &bbio,
+ extent_logical, &mapped_length, &bioc,
0);
if (!ret) {
- if (!bbio || mapped_length < extent_len)
+ if (!bioc || mapped_length < extent_len)
ret = -EIO;
}
if (ret) {
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
goto out;
}
- extent_physical = bbio->stripes[0].physical;
- extent_mirror_num = bbio->mirror_num;
- extent_dev = bbio->stripes[0].dev;
- btrfs_put_bbio(bbio);
+ extent_physical = bioc->stripes[0].physical;
+ extent_mirror_num = bioc->mirror_num;
+ extent_dev = bioc->stripes[0].dev;
+ btrfs_put_bioc(bioc);
ret = btrfs_lookup_csums_range(csum_root,
extent_logical,
@@ -3956,7 +3955,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
int ret;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
/* Seed devices of a new filesystem has their own generation. */
@@ -4068,6 +4067,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace)
{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
struct scrub_ctx *sctx;
int ret;
struct btrfs_device *dev;
@@ -4115,7 +4115,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
goto out_free_ctx;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -4288,11 +4288,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress)
{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
struct btrfs_device *dev;
struct scrub_ctx *sctx = NULL;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (dev)
sctx = dev->scrub_ctx;
if (sctx)
@@ -4309,20 +4310,20 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
int *extent_mirror_num)
{
u64 mapped_length;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
int ret;
mapped_length = extent_len;
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
- &mapped_length, &bbio, 0);
- if (ret || !bbio || mapped_length < extent_len ||
- !bbio->stripes[0].dev->bdev) {
- btrfs_put_bbio(bbio);
+ &mapped_length, &bioc, 0);
+ if (ret || !bioc || mapped_length < extent_len ||
+ !bioc->stripes[0].dev->bdev) {
+ btrfs_put_bioc(bioc);
return;
}
- *extent_physical = bbio->stripes[0].physical;
- *extent_mirror_num = bbio->mirror_num;
- *extent_dev = bbio->stripes[0].dev;
- btrfs_put_bbio(bbio);
+ *extent_physical = bioc->stripes[0].physical;
+ *extent_mirror_num = bioc->mirror_num;
+ *extent_dev = bioc->stripes[0].dev;
+ btrfs_put_bioc(bioc);
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 72f9b865e847..040324d71118 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -84,6 +84,8 @@ struct send_ctx {
u64 total_send_size;
u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
+ /* Protocol version compatibility requested */
+ u32 proto;
struct btrfs_root *send_root;
struct btrfs_root *parent_root;
@@ -312,6 +314,16 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx,
sctx->parent_root->root_key.objectid : 0));
}
+__maybe_unused
+static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
+{
+ switch (sctx->proto) {
+ case 1: return cmd < __BTRFS_SEND_C_MAX_V1;
+ case 2: return cmd < __BTRFS_SEND_C_MAX_V2;
+ default: return false;
+ }
+}
+
static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
static struct waiting_dir_move *
@@ -2720,19 +2732,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx)
if (S_ISDIR(sctx->cur_inode_mode)) {
ret = did_create_dir(sctx, sctx->cur_ino);
if (ret < 0)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
+ return ret;
+ else if (ret > 0)
+ return 0;
}
- ret = send_create_inode(sctx, sctx->cur_ino);
- if (ret < 0)
- goto out;
-
-out:
- return ret;
+ return send_create_inode(sctx, sctx->cur_ino);
}
struct recorded_ref {
@@ -7276,6 +7281,17 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
sctx->flags = arg->flags;
+ if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
+ if (arg->version > BTRFS_SEND_STREAM_VERSION) {
+ ret = -EPROTO;
+ goto out;
+ }
+ /* Zero means "use the highest version" */
+ sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION;
+ } else {
+ sctx->proto = 1;
+ }
+
sctx->send_filp = fget(arg->send_fd);
if (!sctx->send_filp) {
ret = -EBADF;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index de91488b7cd0..23bcefc84e49 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -48,6 +48,7 @@ struct btrfs_tlv_header {
enum btrfs_send_cmd {
BTRFS_SEND_C_UNSPEC,
+ /* Version 1 */
BTRFS_SEND_C_SUBVOL,
BTRFS_SEND_C_SNAPSHOT,
@@ -76,6 +77,12 @@ enum btrfs_send_cmd {
BTRFS_SEND_C_END,
BTRFS_SEND_C_UPDATE_EXTENT,
+ __BTRFS_SEND_C_MAX_V1,
+
+ /* Version 2 */
+ __BTRFS_SEND_C_MAX_V2,
+
+ /* End */
__BTRFS_SEND_C_MAX,
};
#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 5ada02e0e629..48d77f360a24 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -414,9 +414,10 @@ static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
{
lockdep_assert_held(&info->lock);
- btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
+ /* The free space could be negative in case of overcommit */
+ btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull",
info->flags,
- info->total_bytes - btrfs_space_info_used(info, true),
+ (s64)(info->total_bytes - btrfs_space_info_used(info, true)),
info->full ? "" : "not ");
btrfs_info(fs_info,
"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
@@ -884,6 +885,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
{
struct reserve_ticket *ticket;
u64 tickets_id = space_info->tickets_id;
+ const bool aborted = BTRFS_FS_ERROR(fs_info);
trace_btrfs_fail_all_tickets(fs_info, space_info);
@@ -897,16 +899,19 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
- if (ticket->steal &&
+ if (!aborted && ticket->steal &&
steal_from_global_rsv(fs_info, space_info, ticket))
return true;
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
btrfs_info(fs_info, "failing ticket with %llu bytes",
ticket->bytes);
remove_ticket(space_info, ticket);
- ticket->error = -ENOSPC;
+ if (aborted)
+ ticket->error = -EIO;
+ else
+ ticket->error = -ENOSPC;
wake_up(&ticket->wait);
/*
@@ -915,7 +920,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
* here to see if we can make progress with the next ticket in
* the list.
*/
- btrfs_try_granting_tickets(fs_info, space_info);
+ if (!aborted)
+ btrfs_try_granting_tickets(fs_info, space_info);
}
return (tickets_id != space_info->tickets_id);
}
@@ -1171,6 +1177,10 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
spin_unlock(&space_info->lock);
return;
}
+
+ /* Something happened, fail everything and bail. */
+ if (BTRFS_FS_ERROR(fs_info))
+ goto aborted_fs;
last_tickets_id = space_info->tickets_id;
spin_unlock(&space_info->lock);
}
@@ -1201,9 +1211,20 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
} else {
flush_state = 0;
}
+
+ /* Something happened, fail everything and bail. */
+ if (BTRFS_FS_ERROR(fs_info))
+ goto aborted_fs;
+
}
spin_unlock(&space_info->lock);
}
+ return;
+
+aborted_fs:
+ maybe_fail_all_tickets(fs_info, space_info);
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
}
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index cb10e56ee31e..29bd8c7a7706 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -63,11 +63,41 @@
* This means a slightly higher tree locking latency.
*/
+void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize)
+{
+ unsigned int cur = 0;
+ unsigned int nr_bits;
+
+ ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize));
+
+ nr_bits = PAGE_SIZE / sectorsize;
+ subpage_info->bitmap_nr_bits = nr_bits;
+
+ subpage_info->uptodate_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->error_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->dirty_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->writeback_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->ordered_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->checked_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->total_nr_bits = cur;
+}
+
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page, enum btrfs_subpage_type type)
{
- struct btrfs_subpage *subpage = NULL;
- int ret;
+ struct btrfs_subpage *subpage;
/*
* We have cases like a dummy extent buffer page, which is not mappped
@@ -75,13 +105,15 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
*/
if (page->mapping)
ASSERT(PageLocked(page));
+
/* Either not subpage, or the page already has private attached */
if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
return 0;
- ret = btrfs_alloc_subpage(fs_info, &subpage, type);
- if (ret < 0)
- return ret;
+ subpage = btrfs_alloc_subpage(fs_info, type);
+ if (IS_ERR(subpage))
+ return PTR_ERR(subpage);
+
attach_page_private(page, subpage);
return 0;
}
@@ -100,24 +132,28 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
btrfs_free_subpage(subpage);
}
-int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- struct btrfs_subpage **ret,
- enum btrfs_subpage_type type)
+struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+ enum btrfs_subpage_type type)
{
- if (fs_info->sectorsize == PAGE_SIZE)
- return 0;
+ struct btrfs_subpage *ret;
+ unsigned int real_size;
+
+ ASSERT(fs_info->sectorsize < PAGE_SIZE);
+
+ real_size = struct_size(ret, bitmaps,
+ BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits));
+ ret = kzalloc(real_size, GFP_NOFS);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
- *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
- if (!*ret)
- return -ENOMEM;
- spin_lock_init(&(*ret)->lock);
+ spin_lock_init(&ret->lock);
if (type == BTRFS_SUBPAGE_METADATA) {
- atomic_set(&(*ret)->eb_refs, 0);
+ atomic_set(&ret->eb_refs, 0);
} else {
- atomic_set(&(*ret)->readers, 0);
- atomic_set(&(*ret)->writers, 0);
+ atomic_set(&ret->readers, 0);
+ atomic_set(&ret->writers, 0);
}
- return 0;
+ return ret;
}
void btrfs_free_subpage(struct btrfs_subpage *subpage)
@@ -222,8 +258,16 @@ static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
u32 orig_len = *len;
*start = max_t(u64, page_offset(page), orig_start);
- *len = min_t(u64, page_offset(page) + PAGE_SIZE,
- orig_start + orig_len) - *start;
+ /*
+ * For certain call sites like btrfs_drop_pages(), we may have pages
+ * beyond the target range. In that case, just set @len to 0, subpage
+ * helpers can handle @len == 0 without any problem.
+ */
+ if (page_offset(page) >= orig_start + orig_len)
+ *len = 0;
+ else
+ *len = min_t(u64, page_offset(page) + PAGE_SIZE,
+ orig_start + orig_len) - *start;
}
void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
@@ -248,6 +292,16 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
btrfs_subpage_assert(fs_info, page, start, len);
+ /*
+ * We have call sites passing @lock_page into
+ * extent_clear_unlock_delalloc() for compression path.
+ *
+ * This @locked_page is locked by plain lock_page(), thus its
+ * subpage::writers is 0. Handle them in a special way.
+ */
+ if (atomic_read(&subpage->writers) == 0)
+ return true;
+
ASSERT(atomic_read(&subpage->writers) >= nbits);
return atomic_sub_and_test(nbits, &subpage->writers);
}
@@ -289,37 +343,59 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
unlock_page(page);
}
-/*
- * Convert the [start, start + len) range into a u16 bitmap
- *
- * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0.
- */
-static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+static bool bitmap_test_range_all_set(unsigned long *addr, unsigned int start,
+ unsigned int nbits)
{
- const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits;
- const int nbits = len >> fs_info->sectorsize_bits;
+ unsigned int found_zero;
- btrfs_subpage_assert(fs_info, page, start, len);
+ found_zero = find_next_zero_bit(addr, start + nbits, start);
+ if (found_zero == start + nbits)
+ return true;
+ return false;
+}
- /*
- * Here nbits can be 16, thus can go beyond u16 range. We make the
- * first left shift to be calculate in unsigned long (at least u32),
- * then truncate the result to u16.
- */
- return (u16)(((1UL << nbits) - 1) << bit_start);
+static bool bitmap_test_range_all_zero(unsigned long *addr, unsigned int start,
+ unsigned int nbits)
+{
+ unsigned int found_set;
+
+ found_set = find_next_bit(addr, start + nbits, start);
+ if (found_set == start + nbits)
+ return true;
+ return false;
}
+#define subpage_calc_start_bit(fs_info, page, name, start, len) \
+({ \
+ unsigned int start_bit; \
+ \
+ btrfs_subpage_assert(fs_info, page, start, len); \
+ start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
+ start_bit += fs_info->subpage_info->name##_offset; \
+ start_bit; \
+})
+
+#define subpage_test_bitmap_all_set(fs_info, subpage, name) \
+ bitmap_test_range_all_set(subpage->bitmaps, \
+ fs_info->subpage_info->name##_offset, \
+ fs_info->subpage_info->bitmap_nr_bits)
+
+#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \
+ bitmap_test_range_all_zero(subpage->bitmaps, \
+ fs_info->subpage_info->name##_offset, \
+ fs_info->subpage_info->bitmap_nr_bits)
+
void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->uptodate_bitmap |= tmp;
- if (subpage->uptodate_bitmap == U16_MAX)
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
SetPageUptodate(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -328,11 +404,12 @@ void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->uptodate_bitmap &= ~tmp;
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
ClearPageUptodate(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -341,11 +418,12 @@ void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ error, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->error_bitmap |= tmp;
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
SetPageError(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -354,12 +432,13 @@ void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ error, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->error_bitmap &= ~tmp;
- if (subpage->error_bitmap == 0)
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, error))
ClearPageError(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -368,11 +447,12 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ dirty, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->dirty_bitmap |= tmp;
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
spin_unlock_irqrestore(&subpage->lock, flags);
set_page_dirty(page);
}
@@ -391,13 +471,14 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ dirty, start, len);
unsigned long flags;
bool last = false;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->dirty_bitmap &= ~tmp;
- if (subpage->dirty_bitmap == 0)
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty))
last = true;
spin_unlock_irqrestore(&subpage->lock, flags);
return last;
@@ -417,11 +498,12 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->writeback_bitmap |= tmp;
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
set_page_writeback(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -430,12 +512,13 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->writeback_bitmap &= ~tmp;
- if (subpage->writeback_bitmap == 0) {
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
ASSERT(PageWriteback(page));
end_page_writeback(page);
}
@@ -446,11 +529,12 @@ void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->ordered_bitmap |= tmp;
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
SetPageOrdered(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -459,15 +543,46 @@ void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->ordered_bitmap &= ~tmp;
- if (subpage->ordered_bitmap == 0)
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
ClearPageOrdered(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
+
+void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ checked, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
+ SetPageChecked(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ checked, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ ClearPageChecked(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
/*
* Unlike set/clear which is dependent on each page status, for test all bits
* are tested in the same way.
@@ -477,12 +592,14 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \
+ name, start, len); \
unsigned long flags; \
bool ret; \
\
spin_lock_irqsave(&subpage->lock, flags); \
- ret = ((subpage->name##_bitmap & tmp) == tmp); \
+ ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit, \
+ len >> fs_info->sectorsize_bits); \
spin_unlock_irqrestore(&subpage->lock, flags); \
return ret; \
}
@@ -491,6 +608,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
/*
* Note that, in selftests (extent-io-tests), we can have empty fs_info passed
@@ -561,6 +679,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
PageWriteback);
IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
PageOrdered);
+IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked);
/*
* Make sure not only the page dirty bit is cleared, but also subpage dirty bit
@@ -579,5 +698,48 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
return;
ASSERT(PagePrivate(page) && page->private);
- ASSERT(subpage->dirty_bitmap == 0);
+ ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty));
+}
+
+/*
+ * Handle different locked pages with different page sizes:
+ *
+ * - Page locked by plain lock_page()
+ * It should not have any subpage::writers count.
+ * Can be unlocked by unlock_page().
+ * This is the most common locked page for __extent_writepage() called
+ * inside extent_write_cache_pages() or extent_write_full_page().
+ * Rarer cases include the @locked_page from extent_write_locked_range().
+ *
+ * - Page locked by lock_delalloc_pages()
+ * There is only one caller, all pages except @locked_page for
+ * extent_write_locked_range().
+ * In this case, we have to call subpage helper to handle the case.
+ */
+void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
+ u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage;
+
+ ASSERT(PageLocked(page));
+ /* For regular page size case, we just unlock the page */
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return unlock_page(page);
+
+ ASSERT(PagePrivate(page) && page->private);
+ subpage = (struct btrfs_subpage *)page->private;
+
+ /*
+ * For subpage case, there are two types of locked page. With or
+ * without writers number.
+ *
+ * Since we own the page lock, no one else could touch subpage::writers
+ * and we are safe to do several atomic operations without spinlock.
+ */
+ if (atomic_read(&subpage->writers))
+ /* No writers, locked by plain lock_page() */
+ return unlock_page(page);
+
+ /* Have writers, use proper subpage helper to end it */
+ btrfs_page_end_writer_lock(fs_info, page, start, len);
}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 0120948f37a1..7accb5c40d33 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -6,10 +6,38 @@
#include <linux/spinlock.h>
/*
- * Maximum page size we support is 64K, minimum sector size is 4K, u16 bitmap
- * is sufficient. Regular bitmap_* is not used due to size reasons.
+ * Extra info for subpapge bitmap.
+ *
+ * For subpage we pack all uptodate/error/dirty/writeback/ordered bitmaps into
+ * one larger bitmap.
+ *
+ * This structure records how they are organized in the bitmap:
+ *
+ * /- uptodate_offset /- error_offset /- dirty_offset
+ * | | |
+ * v v v
+ * |u|u|u|u|........|u|u|e|e|.......|e|e| ... |o|o|
+ * |<- bitmap_nr_bits ->|
+ * |<--------------- total_nr_bits ---------------->|
*/
-#define BTRFS_SUBPAGE_BITMAP_SIZE 16
+struct btrfs_subpage_info {
+ /* Number of bits for each bitmap */
+ unsigned int bitmap_nr_bits;
+
+ /* Total number of bits for the whole bitmap */
+ unsigned int total_nr_bits;
+
+ /*
+ * *_start indicates where the bitmap starts, the length is always
+ * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize.
+ */
+ unsigned int uptodate_offset;
+ unsigned int error_offset;
+ unsigned int dirty_offset;
+ unsigned int writeback_offset;
+ unsigned int ordered_offset;
+ unsigned int checked_offset;
+};
/*
* Structure to trace status of each sector inside a page, attached to
@@ -18,10 +46,6 @@
struct btrfs_subpage {
/* Common members for both data and metadata pages */
spinlock_t lock;
- u16 uptodate_bitmap;
- u16 error_bitmap;
- u16 dirty_bitmap;
- u16 writeback_bitmap;
/*
* Both data and metadata needs to track how many readers are for the
* page.
@@ -38,14 +62,11 @@ struct btrfs_subpage {
* manages whether the subpage can be detached.
*/
atomic_t eb_refs;
- /* Structures only used by data */
- struct {
- atomic_t writers;
- /* Tracke pending ordered extent in this sector */
- u16 ordered_bitmap;
- };
+ /* Structures only used by data */
+ atomic_t writers;
};
+ unsigned long bitmaps[];
};
enum btrfs_subpage_type {
@@ -53,15 +74,15 @@ enum btrfs_subpage_type {
BTRFS_SUBPAGE_DATA,
};
+void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page, enum btrfs_subpage_type type);
void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page);
/* Allocate additional data where page represents more than one sector */
-int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- struct btrfs_subpage **ret,
- enum btrfs_subpage_type type);
+struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+ enum btrfs_subpage_type type);
void btrfs_free_subpage(struct btrfs_subpage *subpage);
void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
@@ -122,11 +143,14 @@ DECLARE_BTRFS_SUBPAGE_OPS(error);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
DECLARE_BTRFS_SUBPAGE_OPS(writeback);
DECLARE_BTRFS_SUBPAGE_OPS(ordered);
+DECLARE_BTRFS_SUBPAGE_OPS(checked);
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len);
void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
struct page *page);
+void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
+ u64 start, u32 len);
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 537d90bf5d84..a1c54a2c787c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1705,7 +1705,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
goto error_close_devices;
}
- bdev = fs_devices->latest_bdev;
+ bdev = fs_devices->latest_dev->bdev;
s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
fs_info);
if (IS_ERR(s)) {
@@ -2006,7 +2006,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto restore;
} else {
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
btrfs_err(fs_info,
"Remounting read-write after error is not allowed");
ret = -EINVAL;
@@ -2463,30 +2463,16 @@ static int btrfs_unfreeze(struct super_block *sb)
static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
- struct btrfs_device *dev, *first_dev = NULL;
/*
- * Lightweight locking of the devices. We should not need
- * device_list_mutex here as we only read the device data and the list
- * is protected by RCU. Even if a device is deleted during the list
- * traversals, we'll get valid data, the freeing callback will wait at
- * least until the rcu_read_unlock.
+ * There should be always a valid pointer in latest_dev, it may be stale
+ * for a short moment in case it's being deleted but still valid until
+ * the end of RCU grace period.
*/
rcu_read_lock();
- list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
- continue;
- if (!dev->name)
- continue;
- if (!first_dev || dev->devid < first_dev->devid)
- first_dev = dev;
- }
-
- if (first_dev)
- seq_escape(m, rcu_str_deref(first_dev->name), " \t\n\\");
- else
- WARN_ON(1);
+ seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\");
rcu_read_unlock();
+
return 0;
}
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 25a6f587852b..f9eff3b0f77c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -177,7 +177,7 @@ static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
} else
val = can_modify_feature(fa);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
@@ -330,7 +330,7 @@ static const struct attribute_group btrfs_feature_attr_group = {
static ssize_t rmdir_subvol_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return scnprintf(buf, PAGE_SIZE, "0\n");
+ return sysfs_emit(buf, "0\n");
}
BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show);
@@ -345,12 +345,12 @@ static ssize_t supported_checksums_show(struct kobject *kobj,
* This "trick" only works as long as 'enum btrfs_csum_type' has
* no holes in it
*/
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
- (i == 0 ? "" : " "), btrfs_super_csum_name(i));
+ ret += sysfs_emit_at(buf, ret, "%s%s", (i == 0 ? "" : " "),
+ btrfs_super_csum_name(i));
}
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
@@ -358,7 +358,7 @@ BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
static ssize_t send_stream_version_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION);
+ return sysfs_emit(buf, "%d\n", BTRFS_SEND_STREAM_VERSION);
}
BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
@@ -378,9 +378,8 @@ static ssize_t supported_rescue_options_show(struct kobject *kobj,
int i;
for (i = 0; i < ARRAY_SIZE(rescue_opts); i++)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
- (i ? " " : ""), rescue_opts[i]);
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ ret += sysfs_emit_at(buf, ret, "%s%s", (i ? " " : ""), rescue_opts[i]);
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
BTRFS_ATTR(static_feature, supported_rescue_options,
@@ -394,10 +393,10 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
/* 4K sector size is also supported with 64K page size */
if (PAGE_SIZE == SZ_64K)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%u ", SZ_4K);
+ ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
/* Only sectorsize == PAGE_SIZE is now supported */
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE);
+ ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
return ret;
}
@@ -437,7 +436,7 @@ static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%lld\n",
+ return sysfs_emit(buf, "%lld\n",
atomic64_read(&fs_info->discard_ctl.discardable_bytes));
}
BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show);
@@ -448,7 +447,7 @@ static ssize_t btrfs_discardable_extents_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%d\n",
+ return sysfs_emit(buf, "%d\n",
atomic_read(&fs_info->discard_ctl.discardable_extents));
}
BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show);
@@ -459,8 +458,8 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n",
- fs_info->discard_ctl.discard_bitmap_bytes);
+ return sysfs_emit(buf, "%llu\n",
+ fs_info->discard_ctl.discard_bitmap_bytes);
}
BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show);
@@ -470,7 +469,7 @@ static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%lld\n",
+ return sysfs_emit(buf, "%lld\n",
atomic64_read(&fs_info->discard_ctl.discard_bytes_saved));
}
BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show);
@@ -481,8 +480,8 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n",
- fs_info->discard_ctl.discard_extent_bytes);
+ return sysfs_emit(buf, "%llu\n",
+ fs_info->discard_ctl.discard_extent_bytes);
}
BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show);
@@ -492,8 +491,8 @@ static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n",
- READ_ONCE(fs_info->discard_ctl.iops_limit));
+ return sysfs_emit(buf, "%u\n",
+ READ_ONCE(fs_info->discard_ctl.iops_limit));
}
static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj,
@@ -523,8 +522,8 @@ static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n",
- READ_ONCE(fs_info->discard_ctl.kbps_limit));
+ return sysfs_emit(buf, "%u\n",
+ READ_ONCE(fs_info->discard_ctl.kbps_limit));
}
static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj,
@@ -553,8 +552,8 @@ static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n",
- READ_ONCE(fs_info->discard_ctl.max_discard_size));
+ return sysfs_emit(buf, "%llu\n",
+ READ_ONCE(fs_info->discard_ctl.max_discard_size));
}
static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj,
@@ -627,7 +626,7 @@ static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
val = *value_ptr;
if (lock)
spin_unlock(lock);
- return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
+ return sysfs_emit(buf, "%llu\n", val);
}
static ssize_t global_rsv_size_show(struct kobject *kobj,
@@ -673,7 +672,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
val += block_group->used;
}
up_read(&sinfo->groups_sem);
- return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
+ return sysfs_emit(buf, "%llu\n", val);
}
/*
@@ -771,7 +770,7 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
ssize_t ret;
spin_lock(&fs_info->super_lock);
- ret = scnprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
+ ret = sysfs_emit(buf, label[0] ? "%s\n" : "%s", label);
spin_unlock(&fs_info->super_lock);
return ret;
@@ -819,7 +818,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize);
}
BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
@@ -829,8 +828,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n",
- fs_info->super_copy->sectorsize);
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
}
BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
@@ -840,7 +838,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
}
BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
@@ -852,7 +850,7 @@ static ssize_t quota_override_show(struct kobject *kobj,
int quota_override;
quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
- return scnprintf(buf, PAGE_SIZE, "%d\n", quota_override);
+ return sysfs_emit(buf, "%d\n", quota_override);
}
static ssize_t quota_override_store(struct kobject *kobj,
@@ -890,8 +888,7 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%pU\n",
- fs_info->fs_devices->metadata_uuid);
+ return sysfs_emit(buf, "%pU\n", fs_info->fs_devices->metadata_uuid);
}
BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show);
@@ -902,9 +899,9 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj,
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
u16 csum_type = btrfs_super_csum_type(fs_info->super_copy);
- return scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
- btrfs_super_csum_name(csum_type),
- crypto_shash_driver_name(fs_info->csum_shash));
+ return sysfs_emit(buf, "%s (%s)\n",
+ btrfs_super_csum_name(csum_type),
+ crypto_shash_driver_name(fs_info->csum_shash));
}
BTRFS_ATTR(, checksum, btrfs_checksum_show);
@@ -941,7 +938,7 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
str = "UNKNOWN\n";
break;
}
- return scnprintf(buf, PAGE_SIZE, "%s", str);
+ return sysfs_emit(buf, "%s", str);
}
BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
@@ -950,7 +947,7 @@ static ssize_t btrfs_generation_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation);
+ return sysfs_emit(buf, "%llu\n", fs_info->generation);
}
BTRFS_ATTR(, generation, btrfs_generation_show);
@@ -1028,8 +1025,7 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
ssize_t ret;
- ret = scnprintf(buf, PAGE_SIZE, "%d\n",
- READ_ONCE(fs_info->bg_reclaim_threshold));
+ ret = sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold));
return ret;
}
@@ -1471,7 +1467,7 @@ static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show);
@@ -1484,7 +1480,7 @@ static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show);
@@ -1498,7 +1494,7 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
@@ -1509,8 +1505,7 @@ static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj,
struct btrfs_device *device = container_of(kobj, struct btrfs_device,
devid_kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n",
- READ_ONCE(device->scrub_speed_max));
+ return sysfs_emit(buf, "%llu\n", READ_ONCE(device->scrub_speed_max));
}
static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
@@ -1538,7 +1533,7 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
@@ -1549,14 +1544,14 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
devid_kobj);
if (!device->dev_stats_valid)
- return scnprintf(buf, PAGE_SIZE, "invalid\n");
+ return sysfs_emit(buf, "invalid\n");
/*
* Print all at once so we get a snapshot of all values from the same
* time. Keep them in sync and in order of definition of
* btrfs_dev_stat_values.
*/
- return scnprintf(buf, PAGE_SIZE,
+ return sysfs_emit(buf,
"write_errs %d\n"
"read_errs %d\n"
"flush_errs %d\n"
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index df54cdfdc250..2a95f7224e18 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -60,7 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = 0;
- setup_items_for_insert(root, path, &key, &value_len, 1);
+ btrfs_setup_item_for_insert(root, path, &key, value_len);
item = btrfs_item_nr(0);
write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
value_len);
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 73e96d505f4f..c2e72e7a8ff0 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -112,7 +112,7 @@ static int test_find_delalloc(u32 sectorsize)
*/
set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL);
start = 0;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
@@ -143,7 +143,7 @@ static int test_find_delalloc(u32 sectorsize)
}
set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL);
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
@@ -177,14 +177,14 @@ static int test_find_delalloc(u32 sectorsize)
goto out_bits;
}
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (found) {
test_err("found range when we shouldn't have");
goto out_bits;
}
- if (end != (u64)-1) {
+ if (end != test_start + PAGE_SIZE - 1) {
test_err("did not return the proper end offset");
goto out_bits;
}
@@ -198,7 +198,7 @@ static int test_find_delalloc(u32 sectorsize)
*/
set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL);
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
@@ -233,7 +233,7 @@ static int test_find_delalloc(u32 sectorsize)
/* We unlocked it in the previous test */
lock_page(locked_page);
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
/*
* Currently if we fail to find dirty pages in the delalloc range we
* will adjust max_bytes down to PAGE_SIZE and then re-search. If
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index c9874b12d337..cac89c388131 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -33,7 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = start;
- setup_items_for_insert(root, &path, &key, &value_len, 1);
+ btrfs_setup_item_for_insert(root, &path, &key, value_len);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi, 1);
btrfs_set_file_extent_type(leaf, fi, type);
@@ -63,7 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- setup_items_for_insert(root, &path, &key, &value_len, 1);
+ btrfs_setup_item_for_insert(root, &path, &key, value_len);
}
/*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 14b9fdc8aaa9..1c3a1189c0bd 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -283,7 +283,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->trans_lock);
loop:
/* The file system has been taken offline. No new transactions. */
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
spin_unlock(&fs_info->trans_lock);
return -EROFS;
}
@@ -331,7 +331,7 @@ loop:
*/
kfree(cur_trans);
goto loop;
- } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ } else if (BTRFS_FS_ERROR(fs_info)) {
spin_unlock(&fs_info->trans_lock);
kfree(cur_trans);
return -EROFS;
@@ -579,7 +579,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
bool do_chunk_alloc = false;
int ret;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return ERR_PTR(-EROFS);
if (current->journal_info) {
@@ -991,8 +991,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (throttle)
btrfs_run_delayed_iputs(info);
- if (TRANS_ABORTED(trans) ||
- test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
+ if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) {
wake_up_process(info->transaction_kthread);
if (TRANS_ABORTED(trans))
err = trans->aborted;
@@ -2155,7 +2154,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* abort to prevent writing a new superblock that reflects a
* corrupt state (pointing to trees with unwritten nodes/leafs).
*/
- if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
ret = -EROFS;
goto cleanup_transaction;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f7efc26aa82a..8ab33caf016f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,7 +94,7 @@ enum {
};
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
int inode_only,
struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
@@ -207,7 +207,7 @@ again:
}
atomic_inc(&root->log_writers);
- if (ctx && !ctx->logging_new_name) {
+ if (!ctx->logging_new_name) {
int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
@@ -368,25 +368,11 @@ static int process_one_buffer(struct btrfs_root *log,
return ret;
}
-/*
- * Item overwrite used by replay and tree logging. eb, slot and key all refer
- * to the src data we are copying out.
- *
- * root is the tree we are copying into, and path is a scratch
- * path for use in this function (it should be released on entry and
- * will be released on exit).
- *
- * If the key is already in the destination tree the existing item is
- * overwritten. If the existing item isn't big enough, it is extended.
- * If it is too large, it is truncated.
- *
- * If the key isn't in the destination yet, a new item is inserted.
- */
-static noinline int overwrite_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static int do_overwrite_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
{
int ret;
u32 item_size;
@@ -403,10 +389,22 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
item_size = btrfs_item_size_nr(eb, slot);
src_ptr = btrfs_item_ptr_offset(eb, slot);
- /* look for the key in the destination tree */
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
- return ret;
+ /* Our caller must have done a search for the key for us. */
+ ASSERT(path->nodes[0] != NULL);
+
+ /*
+ * And the slot must point to the exact key or the slot where the key
+ * should be at (the first item with a key greater than 'key')
+ */
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ struct btrfs_key found_key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+ ret = btrfs_comp_cpu_keys(&found_key, key);
+ ASSERT(ret >= 0);
+ } else {
+ ret = 1;
+ }
if (ret == 0) {
char *src_copy;
@@ -585,6 +583,36 @@ no_copy:
}
/*
+ * Item overwrite used by replay and tree logging. eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten. If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static int overwrite_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ int ret;
+
+ /* Look for the key in the destination tree. */
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ return do_overwrite_item(trans, root, path, eb, slot, key);
+}
+
+/*
* simple helper to read an inode off the disk from a given root
* This can only be called for subvolume roots and not for the log
*/
@@ -761,7 +789,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ins.objectid, ins.offset, 0);
btrfs_init_data_ref(&ref,
root->root_key.objectid,
- key->objectid, offset);
+ key->objectid, offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret)
goto out;
@@ -893,11 +921,11 @@ out:
* item
*/
static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_inode *dir,
struct btrfs_dir_item *di)
{
+ struct btrfs_root *root = dir->root;
struct inode *inode;
char *name;
int name_len;
@@ -926,7 +954,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
+ ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name,
name_len);
if (ret)
goto out;
@@ -939,9 +967,11 @@ out:
}
/*
- * helper function to see if a given name and sequence number found
- * in an inode back reference are already in a directory and correctly
- * point to this inode
+ * See if a given name and sequence number found in an inode back reference are
+ * already in a directory and correctly point to this inode.
+ *
+ * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
+ * exists.
*/
static noinline int inode_in_dir(struct btrfs_root *root,
struct btrfs_path *path,
@@ -950,29 +980,34 @@ static noinline int inode_in_dir(struct btrfs_root *root,
{
struct btrfs_dir_item *di;
struct btrfs_key location;
- int match = 0;
+ int ret = 0;
di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
index, name, name_len, 0);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ } else if (di) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
if (location.objectid != objectid)
goto out;
- } else
+ } else {
goto out;
- btrfs_release_path(path);
+ }
+ btrfs_release_path(path);
di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
- if (di && !IS_ERR(di)) {
- btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
- if (location.objectid != objectid)
- goto out;
- } else
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
goto out;
- match = 1;
+ } else if (di) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ if (location.objectid == objectid)
+ ret = 1;
+ }
out:
btrfs_release_path(path);
- return match;
+ return ret;
}
/*
@@ -1084,7 +1119,7 @@ again:
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
- ret = btrfs_unlink_inode(trans, root, dir, inode,
+ ret = btrfs_unlink_inode(trans, dir, inode,
victim_name, victim_name_len);
kfree(victim_name);
if (ret)
@@ -1155,7 +1190,7 @@ again:
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
- ret = btrfs_unlink_inode(trans, root,
+ ret = btrfs_unlink_inode(trans,
BTRFS_I(victim_parent),
inode,
victim_name,
@@ -1182,8 +1217,10 @@ next:
/* look for a conflicting sequence number */
di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
ref_index, name, namelen, 0);
- if (di && !IS_ERR(di)) {
- ret = drop_one_dir_item(trans, root, path, dir, di);
+ if (IS_ERR(di)) {
+ return PTR_ERR(di);
+ } else if (di) {
+ ret = drop_one_dir_item(trans, path, dir, di);
if (ret)
return ret;
}
@@ -1192,8 +1229,10 @@ next:
/* look for a conflicting name */
di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
name, namelen, 0);
- if (di && !IS_ERR(di)) {
- ret = drop_one_dir_item(trans, root, path, dir, di);
+ if (IS_ERR(di)) {
+ return PTR_ERR(di);
+ } else if (di) {
+ ret = drop_one_dir_item(trans, path, dir, di);
if (ret)
return ret;
}
@@ -1313,7 +1352,7 @@ again:
kfree(name);
goto out;
}
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
inode, name, namelen);
kfree(name);
iput(dir);
@@ -1374,10 +1413,11 @@ out:
return ret;
}
-static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+static int add_link(struct btrfs_trans_handle *trans,
struct inode *dir, struct inode *inode, const char *name,
int namelen, u64 ref_index)
{
+ struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_dir_item *dir_item;
struct btrfs_key key;
struct btrfs_path *path;
@@ -1411,7 +1451,7 @@ static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
ret = -ENOENT;
goto out;
}
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode),
name, namelen);
if (ret)
goto out;
@@ -1517,10 +1557,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- /* if we already have a perfect match, we're done */
- if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
- btrfs_ino(BTRFS_I(inode)), ref_index,
- name, namelen)) {
+ ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
+ btrfs_ino(BTRFS_I(inode)), ref_index,
+ name, namelen);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
/*
* look for a conflicting back reference in the
* metadata. if we find one we have to unlink that name
@@ -1555,7 +1597,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
ret = btrfs_inode_ref_exists(inode, dir, key->type,
name, namelen);
if (ret > 0) {
- ret = btrfs_unlink_inode(trans, root,
+ ret = btrfs_unlink_inode(trans,
BTRFS_I(dir),
BTRFS_I(inode),
name, namelen);
@@ -1571,7 +1613,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
goto out;
/* insert our name */
- ret = add_link(trans, root, dir, inode, name, namelen,
+ ret = add_link(trans, dir, inode, name, namelen,
ref_index);
if (ret)
goto out;
@@ -1580,6 +1622,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
}
+ /* Else, ret == 1, we already have a perfect match, we're done. */
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
kfree(name);
@@ -1936,8 +1979,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
struct btrfs_key log_key;
struct inode *dir;
u8 log_type;
- int exists;
- int ret = 0;
+ bool exists;
+ int ret;
bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
bool name_added = false;
@@ -1957,12 +2000,12 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
name_len);
btrfs_dir_item_key_to_cpu(eb, di, &log_key);
- exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
- if (exists == 0)
- exists = 1;
- else
- exists = 0;
+ ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
btrfs_release_path(path);
+ if (ret < 0)
+ goto out;
+ exists = (ret == 0);
+ ret = 0;
if (key->type == BTRFS_DIR_ITEM_KEY) {
dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
@@ -1977,7 +2020,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
ret = -EINVAL;
goto out;
}
- if (IS_ERR_OR_NULL(dst_di)) {
+
+ if (IS_ERR(dst_di)) {
+ ret = PTR_ERR(dst_di);
+ goto out;
+ } else if (!dst_di) {
/* we need a sequence number to insert, so we only
* do inserts for the BTRFS_DIR_INDEX_KEY types
*/
@@ -2003,7 +2050,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
if (!exists)
goto out;
- ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
+ ret = drop_one_dir_item(trans, path, BTRFS_I(dir), dst_di);
if (ret)
goto out;
@@ -2233,13 +2280,13 @@ out:
* to is unlinked
*/
static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
struct btrfs_path *log_path,
struct inode *dir,
struct btrfs_key *dir_key)
{
+ struct btrfs_root *root = BTRFS_I(dir)->root;
int ret;
struct extent_buffer *eb;
int slot;
@@ -2281,7 +2328,7 @@ again:
dir_key->offset,
name, name_len, 0);
}
- if (!log_di || log_di == ERR_PTR(-ENOENT)) {
+ if (!log_di) {
btrfs_dir_item_key_to_cpu(eb, di, &location);
btrfs_release_path(path);
btrfs_release_path(log_path);
@@ -2300,7 +2347,7 @@ again:
}
inc_nlink(inode);
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
BTRFS_I(inode), name, name_len);
if (!ret)
ret = btrfs_run_delayed_items(trans);
@@ -2482,7 +2529,9 @@ again:
else {
ret = find_dir_range(log, path, dirid, key_type,
&range_start, &range_end);
- if (ret != 0)
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
break;
}
@@ -2511,7 +2560,7 @@ again:
if (found_key.offset > range_end)
break;
- ret = check_item_in_log(trans, root, log, path,
+ ret = check_item_in_log(trans, log, path,
log_path, dir,
&found_key);
if (ret)
@@ -3019,9 +3068,6 @@ static void wait_for_writer(struct btrfs_root *root)
static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
- if (!ctx)
- return;
-
mutex_lock(&root->log_mutex);
list_del_init(&ctx->list);
mutex_unlock(&root->log_mutex);
@@ -3310,7 +3356,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* writing the super here would result in transid mismatches. If there
* is an error here just bail.
*/
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
ret = -EIO;
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
@@ -3434,6 +3480,9 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
if (inode->logged_trans == trans->transid)
return true;
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
+ return false;
+
/*
* The inode's logged_trans is always 0 when we load it (because it is
* not persisted in the inode item or elsewhere). So if it is 0, the
@@ -3472,10 +3521,10 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
* This optimizations allows us to avoid relogging the entire inode
* or the entire directory.
*/
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *dir, u64 index)
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *dir, u64 index)
{
struct btrfs_root *log;
struct btrfs_dir_item *di;
@@ -3485,11 +3534,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
u64 dir_ino = btrfs_ino(dir);
if (!inode_logged(trans, dir))
- return 0;
+ return;
ret = join_running_log_trans(root);
if (ret)
- return 0;
+ return;
mutex_lock(&dir->log_mutex);
@@ -3537,49 +3586,36 @@ fail:
btrfs_free_path(path);
out_unlock:
mutex_unlock(&dir->log_mutex);
- if (err == -ENOSPC) {
+ if (err < 0)
btrfs_set_log_full_commit(trans);
- err = 0;
- } else if (err < 0 && err != -ENOENT) {
- /* ENOENT can be returned if the entry hasn't been fsynced yet */
- btrfs_abort_transaction(trans, err);
- }
-
btrfs_end_log_trans(root);
-
- return err;
}
/* see comments for btrfs_del_dir_entries_in_log */
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *inode, u64 dirid)
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *inode, u64 dirid)
{
struct btrfs_root *log;
u64 index;
int ret;
if (!inode_logged(trans, inode))
- return 0;
+ return;
ret = join_running_log_trans(root);
if (ret)
- return 0;
+ return;
log = root->log_root;
mutex_lock(&inode->log_mutex);
ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
dirid, &index);
mutex_unlock(&inode->log_mutex);
- if (ret == -ENOSPC) {
+ if (ret < 0 && ret != -ENOENT)
btrfs_set_log_full_commit(trans);
- ret = 0;
- } else if (ret < 0 && ret != -ENOENT)
- btrfs_abort_transaction(trans, ret);
btrfs_end_log_trans(root);
-
- return ret;
}
/*
@@ -3615,31 +3651,231 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
return 0;
}
+static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct extent_buffer *src,
+ struct btrfs_path *dst_path,
+ int start_slot,
+ int count)
+{
+ char *ins_data = NULL;
+ struct btrfs_item_batch batch;
+ struct extent_buffer *dst;
+ unsigned long src_offset;
+ unsigned long dst_offset;
+ struct btrfs_key key;
+ u32 item_size;
+ int ret;
+ int i;
+
+ ASSERT(count > 0);
+ batch.nr = count;
+
+ if (count == 1) {
+ btrfs_item_key_to_cpu(src, &key, start_slot);
+ item_size = btrfs_item_size_nr(src, start_slot);
+ batch.keys = &key;
+ batch.data_sizes = &item_size;
+ batch.total_data_size = item_size;
+ } else {
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+
+ ins_data = kmalloc(count * sizeof(u32) +
+ count * sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
+
+ ins_sizes = (u32 *)ins_data;
+ ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ batch.total_data_size = 0;
+
+ for (i = 0; i < count; i++) {
+ const int slot = start_slot + i;
+
+ btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
+ ins_sizes[i] = btrfs_item_size_nr(src, slot);
+ batch.total_data_size += ins_sizes[i];
+ }
+ }
+
+ ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
+ if (ret)
+ goto out;
+
+ dst = dst_path->nodes[0];
+ /*
+ * Copy all the items in bulk, in a single copy operation. Item data is
+ * organized such that it's placed at the end of a leaf and from right
+ * to left. For example, the data for the second item ends at an offset
+ * that matches the offset where the data for the first item starts, the
+ * data for the third item ends at an offset that matches the offset
+ * where the data of the second items starts, and so on.
+ * Therefore our source and destination start offsets for copy match the
+ * offsets of the last items (highest slots).
+ */
+ dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
+ src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
+ copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
+ btrfs_release_path(dst_path);
+out:
+ kfree(ins_data);
+
+ return ret;
+}
+
+static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ int key_type,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ struct extent_buffer *src = path->nodes[0];
+ const int nritems = btrfs_header_nritems(src);
+ const u64 ino = btrfs_ino(inode);
+ const bool inode_logged_before = inode_logged(trans, inode);
+ u64 last_logged_key_offset;
+ bool last_found = false;
+ int batch_start = 0;
+ int batch_size = 0;
+ int i;
+
+ if (key_type == BTRFS_DIR_ITEM_KEY)
+ last_logged_key_offset = inode->last_dir_item_offset;
+ else
+ last_logged_key_offset = inode->last_dir_index_offset;
+
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_key key;
+ int ret;
+
+ btrfs_item_key_to_cpu(src, &key, i);
+
+ if (key.objectid != ino || key.type != key_type) {
+ last_found = true;
+ break;
+ }
+
+ ctx->last_dir_item_offset = key.offset;
+ /*
+ * We must make sure that when we log a directory entry, the
+ * corresponding inode, after log replay, has a matching link
+ * count. For example:
+ *
+ * touch foo
+ * mkdir mydir
+ * sync
+ * ln foo mydir/bar
+ * xfs_io -c "fsync" mydir
+ * <crash>
+ * <mount fs and log replay>
+ *
+ * Would result in a fsync log that when replayed, our file inode
+ * would have a link count of 1, but we get two directory entries
+ * pointing to the same inode. After removing one of the names,
+ * it would not be possible to remove the other name, which
+ * resulted always in stale file handle errors, and would not be
+ * possible to rmdir the parent directory, since its i_size could
+ * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
+ * resulting in -ENOTEMPTY errors.
+ */
+ if (!ctx->log_new_dentries) {
+ struct btrfs_dir_item *di;
+ struct btrfs_key di_key;
+
+ di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(src, di, &di_key);
+ if ((btrfs_dir_transid(src, di) == trans->transid ||
+ btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
+ di_key.type != BTRFS_ROOT_ITEM_KEY)
+ ctx->log_new_dentries = true;
+ }
+
+ if (!inode_logged_before)
+ goto add_to_batch;
+
+ /*
+ * If we were logged before and have logged dir items, we can skip
+ * checking if any item with a key offset larger than the last one
+ * we logged is in the log tree, saving time and avoiding adding
+ * contention on the log tree.
+ */
+ if (key.offset > last_logged_key_offset)
+ goto add_to_batch;
+ /*
+ * Check if the key was already logged before. If not we can add
+ * it to a batch for bulk insertion.
+ */
+ ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ btrfs_release_path(dst_path);
+ goto add_to_batch;
+ }
+
+ /*
+ * Item exists in the log. Overwrite the item in the log if it
+ * has different content or do nothing if it has exactly the same
+ * content. And then flush the current batch if any - do it after
+ * overwriting the current item, or we would deadlock otherwise,
+ * since we are holding a path for the existing item.
+ */
+ ret = do_overwrite_item(trans, log, dst_path, src, i, &key);
+ if (ret < 0)
+ return ret;
+
+ if (batch_size > 0) {
+ ret = flush_dir_items_batch(trans, log, src, dst_path,
+ batch_start, batch_size);
+ if (ret < 0)
+ return ret;
+ batch_size = 0;
+ }
+ continue;
+add_to_batch:
+ if (batch_size == 0)
+ batch_start = i;
+ batch_size++;
+ }
+
+ if (batch_size > 0) {
+ int ret;
+
+ ret = flush_dir_items_batch(trans, log, src, dst_path,
+ batch_start, batch_size);
+ if (ret < 0)
+ return ret;
+ }
+
+ return last_found ? 1 : 0;
+}
+
/*
* log all the items included in the current transaction for a given
* directory. This also creates the range items in the log tree required
* to replay anything deleted before the fsync
*/
static noinline int log_dir_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path, int key_type,
struct btrfs_log_ctx *ctx,
u64 min_offset, u64 *last_offset_ret)
{
struct btrfs_key min_key;
+ struct btrfs_root *root = inode->root;
struct btrfs_root *log = root->log_root;
- struct extent_buffer *src;
int err = 0;
int ret;
- int i;
- int nritems;
u64 first_offset = min_offset;
u64 last_offset = (u64)-1;
u64 ino = btrfs_ino(inode);
- log = root->log_root;
-
min_key.objectid = ino;
min_key.type = key_type;
min_key.offset = min_offset;
@@ -3713,62 +3949,14 @@ search:
* from our directory
*/
while (1) {
- struct btrfs_key tmp;
- src = path->nodes[0];
- nritems = btrfs_header_nritems(src);
- for (i = path->slots[0]; i < nritems; i++) {
- struct btrfs_dir_item *di;
-
- btrfs_item_key_to_cpu(src, &min_key, i);
-
- if (min_key.objectid != ino || min_key.type != key_type)
- goto done;
-
- if (need_resched()) {
- btrfs_release_path(path);
- cond_resched();
- goto search;
- }
-
- ret = overwrite_item(trans, log, dst_path, src, i,
- &min_key);
- if (ret) {
+ ret = process_dir_items_leaf(trans, inode, path, dst_path,
+ key_type, ctx);
+ if (ret != 0) {
+ if (ret < 0)
err = ret;
- goto done;
- }
-
- /*
- * We must make sure that when we log a directory entry,
- * the corresponding inode, after log replay, has a
- * matching link count. For example:
- *
- * touch foo
- * mkdir mydir
- * sync
- * ln foo mydir/bar
- * xfs_io -c "fsync" mydir
- * <crash>
- * <mount fs and log replay>
- *
- * Would result in a fsync log that when replayed, our
- * file inode would have a link count of 1, but we get
- * two directory entries pointing to the same inode.
- * After removing one of the names, it would not be
- * possible to remove the other name, which resulted
- * always in stale file handle errors, and would not
- * be possible to rmdir the parent directory, since
- * its i_size could never decrement to the value
- * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
- */
- di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
- btrfs_dir_item_key_to_cpu(src, di, &tmp);
- if (ctx &&
- (btrfs_dir_transid(src, di) == trans->transid ||
- btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
- tmp.type != BTRFS_ROOT_ITEM_KEY)
- ctx->log_new_dentries = true;
+ goto done;
}
- path->slots[0] = nritems;
+ path->slots[0] = btrfs_header_nritems(path->nodes[0]);
/*
* look ahead to the next item and see if it is also
@@ -3782,21 +3970,26 @@ search:
err = ret;
goto done;
}
- btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
- if (tmp.objectid != ino || tmp.type != key_type) {
+ btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
+ if (min_key.objectid != ino || min_key.type != key_type) {
last_offset = (u64)-1;
goto done;
}
if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
ret = overwrite_item(trans, log, dst_path,
path->nodes[0], path->slots[0],
- &tmp);
+ &min_key);
if (ret)
err = ret;
else
- last_offset = tmp.offset;
+ last_offset = min_key.offset;
goto done;
}
+ if (need_resched()) {
+ btrfs_release_path(path);
+ cond_resched();
+ goto search;
+ }
}
done:
btrfs_release_path(path);
@@ -3829,7 +4022,7 @@ done:
* key logged by this transaction.
*/
static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path,
struct btrfs_log_ctx *ctx)
@@ -3839,11 +4032,33 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
int ret;
int key_type = BTRFS_DIR_ITEM_KEY;
+ /*
+ * If this is the first time we are being logged in the current
+ * transaction, or we were logged before but the inode was evicted and
+ * reloaded later, in which case its logged_trans is 0, reset the values
+ * of the last logged key offsets. Note that we don't use the helper
+ * function inode_logged() here - that is because the function returns
+ * true after an inode eviction, assuming the worst case as it can not
+ * know for sure if the inode was logged before. So we can not skip key
+ * searches in the case the inode was evicted, because it may not have
+ * been logged in this transaction and may have been logged in a past
+ * transaction, so we need to reset the last dir item and index offsets
+ * to (u64)-1.
+ */
+ if (inode->logged_trans != trans->transid) {
+ inode->last_dir_item_offset = (u64)-1;
+ inode->last_dir_index_offset = (u64)-1;
+ }
again:
min_key = 0;
max_key = 0;
+ if (key_type == BTRFS_DIR_ITEM_KEY)
+ ctx->last_dir_item_offset = inode->last_dir_item_offset;
+ else
+ ctx->last_dir_item_offset = inode->last_dir_index_offset;
+
while (1) {
- ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
+ ret = log_dir_items(trans, inode, path, dst_path, key_type,
ctx, min_key, &max_key);
if (ret)
return ret;
@@ -3853,8 +4068,11 @@ again:
}
if (key_type == BTRFS_DIR_ITEM_KEY) {
+ inode->last_dir_item_offset = ctx->last_dir_item_offset;
key_type = BTRFS_DIR_INDEX_KEY;
goto again;
+ } else {
+ inode->last_dir_index_offset = ctx->last_dir_item_offset;
}
return 0;
}
@@ -3865,17 +4083,21 @@ again:
* This cannot be run for file data extents because it does not
* free the extents they point to.
*/
-static int drop_objectid_items(struct btrfs_trans_handle *trans,
+static int drop_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
- u64 objectid, int max_key_type)
+ struct btrfs_inode *inode,
+ int max_key_type)
{
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
int start_slot;
- key.objectid = objectid;
+ if (!inode_logged(trans, inode))
+ return 0;
+
+ key.objectid = btrfs_ino(inode);
key.type = max_key_type;
key.offset = (u64)-1;
@@ -3892,7 +4114,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
- if (found_key.objectid != objectid)
+ if (found_key.objectid != key.objectid)
break;
found_key.offset = 0;
@@ -3917,6 +4139,21 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
return ret;
}
+static int truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log_root,
+ struct btrfs_inode *inode,
+ u64 new_size, u32 min_type)
+{
+ int ret;
+
+ do {
+ ret = btrfs_truncate_inode_items(trans, log_root, inode,
+ new_size, min_type, NULL);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
static void fill_inode_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf,
struct btrfs_inode_item *item,
@@ -4089,6 +4326,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
+ struct btrfs_item_batch batch;
char *ins_data;
int i;
struct list_head ordered_sums;
@@ -4103,13 +4341,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
ins_sizes = (u32 *)ins_data;
ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ batch.total_data_size = 0;
+ batch.nr = nr;
for (i = 0; i < nr; i++) {
ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+ batch.total_data_size += ins_sizes[i];
btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
}
- ret = btrfs_insert_empty_items(trans, log, dst_path,
- ins_keys, ins_sizes, nr);
+ ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
if (ret) {
kfree(ins_data);
return ret;
@@ -4321,13 +4563,13 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
}
static int log_one_extent(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, struct btrfs_root *root,
+ struct btrfs_inode *inode,
const struct extent_map *em,
struct btrfs_path *path,
struct btrfs_log_ctx *ctx)
{
struct btrfs_drop_extents_args drop_args = { 0 };
- struct btrfs_root *log = root->log_root;
+ struct btrfs_root *log = inode->root->log_root;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
struct btrfs_map_token token;
@@ -4340,14 +4582,25 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- drop_args.path = path;
- drop_args.start = em->start;
- drop_args.end = em->start + em->len;
- drop_args.replace_extent = true;
- drop_args.extent_item_size = sizeof(*fi);
- ret = btrfs_drop_extents(trans, log, inode, &drop_args);
- if (ret)
- return ret;
+ /*
+ * If this is the first time we are logging the inode in the current
+ * transaction, we can avoid btrfs_drop_extents(), which is expensive
+ * because it does a deletion search, which always acquires write locks
+ * for extent buffers at levels 2, 1 and 0. This not only wastes time
+ * but also adds significant contention in a log tree, since log trees
+ * are small, with a root at level 2 or 3 at most, due to their short
+ * life span.
+ */
+ if (inode_logged(trans, inode)) {
+ drop_args.path = path;
+ drop_args.start = em->start;
+ drop_args.end = em->start + em->len;
+ drop_args.replace_extent = true;
+ drop_args.extent_item_size = sizeof(*fi);
+ ret = btrfs_drop_extents(trans, log, inode, &drop_args);
+ if (ret)
+ return ret;
+ }
if (!drop_args.extent_inserted) {
key.objectid = btrfs_ino(inode);
@@ -4505,13 +4758,9 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
* Avoid logging extent items logged in past fsync calls
* and leading to duplicate keys in the log tree.
*/
- do {
- ret = btrfs_truncate_inode_items(trans,
- root->log_root,
- inode, truncate_offset,
- BTRFS_EXTENT_DATA_KEY,
- NULL);
- } while (ret == -EAGAIN);
+ ret = truncate_inode_items(trans, root->log_root, inode,
+ truncate_offset,
+ BTRFS_EXTENT_DATA_KEY);
if (ret)
goto out;
dropped_extents = true;
@@ -4538,7 +4787,6 @@ out:
}
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_log_ctx *ctx)
@@ -4603,7 +4851,7 @@ process:
write_unlock(&tree->lock);
- ret = log_one_extent(trans, inode, root, em, path, ctx);
+ ret = log_one_extent(trans, inode, em, path, ctx);
write_lock(&tree->lock);
clear_em_logging(tree, em);
free_extent_map(em);
@@ -4692,11 +4940,11 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
* with a journal, ext3/4, xfs, f2fs, etc).
*/
static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path)
{
+ struct btrfs_root *root = inode->root;
int ret;
struct btrfs_key key;
const u64 ino = btrfs_ino(inode);
@@ -4770,10 +5018,10 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
* truncate operation that changes the inode's size.
*/
static int btrfs_log_holes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
const u64 ino = btrfs_ino(inode);
@@ -5050,7 +5298,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
} else {
- ret = btrfs_log_inode(trans, root,
+ ret = btrfs_log_inode(trans,
BTRFS_I(inode),
LOG_OTHER_INODE_ALL,
ctx);
@@ -5110,8 +5358,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
- ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_OTHER_INODE, ctx);
+ ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx);
if (ret) {
btrfs_add_delayed_iput(inode);
continue;
@@ -5222,7 +5469,7 @@ again:
&other_ino, &other_parent);
if (ret < 0) {
return ret;
- } else if (ret > 0 && ctx &&
+ } else if (ret > 0 &&
other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
if (ins_nr > 0) {
ins_nr++;
@@ -5322,7 +5569,7 @@ next_key:
* This handles both files and directories.
*/
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
int inode_only,
struct btrfs_log_ctx *ctx)
{
@@ -5330,7 +5577,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_path *dst_path;
struct btrfs_key min_key;
struct btrfs_key max_key;
- struct btrfs_root *log = root->log_root;
+ struct btrfs_root *log = inode->root->log_root;
int err = 0;
int ret = 0;
bool fast_search = false;
@@ -5372,22 +5619,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* Only run delayed items if we are a directory. We want to make sure
* all directory indexes hit the fs/subvolume tree so we can find them
* and figure out which index ranges have to be logged.
- *
- * Otherwise commit the delayed inode only if the full sync flag is set,
- * as we want to make sure an up to date version is in the subvolume
- * tree so copy_inode_items_to_log() / copy_items() can find it and copy
- * it to the log tree. For a non full sync, we always log the inode item
- * based on the in-memory struct btrfs_inode which is always up to date.
*/
- if (S_ISDIR(inode->vfs_inode.i_mode))
- ret = btrfs_commit_inode_delayed_items(trans, inode);
- else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
- ret = btrfs_commit_inode_delayed_inode(inode);
-
- if (ret) {
- btrfs_free_path(path);
- btrfs_free_path(dst_path);
- return ret;
+ if (S_ISDIR(inode->vfs_inode.i_mode)) {
+ err = btrfs_commit_inode_delayed_items(trans, inode);
+ if (err)
+ goto out;
}
if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
@@ -5426,9 +5662,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
if (inode_only == LOG_INODE_EXISTS)
max_key_type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino, max_key_type);
+ ret = drop_inode_items(trans, log, path, inode, max_key_type);
} else {
- if (inode_only == LOG_INODE_EXISTS) {
+ if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) {
/*
* Make sure the new inode item we write to the log has
* the same isize as the current one (if it exists).
@@ -5450,19 +5686,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
&inode->runtime_flags)) {
if (inode_only == LOG_INODE_EXISTS) {
max_key.type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino,
- max_key.type);
+ ret = drop_inode_items(trans, log, path, inode,
+ max_key.type);
} else {
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags);
clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&inode->runtime_flags);
- while(1) {
- ret = btrfs_truncate_inode_items(trans,
- log, inode, 0, 0, NULL);
- if (ret != -EAGAIN)
- break;
- }
+ if (inode_logged(trans, inode))
+ ret = truncate_inode_items(trans, log,
+ inode, 0, 0);
}
} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&inode->runtime_flags) ||
@@ -5470,8 +5703,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (inode_only == LOG_INODE_ALL)
fast_search = true;
max_key.type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino,
- max_key.type);
+ ret = drop_inode_items(trans, log, path, inode,
+ max_key.type);
} else {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
@@ -5494,14 +5727,14 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
+ err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
if (err)
goto out_unlock;
xattrs_logged = true;
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_holes(trans, root, inode, path);
+ err = btrfs_log_holes(trans, inode, path);
if (err)
goto out_unlock;
}
@@ -5521,16 +5754,14 @@ log_extents:
* BTRFS_INODE_COPY_EVERYTHING set.
*/
if (!xattrs_logged && inode->logged_trans < trans->transid) {
- err = btrfs_log_all_xattrs(trans, root, inode, path,
- dst_path);
+ err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
if (err)
goto out_unlock;
btrfs_release_path(path);
}
}
if (fast_search) {
- ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- ctx);
+ ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
if (ret) {
err = ret;
goto out_unlock;
@@ -5545,59 +5776,52 @@ log_extents:
}
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
- ret = log_directory_changes(trans, root, inode, path, dst_path,
- ctx);
+ ret = log_directory_changes(trans, inode, path, dst_path, ctx);
if (ret) {
err = ret;
goto out_unlock;
}
}
+ spin_lock(&inode->lock);
+ inode->logged_trans = trans->transid;
/*
- * If we are logging that an ancestor inode exists as part of logging a
- * new name from a link or rename operation, don't mark the inode as
- * logged - otherwise if an explicit fsync is made against an ancestor,
- * the fsync considers the inode in the log and doesn't sync the log,
- * resulting in the ancestor missing after a power failure unless the
- * log was synced as part of an fsync against any other unrelated inode.
- * So keep it simple for this case and just don't flag the ancestors as
- * logged.
+ * Don't update last_log_commit if we logged that an inode exists.
+ * We do this for three reasons:
+ *
+ * 1) We might have had buffered writes to this inode that were
+ * flushed and had their ordered extents completed in this
+ * transaction, but we did not previously log the inode with
+ * LOG_INODE_ALL. Later the inode was evicted and after that
+ * it was loaded again and this LOG_INODE_EXISTS log operation
+ * happened. We must make sure that if an explicit fsync against
+ * the inode is performed later, it logs the new extents, an
+ * updated inode item, etc, and syncs the log. The same logic
+ * applies to direct IO writes instead of buffered writes.
+ *
+ * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
+ * is logged with an i_size of 0 or whatever value was logged
+ * before. If later the i_size of the inode is increased by a
+ * truncate operation, the log is synced through an fsync of
+ * some other inode and then finally an explicit fsync against
+ * this inode is made, we must make sure this fsync logs the
+ * inode with the new i_size, the hole between old i_size and
+ * the new i_size, and syncs the log.
+ *
+ * 3) If we are logging that an ancestor inode exists as part of
+ * logging a new name from a link or rename operation, don't update
+ * its last_log_commit - otherwise if an explicit fsync is made
+ * against an ancestor, the fsync considers the inode in the log
+ * and doesn't sync the log, resulting in the ancestor missing after
+ * a power failure unless the log was synced as part of an fsync
+ * against any other unrelated inode.
*/
- if (!ctx ||
- !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
- &inode->vfs_inode != ctx->inode)) {
- spin_lock(&inode->lock);
- inode->logged_trans = trans->transid;
- /*
- * Don't update last_log_commit if we logged that an inode exists.
- * We do this for two reasons:
- *
- * 1) We might have had buffered writes to this inode that were
- * flushed and had their ordered extents completed in this
- * transaction, but we did not previously log the inode with
- * LOG_INODE_ALL. Later the inode was evicted and after that
- * it was loaded again and this LOG_INODE_EXISTS log operation
- * happened. We must make sure that if an explicit fsync against
- * the inode is performed later, it logs the new extents, an
- * updated inode item, etc, and syncs the log. The same logic
- * applies to direct IO writes instead of buffered writes.
- *
- * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
- * is logged with an i_size of 0 or whatever value was logged
- * before. If later the i_size of the inode is increased by a
- * truncate operation, the log is synced through an fsync of
- * some other inode and then finally an explicit fsync against
- * this inode is made, we must make sure this fsync logs the
- * inode with the new i_size, the hole between old i_size and
- * the new i_size, and syncs the log.
- */
- if (inode_only != LOG_INODE_EXISTS)
- inode->last_log_commit = inode->last_sub_trans;
- spin_unlock(&inode->lock);
- }
+ if (inode_only != LOG_INODE_EXISTS)
+ inode->last_log_commit = inode->last_sub_trans;
+ spin_unlock(&inode->lock);
out_unlock:
mutex_unlock(&inode->log_mutex);
-
+out:
btrfs_free_path(path);
btrfs_free_path(dst_path);
return err;
@@ -5697,6 +5921,14 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
struct btrfs_dir_list *dir_elem;
int ret = 0;
+ /*
+ * If we are logging a new name, as part of a link or rename operation,
+ * don't bother logging new dentries, as we just want to log the names
+ * of an inode and that any new parents exist.
+ */
+ if (ctx->logging_new_name)
+ return 0;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -5773,7 +6005,7 @@ process_leaf:
ctx->log_new_dentries = false;
if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
log_mode = LOG_INODE_ALL;
- ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
+ ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
log_mode, ctx);
btrfs_add_delayed_iput(di_inode);
if (ret)
@@ -5917,11 +6149,10 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
continue;
}
- if (ctx)
- ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
+ ctx->log_new_dentries = false;
+ ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
LOG_INODE_ALL, ctx);
- if (!ret && ctx && ctx->log_new_dentries)
+ if (!ret && ctx->log_new_dentries)
ret = log_new_dir_dentries(trans, root,
BTRFS_I(dir_inode), ctx);
btrfs_add_delayed_iput(dir_inode);
@@ -5967,7 +6198,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (BTRFS_I(inode)->generation >= trans->transid &&
need_log_inode(trans, BTRFS_I(inode)))
- ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+ ret = btrfs_log_inode(trans, BTRFS_I(inode),
LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
if (ret)
@@ -6022,7 +6253,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
if (inode->generation >= trans->transid &&
need_log_inode(trans, inode)) {
- ret = btrfs_log_inode(trans, root, inode,
+ ret = btrfs_log_inode(trans, inode,
LOG_INODE_EXISTS, ctx);
if (ret)
break;
@@ -6165,7 +6396,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
+ ret = btrfs_log_inode(trans, inode, inode_only, ctx);
if (ret)
goto end_trans;
@@ -6182,7 +6413,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
- if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
+ if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
log_dentries = true;
/*
@@ -6308,8 +6539,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
ret = walk_log_tree(trans, log_root_tree, &wc);
if (ret) {
- btrfs_handle_fs_error(fs_info, ret,
- "Failed to pin buffers while recovering log root tree.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -6322,8 +6552,7 @@ again:
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
if (ret < 0) {
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't find tree log root.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
if (ret > 0) {
@@ -6340,8 +6569,7 @@ again:
log = btrfs_read_tree_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't read tree log root.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -6369,8 +6597,7 @@ again:
if (!ret)
goto next;
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't read target root for tree log recovery.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -6378,14 +6605,15 @@ again:
ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
if (ret)
/* The loop needs to continue due to the root refs */
- btrfs_handle_fs_error(fs_info, ret,
- "failed to record the log root in transaction");
+ btrfs_abort_transaction(trans, ret);
else
ret = walk_log_tree(trans, log, &wc);
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
ret = fixup_inode_link_counts(trans, wc.replay_dest,
path);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
@@ -6402,6 +6630,8 @@ again:
* could only happen during mount.
*/
ret = btrfs_init_root_free_objectid(root);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
wc.replay_dest->log_root = NULL;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 731bd9c029f5..f6811c3df38a 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -17,6 +17,8 @@ struct btrfs_log_ctx {
int log_transid;
bool log_new_dentries;
bool logging_new_name;
+ /* Tracks the last logged dir item/index key offset. */
+ u64 last_dir_item_offset;
struct inode *inode;
struct list_head list;
/* Only used for fast fsyncs. */
@@ -68,14 +70,14 @@ int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
struct btrfs_log_ctx *ctx);
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *dir, u64 index);
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *inode, u64 dirid);
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *dir, u64 index);
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *inode, u64 dirid);
void btrfs_end_log_trans(struct btrfs_root *root);
void btrfs_pin_log_trans(struct btrfs_root *root);
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 28d443d3ef93..4968535dfff0 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -451,7 +451,7 @@ static int del_orphan(struct btrfs_trans_handle *trans, struct btrfs_inode *inod
*/
static int rollback_verity(struct btrfs_inode *inode)
{
- struct btrfs_trans_handle *trans;
+ struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *root = inode->root;
int ret;
@@ -473,6 +473,7 @@ static int rollback_verity(struct btrfs_inode *inode)
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ trans = NULL;
btrfs_handle_fs_error(root->fs_info, ret,
"failed to start transaction in verity rollback %llu",
(u64)inode->vfs_inode.i_ino);
@@ -490,8 +491,9 @@ static int rollback_verity(struct btrfs_inode *inode)
btrfs_abort_transaction(trans, ret);
goto out;
}
- btrfs_end_transaction(trans);
out:
+ if (trans)
+ btrfs_end_transaction(trans);
return ret;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 464485aa7318..61ac57bcbf1a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -14,6 +14,7 @@
#include <linux/semaphore.h>
#include <linux/uuid.h>
#include <linux/list_sort.h>
+#include <linux/namei.h>
#include "misc.h"
#include "ctree.h"
#include "extent_map.h"
@@ -250,7 +251,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret,
+ struct btrfs_io_context **bioc_ret,
int mirror_num, int need_raid_map);
/*
@@ -508,7 +509,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
}
if (flush)
- filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+ sync_blockdev(*bdev);
ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
blkdev_put(*bdev, flags);
@@ -812,9 +813,13 @@ static noinline struct btrfs_device *device_list_add(const char *path,
device = NULL;
} else {
+ struct btrfs_dev_lookup_args args = {
+ .devid = devid,
+ .uuid = disk_super->dev_item.uuid,
+ };
+
mutex_lock(&fs_devices->device_list_mutex);
- device = btrfs_find_device(fs_devices, devid,
- disk_super->dev_item.uuid, NULL);
+ device = btrfs_find_device(fs_devices, &args);
/*
* If this disk has been pulled into an fs devices created by
@@ -1091,7 +1096,7 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
__btrfs_free_extra_devids(seed_dev, &latest_dev);
- fs_devices->latest_bdev = latest_dev->bdev;
+ fs_devices->latest_dev = latest_dev;
mutex_unlock(&uuid_mutex);
}
@@ -1122,8 +1127,10 @@ static void btrfs_close_one_device(struct btrfs_device *device)
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+ clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
fs_devices->missing_devices--;
+ }
btrfs_close_bdev(device);
if (device->bdev) {
@@ -1137,6 +1144,19 @@ static void btrfs_close_one_device(struct btrfs_device *device)
atomic_set(&device->dev_stats_ccnt, 0);
extent_io_tree_release(&device->alloc_state);
+ /*
+ * Reset the flush error record. We might have a transient flush error
+ * in this mount, and if so we aborted the current transaction and set
+ * the fs to an error state, guaranteeing no super blocks can be further
+ * committed. However that error might be transient and if we unmount the
+ * filesystem and mount it again, we should allow the mount to succeed
+ * (btrfs_check_rw_degradable() should not fail) - if after mounting the
+ * filesystem again we still get flush errors, then we will again abort
+ * any transaction and set the error state, guaranteeing no commits of
+ * unsafe super blocks.
+ */
+ device->last_flush_error = 0;
+
/* Verify the device is back in a pristine state */
ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
@@ -1209,7 +1229,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
return -EINVAL;
fs_devices->opened = 1;
- fs_devices->latest_bdev = latest_dev->bdev;
+ fs_devices->latest_dev = latest_dev;
fs_devices->total_rw_bytes = 0;
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
fs_devices->read_policy = BTRFS_READ_POLICY_PID;
@@ -1273,7 +1293,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
pgoff_t index;
/* make sure our super fits in the device */
- if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
+ if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
/* make sure our super fits in the page */
@@ -1830,8 +1850,10 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
+ btrfs_reserve_chunk_metadata(trans, true);
ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
&key, sizeof(*dev_item));
+ btrfs_trans_release_chunk_metadata(trans);
if (ret)
goto out;
@@ -1869,18 +1891,22 @@ out:
/*
* Function to update ctime/mtime for a given device path.
* Mainly used for ctime/mtime based probe like libblkid.
+ *
+ * We don't care about errors here, this is just to be kind to userspace.
*/
-static void update_dev_time(struct block_device *bdev)
+static void update_dev_time(const char *device_path)
{
- struct inode *inode = bdev->bd_inode;
+ struct path path;
struct timespec64 now;
+ int ret;
- /* Shouldn't happen but just in case. */
- if (!inode)
+ ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
+ if (ret)
return;
- now = current_time(inode);
- generic_update_time(inode, &now, S_MTIME | S_CTIME);
+ now = current_time(d_inode(path.dentry));
+ inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
+ path_put(&path);
}
static int btrfs_rm_dev_item(struct btrfs_device *device)
@@ -1904,7 +1930,9 @@ static int btrfs_rm_dev_item(struct btrfs_device *device)
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -1973,7 +2001,7 @@ static struct btrfs_device * btrfs_find_next_active_device(
}
/*
- * Helper function to check if the given device is part of s_bdev / latest_bdev
+ * Helper function to check if the given device is part of s_bdev / latest_dev
* and replace it with the provided or the next active device, in the context
* where this function called, there should be always be another device (or
* this_dev) which is active.
@@ -1992,8 +2020,8 @@ void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
(fs_info->sb->s_bdev == device->bdev))
fs_info->sb->s_bdev = next_device->bdev;
- if (fs_info->fs_devices->latest_bdev == device->bdev)
- fs_info->fs_devices->latest_bdev = next_device->bdev;
+ if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
+ fs_info->fs_devices->latest_dev = next_device;
}
/*
@@ -2056,11 +2084,12 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
/* Update ctime/mtime for device path for libblkid */
- update_dev_time(bdev);
+ update_dev_time(device_path);
}
-int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
- u64 devid, struct block_device **bdev, fmode_t *mode)
+int btrfs_rm_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ struct block_device **bdev, fmode_t *mode)
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
@@ -2068,22 +2097,23 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
u64 num_devices;
int ret = 0;
- mutex_lock(&uuid_mutex);
-
+ /*
+ * The device list in fs_devices is accessed without locks (neither
+ * uuid_mutex nor device_list_mutex) as it won't change on a mounted
+ * filesystem and another device rm cannot run.
+ */
num_devices = btrfs_num_devices(fs_info);
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
goto out;
- device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
-
- if (IS_ERR(device)) {
- if (PTR_ERR(device) == -ENOENT &&
- device_path && strcmp(device_path, "missing") == 0)
+ device = btrfs_find_device(fs_info->fs_devices, args);
+ if (!device) {
+ if (args->missing)
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
else
- ret = PTR_ERR(device);
+ ret = -ENOENT;
goto out;
}
@@ -2113,11 +2143,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_unlock(&fs_info->chunk_mutex);
}
- mutex_unlock(&uuid_mutex);
ret = btrfs_shrink_device(device, 0);
if (!ret)
btrfs_reada_remove_dev(device);
- mutex_lock(&uuid_mutex);
if (ret)
goto error_undo;
@@ -2146,7 +2174,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
/*
* In normal cases the cur_devices == fs_devices. But in case
* of deleting a seed device, the cur_devices should point to
- * its own fs_devices listed under the fs_devices->seed.
+ * its own fs_devices listed under the fs_devices->seed_list.
*/
cur_devices = device->fs_devices;
mutex_lock(&fs_devices->device_list_mutex);
@@ -2197,14 +2225,21 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
synchronize_rcu();
btrfs_free_device(device);
- if (cur_devices->open_devices == 0) {
+ /*
+ * This can happen if cur_devices is the private seed devices list. We
+ * cannot call close_fs_devices() here because it expects the uuid_mutex
+ * to be held, but in fact we don't need that for the private
+ * seed_devices, we can simply decrement cur_devices->opened and then
+ * remove it from our list and free the fs_devices.
+ */
+ if (cur_devices->num_devices == 0) {
list_del_init(&cur_devices->seed_list);
- close_fs_devices(cur_devices);
+ ASSERT(cur_devices->opened == 1);
+ cur_devices->opened--;
free_fs_devices(cur_devices);
}
out:
- mutex_unlock(&uuid_mutex);
return ret;
error_undo:
@@ -2292,13 +2327,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
mutex_unlock(&fs_devices->device_list_mutex);
- /*
- * The update_dev_time() with in btrfs_scratch_superblocks()
- * may lead to a call to btrfs_show_devname() which will try
- * to hold device_list_mutex. And here this device
- * is already out of device list, so we don't have to hold
- * the device_list_mutex lock.
- */
btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
tgtdev->name->str);
@@ -2307,69 +2335,98 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
btrfs_free_device(tgtdev);
}
-static struct btrfs_device *btrfs_find_device_by_path(
- struct btrfs_fs_info *fs_info, const char *device_path)
+/**
+ * Populate args from device at path
+ *
+ * @fs_info: the filesystem
+ * @args: the args to populate
+ * @path: the path to the device
+ *
+ * This will read the super block of the device at @path and populate @args with
+ * the devid, fsid, and uuid. This is meant to be used for ioctls that need to
+ * lookup a device to operate on, but need to do it before we take any locks.
+ * This properly handles the special case of "missing" that a user may pass in,
+ * and does some basic sanity checks. The caller must make sure that @path is
+ * properly NUL terminated before calling in, and must call
+ * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
+ * uuid buffers.
+ *
+ * Return: 0 for success, -errno for failure
+ */
+int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ const char *path)
{
- int ret = 0;
struct btrfs_super_block *disk_super;
- u64 devid;
- u8 *dev_uuid;
struct block_device *bdev;
- struct btrfs_device *device;
+ int ret;
- ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
- fs_info->bdev_holder, 0, &bdev, &disk_super);
- if (ret)
- return ERR_PTR(ret);
+ if (!path || !path[0])
+ return -EINVAL;
+ if (!strcmp(path, "missing")) {
+ args->missing = true;
+ return 0;
+ }
- devid = btrfs_stack_device_id(&disk_super->dev_item);
- dev_uuid = disk_super->dev_item.uuid;
+ args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
+ args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
+ if (!args->uuid || !args->fsid) {
+ btrfs_put_dev_args_from_path(args);
+ return -ENOMEM;
+ }
+
+ ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
+ &bdev, &disk_super);
+ if (ret)
+ return ret;
+ args->devid = btrfs_stack_device_id(&disk_super->dev_item);
+ memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->metadata_uuid);
+ memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
else
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->fsid);
-
+ memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
btrfs_release_disk_super(disk_super);
- if (!device)
- device = ERR_PTR(-ENOENT);
blkdev_put(bdev, FMODE_READ);
- return device;
+ return 0;
}
/*
- * Lookup a device given by device id, or the path if the id is 0.
+ * Only use this jointly with btrfs_get_dev_args_from_path() because we will
+ * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
+ * that don't need to be freed.
*/
+void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
+{
+ kfree(args->uuid);
+ kfree(args->fsid);
+ args->uuid = NULL;
+ args->fsid = NULL;
+}
+
struct btrfs_device *btrfs_find_device_by_devspec(
struct btrfs_fs_info *fs_info, u64 devid,
const char *device_path)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_device *device;
+ int ret;
if (devid) {
- device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
- NULL);
+ args.devid = devid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
if (!device)
return ERR_PTR(-ENOENT);
return device;
}
- if (!device_path || !device_path[0])
- return ERR_PTR(-EINVAL);
-
- if (strcmp(device_path, "missing") == 0) {
- /* Find first missing device */
- list_for_each_entry(device, &fs_info->fs_devices->devices,
- dev_list) {
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
- &device->dev_state) && !device->bdev)
- return device;
- }
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
+ if (ret)
+ return ERR_PTR(ret);
+ device = btrfs_find_device(fs_info->fs_devices, &args);
+ btrfs_put_dev_args_from_path(&args);
+ if (!device)
return ERR_PTR(-ENOENT);
- }
-
- return btrfs_find_device_by_path(fs_info, device_path);
+ return device;
}
/*
@@ -2446,6 +2503,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
*/
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_path *path;
@@ -2455,7 +2513,6 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
struct btrfs_key key;
u8 fs_uuid[BTRFS_FSID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
- u64 devid;
int ret;
path = btrfs_alloc_path();
@@ -2467,7 +2524,9 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
key.type = BTRFS_DEV_ITEM_KEY;
while (1) {
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret < 0)
goto error;
@@ -2492,13 +2551,14 @@ next_slot:
dev_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_item);
- devid = btrfs_device_id(leaf, dev_item);
+ args.devid = btrfs_device_id(leaf, dev_item);
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid);
+ args.uuid = dev_uuid;
+ args.fsid = fs_uuid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
@@ -2597,8 +2657,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
device->sector_size = fs_info->sectorsize;
- device->total_bytes = round_down(i_size_read(bdev->bd_inode),
- fs_info->sectorsize);
+ device->total_bytes =
+ round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
@@ -2614,6 +2674,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_abort_transaction(trans, ret);
goto error_trans;
}
+ btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
+ device);
}
device->fs_devices = fs_devices;
@@ -2720,7 +2782,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_forget_devices(device_path);
/* Update ctime/mtime for blkid or udev */
- update_dev_time(bdev);
+ update_dev_time(device_path);
return ret;
@@ -2813,6 +2875,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_super_block *super_copy = fs_info->super_copy;
u64 old_total;
u64 diff;
+ int ret;
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return -EACCES;
@@ -2841,7 +2904,11 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
&trans->transaction->dev_update_list);
mutex_unlock(&fs_info->chunk_mutex);
- return btrfs_update_device(trans, device);
+ btrfs_reserve_chunk_metadata(trans, false);
+ ret = btrfs_update_device(trans, device);
+ btrfs_trans_release_chunk_metadata(trans);
+
+ return ret;
}
static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
@@ -3083,7 +3150,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *sys_bg;
- sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+ sys_bg = btrfs_create_chunk(trans, sys_flags);
if (IS_ERR(sys_bg)) {
ret = PTR_ERR(sys_bg);
btrfs_abort_transaction(trans, ret);
@@ -4876,8 +4943,10 @@ again:
round_down(old_total - diff, fs_info->sectorsize));
mutex_unlock(&fs_info->chunk_mutex);
+ btrfs_reserve_chunk_metadata(trans, false);
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
@@ -4960,7 +5029,7 @@ static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
}
/*
- * Structure used internally for __btrfs_alloc_chunk() function.
+ * Structure used internally for btrfs_create_chunk() function.
* Wraps needed parameters.
*/
struct alloc_chunk_ctl {
@@ -5364,7 +5433,7 @@ error_del_extent:
return block_group;
}
-struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type)
{
struct btrfs_fs_info *info = trans->fs_info;
@@ -5565,12 +5634,12 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
*/
alloc_profile = btrfs_metadata_alloc_profile(fs_info);
- meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
+ meta_bg = btrfs_create_chunk(trans, alloc_profile);
if (IS_ERR(meta_bg))
return PTR_ERR(meta_bg);
alloc_profile = btrfs_system_alloc_profile(fs_info);
- sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
+ sys_bg = btrfs_create_chunk(trans, alloc_profile);
if (IS_ERR(sys_bg))
return PTR_ERR(sys_bg);
@@ -5584,17 +5653,17 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
return btrfs_raid_array[index].tolerated_failures;
}
-int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct extent_map *em;
struct map_lookup *map;
- int readonly = 0;
int miss_ndevs = 0;
int i;
+ bool ret = true;
em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
if (IS_ERR(em))
- return 1;
+ return false;
map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
@@ -5605,21 +5674,20 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
}
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
&map->stripes[i].dev->dev_state)) {
- readonly = 1;
+ ret = false;
goto end;
}
}
/*
- * If the number of missing devices is larger than max errors,
- * we can not write the data into that chunk successfully, so
- * set it readonly.
+ * If the number of missing devices is larger than max errors, we can
+ * not write the data into that chunk successfully.
*/
if (miss_ndevs > btrfs_chunk_max_errors(map))
- readonly = 1;
+ ret = false;
end:
free_extent_map(em);
- return readonly;
+ return ret;
}
void btrfs_mapping_tree_free(struct extent_map_tree *tree)
@@ -5782,7 +5850,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
}
/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
-static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
+static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
{
int i;
int again = 1;
@@ -5791,52 +5859,55 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
again = 0;
for (i = 0; i < num_stripes - 1; i++) {
/* Swap if parity is on a smaller index */
- if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
- swap(bbio->stripes[i], bbio->stripes[i + 1]);
- swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
+ if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
+ swap(bioc->stripes[i], bioc->stripes[i + 1]);
+ swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
again = 1;
}
}
}
}
-static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
+static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ int total_stripes,
+ int real_stripes)
{
- struct btrfs_bio *bbio = kzalloc(
- /* the size of the btrfs_bio */
- sizeof(struct btrfs_bio) +
- /* plus the variable array for the stripes */
- sizeof(struct btrfs_bio_stripe) * (total_stripes) +
- /* plus the variable array for the tgt dev */
+ struct btrfs_io_context *bioc = kzalloc(
+ /* The size of btrfs_io_context */
+ sizeof(struct btrfs_io_context) +
+ /* Plus the variable array for the stripes */
+ sizeof(struct btrfs_io_stripe) * (total_stripes) +
+ /* Plus the variable array for the tgt dev */
sizeof(int) * (real_stripes) +
/*
- * plus the raid_map, which includes both the tgt dev
- * and the stripes
+ * Plus the raid_map, which includes both the tgt dev
+ * and the stripes.
*/
sizeof(u64) * (total_stripes),
GFP_NOFS|__GFP_NOFAIL);
- atomic_set(&bbio->error, 0);
- refcount_set(&bbio->refs, 1);
+ atomic_set(&bioc->error, 0);
+ refcount_set(&bioc->refs, 1);
- bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
- bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
+ bioc->fs_info = fs_info;
+ bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
+ bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
- return bbio;
+ return bioc;
}
-void btrfs_get_bbio(struct btrfs_bio *bbio)
+void btrfs_get_bioc(struct btrfs_io_context *bioc)
{
- WARN_ON(!refcount_read(&bbio->refs));
- refcount_inc(&bbio->refs);
+ WARN_ON(!refcount_read(&bioc->refs));
+ refcount_inc(&bioc->refs);
}
-void btrfs_put_bbio(struct btrfs_bio *bbio)
+void btrfs_put_bioc(struct btrfs_io_context *bioc)
{
- if (!bbio)
+ if (!bioc)
return;
- if (refcount_dec_and_test(&bbio->refs))
- kfree(bbio);
+ if (refcount_dec_and_test(&bioc->refs))
+ kfree(bioc);
}
/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
@@ -5846,11 +5917,11 @@ void btrfs_put_bbio(struct btrfs_bio *bbio)
*/
static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
- struct btrfs_bio **bbio_ret)
+ struct btrfs_io_context **bioc_ret)
{
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
u64 length = *length_ret;
u64 offset;
u64 stripe_nr;
@@ -5869,8 +5940,8 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
int ret = 0;
int i;
- /* discard always return a bbio */
- ASSERT(bbio_ret);
+ /* Discard always returns a bioc. */
+ ASSERT(bioc_ret);
em = btrfs_get_chunk_map(fs_info, logical, length);
if (IS_ERR(em))
@@ -5933,26 +6004,25 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
&stripe_index);
}
- bbio = alloc_btrfs_bio(num_stripes, 0);
- if (!bbio) {
+ bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0);
+ if (!bioc) {
ret = -ENOMEM;
goto out;
}
for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical =
+ bioc->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
- bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+ bioc->stripes[i].dev = map->stripes[stripe_index].dev;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
- bbio->stripes[i].length = stripes_per_dev *
+ bioc->stripes[i].length = stripes_per_dev *
map->stripe_len;
if (i / sub_stripes < remaining_stripes)
- bbio->stripes[i].length +=
- map->stripe_len;
+ bioc->stripes[i].length += map->stripe_len;
/*
* Special for the first stripe and
@@ -5963,19 +6033,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
* off end_off
*/
if (i < sub_stripes)
- bbio->stripes[i].length -=
- stripe_offset;
+ bioc->stripes[i].length -= stripe_offset;
if (stripe_index >= last_stripe &&
stripe_index <= (last_stripe +
sub_stripes - 1))
- bbio->stripes[i].length -=
- stripe_end_offset;
+ bioc->stripes[i].length -= stripe_end_offset;
if (i == sub_stripes - 1)
stripe_offset = 0;
} else {
- bbio->stripes[i].length = length;
+ bioc->stripes[i].length = length;
}
stripe_index++;
@@ -5985,9 +6053,9 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
}
}
- *bbio_ret = bbio;
- bbio->map_type = map->type;
- bbio->num_stripes = num_stripes;
+ *bioc_ret = bioc;
+ bioc->map_type = map->type;
+ bioc->num_stripes = num_stripes;
out:
free_extent_map(em);
return ret;
@@ -6011,7 +6079,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
u64 srcdev_devid, int *mirror_num,
u64 *physical)
{
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
int num_stripes;
int index_srcdev = 0;
int found = 0;
@@ -6020,20 +6088,20 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
int ret = 0;
ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &length, &bbio, 0, 0);
+ logical, &length, &bioc, 0, 0);
if (ret) {
- ASSERT(bbio == NULL);
+ ASSERT(bioc == NULL);
return ret;
}
- num_stripes = bbio->num_stripes;
+ num_stripes = bioc->num_stripes;
if (*mirror_num > num_stripes) {
/*
* BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
* that means that the requested area is not left of the left
* cursor
*/
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
return -EIO;
}
@@ -6043,7 +6111,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
* pointer to the one of the target drive.
*/
for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid != srcdev_devid)
+ if (bioc->stripes[i].dev->devid != srcdev_devid)
continue;
/*
@@ -6051,15 +6119,15 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
* mirror with the lowest physical address
*/
if (found &&
- physical_of_found <= bbio->stripes[i].physical)
+ physical_of_found <= bioc->stripes[i].physical)
continue;
index_srcdev = i;
found = 1;
- physical_of_found = bbio->stripes[i].physical;
+ physical_of_found = bioc->stripes[i].physical;
}
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
ASSERT(found);
if (!found)
@@ -6090,12 +6158,12 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
}
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
- struct btrfs_bio **bbio_ret,
+ struct btrfs_io_context **bioc_ret,
struct btrfs_dev_replace *dev_replace,
u64 logical,
int *num_stripes_ret, int *max_errors_ret)
{
- struct btrfs_bio *bbio = *bbio_ret;
+ struct btrfs_io_context *bioc = *bioc_ret;
u64 srcdev_devid = dev_replace->srcdev->devid;
int tgtdev_indexes = 0;
int num_stripes = *num_stripes_ret;
@@ -6125,17 +6193,17 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
*/
index_where_to_add = num_stripes;
for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ if (bioc->stripes[i].dev->devid == srcdev_devid) {
/* write to new disk, too */
- struct btrfs_bio_stripe *new =
- bbio->stripes + index_where_to_add;
- struct btrfs_bio_stripe *old =
- bbio->stripes + i;
+ struct btrfs_io_stripe *new =
+ bioc->stripes + index_where_to_add;
+ struct btrfs_io_stripe *old =
+ bioc->stripes + i;
new->physical = old->physical;
new->length = old->length;
new->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[i] = index_where_to_add;
+ bioc->tgtdev_map[i] = index_where_to_add;
index_where_to_add++;
max_errors++;
tgtdev_indexes++;
@@ -6155,30 +6223,29 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
* full copy of the source drive.
*/
for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ if (bioc->stripes[i].dev->devid == srcdev_devid) {
/*
* In case of DUP, in order to keep it simple,
* only add the mirror with the lowest physical
* address
*/
if (found &&
- physical_of_found <=
- bbio->stripes[i].physical)
+ physical_of_found <= bioc->stripes[i].physical)
continue;
index_srcdev = i;
found = 1;
- physical_of_found = bbio->stripes[i].physical;
+ physical_of_found = bioc->stripes[i].physical;
}
}
if (found) {
- struct btrfs_bio_stripe *tgtdev_stripe =
- bbio->stripes + num_stripes;
+ struct btrfs_io_stripe *tgtdev_stripe =
+ bioc->stripes + num_stripes;
tgtdev_stripe->physical = physical_of_found;
tgtdev_stripe->length =
- bbio->stripes[index_srcdev].length;
+ bioc->stripes[index_srcdev].length;
tgtdev_stripe->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[index_srcdev] = num_stripes;
+ bioc->tgtdev_map[index_srcdev] = num_stripes;
tgtdev_indexes++;
num_stripes++;
@@ -6187,8 +6254,8 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
*num_stripes_ret = num_stripes;
*max_errors_ret = max_errors;
- bbio->num_tgtdevs = tgtdev_indexes;
- *bbio_ret = bbio;
+ bioc->num_tgtdevs = tgtdev_indexes;
+ *bioc_ret = bioc;
}
static bool need_full_stripe(enum btrfs_map_op op)
@@ -6291,7 +6358,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret,
+ struct btrfs_io_context **bioc_ret,
int mirror_num, int need_raid_map)
{
struct extent_map *em;
@@ -6306,7 +6373,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
int num_stripes;
int max_errors = 0;
int tgtdev_indexes = 0;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int dev_replace_is_ongoing = 0;
int num_alloc_stripes;
@@ -6315,7 +6382,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
u64 raid56_full_stripe_start = (u64)-1;
struct btrfs_io_geometry geom;
- ASSERT(bbio_ret);
+ ASSERT(bioc_ret);
ASSERT(op != BTRFS_MAP_DISCARD);
em = btrfs_get_chunk_map(fs_info, logical, *length);
@@ -6459,20 +6526,20 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
tgtdev_indexes = num_stripes;
}
- bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
- if (!bbio) {
+ bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
+ if (!bioc) {
ret = -ENOMEM;
goto out;
}
for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical = map->stripes[stripe_index].physical +
+ bioc->stripes[i].physical = map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
- bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+ bioc->stripes[i].dev = map->stripes[stripe_index].dev;
stripe_index++;
}
- /* build raid_map */
+ /* Build raid_map */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
(need_full_stripe(op) || mirror_num > 1)) {
u64 tmp;
@@ -6484,15 +6551,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
/* Fill in the logical address of each stripe */
tmp = stripe_nr * data_stripes;
for (i = 0; i < data_stripes; i++)
- bbio->raid_map[(i+rot) % num_stripes] =
+ bioc->raid_map[(i + rot) % num_stripes] =
em->start + (tmp + i) * map->stripe_len;
- bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+ bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- bbio->raid_map[(i+rot+1) % num_stripes] =
+ bioc->raid_map[(i + rot + 1) % num_stripes] =
RAID6_Q_STRIPE;
- sort_parity_stripes(bbio, num_stripes);
+ sort_parity_stripes(bioc, num_stripes);
}
if (need_full_stripe(op))
@@ -6500,15 +6567,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
need_full_stripe(op)) {
- handle_ops_on_dev_replace(op, &bbio, dev_replace, logical,
+ handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
&num_stripes, &max_errors);
}
- *bbio_ret = bbio;
- bbio->map_type = map->type;
- bbio->num_stripes = num_stripes;
- bbio->max_errors = max_errors;
- bbio->mirror_num = mirror_num;
+ *bioc_ret = bioc;
+ bioc->map_type = map->type;
+ bioc->num_stripes = num_stripes;
+ bioc->max_errors = max_errors;
+ bioc->mirror_num = mirror_num;
/*
* this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -6517,9 +6584,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
*/
if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
WARN_ON(num_stripes > 1);
- bbio->stripes[0].dev = dev_replace->tgtdev;
- bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
- bbio->mirror_num = map->num_stripes + 1;
+ bioc->stripes[0].dev = dev_replace->tgtdev;
+ bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
+ bioc->mirror_num = map->num_stripes + 1;
}
out:
if (dev_replace_is_ongoing) {
@@ -6533,43 +6600,43 @@ out:
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret, int mirror_num)
+ struct btrfs_io_context **bioc_ret, int mirror_num)
{
if (op == BTRFS_MAP_DISCARD)
return __btrfs_map_block_for_discard(fs_info, logical,
- length, bbio_ret);
+ length, bioc_ret);
- return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
+ return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
mirror_num, 0);
}
/* For Scrub/replace */
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret)
+ struct btrfs_io_context **bioc_ret)
{
- return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
+ return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
}
-static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
+static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
{
- bio->bi_private = bbio->private;
- bio->bi_end_io = bbio->end_io;
+ bio->bi_private = bioc->private;
+ bio->bi_end_io = bioc->end_io;
bio_endio(bio);
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
}
static void btrfs_end_bio(struct bio *bio)
{
- struct btrfs_bio *bbio = bio->bi_private;
+ struct btrfs_io_context *bioc = bio->bi_private;
int is_orig_bio = 0;
if (bio->bi_status) {
- atomic_inc(&bbio->error);
+ atomic_inc(&bioc->error);
if (bio->bi_status == BLK_STS_IOERR ||
bio->bi_status == BLK_STS_TARGET) {
- struct btrfs_device *dev = btrfs_io_bio(bio)->device;
+ struct btrfs_device *dev = btrfs_bio(bio)->device;
ASSERT(dev->bdev);
if (btrfs_op(bio) == BTRFS_MAP_WRITE)
@@ -6584,22 +6651,22 @@ static void btrfs_end_bio(struct bio *bio)
}
}
- if (bio == bbio->orig_bio)
+ if (bio == bioc->orig_bio)
is_orig_bio = 1;
- btrfs_bio_counter_dec(bbio->fs_info);
+ btrfs_bio_counter_dec(bioc->fs_info);
- if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ if (atomic_dec_and_test(&bioc->stripes_pending)) {
if (!is_orig_bio) {
bio_put(bio);
- bio = bbio->orig_bio;
+ bio = bioc->orig_bio;
}
- btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+ btrfs_bio(bio)->mirror_num = bioc->mirror_num;
/* only send an error to the higher layers if it is
* beyond the tolerance of the btrfs bio
*/
- if (atomic_read(&bbio->error) > bbio->max_errors) {
+ if (atomic_read(&bioc->error) > bioc->max_errors) {
bio->bi_status = BLK_STS_IOERR;
} else {
/*
@@ -6609,19 +6676,19 @@ static void btrfs_end_bio(struct bio *bio)
bio->bi_status = BLK_STS_OK;
}
- btrfs_end_bbio(bbio, bio);
+ btrfs_end_bioc(bioc, bio);
} else if (!is_orig_bio) {
bio_put(bio);
}
}
-static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
+static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
u64 physical, struct btrfs_device *dev)
{
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
- bio->bi_private = bbio;
- btrfs_io_bio(bio)->device = dev;
+ bio->bi_private = bioc;
+ btrfs_bio(bio)->device = dev;
bio->bi_end_io = btrfs_end_bio;
bio->bi_iter.bi_sector = physical >> 9;
/*
@@ -6650,20 +6717,20 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
btrfsic_submit_bio(bio);
}
-static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
{
- atomic_inc(&bbio->error);
- if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ atomic_inc(&bioc->error);
+ if (atomic_dec_and_test(&bioc->stripes_pending)) {
/* Should be the original bio. */
- WARN_ON(bio != bbio->orig_bio);
+ WARN_ON(bio != bioc->orig_bio);
- btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+ btrfs_bio(bio)->mirror_num = bioc->mirror_num;
bio->bi_iter.bi_sector = logical >> 9;
- if (atomic_read(&bbio->error) > bbio->max_errors)
+ if (atomic_read(&bioc->error) > bioc->max_errors)
bio->bi_status = BLK_STS_IOERR;
else
bio->bi_status = BLK_STS_OK;
- btrfs_end_bbio(bbio, bio);
+ btrfs_end_bioc(bioc, bio);
}
}
@@ -6678,36 +6745,34 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int ret;
int dev_nr;
int total_devs;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
length = bio->bi_iter.bi_size;
map_length = length;
btrfs_bio_counter_inc_blocked(fs_info);
ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
- &map_length, &bbio, mirror_num, 1);
+ &map_length, &bioc, mirror_num, 1);
if (ret) {
btrfs_bio_counter_dec(fs_info);
return errno_to_blk_status(ret);
}
- total_devs = bbio->num_stripes;
- bbio->orig_bio = first_bio;
- bbio->private = first_bio->bi_private;
- bbio->end_io = first_bio->bi_end_io;
- bbio->fs_info = fs_info;
- atomic_set(&bbio->stripes_pending, bbio->num_stripes);
+ total_devs = bioc->num_stripes;
+ bioc->orig_bio = first_bio;
+ bioc->private = first_bio->bi_private;
+ bioc->end_io = first_bio->bi_end_io;
+ atomic_set(&bioc->stripes_pending, bioc->num_stripes);
- if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
+ if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- ret = raid56_parity_write(fs_info, bio, bbio,
- map_length);
+ ret = raid56_parity_write(bio, bioc, map_length);
} else {
- ret = raid56_parity_recover(fs_info, bio, bbio,
- map_length, mirror_num, 1);
+ ret = raid56_parity_recover(bio, bioc, map_length,
+ mirror_num, 1);
}
btrfs_bio_counter_dec(fs_info);
@@ -6722,12 +6787,12 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
}
for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
- dev = bbio->stripes[dev_nr].dev;
+ dev = bioc->stripes[dev_nr].dev;
if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
&dev->dev_state) ||
(btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
- bbio_error(bbio, first_bio, logical);
+ bioc_error(bioc, first_bio, logical);
continue;
}
@@ -6736,12 +6801,39 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
else
bio = first_bio;
- submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
+ submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
}
btrfs_bio_counter_dec(fs_info);
return BLK_STS_OK;
}
+static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
+ const struct btrfs_fs_devices *fs_devices)
+{
+ if (args->fsid == NULL)
+ return true;
+ if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
+ return true;
+ return false;
+}
+
+static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
+ const struct btrfs_device *device)
+{
+ ASSERT((args->devid != (u64)-1) || args->missing);
+
+ if ((args->devid != (u64)-1) && device->devid != args->devid)
+ return false;
+ if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
+ return false;
+ if (!args->missing)
+ return true;
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
+ !device->bdev)
+ return true;
+ return false;
+}
+
/*
* Find a device specified by @devid or @uuid in the list of @fs_devices, or
* return NULL.
@@ -6749,31 +6841,25 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
* If devid and uuid are both specified, the match must be exact, otherwise
* only devid is used.
*/
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, u8 *uuid, u8 *fsid)
+struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+ const struct btrfs_dev_lookup_args *args)
{
struct btrfs_device *device;
struct btrfs_fs_devices *seed_devs;
- if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ if (dev_args_match_fs_devices(args, fs_devices)) {
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- if (device->devid == devid &&
- (!uuid || memcmp(device->uuid, uuid,
- BTRFS_UUID_SIZE) == 0))
+ if (dev_args_match_device(args, device))
return device;
}
}
list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
- if (!fsid ||
- !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
- list_for_each_entry(device, &seed_devs->devices,
- dev_list) {
- if (device->devid == devid &&
- (!uuid || memcmp(device->uuid, uuid,
- BTRFS_UUID_SIZE) == 0))
- return device;
- }
+ if (!dev_args_match_fs_devices(args, seed_devs))
+ continue;
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
+ if (dev_args_match_device(args, device))
+ return device;
}
}
@@ -6939,6 +7025,7 @@ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct map_lookup *map;
@@ -7016,11 +7103,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+ args.devid = devid;
read_extent_buffer(leaf, uuid, (unsigned long)
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
- map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
- devid, uuid, NULL);
+ args.uuid = uuid;
+ map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!map->stripes[i].dev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
free_extent_map(em);
@@ -7138,6 +7226,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
static int read_one_dev(struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
@@ -7146,11 +7235,13 @@ static int read_one_dev(struct extent_buffer *leaf,
u8 fs_uuid[BTRFS_FSID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
- devid = btrfs_device_id(leaf, dev_item);
+ devid = args.devid = btrfs_device_id(leaf, dev_item);
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
+ args.uuid = dev_uuid;
+ args.fsid = fs_uuid;
if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
fs_devices = open_seed_devices(fs_info, fs_uuid);
@@ -7158,8 +7249,7 @@ static int read_one_dev(struct extent_buffer *leaf,
return PTR_ERR(fs_devices);
}
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid);
+ device = btrfs_find_device(fs_info->fs_devices, &args);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_report_missing_device(fs_info, devid,
@@ -7223,7 +7313,7 @@ static int read_one_dev(struct extent_buffer *leaf,
fill_device_from_item(leaf, dev_item, device);
if (device->bdev) {
- u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
+ u64 max_total_bytes = bdev_nr_bytes(device->bdev);
if (device->total_bytes > max_total_bytes) {
btrfs_err(fs_info,
@@ -7828,12 +7918,14 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_get_dev_stats *stats)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_device *dev;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
int i;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
+ args.devid = stats->devid;
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
@@ -7909,6 +8001,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 chunk_offset, u64 devid,
u64 physical_offset, u64 physical_len)
{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
@@ -7964,7 +8057,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
/* Make sure no dev extent is beyond device boundary */
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!dev) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2183361db614..3b8130680749 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -236,17 +236,40 @@ struct btrfs_fs_devices {
bool fsid_change;
struct list_head fs_list;
+ /*
+ * Number of devices under this fsid including missing and
+ * replace-target device and excludes seed devices.
+ */
u64 num_devices;
+
+ /*
+ * The number of devices that successfully opened, including
+ * replace-target, excludes seed devices.
+ */
u64 open_devices;
+
+ /* The number of devices that are under the chunk allocation list. */
u64 rw_devices;
+
+ /* Count of missing devices under this fsid excluding seed device. */
u64 missing_devices;
u64 total_rw_bytes;
+
+ /*
+ * Count of devices from btrfs_super_block::num_devices for this fsid,
+ * which includes the seed device, excludes the transient replace-target
+ * device.
+ */
u64 total_devices;
/* Highest generation number of seen devices */
u64 latest_generation;
- struct block_device *latest_bdev;
+ /*
+ * The mount device or a device with highest generation after removal
+ * or replace.
+ */
+ struct btrfs_device *latest_dev;
/* all of the devices in the FS, protected by a mutex
* so we can safely walk it to write out the supers without
@@ -300,48 +323,62 @@ struct btrfs_fs_devices {
/ sizeof(struct btrfs_stripe) + 1)
/*
- * we need the mirror number and stripe index to be passed around
- * the call chain while we are processing end_io (especially errors).
- * Really, what we need is a btrfs_bio structure that has this info
- * and is properly sized with its stripe array, but we're not there
- * quite yet. We have our own btrfs bioset, and all of the bios
- * we allocate are actually btrfs_io_bios. We'll cram as much of
- * struct btrfs_bio as we can into this over time.
+ * Additional info to pass along bio.
+ *
+ * Mostly for btrfs specific features like csum and mirror_num.
*/
-struct btrfs_io_bio {
+struct btrfs_bio {
unsigned int mirror_num;
+
+ /* @device is for stripe IO submission. */
struct btrfs_device *device;
- u64 logical;
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
struct bvec_iter iter;
+
/*
* This member must come last, bio_alloc_bioset will allocate enough
- * bytes for entire btrfs_io_bio but relies on bio being last.
+ * bytes for entire btrfs_bio but relies on bio being last.
*/
struct bio bio;
};
-static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)
+static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
{
- return container_of(bio, struct btrfs_io_bio, bio);
+ return container_of(bio, struct btrfs_bio, bio);
}
-static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio)
+static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
{
- if (io_bio->csum != io_bio->csum_inline) {
- kfree(io_bio->csum);
- io_bio->csum = NULL;
+ if (bbio->csum != bbio->csum_inline) {
+ kfree(bbio->csum);
+ bbio->csum = NULL;
}
}
-struct btrfs_bio_stripe {
+struct btrfs_io_stripe {
struct btrfs_device *dev;
u64 physical;
u64 length; /* only used for discard mappings */
};
-struct btrfs_bio {
+/*
+ * Context for IO subsmission for device stripe.
+ *
+ * - Track the unfinished mirrors for mirror based profiles
+ * Mirror based profiles are SINGLE/DUP/RAID1/RAID10.
+ *
+ * - Contain the logical -> physical mapping info
+ * Used by submit_stripe_bio() for mapping logical bio
+ * into physical device address.
+ *
+ * - Contain device replace info
+ * Used by handle_ops_on_dev_replace() to copy logical bios
+ * into the new device.
+ *
+ * - Contain RAID56 full stripe logical bytenrs
+ */
+struct btrfs_io_context {
refcount_t refs;
atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
@@ -361,7 +398,7 @@ struct btrfs_bio {
* so raid_map[0] is the start of our full stripe
*/
u64 *raid_map;
- struct btrfs_bio_stripe stripes[];
+ struct btrfs_io_stripe stripes[];
};
struct btrfs_device_info {
@@ -396,11 +433,11 @@ struct map_lookup {
int num_stripes;
int sub_stripes;
int verified_stripes; /* For mount time dev extent verification */
- struct btrfs_bio_stripe stripes[];
+ struct btrfs_io_stripe stripes[];
};
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
- (sizeof(struct btrfs_bio_stripe) * (n)))
+ (sizeof(struct btrfs_io_stripe) * (n)))
struct btrfs_balance_args;
struct btrfs_balance_progress;
@@ -414,6 +451,22 @@ struct btrfs_balance_control {
struct btrfs_balance_progress stat;
};
+/*
+ * Search for a given device by the set parameters
+ */
+struct btrfs_dev_lookup_args {
+ u64 devid;
+ u8 *uuid;
+ u8 *fsid;
+ bool missing;
+};
+
+/* We have to initialize to -1 because BTRFS_DEV_REPLACE_DEVID is 0 */
+#define BTRFS_DEV_LOOKUP_ARGS_INIT { .devid = (u64)-1 }
+
+#define BTRFS_DEV_LOOKUP_ARGS(name) \
+ struct btrfs_dev_lookup_args name = BTRFS_DEV_LOOKUP_ARGS_INIT
+
enum btrfs_map_op {
BTRFS_MAP_READ,
BTRFS_MAP_WRITE,
@@ -437,20 +490,20 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio)
}
}
-void btrfs_get_bbio(struct btrfs_bio *bbio);
-void btrfs_put_bbio(struct btrfs_bio *bbio);
+void btrfs_get_bioc(struct btrfs_io_context *bioc);
+void btrfs_put_bioc(struct btrfs_io_context *bioc);
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret, int mirror_num);
+ struct btrfs_io_context **bioc_ret, int mirror_num);
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret);
+ struct btrfs_io_context **bioc_ret);
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
enum btrfs_map_op op, u64 logical,
struct btrfs_io_geometry *io_geom);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
-struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type);
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
@@ -467,19 +520,23 @@ void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info,
u64 devid,
const char *devpath);
+int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ const char *path);
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
const u8 *uuid);
+void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
void btrfs_free_device(struct btrfs_device *device);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
- const char *device_path, u64 devid,
+ struct btrfs_dev_lookup_args *args,
struct block_device **bdev, fmode_t *mode);
void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, u8 *uuid, u8 *fsid);
+struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+ const struct btrfs_dev_lookup_args *args);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
int btrfs_balance(struct btrfs_fs_info *fs_info,
@@ -493,7 +550,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_uuid_scan_kthread(void *data);
-int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset);
+bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 8a4514283a4b..2837b4c8424d 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -138,7 +138,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
* matches our target xattr, so lets check.
*/
ret = 0;
- btrfs_assert_tree_locked(path->nodes[0]);
+ btrfs_assert_tree_write_locked(path->nodes[0]);
di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
if (!di && !(flags & XATTR_REPLACE)) {
ret = -ENOSPC;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 8afa90074891..767a0c6c9694 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
+ cpage_out = kmap(out_page);
pages[0] = out_page;
nr_pages = 1;
@@ -148,22 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
int i;
for (i = 0; i < in_buf_pages; i++) {
- if (in_page)
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = page_address(in_page);
+ data_in = kmap(in_page);
memcpy(workspace->buf + i * PAGE_SIZE,
data_in, PAGE_SIZE);
start += PAGE_SIZE;
}
workspace->strm.next_in = workspace->buf;
} else {
- if (in_page)
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = page_address(in_page);
+ data_in = kmap(in_page);
start += PAGE_SIZE;
workspace->strm.next_in = data_in;
}
@@ -192,6 +196,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
* the stream end if required
*/
if (workspace->strm.avail_out == 0) {
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -202,7 +207,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
+ cpage_out = kmap(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -229,6 +234,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
} else if (workspace->strm.avail_out == 0) {
/* get another page for the stream end */
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -239,7 +245,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
+ cpage_out = kmap(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -258,8 +264,13 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = workspace->strm.total_in;
out:
*out_pages = nr_pages;
- if (in_page)
+ if (out_page)
+ kunmap(out_page);
+
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
return ret;
}
@@ -276,7 +287,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
unsigned long buf_start;
struct page **pages_in = cb->compressed_pages;
- data_in = page_address(pages_in[page_in_index]);
+ data_in = kmap(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
workspace->strm.total_in = 0;
@@ -298,6 +309,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
pr_warn("BTRFS: inflateInit failed\n");
+ kunmap(pages_in[page_in_index]);
return -EIO;
}
while (workspace->strm.total_in < srclen) {
@@ -324,13 +336,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
-
+ kunmap(pages_in[page_in_index]);
page_in_index++;
if (page_in_index >= total_pages_in) {
data_in = NULL;
break;
}
- data_in = page_address(pages_in[page_in_index]);
+ data_in = kmap(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
workspace->strm.avail_in = min(tmp, PAGE_SIZE);
@@ -342,6 +354,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
ret = 0;
done:
zlib_inflateEnd(&workspace->strm);
+ if (data_in)
+ kunmap(pages_in[page_in_index]);
if (!ret)
zero_fill_bio(cb->orig_bio);
return ret;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 47af1ab3bf12..67d932d70798 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -4,6 +4,7 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/sched/mm.h>
+#include <linux/atomic.h>
#include "ctree.h"
#include "volumes.h"
#include "zoned.h"
@@ -39,12 +40,30 @@
#define BTRFS_NR_SB_LOG_ZONES 2
/*
+ * Minimum of active zones we need:
+ *
+ * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
+ * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
+ * - 1 zone for tree-log dedicated block group
+ * - 1 zone for relocation
+ */
+#define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5)
+
+/*
* Maximum supported zone size. Currently, SMR disks have a zone size of
* 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
* expect the zone size to become larger than 8GiB in the near future.
*/
#define BTRFS_MAX_ZONE_SIZE SZ_8G
+#define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
+
+static inline bool sb_zone_is_full(const struct blk_zone *zone)
+{
+ return (zone->cond == BLK_ZONE_COND_FULL) ||
+ (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
+}
+
static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
{
struct blk_zone *zones = data;
@@ -60,14 +79,13 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
bool empty[BTRFS_NR_SB_LOG_ZONES];
bool full[BTRFS_NR_SB_LOG_ZONES];
sector_t sector;
+ int i;
- ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
- zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
-
- empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
- empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
- full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
- full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
+ empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
+ full[i] = sb_zone_is_full(&zones[i]);
+ }
/*
* Possible states of log buffer zones
@@ -296,6 +314,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_zoned_device_info *zone_info = NULL;
struct block_device *bdev = device->bdev;
+ struct request_queue *queue = bdev_get_queue(bdev);
+ unsigned int max_active_zones;
+ unsigned int nactive;
sector_t nr_sectors;
sector_t sector = 0;
struct blk_zone *zones = NULL;
@@ -351,6 +372,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
+ max_active_zones = queue_max_active_zones(queue);
+ if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
+ btrfs_err_in_rcu(fs_info,
+"zoned: %s: max active zones %u is too small, need at least %u active zones",
+ rcu_str_deref(device->name), max_active_zones,
+ BTRFS_MIN_ACTIVE_ZONES);
+ ret = -EINVAL;
+ goto out;
+ }
+ zone_info->max_active_zones = max_active_zones;
+
zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
if (!zone_info->seq_zones) {
ret = -ENOMEM;
@@ -363,6 +395,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
goto out;
}
+ zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->active_zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
if (!zones) {
ret = -ENOMEM;
@@ -370,6 +408,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
}
/* Get zones type */
+ nactive = 0;
while (sector < nr_sectors) {
nr_zones = BTRFS_REPORT_NR_ZONES;
ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
@@ -380,8 +419,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
for (i = 0; i < nr_zones; i++) {
if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
__set_bit(nreported, zone_info->seq_zones);
- if (zones[i].cond == BLK_ZONE_COND_EMPTY)
+ switch (zones[i].cond) {
+ case BLK_ZONE_COND_EMPTY:
__set_bit(nreported, zone_info->empty_zones);
+ break;
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ __set_bit(nreported, zone_info->active_zones);
+ nactive++;
+ break;
+ }
nreported++;
}
sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
@@ -396,6 +444,19 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
goto out;
}
+ if (max_active_zones) {
+ if (nactive > max_active_zones) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: %u active zones on %s exceeds max_active_zones %u",
+ nactive, rcu_str_deref(device->name),
+ max_active_zones);
+ ret = -EIO;
+ goto out;
+ }
+ atomic_set(&zone_info->active_zones_left,
+ max_active_zones - nactive);
+ }
+
/* Validate superblock log */
nr_zones = BTRFS_NR_SB_LOG_ZONES;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -478,6 +539,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
out:
kfree(zones);
out_free_zone_info:
+ bitmap_free(zone_info->active_zones);
bitmap_free(zone_info->empty_zones);
bitmap_free(zone_info->seq_zones);
kfree(zone_info);
@@ -493,6 +555,7 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
if (!zone_info)
return;
+ bitmap_free(zone_info->active_zones);
bitmap_free(zone_info->seq_zones);
bitmap_free(zone_info->empty_zones);
kfree(zone_info);
@@ -585,7 +648,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
/*
* stripe_size is always aligned to BTRFS_STRIPE_LEN in
- * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
+ * btrfs_create_chunk(). Since we want stripe_len == zone_size,
* check the alignment here.
*/
if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
@@ -664,7 +727,7 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
reset = &zones[1];
if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
- ASSERT(reset->cond == BLK_ZONE_COND_FULL);
+ ASSERT(sb_zone_is_full(reset));
ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
reset->start, reset->len,
@@ -676,9 +739,20 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
reset->wp = reset->start;
}
} else if (ret != -ENOENT) {
- /* For READ, we want the precious one */
+ /*
+ * For READ, we want the previous one. Move write pointer to
+ * the end of a zone, if it is at the head of a zone.
+ */
+ u64 zone_end = 0;
+
if (wp == zones[0].start << SECTOR_SHIFT)
- wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
+ zone_end = zones[1].start + zones[1].capacity;
+ else if (wp == zones[1].start << SECTOR_SHIFT)
+ zone_end = zones[0].start + zones[0].capacity;
+ if (zone_end)
+ wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
+ BTRFS_SUPER_INFO_SIZE);
+
wp -= BTRFS_SUPER_INFO_SIZE;
}
@@ -771,36 +845,56 @@ static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
return true;
}
-void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
{
struct btrfs_zoned_device_info *zinfo = device->zone_info;
struct blk_zone *zone;
+ int i;
if (!is_sb_log_zone(zinfo, mirror))
- return;
+ return 0;
zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
- if (zone->cond != BLK_ZONE_COND_FULL) {
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ /* Advance the next zone */
+ if (zone->cond == BLK_ZONE_COND_FULL) {
+ zone++;
+ continue;
+ }
+
if (zone->cond == BLK_ZONE_COND_EMPTY)
zone->cond = BLK_ZONE_COND_IMP_OPEN;
- zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
+ zone->wp += SUPER_INFO_SECTORS;
+
+ if (sb_zone_is_full(zone)) {
+ /*
+ * No room left to write new superblock. Since
+ * superblock is written with REQ_SYNC, it is safe to
+ * finish the zone now.
+ *
+ * If the write pointer is exactly at the capacity,
+ * explicit ZONE_FINISH is not necessary.
+ */
+ if (zone->wp != zone->start + zone->capacity) {
+ int ret;
+
+ ret = blkdev_zone_mgmt(device->bdev,
+ REQ_OP_ZONE_FINISH, zone->start,
+ zone->len, GFP_NOFS);
+ if (ret)
+ return ret;
+ }
- if (zone->wp == zone->start + zone->len)
+ zone->wp = zone->start + zone->len;
zone->cond = BLK_ZONE_COND_FULL;
-
- return;
+ }
+ return 0;
}
- zone++;
- ASSERT(zone->cond != BLK_ZONE_COND_FULL);
- if (zone->cond == BLK_ZONE_COND_EMPTY)
- zone->cond = BLK_ZONE_COND_IMP_OPEN;
-
- zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
-
- if (zone->wp == zone->start + zone->len)
- zone->cond = BLK_ZONE_COND_FULL;
+ /* All the zones are FULL. Should not reach here. */
+ ASSERT(0);
+ return -EIO;
}
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
@@ -895,6 +989,41 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
return pos;
}
+static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ unsigned int zno = (pos >> zone_info->zone_size_shift);
+
+ /* We can use any number of zones */
+ if (zone_info->max_active_zones == 0)
+ return true;
+
+ if (!test_bit(zno, zone_info->active_zones)) {
+ /* Active zone left? */
+ if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
+ return false;
+ if (test_and_set_bit(zno, zone_info->active_zones)) {
+ /* Someone already set the bit */
+ atomic_inc(&zone_info->active_zones_left);
+ }
+ }
+
+ return true;
+}
+
+static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ unsigned int zno = (pos >> zone_info->zone_size_shift);
+
+ /* We can use any number of zones */
+ if (zone_info->max_active_zones == 0)
+ return;
+
+ if (test_and_clear_bit(zno, zone_info->active_zones))
+ atomic_inc(&zone_info->active_zones_left);
+}
+
int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
u64 length, u64 *bytes)
{
@@ -910,6 +1039,7 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
*bytes = length;
while (length) {
btrfs_dev_set_zone_empty(device, physical);
+ btrfs_dev_clear_active_zone(device, physical);
physical += device->zone_info->zone_size;
length -= device->zone_info->zone_size;
}
@@ -1039,6 +1169,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
int i;
unsigned int nofs_flag;
u64 *alloc_offsets = NULL;
+ u64 *caps = NULL;
+ unsigned long *active = NULL;
u64 last_alloc = 0;
u32 num_sequential = 0, num_conventional = 0;
@@ -1063,10 +1195,28 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
map = em->map_lookup;
+ cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
+ if (!cache->physical_map) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
if (!alloc_offsets) {
- free_extent_map(em);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
+ if (!caps) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
+ if (!active) {
+ ret = -ENOMEM;
+ goto out;
}
for (i = 0; i < map->num_stripes; i++) {
@@ -1131,6 +1281,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
+ caps[i] = (zone.capacity << SECTOR_SHIFT);
+
switch (zone.cond) {
case BLK_ZONE_COND_OFFLINE:
case BLK_ZONE_COND_READONLY:
@@ -1144,14 +1296,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
alloc_offsets[i] = 0;
break;
case BLK_ZONE_COND_FULL:
- alloc_offsets[i] = fs_info->zone_size;
+ alloc_offsets[i] = caps[i];
break;
default:
/* Partially used zone */
alloc_offsets[i] =
((zone.wp - zone.start) << SECTOR_SHIFT);
+ __set_bit(i, active);
break;
}
+
+ /*
+ * Consider a zone as active if we can allow any number of
+ * active zones.
+ */
+ if (!device->zone_info->max_active_zones)
+ __set_bit(i, active);
}
if (num_sequential > 0)
@@ -1169,6 +1329,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
* calculate_alloc_pointer() which takes extent buffer
* locks to avoid deadlock.
*/
+
+ /* Zone capacity is always zone size in emulation */
+ cache->zone_capacity = cache->length;
if (new) {
cache->alloc_offset = 0;
goto out;
@@ -1195,6 +1358,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
cache->alloc_offset = alloc_offsets[0];
+ cache->zone_capacity = caps[0];
+ cache->zone_is_active = test_bit(0, active);
break;
case BTRFS_BLOCK_GROUP_DUP:
case BTRFS_BLOCK_GROUP_RAID1:
@@ -1210,6 +1375,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
+ if (cache->zone_is_active) {
+ btrfs_get_block_group(cache);
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
+
out:
if (cache->alloc_offset > fs_info->zone_size) {
btrfs_err(fs_info,
@@ -1218,6 +1390,14 @@ out:
ret = -EIO;
}
+ if (cache->alloc_offset > cache->zone_capacity) {
+ btrfs_err(fs_info,
+"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
+ cache->alloc_offset, cache->zone_capacity,
+ cache->start);
+ ret = -EIO;
+ }
+
/* An extent is allocated after the write pointer */
if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
btrfs_err(fs_info,
@@ -1229,6 +1409,12 @@ out:
if (!ret)
cache->meta_write_pointer = cache->alloc_offset + cache->start;
+ if (ret) {
+ kfree(cache->physical_map);
+ cache->physical_map = NULL;
+ }
+ bitmap_free(active);
+ kfree(caps);
kfree(alloc_offsets);
free_extent_map(em);
@@ -1243,17 +1429,15 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
return;
WARN_ON(cache->bytes_super != 0);
- unusable = cache->alloc_offset - cache->used;
- free = cache->length - cache->alloc_offset;
+ unusable = (cache->alloc_offset - cache->used) +
+ (cache->length - cache->zone_capacity);
+ free = cache->zone_capacity - cache->alloc_offset;
/* We only need ->free_space in ALLOC_SEQ block groups */
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
cache->free_space_ctl->free_space = free;
cache->zone_unusable = unusable;
-
- /* Should not have any excluded extents. Just in case, though */
- btrfs_free_excluded_extents(cache);
}
void btrfs_redirty_list_add(struct btrfs_transaction *trans,
@@ -1304,6 +1488,17 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
if (!is_data_inode(&inode->vfs_inode))
return false;
+ /*
+ * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
+ * extent layout the relocation code has.
+ * Furthermore we have set aside own block-group from which only the
+ * relocation "process" can allocate and make sure only one process at a
+ * time can add pages to an extent that gets relocated, so it's safe to
+ * use regular REQ_OP_WRITE for this special case.
+ */
+ if (btrfs_is_data_reloc_root(inode->root))
+ return false;
+
cache = btrfs_lookup_block_group(fs_info, start);
ASSERT(cache);
if (!cache)
@@ -1440,27 +1635,27 @@ int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 len
static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
struct blk_zone *zone)
{
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
u64 mapped_length = PAGE_SIZE;
unsigned int nofs_flag;
int nmirrors;
int i, ret;
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &mapped_length, &bbio);
- if (ret || !bbio || mapped_length < PAGE_SIZE) {
- btrfs_put_bbio(bbio);
+ &mapped_length, &bioc);
+ if (ret || !bioc || mapped_length < PAGE_SIZE) {
+ btrfs_put_bioc(bioc);
return -EIO;
}
- if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
return -EINVAL;
nofs_flag = memalloc_nofs_save();
- nmirrors = (int)bbio->num_stripes;
+ nmirrors = (int)bioc->num_stripes;
for (i = 0; i < nmirrors; i++) {
- u64 physical = bbio->stripes[i].physical;
- struct btrfs_device *dev = bbio->stripes[i].dev;
+ u64 physical = bioc->stripes[i].physical;
+ struct btrfs_device *dev = bioc->stripes[i].dev;
/* Missing device */
if (!dev->bdev)
@@ -1530,3 +1725,251 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
return device;
}
+
+/**
+ * Activate block group and underlying device zones
+ *
+ * @block_group: the block group to activate
+ *
+ * Return: true on success, false otherwise
+ */
+bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 physical;
+ bool ret;
+
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return true;
+
+ map = block_group->physical_map;
+ /* Currently support SINGLE profile only */
+ ASSERT(map->num_stripes == 1);
+ device = map->stripes[0].dev;
+ physical = map->stripes[0].physical;
+
+ if (device->zone_info->max_active_zones == 0)
+ return true;
+
+ spin_lock(&block_group->lock);
+
+ if (block_group->zone_is_active) {
+ ret = true;
+ goto out_unlock;
+ }
+
+ /* No space left */
+ if (block_group->alloc_offset == block_group->zone_capacity) {
+ ret = false;
+ goto out_unlock;
+ }
+
+ if (!btrfs_dev_set_active_zone(device, physical)) {
+ /* Cannot activate the zone */
+ ret = false;
+ goto out_unlock;
+ }
+
+ /* Successfully activated all the zones */
+ block_group->zone_is_active = 1;
+
+ spin_unlock(&block_group->lock);
+
+ /* For the active block group list */
+ btrfs_get_block_group(block_group);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ASSERT(list_empty(&block_group->active_bg_list));
+ list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ return true;
+
+out_unlock:
+ spin_unlock(&block_group->lock);
+ return ret;
+}
+
+int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 physical;
+ int ret = 0;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ map = block_group->physical_map;
+ /* Currently support SINGLE profile only */
+ ASSERT(map->num_stripes == 1);
+
+ device = map->stripes[0].dev;
+ physical = map->stripes[0].physical;
+
+ if (device->zone_info->max_active_zones == 0)
+ return 0;
+
+ spin_lock(&block_group->lock);
+ if (!block_group->zone_is_active) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+
+ /* Check if we have unwritten allocated space */
+ if ((block_group->flags &
+ (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
+ block_group->alloc_offset > block_group->meta_write_pointer) {
+ spin_unlock(&block_group->lock);
+ return -EAGAIN;
+ }
+ spin_unlock(&block_group->lock);
+
+ ret = btrfs_inc_block_group_ro(block_group, false);
+ if (ret)
+ return ret;
+
+ /* Ensure all writes in this block group finish */
+ btrfs_wait_block_group_reservations(block_group);
+ /* No need to wait for NOCOW writers. Zoned mode does not allow that. */
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
+ block_group->length);
+
+ spin_lock(&block_group->lock);
+
+ /*
+ * Bail out if someone already deactivated the block group, or
+ * allocated space is left in the block group.
+ */
+ if (!block_group->zone_is_active) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return 0;
+ }
+
+ if (block_group->reserved) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return -EAGAIN;
+ }
+
+ block_group->zone_is_active = 0;
+ block_group->alloc_offset = block_group->zone_capacity;
+ block_group->free_space_ctl->free_space = 0;
+ btrfs_clear_treelog_bg(block_group);
+ spin_unlock(&block_group->lock);
+
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ device->zone_info->zone_size >> SECTOR_SHIFT,
+ GFP_NOFS);
+ btrfs_dec_block_group_ro(block_group);
+
+ if (!ret) {
+ btrfs_dev_clear_active_zone(device, physical);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ASSERT(!list_empty(&block_group->active_bg_list));
+ list_del_init(&block_group->active_bg_list);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ /* For active_bg_list */
+ btrfs_put_block_group(block_group);
+ }
+
+ return ret;
+}
+
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index)
+{
+ struct btrfs_device *device;
+ bool ret = false;
+
+ if (!btrfs_is_zoned(fs_devices->fs_info))
+ return true;
+
+ /* Non-single profiles are not supported yet */
+ if (raid_index != BTRFS_RAID_SINGLE)
+ return false;
+
+ /* Check if there is a device with active zones left */
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+
+ if (!device->bdev)
+ continue;
+
+ if (!zinfo->max_active_zones ||
+ atomic_read(&zinfo->active_zones_left)) {
+ ret = true;
+ break;
+ }
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ return ret;
+}
+
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
+{
+ struct btrfs_block_group *block_group;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 physical;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ block_group = btrfs_lookup_block_group(fs_info, logical);
+ ASSERT(block_group);
+
+ if (logical + length < block_group->start + block_group->zone_capacity)
+ goto out;
+
+ spin_lock(&block_group->lock);
+
+ if (!block_group->zone_is_active) {
+ spin_unlock(&block_group->lock);
+ goto out;
+ }
+
+ block_group->zone_is_active = 0;
+ /* We should have consumed all the free space */
+ ASSERT(block_group->alloc_offset == block_group->zone_capacity);
+ ASSERT(block_group->free_space_ctl->free_space == 0);
+ btrfs_clear_treelog_bg(block_group);
+ spin_unlock(&block_group->lock);
+
+ map = block_group->physical_map;
+ device = map->stripes[0].dev;
+ physical = map->stripes[0].physical;
+
+ if (!device->zone_info->max_active_zones)
+ goto out;
+
+ btrfs_dev_clear_active_zone(device, physical);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ASSERT(!list_empty(&block_group->active_bg_list));
+ list_del_init(&block_group->active_bg_list);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ btrfs_put_block_group(block_group);
+
+out:
+ btrfs_put_block_group(block_group);
+}
+
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg == bg->start)
+ fs_info->data_reloc_bg = 0;
+ spin_unlock(&fs_info->relocation_bg_lock);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 4b299705bb12..e53ab7b96437 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -23,8 +23,11 @@ struct btrfs_zoned_device_info {
u64 zone_size;
u8 zone_size_shift;
u32 nr_zones;
+ unsigned int max_active_zones;
+ atomic_t active_zones_left;
unsigned long *seq_zones;
unsigned long *empty_zones;
+ unsigned long *active_zones;
struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
};
@@ -40,7 +43,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
u64 *bytenr_ret);
int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
u64 *bytenr_ret);
-void btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
+int btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
u64 hole_end, u64 num_bytes);
@@ -66,6 +69,13 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
u64 physical_start, u64 physical_pos);
struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
u64 logical, u64 length);
+bool btrfs_zone_activate(struct btrfs_block_group *block_group);
+int btrfs_zone_finish(struct btrfs_block_group *block_group);
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+ int raid_index);
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length);
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -113,8 +123,10 @@ static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror,
return 0;
}
-static inline void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
-{ }
+static inline int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+{
+ return 0;
+}
static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
{
@@ -199,6 +211,27 @@ static inline struct btrfs_device *btrfs_zoned_get_device(
return ERR_PTR(-EOPNOTSUPP);
}
+static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+{
+ return true;
+}
+
+static inline int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+ return 0;
+}
+
+static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+ int raid_index)
+{
+ return true;
+}
+
+static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length) { }
+
+static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 56dce9f00988..f06b68040352 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -399,7 +399,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* map in the first page of input data */
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = page_address(in_page);
+ workspace->in_buf.src = kmap(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
@@ -411,7 +411,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ workspace->out_buf.dst = kmap(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
@@ -446,6 +446,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
if (workspace->out_buf.pos == workspace->out_buf.size) {
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -457,7 +458,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ workspace->out_buf.dst = kmap(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out,
PAGE_SIZE);
@@ -472,12 +473,13 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* Check if we need more input */
if (workspace->in_buf.pos == workspace->in_buf.size) {
tot_in += PAGE_SIZE;
+ kunmap(in_page);
put_page(in_page);
start += PAGE_SIZE;
len -= PAGE_SIZE;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = page_address(in_page);
+ workspace->in_buf.src = kmap(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
}
@@ -504,6 +506,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -515,7 +518,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ workspace->out_buf.dst = kmap(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
}
@@ -531,8 +534,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
out:
*out_pages = nr_pages;
/* Cleanup */
- if (in_page)
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
+ if (out_page)
+ kunmap(out_page);
return ret;
}
@@ -556,7 +563,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
goto done;
}
- workspace->in_buf.src = page_address(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
@@ -592,14 +599,14 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
break;
if (workspace->in_buf.pos == workspace->in_buf.size) {
- page_in_index++;
+ kunmap(pages_in[page_in_index++]);
if (page_in_index >= total_pages_in) {
workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
srclen -= PAGE_SIZE;
- workspace->in_buf.src = page_address(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
}
@@ -607,6 +614,8 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
ret = 0;
zero_fill_bio(cb->orig_bio);
done:
+ if (workspace->in_buf.src)
+ kunmap(pages_in[page_in_index]);
return ret;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index ab7573d72dd7..46bc589b7a03 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -878,7 +878,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
{
sector_t retval = ~((sector_t)0);
- loff_t sz = i_size_read(bdev->bd_inode);
+ loff_t sz = bdev_nr_bytes(bdev);
if (sz) {
unsigned int sizebits = blksize_bits(size);
@@ -897,7 +897,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh = head;
int uptodate = PageUptodate(page);
- sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
+ sector_t end_block = blkdev_max_block(bdev, size);
do {
if (!buffer_mapped(bh)) {
@@ -1425,12 +1425,16 @@ void invalidate_bh_lrus(void)
}
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
-void invalidate_bh_lrus_cpu(int cpu)
+/*
+ * It's called from workqueue context so we need a bh_lru_lock to close
+ * the race with preemption/irq.
+ */
+void invalidate_bh_lrus_cpu(void)
{
struct bh_lru *b;
bh_lru_lock();
- b = per_cpu_ptr(&bh_lrus, cpu);
+ b = this_cpu_ptr(&bh_lrus);
__invalidate_bh_lrus(b);
bh_lru_unlock();
}
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index fac2e8e7b533..effe37ef8629 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -37,11 +37,11 @@ static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki)
/*
* Handle completion of a read from the cache.
*/
-static void cachefiles_read_complete(struct kiocb *iocb, long ret, long ret2)
+static void cachefiles_read_complete(struct kiocb *iocb, long ret)
{
struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
- _enter("%ld,%ld", ret, ret2);
+ _enter("%ld", ret);
if (ki->term_func) {
if (ret >= 0)
@@ -139,7 +139,7 @@ static int cachefiles_read(struct netfs_cache_resources *cres,
fallthrough;
default:
ki->was_async = false;
- cachefiles_read_complete(&ki->iocb, ret, 0);
+ cachefiles_read_complete(&ki->iocb, ret);
if (ret > 0)
ret = 0;
break;
@@ -159,12 +159,12 @@ presubmission_error:
/*
* Handle completion of a write to the cache.
*/
-static void cachefiles_write_complete(struct kiocb *iocb, long ret, long ret2)
+static void cachefiles_write_complete(struct kiocb *iocb, long ret)
{
struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
struct inode *inode = file_inode(ki->iocb.ki_filp);
- _enter("%ld,%ld", ret, ret2);
+ _enter("%ld", ret);
/* Tell lockdep we inherited freeze protection from submission thread */
__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
@@ -244,7 +244,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
fallthrough;
default:
ki->was_async = false;
- cachefiles_write_complete(&ki->iocb, ret, 0);
+ cachefiles_write_complete(&ki->iocb, ret);
if (ret > 0)
ret = 0;
break;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 8ffc40e84a59..fcf4f3b72923 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -25,20 +25,20 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
struct cachefiles_object *object;
struct fscache_retrieval *op = monitor->op;
struct wait_page_key *key = _key;
- struct page *page = wait->private;
+ struct folio *folio = wait->private;
ASSERT(key);
_enter("{%lu},%u,%d,{%p,%u}",
monitor->netfs_page->index, mode, sync,
- key->page, key->bit_nr);
+ key->folio, key->bit_nr);
- if (key->page != page || key->bit_nr != PG_locked)
+ if (key->folio != folio || key->bit_nr != PG_locked)
return 0;
- _debug("--- monitor %p %lx ---", page, page->flags);
+ _debug("--- monitor %p %lx ---", folio, folio->flags);
- if (!PageUptodate(page) && !PageError(page)) {
+ if (!folio_test_uptodate(folio) && !folio_test_error(folio)) {
/* unlocked, not uptodate and not erronous? */
_debug("page probably truncated");
}
@@ -107,7 +107,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
put_page(backpage2);
INIT_LIST_HEAD(&monitor->op_link);
- add_page_wait_queue(backpage, &monitor->monitor);
+ folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
if (trylock_page(backpage)) {
ret = -EIO;
@@ -294,7 +294,7 @@ monitor_backing_page:
get_page(backpage);
monitor->back_page = backpage;
monitor->monitor.private = backpage;
- add_page_wait_queue(backpage, &monitor->monitor);
+ folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
monitor = NULL;
/* but the page may have been read before the monitor was installed, so
@@ -548,7 +548,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
get_page(backpage);
monitor->back_page = backpage;
monitor->monitor.private = backpage;
- add_page_wait_queue(backpage, &monitor->monitor);
+ folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
monitor = NULL;
/* but the page may have been read before the monitor was
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 6c0e52fd0743..8f537f1d9d1d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2263,7 +2263,7 @@ retry:
list_for_each_entry(req, &ci->i_unsafe_dirops,
r_unsafe_dir_item) {
s = req->r_session;
- if (unlikely(s->s_mds > max)) {
+ if (unlikely(s->s_mds >= max)) {
spin_unlock(&ci->i_unsafe_lock);
goto retry;
}
@@ -2277,7 +2277,7 @@ retry:
list_for_each_entry(req, &ci->i_unsafe_iops,
r_unsafe_target_item) {
s = req->r_session;
- if (unlikely(s->s_mds > max)) {
+ if (unlikely(s->s_mds >= max)) {
spin_unlock(&ci->i_unsafe_lock);
goto retry;
}
@@ -2330,7 +2330,6 @@ retry:
int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct ceph_file_info *fi = file->private_data;
struct inode *inode = file->f_mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
u64 flush_tid;
@@ -2365,14 +2364,9 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (err < 0)
ret = err;
- if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) {
- spin_lock(&file->f_lock);
- err = errseq_check_and_advance(&ci->i_meta_err,
- &fi->meta_err);
- spin_unlock(&file->f_lock);
- if (err < 0)
- ret = err;
- }
+ err = file_check_and_advance_wb_err(file);
+ if (err < 0)
+ ret = err;
out:
dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
return ret;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d16fd2d5fd42..b129ea551378 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -233,7 +233,6 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
spin_lock_init(&fi->rw_contexts_lock);
INIT_LIST_HEAD(&fi->rw_contexts);
- fi->meta_err = errseq_sample(&ci->i_meta_err);
fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
return 0;
@@ -1023,7 +1022,7 @@ static void ceph_aio_complete(struct inode *inode,
ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
CEPH_CAP_FILE_RD));
- aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
+ aio_req->iocb->ki_complete(aio_req->iocb, ret);
ceph_free_cap_flush(aio_req->prealloc_cf);
kfree(aio_req);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 2df1e1284451..1c7574105478 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -541,8 +541,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ceph_fscache_inode_init(ci);
- ci->i_meta_err = 0;
-
return &ci->vfs_inode;
}
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index bdeb271f47d9..d8c31069fbf2 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -302,9 +302,6 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- /* No mandatory locks */
- if (fl->fl_type & LOCK_MAND)
- return -EOPNOTSUPP;
dout("ceph_flock, fl_file: %p\n", fl->fl_file);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 7cad180d6deb..d64413adc0fd 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1493,7 +1493,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
{
struct ceph_mds_request *req;
struct rb_node *p;
- struct ceph_inode_info *ci;
dout("cleanup_session_requests mds%d\n", session->s_mds);
mutex_lock(&mdsc->mutex);
@@ -1502,16 +1501,10 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
struct ceph_mds_request, r_unsafe_item);
pr_warn_ratelimited(" dropping unsafe request %llu\n",
req->r_tid);
- if (req->r_target_inode) {
- /* dropping unsafe change of inode's attributes */
- ci = ceph_inode(req->r_target_inode);
- errseq_set(&ci->i_meta_err, -EIO);
- }
- if (req->r_unsafe_dir) {
- /* dropping unsafe directory operation */
- ci = ceph_inode(req->r_unsafe_dir);
- errseq_set(&ci->i_meta_err, -EIO);
- }
+ if (req->r_target_inode)
+ mapping_set_error(req->r_target_inode->i_mapping, -EIO);
+ if (req->r_unsafe_dir)
+ mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO);
__unregister_request(mdsc, req);
}
/* zero r_attempts, so kick_requests() will re-send requests */
@@ -1678,7 +1671,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
spin_unlock(&mdsc->cap_dirty_lock);
if (dirty_dropped) {
- errseq_set(&ci->i_meta_err, -EIO);
+ mapping_set_error(inode->i_mapping, -EIO);
if (ci->i_wrbuffer_ref_head == 0 &&
ci->i_wr_ref == 0 &&
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9b1b7f4cfdd4..fd8742bae847 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1002,16 +1002,16 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc)
struct ceph_fs_client *new = fc->s_fs_info;
struct ceph_mount_options *fsopt = new->mount_options;
struct ceph_options *opt = new->client->options;
- struct ceph_fs_client *other = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
dout("ceph_compare_super %p\n", sb);
- if (compare_mount_options(fsopt, opt, other)) {
+ if (compare_mount_options(fsopt, opt, fsc)) {
dout("monitor(s)/mount options don't match\n");
return 0;
}
if ((opt->flags & CEPH_OPT_FSID) &&
- ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
+ ceph_fsid_compare(&opt->fsid, &fsc->client->fsid)) {
dout("fsid doesn't match\n");
return 0;
}
@@ -1019,6 +1019,17 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc)
dout("flags differ\n");
return 0;
}
+
+ if (fsc->blocklisted && !ceph_test_mount_opt(fsc, CLEANRECOVER)) {
+ dout("client is blocklisted (and CLEANRECOVER is not set)\n");
+ return 0;
+ }
+
+ if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
+ dout("client has been forcibly unmounted\n");
+ return 0;
+ }
+
return 1;
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index a40eb14c282a..14f951cd5b61 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -429,8 +429,6 @@ struct ceph_inode_info {
#ifdef CONFIG_CEPH_FSCACHE
struct fscache_cookie *fscache;
#endif
- errseq_t i_meta_err;
-
struct inode vfs_inode; /* at end */
};
@@ -774,7 +772,6 @@ struct ceph_file_info {
spinlock_t rw_contexts_lock;
struct list_head rw_contexts;
- errseq_t meta_err;
u32 filp_gen;
atomic_t num_locks;
};
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 159a1ffa4f4b..fcf7dfdecf96 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1311,7 +1311,7 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
int err;
err = security_dentry_init_security(dentry, mode, &dentry->d_name,
- &as_ctx->sec_ctx,
+ &name, &as_ctx->sec_ctx,
&as_ctx->sec_ctxlen);
if (err < 0) {
WARN_ON_ONCE(err != -EOPNOTSUPP);
@@ -1335,7 +1335,6 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
* It only supports single security module and only selinux has
* dentry_init_security hook.
*/
- name = XATTR_NAME_SELINUX;
name_len = strlen(name);
err = ceph_pagelist_reserve(pagelist,
4 * 2 + name_len + as_ctx->sec_ctxlen);
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 8a3b30ec860c..8be57aaedab6 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/cache.c - CIFS filesystem cache index structure definitions
+ * CIFS filesystem cache index structure definitions
*
* Copyright (c) 2010 Novell, Inc.
* Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 51a824fc926a..de2c12bcfa4b 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * fs/cifs_debug.c
*
* Copyright (C) International Business Machines Corp., 2000,2005
*
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 4fd788586399..f97407520ea1 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: LGPL-2.1 */
/*
- * fs/cifs/cifs_fs_sb.h
*
* Copyright (c) International Business Machines Corp., 2002,2004
* Author(s): Steve French (sfrench@us.ibm.com)
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index ef723be358af..b87cbbe6d2d4 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: LGPL-2.1 */
/*
- * fs/cifs/cifs_ioctl.h
*
* Structure definitions for io control for cifs/smb3
*
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 8fa26a8530f8..353bd0dd7026 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/cifs_spnego.c -- SPNEGO upcall management for CIFS
+ * SPNEGO upcall management for CIFS
*
* Copyright (c) 2007 Red Hat, Inc.
* Author(s): Jeff Layton (jlayton@redhat.com)
diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h
index 31387d0ea32e..e6a0451877d4 100644
--- a/fs/cifs/cifs_spnego.h
+++ b/fs/cifs/cifs_spnego.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: LGPL-2.1 */
/*
- * fs/cifs/cifs_spnego.h -- SPNEGO upcall management for CIFS
+ * SPNEGO upcall management for CIFS
*
* Copyright (c) 2007 Red Hat, Inc.
* Author(s): Jeff Layton (jlayton@redhat.com)
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 171ad8b42107..e7582dd79179 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * fs/cifs/cifs_unicode.c
*
* Copyright (c) International Business Machines Corp., 2000,2009
* Modified by Steve French (sfrench@us.ibm.com)
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 388eb536cff1..ee3aab3dd4ac 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/cifsacl.c
*
* Copyright (C) International Business Machines Corp., 2007,2008
* Author(s): Steve French (sfrench@us.ibm.com)
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index f8292bcf8594..ccbfc754bd3c 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: LGPL-2.1 */
/*
- * fs/cifs/cifsacl.h
*
* Copyright (c) International Business Machines Corp., 2007
* Author(s): Steve French (sfrench@us.ibm.com)
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 2e6f40344037..d118282071b3 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/cifsencrypt.c
*
* Encryption and hashing operations relating to NTLM, NTLMv2. See MS-NLMP
* for more detailed information
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8c20bfa187ac..dca42aa87d30 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/cifsfs.c
*
* Copyright (C) International Business Machines Corp., 2002,2008
* Author(s): Steve French (sfrench@us.ibm.com)
@@ -39,7 +38,6 @@
#include <linux/key-type.h>
#include "cifs_spnego.h"
#include "fscache.h"
-#include "smb2pdu.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
#include "dfs_cache.h"
#endif
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index d25a4099b32e..b50da1901ebd 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: LGPL-2.1 */
/*
- * fs/cifs/cifsfs.h
*
* Copyright (c) International Business Machines Corp., 2002, 2007
* Author(s): Steve French (sfrench@us.ibm.com)
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index c068f7d8d879..abff31dcd005 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: LGPL-2.1 */
/*
- * fs/cifs/cifsglob.h
*
* Copyright (C) International Business Machines Corp., 2002,2008
* Author(s): Steve French (sfrench@us.ibm.com)
@@ -21,6 +20,7 @@
#include <crypto/internal/hash.h>
#include <linux/scatterlist.h>
#include <uapi/linux/cifs/cifs_mount.h>
+#include "../smbfs_common/smb2pdu.h"
#include "smb2pdu.h"
#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
@@ -777,7 +777,7 @@ revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
static inline void
revert_current_mid_from_hdr(struct TCP_Server_Info *server,
- const struct smb2_sync_hdr *shdr)
+ const struct smb2_hdr *shdr)
{
unsigned int num = le16_to_cpu(shdr->CreditCharge);
@@ -1400,6 +1400,7 @@ struct cifsInodeInfo {
#define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */
#define CIFS_INO_LOCK (5) /* lock bit for synchronization */
#define CIFS_INO_MODIFIED_ATTR (6) /* Indicate change in mtime/ctime */
+#define CIFS_INO_CLOSE_ON_LOCK (7) /* Not to defer the close when lock is set */
unsigned long flags;
spinlock_t writers_lock;
unsigned int writers; /* Number of writers on this inode */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 98e8e5aa0613..d2ff438fd31f 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: LGPL-2.1 */
/*
- * fs/cifs/cifspdu.h
*
* Copyright (c) International Business Machines Corp., 2002,2009
* Author(s): Steve French (sfrench@us.ibm.com)
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index f9740c21ca3d..d0f85b666662 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: LGPL-2.1 */
/*
- * fs/cifs/cifsproto.h
*
* Copyright (c) International Business Machines Corp., 2002,2008
* Author(s): Steve French (sfrench@us.ibm.com)
@@ -268,6 +267,9 @@ extern void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode);
extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon);
+extern void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon,
+ const char *path);
+
extern struct TCP_Server_Info *cifs_get_tcp_session(struct smb3_fs_context *ctx);
extern void cifs_put_tcp_session(struct TCP_Server_Info *server,
int from_reconnect);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index a8e41c1e80ca..243d17696f06 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/cifssmb.c
*
* Copyright (C) International Business Machines Corp., 2002,2010
* Author(s): Steve French (sfrench@us.ibm.com)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0db344807ef1..0abbff4e4135 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/connect.c
*
* Copyright (C) International Business Machines Corp., 2002,2011
* Author(s): Steve French (sfrench@us.ibm.com)
@@ -678,7 +677,7 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed)
static unsigned int
smb2_get_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buffer;
/*
* SMB1 does not use credits.
@@ -795,7 +794,6 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
*/
}
- kfree(server->hostname);
kfree(server);
length = atomic_dec_return(&tcpSesAllocCount);
@@ -879,7 +877,7 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
static void
smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buffer;
int scredits, in_flight;
/*
@@ -1090,7 +1088,7 @@ next_pdu:
module_put_and_exit(0);
}
-/**
+/*
* Returns true if srcaddr isn't specified and rhs isn't specified, or
* if srcaddr is specified and matches the IP address of the rhs argument
*/
@@ -1236,6 +1234,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *
if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
return 0;
+ if (strcasecmp(server->hostname, ctx->server_hostname))
+ return 0;
+
if (!match_address(server, addr,
(struct sockaddr *)&ctx->srcaddr))
return 0;
@@ -1337,6 +1338,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
kfree(server->session_key.response);
server->session_key.response = NULL;
server->session_key.len = 0;
+ kfree(server->hostname);
task = xchg(&server->tsk, NULL);
if (task)
@@ -1362,14 +1364,15 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx)
goto out_err;
}
+ tcp_ses->hostname = kstrdup(ctx->server_hostname, GFP_KERNEL);
+ if (!tcp_ses->hostname) {
+ rc = -ENOMEM;
+ goto out_err;
+ }
+
tcp_ses->ops = ctx->ops;
tcp_ses->vals = ctx->vals;
cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
- tcp_ses->hostname = extract_hostname(ctx->UNC);
- if (IS_ERR(tcp_ses->hostname)) {
- rc = PTR_ERR(tcp_ses->hostname);
- goto out_err_crypto_release;
- }
tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId);
tcp_ses->noblockcnt = ctx->rootfs;
@@ -1498,8 +1501,7 @@ out_err_crypto_release:
out_err:
if (tcp_ses) {
- if (!IS_ERR(tcp_ses->hostname))
- kfree(tcp_ses->hostname);
+ kfree(tcp_ses->hostname);
if (tcp_ses->ssocket)
sock_release(tcp_ses->ssocket);
kfree(tcp_ses);
@@ -1550,6 +1552,9 @@ static int match_session(struct cifs_ses *ses, struct smb3_fs_context *ctx)
/**
* cifs_setup_ipc - helper to setup the IPC tcon for the session
+ * @ses: smb session to issue the request on
+ * @ctx: the superblock configuration context to use for building the
+ * new tree connection for the IPC (interprocess communication RPC)
*
* A new IPC connection is made and stored in the session
* tcon_ipc. The IPC tcon has the same lifetime as the session.
@@ -1605,6 +1610,7 @@ out:
/**
* cifs_free_ipc - helper to release the session IPC tcon
+ * @ses: smb session to unmount the IPC from
*
* Needs to be called everytime a session is destroyed.
*
@@ -1855,6 +1861,8 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx __attribute__((unused)),
/**
* cifs_get_smb_ses - get a session matching @ctx data from @server
+ * @server: server to setup the session to
+ * @ctx: superblock configuration context to use to setup the session
*
* This function assumes it is being called from cifs_mount() where we
* already got a server reference (server refcount +1). See
@@ -2065,6 +2073,8 @@ cifs_put_tcon(struct cifs_tcon *tcon)
/**
* cifs_get_tcon - get a tcon matching @ctx data from @ses
+ * @ses: smb session to issue the request on
+ * @ctx: the superblock configuration context to use for building the
*
* - tcon refcount is the number of mount points using the tcon.
* - ses refcount is the number of tcon using the session.
@@ -2382,9 +2392,10 @@ cifs_match_super(struct super_block *sb, void *data)
spin_lock(&cifs_tcp_ses_lock);
cifs_sb = CIFS_SB(sb);
tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
- if (IS_ERR(tlink)) {
+ if (tlink == NULL) {
+ /* can not match superblock if tlink were ever null */
spin_unlock(&cifs_tcp_ses_lock);
- return rc;
+ return 0;
}
tcon = tlink_tcon(tlink);
ses = tcon->ses;
@@ -2638,11 +2649,12 @@ generic_ip_connect(struct TCP_Server_Info *server)
rc = 0;
if (rc < 0) {
cifs_dbg(FYI, "Error %d connecting to server\n", rc);
+ trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc);
sock_release(socket);
server->ssocket = NULL;
return rc;
}
-
+ trace_smb3_connect_done(server->hostname, server->conn_id, &server->dstaddr);
if (sport == htons(RFC1001_PORT))
rc = ip_rfc1001_connect(server);
@@ -3030,7 +3042,7 @@ build_unc_path_to_root(const struct smb3_fs_context *ctx,
return full_path;
}
-/**
+/*
* expand_dfs_referral - Perform a dfs referral query and update the cifs_sb
*
* If a referral is found, cifs_sb->ctx->mount_options will be (re-)allocated
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 5f8a302ffcb2..6e8e7cc26ae2 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/dir.c
*
* vfs operations that deal with dentries
*
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 8c616aaeb7c4..0458d28d71aa 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/dns_resolve.c
*
* Copyright (c) 2007 Igor Mammedov
* Author(s): Igor Mammedov (niallain@gmail.com)
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index 9fa2807ef79e..afc0df381246 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: LGPL-2.1 */
/*
- * fs/cifs/dns_resolve.h -- DNS Resolver upcall management for CIFS DFS
- * Handles host name to IP address resolution
+ * DNS Resolver upcall management for CIFS DFS
+ * Handles host name to IP address resolution
*
* Copyright (c) International Business Machines Corp., 2008
* Author(s): Steve French (sfrench@us.ibm.com)
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 747a540db954..37c28415df1e 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/export.c
*
* Copyright (C) International Business Machines Corp., 2007
* Author(s): Steve French (sfrench@us.ibm.com)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d0216472f1c6..1b855fcb179e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/file.c
*
* vfs operations that deal with files
*
@@ -883,8 +882,9 @@ int cifs_close(struct inode *inode, struct file *file)
dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL);
if ((cinode->oplock == CIFS_CACHE_RHW_FLG) &&
cinode->lease_granted &&
+ !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) &&
dclose) {
- if (test_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) {
+ if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) {
inode->i_ctime = inode->i_mtime = current_time(inode);
cifs_fscache_update_inode_cookie(inode);
}
@@ -1865,6 +1865,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag,
tcon->ses->server);
cifs_sb = CIFS_FILE_SB(file);
+ set_bit(CIFS_INO_CLOSE_ON_LOCK, &CIFS_I(d_inode(cfile->dentry))->flags);
if (cap_unix(tcon->ses) &&
(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
@@ -3112,7 +3113,7 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb;
struct dentry *dentry = ctx->cfile->dentry;
- int rc;
+ ssize_t rc;
tcon = tlink_tcon(ctx->cfile->tlink);
cifs_sb = CIFS_SB(dentry->d_sb);
@@ -3183,7 +3184,7 @@ restart_loop:
mutex_unlock(&ctx->aio_mutex);
if (ctx->iocb && ctx->iocb->ki_complete)
- ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+ ctx->iocb->ki_complete(ctx->iocb, ctx->rc);
else
complete(&ctx->done);
}
@@ -3916,7 +3917,7 @@ again:
mutex_unlock(&ctx->aio_mutex);
if (ctx->iocb && ctx->iocb->ki_complete)
- ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+ ctx->iocb->ki_complete(ctx->iocb, ctx->rc);
else
complete(&ctx->done);
}
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 3109def8e199..38d96a480745 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -116,6 +116,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_flag("nosharesock", Opt_nosharesock),
fsparam_flag_no("persistenthandles", Opt_persistent),
fsparam_flag_no("resilienthandles", Opt_resilient),
+ fsparam_flag_no("tcpnodelay", Opt_tcp_nodelay),
fsparam_flag("domainauto", Opt_domainauto),
fsparam_flag("rdma", Opt_rdma),
fsparam_flag("modesid", Opt_modesid),
@@ -318,6 +319,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
DUP_CTX_STR(mount_options);
DUP_CTX_STR(username);
DUP_CTX_STR(password);
+ DUP_CTX_STR(server_hostname);
DUP_CTX_STR(UNC);
DUP_CTX_STR(source);
DUP_CTX_STR(domainname);
@@ -456,6 +458,11 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx)
if (!pos)
return -EINVAL;
+ /* record the server hostname */
+ ctx->server_hostname = kstrndup(devname + 2, pos - devname - 2, GFP_KERNEL);
+ if (!ctx->server_hostname)
+ return -ENOMEM;
+
/* skip past delimiter */
++pos;
@@ -1383,6 +1390,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
}
}
break;
+ case Opt_tcp_nodelay:
+ /* tcp nodelay should not usually be needed since we CORK/UNCORK the socket */
+ if (result.negated)
+ ctx->sockopt_tcp_nodelay = false;
+ else
+ ctx->sockopt_tcp_nodelay = true;
+ break;
case Opt_domainauto:
ctx->domainauto = true;
break;
@@ -1496,6 +1510,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
ctx->username = NULL;
kfree_sensitive(ctx->password);
ctx->password = NULL;
+ kfree(ctx->server_hostname);
+ ctx->server_hostname = NULL;
kfree(ctx->UNC);
ctx->UNC = NULL;
kfree(ctx->source);
diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h
index a42ba71d7a81..b2d22cf9cb18 100644
--- a/fs/cifs/fs_context.h
+++ b/fs/cifs/fs_context.h
@@ -98,6 +98,7 @@ enum cifs_param {
Opt_nosharesock,
Opt_persistent,
Opt_resilient,
+ Opt_tcp_nodelay,
Opt_domainauto,
Opt_rdma,
Opt_modesid,
@@ -166,6 +167,7 @@ struct smb3_fs_context {
char *password;
char *domainname;
char *source;
+ char *server_hostname;
char *UNC;
char *nodename;
char *iocharset; /* local code page for mapping to and from Unicode */
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index fab47fa7df74..8eedd20c44ab 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/fscache.c - CIFS filesystem cache interface
+ * CIFS filesystem cache interface
*
* Copyright (c) 2010 Novell, Inc.
* Author(s): Suresh Jayaraman <sjayaraman@suse.de>
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index 82e856b9cf89..9baa1d0f22bd 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: LGPL-2.1 */
/*
- * fs/cifs/fscache.h - CIFS filesystem cache interface definitions
+ * CIFS filesystem cache interface definitions
*
* Copyright (c) 2010 Novell, Inc.
* Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 50c01cff4c84..82848412ad85 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/inode.c
*
* Copyright (C) International Business Machines Corp., 2002,2010
* Author(s): Steve French (sfrench@us.ibm.com)
@@ -1625,7 +1624,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
goto unlink_out;
}
- cifs_close_deferred_file(CIFS_I(inode));
+ cifs_close_deferred_file_under_dentry(tcon, full_path);
if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
rc = CIFSPOSIXDelFile(xid, tcon, full_path,
@@ -2114,9 +2113,9 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
goto cifs_rename_exit;
}
- cifs_close_deferred_file(CIFS_I(d_inode(source_dentry)));
+ cifs_close_deferred_file_under_dentry(tcon, from_name);
if (d_inode(target_dentry) != NULL)
- cifs_close_deferred_file(CIFS_I(d_inode(target_dentry)));
+ cifs_close_deferred_file_under_dentry(tcon, to_name);
rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
to_name);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 42c6a0bac6c8..0359b604bdbc 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/ioctl.c
*
* vfs operations that deal with io control
*
@@ -359,7 +358,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
if (pSMBFile == NULL)
break;
tcon = tlink_tcon(pSMBFile->tlink);
- caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
+ /* caps = le64_to_cpu(tcon->fsUnixInfo.Capability); */
if (get_user(ExtAttrBits, (int __user *)arg)) {
rc = -EFAULT;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index f0a6d63bc08c..852e54ee82c2 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/link.c
*
* Copyright (C) International Business Machines Corp., 2002,2008
* Author(s): Steve French (sfrench@us.ibm.com)
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 9469f1cf0b46..ba2c3e897b29 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/misc.c
*
* Copyright (C) International Business Machines Corp., 2002,2008
* Author(s): Steve French (sfrench@us.ibm.com)
@@ -153,7 +152,7 @@ cifs_buf_get(void)
* SMB2 header is bigger than CIFS one - no problems to clean some
* more bytes for CIFS.
*/
- size_t buf_size = sizeof(struct smb2_sync_hdr);
+ size_t buf_size = sizeof(struct smb2_hdr);
/*
* We could use negotiated size instead of max_msgsize -
@@ -265,7 +264,8 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
/* Uid is not converted */
buffer->Uid = treeCon->ses->Suid;
- buffer->Mid = get_next_mid(treeCon->ses->server);
+ if (treeCon->ses->server)
+ buffer->Mid = get_next_mid(treeCon->ses->server);
}
if (treeCon->Flags & SMB_SHARE_IS_IN_DFS)
buffer->Flags2 |= SMBFLG2_DFS;
@@ -591,6 +591,7 @@ void cifs_put_writer(struct cifsInodeInfo *cinode)
/**
* cifs_queue_oplock_break - queue the oplock break handler for cfile
+ * @cfile: The file to break the oplock on
*
* This function is called from the demultiplex thread when it
* receives an oplock break for @cfile.
@@ -736,7 +737,7 @@ cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode)
if (cancel_delayed_work(&cfile->deferred)) {
tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
if (tmp_list == NULL)
- continue;
+ break;
tmp_list->cfile = cfile;
list_add_tail(&tmp_list->list, &file_head);
}
@@ -767,7 +768,7 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon)
if (cancel_delayed_work(&cfile->deferred)) {
tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
if (tmp_list == NULL)
- continue;
+ break;
tmp_list->cfile = cfile;
list_add_tail(&tmp_list->list, &file_head);
}
@@ -781,6 +782,43 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon)
kfree(tmp_list);
}
}
+void
+cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
+{
+ struct cifsFileInfo *cfile;
+ struct list_head *tmp;
+ struct file_list *tmp_list, *tmp_next_list;
+ struct list_head file_head;
+ void *page;
+ const char *full_path;
+
+ INIT_LIST_HEAD(&file_head);
+ page = alloc_dentry_path();
+ spin_lock(&tcon->open_file_lock);
+ list_for_each(tmp, &tcon->openFileList) {
+ cfile = list_entry(tmp, struct cifsFileInfo, tlist);
+ full_path = build_path_from_dentry(cfile->dentry, page);
+ if (strstr(full_path, path)) {
+ if (delayed_work_pending(&cfile->deferred)) {
+ if (cancel_delayed_work(&cfile->deferred)) {
+ tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
+ if (tmp_list == NULL)
+ break;
+ tmp_list->cfile = cfile;
+ list_add_tail(&tmp_list->list, &file_head);
+ }
+ }
+ }
+ }
+ spin_unlock(&tcon->open_file_lock);
+
+ list_for_each_entry_safe(tmp_list, tmp_next_list, &file_head, list) {
+ _cifsFileInfo_put(tmp_list->cfile, true, false);
+ list_del(&tmp_list->list);
+ kfree(tmp_list);
+ }
+ free_dentry_path(page);
+}
/* parses DFS refferal V3 structure
* caller is responsible for freeing target_nodes
@@ -1029,6 +1067,9 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
/**
* cifs_alloc_hash - allocate hash and hash context together
+ * @name: The name of the crypto hash algo
+ * @shash: Where to put the pointer to the hash algo
+ * @sdesc: Where to put the pointer to the hash descriptor
*
* The caller has to make sure @sdesc is initialized to either NULL or
* a valid context. Both can be freed via cifs_free_hash().
@@ -1067,6 +1108,8 @@ cifs_alloc_hash(const char *name,
/**
* cifs_free_hash - free hash and hash context together
+ * @shash: Where to find the pointer to the hash algo
+ * @sdesc: Where to find the pointer to the hash descriptor
*
* Freeing a NULL hash or context is safe.
*/
@@ -1082,8 +1125,10 @@ cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc)
/**
* rqst_page_get_length - obtain the length and offset for a page in smb_rqst
- * Input: rqst - a smb_rqst, page - a page index for rqst
- * Output: *len - the length for this page, *offset - the offset for this page
+ * @rqst: The request descriptor
+ * @page: The index of the page to query
+ * @len: Where to store the length for this page:
+ * @offset: Where to store the offset for this page
*/
void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
unsigned int *len, unsigned int *offset)
@@ -1116,6 +1161,8 @@ void extract_unc_hostname(const char *unc, const char **h, size_t *len)
/**
* copy_path_name - copy src path to dst, possibly truncating
+ * @dst: The destination buffer
+ * @src: The source name
*
* returns number of bytes written (including trailing nul)
*/
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 0e728aac67e9..fa9fbd6a819c 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * fs/cifs/netmisc.c
*
* Copyright (c) International Business Machines Corp., 2002,2008
* Author(s): Steve French (sfrench@us.ibm.com)
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 378133ce8869..25a2b8ef88b9 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: LGPL-2.1 */
/*
- * fs/cifs/ntlmssp.h
*
* Copyright (c) International Business Machines Corp., 2002,2007
* Author(s): Steve French (sfrench@us.ibm.com)
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 54d77c99e21c..1929e80c09ee 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/readdir.c
*
* Directory search handling
*
diff --git a/fs/cifs/rfc1002pdu.h b/fs/cifs/rfc1002pdu.h
index 137f7c95afd6..ae1d025da294 100644
--- a/fs/cifs/rfc1002pdu.h
+++ b/fs/cifs/rfc1002pdu.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: LGPL-2.1 */
/*
- * fs/cifs/rfc1002pdu.h
*
* Protocol Data Unit definitions for RFC 1001/1002 support
*
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 118403fbeda2..23e02db7923f 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/sess.c
*
* SMB/CIFS session setup handling routines
*
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index c9d8a50062b8..f5dcc4940b6d 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/smb2file.c
*
* Copyright (C) International Business Machines Corp., 2002, 2011
* Author(s): Steve French (sfrench@us.ibm.com),
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index d0e9f3782bd9..ca692b2283cd 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: LGPL-2.1 */
/*
- * fs/cifs/smb2glob.h
*
* Definitions for various global variables and structures
*
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 957b2594f02e..8297703492ee 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/smb2inode.c
*
* Copyright (C) International Business Machines Corp., 2002, 2011
* Etersoft, 2012
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 181514b8770d..194799ddd382 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -2439,14 +2439,16 @@ smb2_print_status(__le32 status)
int
map_smb2_to_linux_error(char *buf, bool log_err)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
unsigned int i;
int rc = -EIO;
__le32 smb2err = shdr->Status;
if (smb2err == 0) {
- trace_smb3_cmd_done(shdr->TreeId, shdr->SessionId,
- le16_to_cpu(shdr->Command), le64_to_cpu(shdr->MessageId));
+ trace_smb3_cmd_done(le32_to_cpu(shdr->Id.SyncId.TreeId),
+ le64_to_cpu(shdr->SessionId),
+ le16_to_cpu(shdr->Command),
+ le64_to_cpu(shdr->MessageId));
return 0;
}
@@ -2470,8 +2472,10 @@ map_smb2_to_linux_error(char *buf, bool log_err)
cifs_dbg(FYI, "Mapping SMB2 status code 0x%08x to POSIX err %d\n",
__le32_to_cpu(smb2err), rc);
- trace_smb3_cmd_err(shdr->TreeId, shdr->SessionId,
- le16_to_cpu(shdr->Command),
- le64_to_cpu(shdr->MessageId), le32_to_cpu(smb2err), rc);
+ trace_smb3_cmd_err(le32_to_cpu(shdr->Id.SyncId.TreeId),
+ le64_to_cpu(shdr->SessionId),
+ le16_to_cpu(shdr->Command),
+ le64_to_cpu(shdr->MessageId),
+ le32_to_cpu(smb2err), rc);
return rc;
}
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 668f77108831..cdcdef32759e 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/smb2misc.c
*
* Copyright (C) International Business Machines Corp., 2002,2011
* Etersoft, 2012
@@ -9,7 +8,6 @@
*
*/
#include <linux/ctype.h>
-#include "smb2pdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
#include "smb2proto.h"
@@ -20,7 +18,7 @@
#include "nterr.h"
static int
-check_smb2_hdr(struct smb2_sync_hdr *shdr, __u64 mid)
+check_smb2_hdr(struct smb2_hdr *shdr, __u64 mid)
{
__u64 wire_mid = le64_to_cpu(shdr->MessageId);
@@ -82,9 +80,9 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
/* SMB2_OPLOCK_BREAK */ cpu_to_le16(24)
};
-#define SMB311_NEGPROT_BASE_SIZE (sizeof(struct smb2_sync_hdr) + sizeof(struct smb2_negotiate_rsp))
+#define SMB311_NEGPROT_BASE_SIZE (sizeof(struct smb2_hdr) + sizeof(struct smb2_negotiate_rsp))
-static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len,
+static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len,
__u32 non_ctxlen)
{
__u16 neg_count;
@@ -136,13 +134,13 @@ static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len,
int
smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
- struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)shdr;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
+ struct smb2_pdu *pdu = (struct smb2_pdu *)shdr;
__u64 mid;
__u32 clc_len; /* calculated length */
int command;
- int pdu_size = sizeof(struct smb2_sync_pdu);
- int hdr_size = sizeof(struct smb2_sync_hdr);
+ int pdu_size = sizeof(struct smb2_pdu);
+ int hdr_size = sizeof(struct smb2_hdr);
/*
* Add function to do table lookup of StructureSize by command
@@ -156,7 +154,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
/* decrypt frame now that it is completely read in */
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &srvr->smb_ses_list, smb_ses_list) {
- if (ses->Suid == thdr->SessionId)
+ if (ses->Suid == le64_to_cpu(thdr->SessionId))
break;
}
spin_unlock(&cifs_tcp_ses_lock);
@@ -297,7 +295,7 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = {
* area and the offset to it (from the beginning of the smb are also returned.
*/
char *
-smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr)
+smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr)
{
*off = 0;
*len = 0;
@@ -402,8 +400,8 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr)
unsigned int
smb2_calc_size(void *buf, struct TCP_Server_Info *srvr)
{
- struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)buf;
- struct smb2_sync_hdr *shdr = &pdu->sync_hdr;
+ struct smb2_pdu *pdu = (struct smb2_pdu *)buf;
+ struct smb2_hdr *shdr = &pdu->hdr;
int offset; /* the offset from the beginning of SMB to data area */
int data_length; /* the length of the variable length data area */
/* Structure Size has already been checked to make sure it is 64 */
@@ -670,7 +668,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
cifs_dbg(FYI, "Checking for oplock break\n");
- if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK)
+ if (rsp->hdr.Command != SMB2_OPLOCK_BREAK)
return false;
if (rsp->StructureSize !=
@@ -817,25 +815,25 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
int
smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *sync_hdr = mid->resp_buf;
+ struct smb2_hdr *hdr = mid->resp_buf;
struct smb2_create_rsp *rsp = mid->resp_buf;
struct cifs_tcon *tcon;
int rc;
- if ((mid->optype & CIFS_CP_CREATE_CLOSE_OP) || sync_hdr->Command != SMB2_CREATE ||
- sync_hdr->Status != STATUS_SUCCESS)
+ if ((mid->optype & CIFS_CP_CREATE_CLOSE_OP) || hdr->Command != SMB2_CREATE ||
+ hdr->Status != STATUS_SUCCESS)
return 0;
- tcon = smb2_find_smb_tcon(server, sync_hdr->SessionId,
- sync_hdr->TreeId);
+ tcon = smb2_find_smb_tcon(server, le64_to_cpu(hdr->SessionId),
+ le32_to_cpu(hdr->Id.SyncId.TreeId));
if (!tcon)
return -ENOENT;
rc = __smb2_handle_cancelled_cmd(tcon,
- le16_to_cpu(sync_hdr->Command),
- le64_to_cpu(sync_hdr->MessageId),
- rsp->PersistentFileId,
- rsp->VolatileFileId);
+ le16_to_cpu(hdr->Command),
+ le64_to_cpu(hdr->MessageId),
+ le64_to_cpu(rsp->PersistentFileId),
+ le64_to_cpu(rsp->VolatileFileId));
if (rc)
cifs_put_tcon(tcon);
@@ -857,10 +855,10 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec)
{
int i, rc;
struct sdesc *d;
- struct smb2_sync_hdr *hdr;
+ struct smb2_hdr *hdr;
struct TCP_Server_Info *server = cifs_ses_server(ses);
- hdr = (struct smb2_sync_hdr *)iov[0].iov_base;
+ hdr = (struct smb2_hdr *)iov[0].iov_base;
/* neg prot are always taken */
if (hdr->Command == SMB2_NEGOTIATE)
goto ok;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index bda606dc72b1..7acf71defea7 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -325,7 +325,7 @@ static struct mid_q_entry *
__smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue)
{
struct mid_q_entry *mid;
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
__u64 wire_mid = le64_to_cpu(shdr->MessageId);
if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
@@ -367,11 +367,11 @@ static void
smb2_dump_detail(void *buf, struct TCP_Server_Info *server)
{
#ifdef CONFIG_CIFS_DEBUG2
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
cifs_server_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n",
shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId,
- shdr->ProcessId);
+ shdr->Id.SyncId.ProcessId);
cifs_server_dbg(VFS, "smb buf %p len %u\n", buf,
server->ops->calc_smb_size(buf, server));
#endif
@@ -885,10 +885,10 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
atomic_inc(&tcon->num_remote_opens);
o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
- oparms.fid->persistent_fid = o_rsp->PersistentFileId;
- oparms.fid->volatile_fid = o_rsp->VolatileFileId;
+ oparms.fid->persistent_fid = le64_to_cpu(o_rsp->PersistentFileId);
+ oparms.fid->volatile_fid = le64_to_cpu(o_rsp->VolatileFileId);
#ifdef CONFIG_CIFS_DEBUG2
- oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId);
+ oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId);
#endif /* CIFS_DEBUG2 */
tcon->crfid.tcon = tcon;
@@ -2391,12 +2391,12 @@ again:
/* If the open failed there is nothing to do */
op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
- if (op_rsp == NULL || op_rsp->sync_hdr.Status != STATUS_SUCCESS) {
+ if (op_rsp == NULL || op_rsp->hdr.Status != STATUS_SUCCESS) {
cifs_dbg(FYI, "query_dir_first: open failed rc=%d\n", rc);
goto qdf_free;
}
- fid->persistent_fid = op_rsp->PersistentFileId;
- fid->volatile_fid = op_rsp->VolatileFileId;
+ fid->persistent_fid = le64_to_cpu(op_rsp->PersistentFileId);
+ fid->volatile_fid = le64_to_cpu(op_rsp->VolatileFileId);
/* Anything else than ENODATA means a genuine error */
if (rc && rc != -ENODATA) {
@@ -2410,7 +2410,7 @@ again:
atomic_inc(&tcon->num_remote_opens);
qd_rsp = (struct smb2_query_directory_rsp *)rsp_iov[1].iov_base;
- if (qd_rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
+ if (qd_rsp->hdr.Status == STATUS_NO_MORE_FILES) {
trace_smb3_query_dir_done(xid, fid->persistent_fid,
tcon->tid, tcon->ses->Suid, 0, 0);
srch_inf->endOfSearch = true;
@@ -2462,7 +2462,7 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
static bool
smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
int scredits, in_flight;
if (shdr->Status != STATUS_PENDING)
@@ -2489,13 +2489,14 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
static bool
smb2_is_session_expired(char *buf)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED &&
shdr->Status != STATUS_USER_SESSION_DELETED)
return false;
- trace_smb3_ses_expired(shdr->TreeId, shdr->SessionId,
+ trace_smb3_ses_expired(le32_to_cpu(shdr->Id.SyncId.TreeId),
+ le64_to_cpu(shdr->SessionId),
le16_to_cpu(shdr->Command),
le64_to_cpu(shdr->MessageId));
cifs_dbg(FYI, "Session expired or deleted\n");
@@ -2506,7 +2507,7 @@ smb2_is_session_expired(char *buf)
static bool
smb2_is_status_io_timeout(char *buf)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
if (shdr->Status == STATUS_IO_TIMEOUT)
return true;
@@ -2517,7 +2518,7 @@ smb2_is_status_io_timeout(char *buf)
static void
smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
struct list_head *tmp, *tmp1;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
@@ -2530,7 +2531,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
list_for_each(tmp1, &ses->tcon_list) {
tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
- if (tcon->tid == shdr->TreeId) {
+ if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) {
tcon->need_reconnect = true;
spin_unlock(&cifs_tcp_ses_lock);
pr_warn_once("Server share %s deleted.\n",
@@ -2558,9 +2559,9 @@ smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
void
smb2_set_related(struct smb_rqst *rqst)
{
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
- shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base);
+ shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base);
if (shdr == NULL) {
cifs_dbg(FYI, "shdr NULL in smb2_set_related\n");
return;
@@ -2573,13 +2574,13 @@ char smb2_padding[7] = {0, 0, 0, 0, 0, 0, 0};
void
smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
{
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
struct cifs_ses *ses = tcon->ses;
struct TCP_Server_Info *server = ses->server;
unsigned long len = smb_rqst_len(server, rqst);
int i, num_padding;
- shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base);
+ shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base);
if (shdr == NULL) {
cifs_dbg(FYI, "shdr NULL in smb2_set_next_command\n");
return;
@@ -3124,7 +3125,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
resp_buftype, rsp_iov);
create_rsp = rsp_iov[0].iov_base;
- if (create_rsp && create_rsp->sync_hdr.Status)
+ if (create_rsp && create_rsp->hdr.Status)
err_iov = rsp_iov[0];
ioctl_rsp = rsp_iov[1].iov_base;
@@ -4369,8 +4370,8 @@ static void
fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
struct smb_rqst *old_rq, __le16 cipher_type)
{
- struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)old_rq->rq_iov[0].iov_base;
+ struct smb2_hdr *shdr =
+ (struct smb2_hdr *)old_rq->rq_iov[0].iov_base;
memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr));
tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM;
@@ -4496,7 +4497,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
struct crypto_aead *tfm;
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
- rc = smb2_get_enc_key(server, tr_hdr->SessionId, enc, key);
+ rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not get %scryption key\n", __func__,
enc ? "en" : "de");
@@ -4788,7 +4789,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
unsigned int cur_page_idx;
unsigned int pad_len;
struct cifs_readdata *rdata = mid->callback_data;
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
struct bio_vec *bvec = NULL;
struct iov_iter iter;
struct kvec iov;
@@ -5117,7 +5118,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
{
int ret, length;
char *buf = server->smallbuf;
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
unsigned int pdu_length = server->pdu_size;
unsigned int buf_size;
struct mid_q_entry *mid_entry;
@@ -5147,7 +5148,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
next_is_large = server->large_buf;
one_more:
- shdr = (struct smb2_sync_hdr *)buf;
+ shdr = (struct smb2_hdr *)buf;
if (shdr->NextCommand) {
if (next_is_large)
next_buffer = (char *)cifs_buf_get();
@@ -5213,7 +5214,7 @@ smb3_receive_transform(struct TCP_Server_Info *server,
unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
if (pdu_length < sizeof(struct smb2_transform_hdr) +
- sizeof(struct smb2_sync_hdr)) {
+ sizeof(struct smb2_hdr)) {
cifs_server_dbg(VFS, "Transform message is too small (%u)\n",
pdu_length);
cifs_reconnect(server);
@@ -5246,7 +5247,7 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid)
static int
smb2_next_header(char *buf)
{
- struct smb2_sync_hdr *hdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
struct smb2_transform_hdr *t_hdr = (struct smb2_transform_hdr *)buf;
if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM)
@@ -5788,7 +5789,7 @@ struct smb_version_values smb20_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5809,7 +5810,7 @@ struct smb_version_values smb21_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5830,7 +5831,7 @@ struct smb_version_values smb3any_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5851,7 +5852,7 @@ struct smb_version_values smbdefault_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5872,7 +5873,7 @@ struct smb_version_values smb30_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5893,7 +5894,7 @@ struct smb_version_values smb302_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5914,7 +5915,7 @@ struct smb_version_values smb311_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index b6d2e3591927..d2ecb2ea37c0 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/smb2pdu.c
*
* Copyright (C) International Business Machines Corp., 2009, 2013
* Etersoft, 2012
@@ -24,7 +23,6 @@
#include <linux/uuid.h>
#include <linux/pagemap.h>
#include <linux/xattr.h>
-#include "smb2pdu.h"
#include "cifsglob.h"
#include "cifsacl.h"
#include "cifsproto.h"
@@ -85,7 +83,7 @@ int smb3_encryption_required(const struct cifs_tcon *tcon)
}
static void
-smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
+smb2_hdr_assemble(struct smb2_hdr *shdr, __le16 smb2_cmd,
const struct cifs_tcon *tcon,
struct TCP_Server_Info *server)
{
@@ -105,7 +103,7 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
} else {
shdr->CreditRequest = cpu_to_le16(2);
}
- shdr->ProcessId = cpu_to_le32((__u16)current->tgid);
+ shdr->Id.SyncId.ProcessId = cpu_to_le32((__u16)current->tgid);
if (!tcon)
goto out;
@@ -116,10 +114,10 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
shdr->CreditCharge = cpu_to_le16(1);
/* else CreditCharge MBZ */
- shdr->TreeId = tcon->tid;
+ shdr->Id.SyncId.TreeId = cpu_to_le32(tcon->tid);
/* Uid is not converted */
if (tcon->ses)
- shdr->SessionId = tcon->ses->Suid;
+ shdr->SessionId = cpu_to_le64(tcon->ses->Suid);
/*
* If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have
@@ -332,7 +330,7 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon,
void *buf,
unsigned int *total_len)
{
- struct smb2_sync_pdu *spdu = (struct smb2_sync_pdu *)buf;
+ struct smb2_pdu *spdu = (struct smb2_pdu *)buf;
/* lookup word count ie StructureSize from table */
__u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_command)];
@@ -342,10 +340,10 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon,
*/
memset(buf, 0, 256);
- smb2_hdr_assemble(&spdu->sync_hdr, smb2_command, tcon, server);
+ smb2_hdr_assemble(&spdu->hdr, smb2_command, tcon, server);
spdu->StructureSize2 = cpu_to_le16(parmsize);
- *total_len = parmsize + sizeof(struct smb2_sync_hdr);
+ *total_len = parmsize + sizeof(struct smb2_hdr);
}
/*
@@ -368,7 +366,7 @@ static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
}
fill_small_buf(smb2_command, tcon, server,
- (struct smb2_sync_hdr *)(*request_buf),
+ (struct smb2_hdr *)(*request_buf),
total_len);
if (tcon != NULL) {
@@ -415,8 +413,8 @@ build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt)
pneg_ctxt->ContextType = SMB2_PREAUTH_INTEGRITY_CAPABILITIES;
pneg_ctxt->DataLength = cpu_to_le16(38);
pneg_ctxt->HashAlgorithmCount = cpu_to_le16(1);
- pneg_ctxt->SaltLength = cpu_to_le16(SMB311_LINUX_CLIENT_SALT_SIZE);
- get_random_bytes(pneg_ctxt->Salt, SMB311_LINUX_CLIENT_SALT_SIZE);
+ pneg_ctxt->SaltLength = cpu_to_le16(SMB311_SALT_SIZE);
+ get_random_bytes(pneg_ctxt->Salt, SMB311_SALT_SIZE);
pneg_ctxt->HashAlgorithms = SMB2_PREAUTH_INTEGRITY_SHA512;
}
@@ -858,7 +856,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
if (rc)
return rc;
- req->sync_hdr.SessionId = 0;
+ req->hdr.SessionId = 0;
memset(server->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE);
memset(ses->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE);
@@ -1019,7 +1017,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
server->cipher_type = SMB2_ENCRYPTION_AES128_CCM;
security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
- (struct smb2_sync_hdr *)rsp);
+ (struct smb2_hdr *)rsp);
/*
* See MS-SMB2 section 2.2.4: if no blob, client picks default which
* for us will be
@@ -1251,23 +1249,23 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
return rc;
if (sess_data->ses->binding) {
- req->sync_hdr.SessionId = sess_data->ses->Suid;
- req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ req->hdr.SessionId = cpu_to_le64(sess_data->ses->Suid);
+ req->hdr.Flags |= SMB2_FLAGS_SIGNED;
req->PreviousSessionId = 0;
req->Flags = SMB2_SESSION_REQ_FLAG_BINDING;
} else {
/* First session, not a reauthenticate */
- req->sync_hdr.SessionId = 0;
+ req->hdr.SessionId = 0;
/*
* if reconnect, we need to send previous sess id
* otherwise it is 0
*/
- req->PreviousSessionId = sess_data->previous_session;
+ req->PreviousSessionId = cpu_to_le64(sess_data->previous_session);
req->Flags = 0; /* MBZ */
}
/* enough to enable echos and oplocks and one max size write */
- req->sync_hdr.CreditRequest = cpu_to_le16(130);
+ req->hdr.CreditRequest = cpu_to_le16(130);
/* only one of SMB2 signing flags may be set in SMB2 request */
if (server->sign)
@@ -1426,7 +1424,7 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
/* keep session id and flags if binding */
if (!ses->binding) {
- ses->Suid = rsp->sync_hdr.SessionId;
+ ses->Suid = le64_to_cpu(rsp->hdr.SessionId);
ses->session_flags = le16_to_cpu(rsp->SessionFlags);
}
@@ -1502,7 +1500,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
/* If true, rc here is expected and not an error */
if (sess_data->buf0_type != CIFS_NO_BUFFER &&
- rsp->sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED)
+ rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED)
rc = 0;
if (rc)
@@ -1524,7 +1522,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
/* keep existing ses id and flags if binding */
if (!ses->binding) {
- ses->Suid = rsp->sync_hdr.SessionId;
+ ses->Suid = le64_to_cpu(rsp->hdr.SessionId);
ses->session_flags = le16_to_cpu(rsp->SessionFlags);
}
@@ -1559,7 +1557,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
goto out;
req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base;
- req->sync_hdr.SessionId = ses->Suid;
+ req->hdr.SessionId = cpu_to_le64(ses->Suid);
rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses,
sess_data->nls_cp);
@@ -1585,7 +1583,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
/* keep existing ses id and flags if binding */
if (!ses->binding) {
- ses->Suid = rsp->sync_hdr.SessionId;
+ ses->Suid = le64_to_cpu(rsp->hdr.SessionId);
ses->session_flags = le16_to_cpu(rsp->SessionFlags);
}
@@ -1716,12 +1714,12 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
return rc;
/* since no tcon, smb2_init can not do this, so do here */
- req->sync_hdr.SessionId = ses->Suid;
+ req->hdr.SessionId = cpu_to_le64(ses->Suid);
if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
flags |= CIFS_TRANSFORM_REQ;
else if (server->sign)
- req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ req->hdr.Flags |= SMB2_FLAGS_SIGNED;
flags |= CIFS_NO_RSP_BUF;
@@ -1829,14 +1827,14 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
!(ses->session_flags &
(SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) &&
((ses->user_name != NULL) || (ses->sectype == Kerberos)))
- req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ req->hdr.Flags |= SMB2_FLAGS_SIGNED;
memset(&rqst, 0, sizeof(struct smb_rqst));
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
/* Need 64 for max size write so ask for more in case not there yet */
- req->sync_hdr.CreditRequest = cpu_to_le16(64);
+ req->hdr.CreditRequest = cpu_to_le16(64);
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
@@ -1872,7 +1870,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
tcon->tidStatus = CifsGood;
tcon->need_reconnect = false;
- tcon->tid = rsp->sync_hdr.TreeId;
+ tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId);
strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
@@ -1893,9 +1891,8 @@ tcon_exit:
return rc;
tcon_error_exit:
- if (rsp && rsp->sync_hdr.Status == STATUS_BAD_NETWORK_NAME) {
+ if (rsp && rsp->hdr.Status == STATUS_BAD_NETWORK_NAME)
cifs_tcon_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
- }
goto tcon_exit;
}
@@ -2398,7 +2395,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
buf->sd.OffsetDacl = cpu_to_le32(ptr - (__u8 *)&buf->sd);
/* Ship the ACL for now. we will copy it into buf later. */
aclptr = ptr;
- ptr += sizeof(struct cifs_acl);
+ ptr += sizeof(struct smb3_acl);
/* create one ACE to hold the mode embedded in reserved special SID */
acelen = setup_special_mode_ACE((struct cifs_ace *)ptr, (__u64)mode);
@@ -2423,7 +2420,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
acl.AclRevision = ACL_REVISION; /* See 2.4.4.1 of MS-DTYP */
acl.AclSize = cpu_to_le16(acl_size);
acl.AceCount = cpu_to_le16(ace_count);
- memcpy(aclptr, &acl, sizeof(struct cifs_acl));
+ memcpy(aclptr, &acl, sizeof(struct smb3_acl));
buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd);
*len = roundup(ptr - (__u8 *)buf, 8);
@@ -2609,7 +2606,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
if (tcon->share_flags & SHI1005_FLAGS_DFS) {
int name_len;
- req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
+ req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
rc = alloc_path_with_tree_prefix(&copy_path, &copy_size,
&name_len,
tcon->treeName, utf16_path);
@@ -2673,11 +2670,13 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
}
rsp = (struct smb2_create_rsp *)rsp_iov.iov_base;
- trace_smb3_posix_mkdir_done(xid, rsp->PersistentFileId, tcon->tid,
+ trace_smb3_posix_mkdir_done(xid, le64_to_cpu(rsp->PersistentFileId),
+ tcon->tid,
ses->Suid, CREATE_NOT_FILE,
FILE_WRITE_ATTRIBUTES);
- SMB2_close(xid, tcon, rsp->PersistentFileId, rsp->VolatileFileId);
+ SMB2_close(xid, tcon, le64_to_cpu(rsp->PersistentFileId),
+ le64_to_cpu(rsp->VolatileFileId));
/* Eventually save off posix specific response info and timestaps */
@@ -2741,7 +2740,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
if (tcon->share_flags & SHI1005_FLAGS_DFS) {
int name_len;
- req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
+ req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
rc = alloc_path_with_tree_prefix(&copy_path, &copy_size,
&name_len,
tcon->treeName, path);
@@ -2944,16 +2943,17 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
}
goto creat_exit;
} else
- trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid,
+ trace_smb3_open_done(xid, le64_to_cpu(rsp->PersistentFileId),
+ tcon->tid,
ses->Suid, oparms->create_options,
oparms->desired_access);
atomic_inc(&tcon->num_remote_opens);
- oparms->fid->persistent_fid = rsp->PersistentFileId;
- oparms->fid->volatile_fid = rsp->VolatileFileId;
+ oparms->fid->persistent_fid = le64_to_cpu(rsp->PersistentFileId);
+ oparms->fid->volatile_fid = le64_to_cpu(rsp->VolatileFileId);
oparms->fid->access = oparms->desired_access;
#ifdef CONFIG_CIFS_DEBUG2
- oparms->fid->mid = le64_to_cpu(rsp->sync_hdr.MessageId);
+ oparms->fid->mid = le64_to_cpu(rsp->hdr.MessageId);
#endif /* CIFS_DEBUG2 */
if (buf) {
@@ -3053,7 +3053,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
* response size smaller.
*/
req->MaxOutputResponse = cpu_to_le32(max_response_size);
- req->sync_hdr.CreditCharge =
+ req->hdr.CreditCharge =
cpu_to_le16(DIV_ROUND_UP(max(indatalen, max_response_size),
SMB2_MAX_BUFFER_SIZE));
if (is_fsctl)
@@ -3063,7 +3063,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
/* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */
if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO)
- req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ req->hdr.Flags |= SMB2_FLAGS_SIGNED;
return 0;
}
@@ -3237,8 +3237,8 @@ SMB2_close_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
if (rc)
return rc;
- req->PersistentFileId = persistent_fid;
- req->VolatileFileId = volatile_fid;
+ req->PersistentFileId = cpu_to_le64(persistent_fid);
+ req->VolatileFileId = cpu_to_le64(volatile_fid);
if (query_attrs)
req->Flags = SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB;
else
@@ -3601,8 +3601,8 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst,
if (rc)
return rc;
- req->PersistentFileId = persistent_fid;
- req->VolatileFileId = volatile_fid;
+ req->PersistentFileId = cpu_to_le64(persistent_fid);
+ req->VolatileFileId = cpu_to_le64(volatile_fid);
/* See note 354 of MS-SMB2, 64K max */
req->OutputBufferLength =
cpu_to_le32(SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE);
@@ -3688,7 +3688,7 @@ smb2_echo_callback(struct mid_q_entry *mid)
if (mid->mid_state == MID_RESPONSE_RECEIVED
|| mid->mid_state == MID_RESPONSE_MALFORMED) {
- credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
credits.instance = server->reconnect_instance;
}
@@ -3788,7 +3788,7 @@ SMB2_echo(struct TCP_Server_Info *server)
if (rc)
return rc;
- req->sync_hdr.CreditRequest = cpu_to_le16(1);
+ req->hdr.CreditRequest = cpu_to_le16(1);
iov[0].iov_len = total_len;
iov[0].iov_base = (char *)req;
@@ -3824,8 +3824,8 @@ SMB2_flush_init(const unsigned int xid, struct smb_rqst *rqst,
if (rc)
return rc;
- req->PersistentFileId = persistent_fid;
- req->VolatileFileId = volatile_fid;
+ req->PersistentFileId = cpu_to_le64(persistent_fid);
+ req->VolatileFileId = cpu_to_le64(volatile_fid);
iov[0].iov_base = (char *)req;
iov[0].iov_len = total_len;
@@ -3891,8 +3891,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
unsigned int remaining_bytes, int request_type)
{
int rc = -EACCES;
- struct smb2_read_plain_req *req = NULL;
- struct smb2_sync_hdr *shdr;
+ struct smb2_read_req *req = NULL;
+ struct smb2_hdr *shdr;
struct TCP_Server_Info *server = io_parms->server;
rc = smb2_plain_req_init(SMB2_READ, io_parms->tcon, server,
@@ -3903,11 +3903,11 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
if (server == NULL)
return -ECONNABORTED;
- shdr = &req->sync_hdr;
- shdr->ProcessId = cpu_to_le32(io_parms->pid);
+ shdr = &req->hdr;
+ shdr->Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid);
- req->PersistentFileId = io_parms->persistent_fid;
- req->VolatileFileId = io_parms->volatile_fid;
+ req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid);
+ req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid);
req->ReadChannelInfoOffset = 0; /* reserved */
req->ReadChannelInfoLength = 0; /* reserved */
req->Channel = 0; /* reserved */
@@ -3941,7 +3941,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
if (need_invalidate)
req->Channel = SMB2_CHANNEL_RDMA_V1;
req->ReadChannelInfoOffset =
- cpu_to_le16(offsetof(struct smb2_read_plain_req, Buffer));
+ cpu_to_le16(offsetof(struct smb2_read_req, Buffer));
req->ReadChannelInfoLength =
cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1));
v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0];
@@ -3965,10 +3965,10 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
* Related requests use info from previous read request
* in chain.
*/
- shdr->SessionId = 0xFFFFFFFFFFFFFFFF;
- shdr->TreeId = 0xFFFFFFFF;
- req->PersistentFileId = 0xFFFFFFFFFFFFFFFF;
- req->VolatileFileId = 0xFFFFFFFFFFFFFFFF;
+ shdr->SessionId = cpu_to_le64(0xFFFFFFFFFFFFFFFF);
+ shdr->Id.SyncId.TreeId = cpu_to_le32(0xFFFFFFFF);
+ req->PersistentFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF);
+ req->VolatileFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF);
}
}
if (remaining_bytes > io_parms->length)
@@ -3986,8 +3986,8 @@ smb2_readv_callback(struct mid_q_entry *mid)
struct cifs_readdata *rdata = mid->callback_data;
struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
struct TCP_Server_Info *server = rdata->server;
- struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rdata->iov[0].iov_base;
+ struct smb2_hdr *shdr =
+ (struct smb2_hdr *)rdata->iov[0].iov_base;
struct cifs_credits credits = { .value = 0, .instance = 0 };
struct smb_rqst rqst = { .rq_iov = &rdata->iov[1],
.rq_nvec = 1,
@@ -4073,7 +4073,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
{
int rc, flags = 0;
char *buf;
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
struct cifs_io_parms io_parms;
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 1 };
@@ -4106,7 +4106,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
rdata->iov[0].iov_base = buf;
rdata->iov[0].iov_len = total_len;
- shdr = (struct smb2_sync_hdr *)buf;
+ shdr = (struct smb2_hdr *)buf;
if (rdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
@@ -4145,7 +4145,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
{
struct smb_rqst rqst;
int resp_buftype, rc;
- struct smb2_read_plain_req *req = NULL;
+ struct smb2_read_req *req = NULL;
struct smb2_read_rsp *rsp = NULL;
struct kvec iov[1];
struct kvec rsp_iov;
@@ -4179,19 +4179,22 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
if (rc != -ENODATA) {
cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
cifs_dbg(VFS, "Send error in read = %d\n", rc);
- trace_smb3_read_err(xid, req->PersistentFileId,
+ trace_smb3_read_err(xid,
+ le64_to_cpu(req->PersistentFileId),
io_parms->tcon->tid, ses->Suid,
io_parms->offset, io_parms->length,
rc);
} else
- trace_smb3_read_done(xid, req->PersistentFileId,
- io_parms->tcon->tid, ses->Suid,
- io_parms->offset, 0);
+ trace_smb3_read_done(xid,
+ le64_to_cpu(req->PersistentFileId),
+ io_parms->tcon->tid, ses->Suid,
+ io_parms->offset, 0);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
cifs_small_buf_release(req);
return rc == -ENODATA ? 0 : rc;
} else
- trace_smb3_read_done(xid, req->PersistentFileId,
+ trace_smb3_read_done(xid,
+ le64_to_cpu(req->PersistentFileId),
io_parms->tcon->tid, ses->Suid,
io_parms->offset, io_parms->length);
@@ -4239,7 +4242,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
- credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
credits.instance = server->reconnect_instance;
wdata->result = smb2_check_receive(mid, server, 0);
if (wdata->result != 0)
@@ -4265,7 +4268,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->result = -EAGAIN;
break;
case MID_RESPONSE_MALFORMED:
- credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
credits.instance = server->reconnect_instance;
fallthrough;
default:
@@ -4312,7 +4315,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
{
int rc = -EACCES, flags = 0;
struct smb2_write_req *req = NULL;
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
struct TCP_Server_Info *server = wdata->server;
struct kvec iov[1];
@@ -4330,11 +4333,11 @@ smb2_async_writev(struct cifs_writedata *wdata,
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- shdr = (struct smb2_sync_hdr *)req;
- shdr->ProcessId = cpu_to_le32(wdata->cfile->pid);
+ shdr = (struct smb2_hdr *)req;
+ shdr->Id.SyncId.ProcessId = cpu_to_le32(wdata->cfile->pid);
- req->PersistentFileId = wdata->cfile->fid.persistent_fid;
- req->VolatileFileId = wdata->cfile->fid.volatile_fid;
+ req->PersistentFileId = cpu_to_le64(wdata->cfile->fid.persistent_fid);
+ req->VolatileFileId = cpu_to_le64(wdata->cfile->fid.volatile_fid);
req->WriteChannelInfoOffset = 0;
req->WriteChannelInfoLength = 0;
req->Channel = 0;
@@ -4431,7 +4434,8 @@ smb2_async_writev(struct cifs_writedata *wdata,
wdata, flags, &wdata->credits);
if (rc) {
- trace_smb3_write_err(0 /* no xid */, req->PersistentFileId,
+ trace_smb3_write_err(0 /* no xid */,
+ le64_to_cpu(req->PersistentFileId),
tcon->tid, tcon->ses->Suid, wdata->offset,
wdata->bytes, rc);
kref_put(&wdata->refcount, release);
@@ -4482,10 +4486,10 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
if (smb3_encryption_required(io_parms->tcon))
flags |= CIFS_TRANSFORM_REQ;
- req->sync_hdr.ProcessId = cpu_to_le32(io_parms->pid);
+ req->hdr.Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid);
- req->PersistentFileId = io_parms->persistent_fid;
- req->VolatileFileId = io_parms->volatile_fid;
+ req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid);
+ req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid);
req->WriteChannelInfoOffset = 0;
req->WriteChannelInfoLength = 0;
req->Channel = 0;
@@ -4513,7 +4517,8 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
rsp = (struct smb2_write_rsp *)rsp_iov.iov_base;
if (rc) {
- trace_smb3_write_err(xid, req->PersistentFileId,
+ trace_smb3_write_err(xid,
+ le64_to_cpu(req->PersistentFileId),
io_parms->tcon->tid,
io_parms->tcon->ses->Suid,
io_parms->offset, io_parms->length, rc);
@@ -4521,10 +4526,11 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
cifs_dbg(VFS, "Send error in write = %d\n", rc);
} else {
*nbytes = le32_to_cpu(rsp->DataLength);
- trace_smb3_write_done(xid, req->PersistentFileId,
- io_parms->tcon->tid,
- io_parms->tcon->ses->Suid,
- io_parms->offset, *nbytes);
+ trace_smb3_write_done(xid,
+ le64_to_cpu(req->PersistentFileId),
+ io_parms->tcon->tid,
+ io_parms->tcon->ses->Suid,
+ io_parms->offset, *nbytes);
}
cifs_small_buf_release(req);
@@ -4867,7 +4873,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
if (rc == -ENODATA &&
- rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
+ rsp->hdr.Status == STATUS_NO_MORE_FILES) {
trace_smb3_query_dir_done(xid, persistent_fid,
tcon->tid, tcon->ses->Suid, index, 0);
srch_inf->endOfSearch = true;
@@ -4915,7 +4921,7 @@ SMB2_set_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
if (rc)
return rc;
- req->sync_hdr.ProcessId = cpu_to_le32(pid);
+ req->hdr.Id.SyncId.ProcessId = cpu_to_le32(pid);
req->InfoType = info_type;
req->FileInfoClass = info_class;
req->PersistentFileId = persistent_fid;
@@ -5075,7 +5081,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
req->VolatileFid = volatile_fid;
req->PersistentFid = persistent_fid;
req->OplockLevel = oplock_level;
- req->sync_hdr.CreditRequest = cpu_to_le16(1);
+ req->hdr.CreditRequest = cpu_to_le16(1);
flags |= CIFS_NO_RSP_BUF;
@@ -5377,7 +5383,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- req->sync_hdr.ProcessId = cpu_to_le32(pid);
+ req->hdr.Id.SyncId.ProcessId = cpu_to_le32(pid);
req->LockCount = cpu_to_le16(num_lock);
req->PersistentFileId = persist_fid;
@@ -5453,7 +5459,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- req->sync_hdr.CreditRequest = cpu_to_le16(1);
+ req->hdr.CreditRequest = cpu_to_le16(1);
req->StructureSize = cpu_to_le16(36);
total_len += 12;
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index e9cac7970b66..33cfd0a1adf1 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: LGPL-2.1 */
/*
- * fs/cifs/smb2pdu.h
*
* Copyright (c) International Business Machines Corp., 2009, 2013
* Etersoft, 2012
@@ -15,156 +14,12 @@
#include <net/sock.h>
#include "cifsacl.h"
-/*
- * Note that, due to trying to use names similar to the protocol specifications,
- * there are many mixed case field names in the structures below. Although
- * this does not match typical Linux kernel style, it is necessary to be
- * able to match against the protocol specfication.
- *
- * SMB2 commands
- * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
- * (ie no useful data other than the SMB error code itself) and are marked such.
- * Knowing this helps avoid response buffer allocations and copy in some cases.
- */
-
-/* List of commands in host endian */
-#define SMB2_NEGOTIATE_HE 0x0000
-#define SMB2_SESSION_SETUP_HE 0x0001
-#define SMB2_LOGOFF_HE 0x0002 /* trivial request/resp */
-#define SMB2_TREE_CONNECT_HE 0x0003
-#define SMB2_TREE_DISCONNECT_HE 0x0004 /* trivial req/resp */
-#define SMB2_CREATE_HE 0x0005
-#define SMB2_CLOSE_HE 0x0006
-#define SMB2_FLUSH_HE 0x0007 /* trivial resp */
-#define SMB2_READ_HE 0x0008
-#define SMB2_WRITE_HE 0x0009
-#define SMB2_LOCK_HE 0x000A
-#define SMB2_IOCTL_HE 0x000B
-#define SMB2_CANCEL_HE 0x000C
-#define SMB2_ECHO_HE 0x000D
-#define SMB2_QUERY_DIRECTORY_HE 0x000E
-#define SMB2_CHANGE_NOTIFY_HE 0x000F
-#define SMB2_QUERY_INFO_HE 0x0010
-#define SMB2_SET_INFO_HE 0x0011
-#define SMB2_OPLOCK_BREAK_HE 0x0012
-
-/* The same list in little endian */
-#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE)
-#define SMB2_SESSION_SETUP cpu_to_le16(SMB2_SESSION_SETUP_HE)
-#define SMB2_LOGOFF cpu_to_le16(SMB2_LOGOFF_HE)
-#define SMB2_TREE_CONNECT cpu_to_le16(SMB2_TREE_CONNECT_HE)
-#define SMB2_TREE_DISCONNECT cpu_to_le16(SMB2_TREE_DISCONNECT_HE)
-#define SMB2_CREATE cpu_to_le16(SMB2_CREATE_HE)
-#define SMB2_CLOSE cpu_to_le16(SMB2_CLOSE_HE)
-#define SMB2_FLUSH cpu_to_le16(SMB2_FLUSH_HE)
-#define SMB2_READ cpu_to_le16(SMB2_READ_HE)
-#define SMB2_WRITE cpu_to_le16(SMB2_WRITE_HE)
-#define SMB2_LOCK cpu_to_le16(SMB2_LOCK_HE)
-#define SMB2_IOCTL cpu_to_le16(SMB2_IOCTL_HE)
-#define SMB2_CANCEL cpu_to_le16(SMB2_CANCEL_HE)
-#define SMB2_ECHO cpu_to_le16(SMB2_ECHO_HE)
-#define SMB2_QUERY_DIRECTORY cpu_to_le16(SMB2_QUERY_DIRECTORY_HE)
-#define SMB2_CHANGE_NOTIFY cpu_to_le16(SMB2_CHANGE_NOTIFY_HE)
-#define SMB2_QUERY_INFO cpu_to_le16(SMB2_QUERY_INFO_HE)
-#define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE)
-#define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE)
-
-#define SMB2_INTERNAL_CMD cpu_to_le16(0xFFFF)
-
-#define NUMBER_OF_SMB2_COMMANDS 0x0013
-
/* 52 transform hdr + 64 hdr + 88 create rsp */
#define SMB2_TRANSFORM_HEADER_SIZE 52
#define MAX_SMB2_HDR_SIZE 204
-#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
-#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
-#define SMB2_COMPRESSION_TRANSFORM_ID cpu_to_le32(0x424d53fc)
-
-/*
- * SMB2 Header Definition
- *
- * "MBZ" : Must be Zero
- * "BB" : BugBug, Something to check/review/analyze later
- * "PDU" : "Protocol Data Unit" (ie a network "frame")
- *
- */
-
-#define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64)
-
-struct smb2_sync_hdr {
- __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */
- __le16 StructureSize; /* 64 */
- __le16 CreditCharge; /* MBZ */
- __le32 Status; /* Error from server */
- __le16 Command;
- __le16 CreditRequest; /* CreditResponse */
- __le32 Flags;
- __le32 NextCommand;
- __le64 MessageId;
- __le32 ProcessId;
- __u32 TreeId; /* opaque - so do not make little endian */
- __u64 SessionId; /* opaque - so do not make little endian */
- __u8 Signature[16];
-} __packed;
-
/* The total header size for SMB2 read and write */
-#define SMB2_READWRITE_PDU_HEADER_SIZE (48 + sizeof(struct smb2_sync_hdr))
-
-struct smb2_sync_pdu {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize2; /* size of wct area (varies, request specific) */
-} __packed;
-
-#define SMB3_AES_CCM_NONCE 11
-#define SMB3_AES_GCM_NONCE 12
-
-/* Transform flags (for 3.0 dialect this flag indicates CCM */
-#define TRANSFORM_FLAG_ENCRYPTED 0x0001
-struct smb2_transform_hdr {
- __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */
- __u8 Signature[16];
- __u8 Nonce[16];
- __le32 OriginalMessageSize;
- __u16 Reserved1;
- __le16 Flags; /* EncryptionAlgorithm for 3.0, enc enabled for 3.1.1 */
- __u64 SessionId;
-} __packed;
-
-/* See MS-SMB2 2.2.42 */
-struct smb2_compression_transform_hdr_unchained {
- __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
- __le32 OriginalCompressedSegmentSize;
- __le16 CompressionAlgorithm;
- __le16 Flags;
- __le16 Length; /* if chained it is length, else offset */
-} __packed;
-
-/* See MS-SMB2 2.2.42.1 */
-#define SMB2_COMPRESSION_FLAG_NONE 0x0000
-#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001
-
-struct compression_payload_header {
- __le16 CompressionAlgorithm;
- __le16 Flags;
- __le32 Length; /* length of compressed playload including field below if present */
- /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */
-} __packed;
-
-/* See MS-SMB2 2.2.42.2 */
-struct smb2_compression_transform_hdr_chained {
- __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
- __le32 OriginalCompressedSegmentSize;
- /* struct compression_payload_header[] */
-} __packed;
-
-/* See MS-SMB2 2.2.42.2.2 */
-struct compression_pattern_payload_v1 {
- __le16 Pattern;
- __le16 Reserved1;
- __le16 Reserved2;
- __le32 Repetitions;
-} __packed;
+#define SMB2_READWRITE_PDU_HEADER_SIZE (48 + sizeof(struct smb2_hdr))
/* See MS-SMB2 2.2.43 */
struct smb2_rdma_transform {
@@ -191,17 +46,6 @@ struct smb2_rdma_crypto_transform {
} __packed;
/*
- * SMB2 flag definitions
- */
-#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001)
-#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002)
-#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004)
-#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008)
-#define SMB2_FLAGS_PRIORITY_MASK cpu_to_le32(0x00000070) /* SMB3.1.1 */
-#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000)
-#define SMB2_FLAGS_REPLAY_OPERATION cpu_to_le32(0x20000000) /* SMB3 & up */
-
-/*
* Definitions for SMB2 Protocol Data Units (network frames)
*
* See MS-SMB2.PDF specification for protocol details.
@@ -215,7 +59,7 @@ struct smb2_rdma_crypto_transform {
#define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9)
struct smb2_err_rsp {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize;
__le16 Reserved; /* MBZ */
__le32 ByteCount; /* even if zero, at least one byte follows */
@@ -271,530 +115,6 @@ struct share_redirect_error_context_rsp {
/* __u8 ResourceName[] */ /* Name of share as counted Unicode string */
} __packed;
-#define SMB2_CLIENT_GUID_SIZE 16
-
-struct smb2_negotiate_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 36 */
- __le16 DialectCount;
- __le16 SecurityMode;
- __le16 Reserved; /* MBZ */
- __le32 Capabilities;
- __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE];
- /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */
- __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */
- __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */
- __le16 Reserved2;
- __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */
-} __packed;
-
-/* Dialects */
-#define SMB10_PROT_ID 0x0000 /* local only, not sent on wire w/CIFS negprot */
-#define SMB20_PROT_ID 0x0202
-#define SMB21_PROT_ID 0x0210
-#define SMB30_PROT_ID 0x0300
-#define SMB302_PROT_ID 0x0302
-#define SMB311_PROT_ID 0x0311
-#define BAD_PROT_ID 0xFFFF
-
-/* SecurityMode flags */
-#define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001
-#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002
-#define SMB2_SEC_MODE_FLAGS_ALL 0x0003
-
-/* Capabilities flags */
-#define SMB2_GLOBAL_CAP_DFS 0x00000001
-#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */
-#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */
-#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */
-#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */
-#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */
-#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */
-/* Internal types */
-#define SMB2_NT_FIND 0x00100000
-#define SMB2_LARGE_FILES 0x00200000
-
-
-/* Negotiate Contexts - ContextTypes. See MS-SMB2 section 2.2.3.1 for details */
-#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1)
-#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2)
-#define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3)
-#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5)
-#define SMB2_TRANSPORT_CAPABILITIES cpu_to_le16(6)
-#define SMB2_RDMA_TRANSFORM_CAPABILITIES cpu_to_le16(7)
-#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8)
-#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100)
-
-struct smb2_neg_context {
- __le16 ContextType;
- __le16 DataLength;
- __le32 Reserved;
- /* Followed by array of data. NOTE: some servers require padding to 8 byte boundary */
-} __packed;
-
-#define SMB311_LINUX_CLIENT_SALT_SIZE 32
-/* Hash Algorithm Types */
-#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001)
-#define SMB2_PREAUTH_HASH_SIZE 64
-
-/*
- * SaltLength that the server send can be zero, so the only three required
- * fields (all __le16) end up six bytes total, so the minimum context data len
- * in the response is six bytes which accounts for
- *
- * HashAlgorithmCount, SaltLength, and 1 HashAlgorithm.
- */
-#define MIN_PREAUTH_CTXT_DATA_LEN 6
-
-struct smb2_preauth_neg_context {
- __le16 ContextType; /* 1 */
- __le16 DataLength;
- __le32 Reserved;
- __le16 HashAlgorithmCount; /* 1 */
- __le16 SaltLength;
- __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */
- __u8 Salt[SMB311_LINUX_CLIENT_SALT_SIZE];
-} __packed;
-
-/* Encryption Algorithms Ciphers */
-#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001)
-#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002)
-/* we currently do not request AES256_CCM since presumably GCM faster */
-#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003)
-#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004)
-
-/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */
-#define MIN_ENCRYPT_CTXT_DATA_LEN 4
-struct smb2_encryption_neg_context {
- __le16 ContextType; /* 2 */
- __le16 DataLength;
- __le32 Reserved;
- /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */
- __le16 CipherCount; /* AES128-GCM and AES128-CCM by default */
- __le16 Ciphers[3];
-} __packed;
-
-/* See MS-SMB2 2.2.3.1.3 */
-#define SMB3_COMPRESS_NONE cpu_to_le16(0x0000)
-#define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001)
-#define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002)
-#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003)
-/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */
-#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) /* Pattern_V1 */
-
-/* Compression Flags */
-#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000)
-#define SMB2_COMPRESSION_CAPABILITIES_FLAG_CHAINED cpu_to_le32(0x00000001)
-
-struct smb2_compression_capabilities_context {
- __le16 ContextType; /* 3 */
- __le16 DataLength;
- __u32 Reserved;
- __le16 CompressionAlgorithmCount;
- __u16 Padding;
- __u32 Flags;
- __le16 CompressionAlgorithms[3];
- __u16 Pad; /* Some servers require pad to DataLen multiple of 8 */
- /* Check if pad needed */
-} __packed;
-
-/*
- * For smb2_netname_negotiate_context_id See MS-SMB2 2.2.3.1.4.
- * Its struct simply contains NetName, an array of Unicode characters
- */
-struct smb2_netname_neg_context {
- __le16 ContextType; /* 5 */
- __le16 DataLength;
- __le32 Reserved;
- __le16 NetName[]; /* hostname of target converted to UCS-2 */
-} __packed;
-
-/*
- * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5
- * and 2.2.4.1.5
- */
-
-/* Flags */
-#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001
-
-struct smb2_transport_capabilities_context {
- __le16 ContextType; /* 6 */
- __le16 DataLength;
- __u32 Reserved;
- __le32 Flags;
- __u32 Pad;
-} __packed;
-
-/*
- * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6
- * and 2.2.4.1.6
- */
-
-/* RDMA Transform IDs */
-#define SMB2_RDMA_TRANSFORM_NONE 0x0000
-#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001
-#define SMB2_RDMA_TRANSFORM_SIGNING 0x0002
-
-struct smb2_rdma_transform_capabilities_context {
- __le16 ContextType; /* 7 */
- __le16 DataLength;
- __u32 Reserved;
- __le16 TransformCount;
- __u16 Reserved1;
- __u32 Reserved2;
- __le16 RDMATransformIds[];
-} __packed;
-
-/*
- * For signing capabilities context see MS-SMB2 2.2.3.1.7
- * and 2.2.4.1.7
- */
-
-/* Signing algorithms */
-#define SIGNING_ALG_HMAC_SHA256 0
-#define SIGNING_ALG_AES_CMAC 1
-#define SIGNING_ALG_AES_GMAC 2
-
-struct smb2_signing_capabilities {
- __le16 ContextType; /* 8 */
- __le16 DataLength;
- __u32 Reserved;
- __le16 SigningAlgorithmCount;
- __le16 SigningAlgorithms[];
- /* Followed by padding to 8 byte boundary (required by some servers) */
-} __packed;
-
-#define POSIX_CTXT_DATA_LEN 16
-struct smb2_posix_neg_context {
- __le16 ContextType; /* 0x100 */
- __le16 DataLength;
- __le32 Reserved;
- __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */
-} __packed;
-
-struct smb2_negotiate_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 65 */
- __le16 SecurityMode;
- __le16 DialectRevision;
- __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */
- __u8 ServerGUID[16];
- __le32 Capabilities;
- __le32 MaxTransactSize;
- __le32 MaxReadSize;
- __le32 MaxWriteSize;
- __le64 SystemTime; /* MBZ */
- __le64 ServerStartTime;
- __le16 SecurityBufferOffset;
- __le16 SecurityBufferLength;
- __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */
- __u8 Buffer[1]; /* variable length GSS security buffer */
-} __packed;
-
-/* Flags */
-#define SMB2_SESSION_REQ_FLAG_BINDING 0x01
-#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04
-
-struct smb2_sess_setup_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 25 */
- __u8 Flags;
- __u8 SecurityMode;
- __le32 Capabilities;
- __le32 Channel;
- __le16 SecurityBufferOffset;
- __le16 SecurityBufferLength;
- __u64 PreviousSessionId;
- __u8 Buffer[1]; /* variable length GSS security buffer */
-} __packed;
-
-/* Currently defined SessionFlags */
-#define SMB2_SESSION_FLAG_IS_GUEST 0x0001
-#define SMB2_SESSION_FLAG_IS_NULL 0x0002
-#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004
-struct smb2_sess_setup_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 SessionFlags;
- __le16 SecurityBufferOffset;
- __le16 SecurityBufferLength;
- __u8 Buffer[1]; /* variable length GSS security buffer */
-} __packed;
-
-struct smb2_logoff_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-struct smb2_logoff_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-/* Flags/Reserved for SMB3.1.1 */
-#define SMB2_TREE_CONNECT_FLAG_CLUSTER_RECONNECT cpu_to_le16(0x0001)
-#define SMB2_TREE_CONNECT_FLAG_REDIRECT_TO_OWNER cpu_to_le16(0x0002)
-#define SMB2_TREE_CONNECT_FLAG_EXTENSION_PRESENT cpu_to_le16(0x0004)
-
-struct smb2_tree_connect_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 Flags; /* Reserved MBZ for dialects prior to SMB3.1.1 */
- __le16 PathOffset;
- __le16 PathLength;
- __u8 Buffer[1]; /* variable length */
-} __packed;
-
-/* See MS-SMB2 section 2.2.9.2 */
-/* Context Types */
-#define SMB2_RESERVED_TREE_CONNECT_CONTEXT_ID 0x0000
-#define SMB2_REMOTED_IDENTITY_TREE_CONNECT_CONTEXT_ID cpu_to_le16(0x0001)
-
-struct tree_connect_contexts {
- __le16 ContextType;
- __le16 DataLength;
- __le32 Reserved;
- __u8 Data[];
-} __packed;
-
-/* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */
-struct smb3_blob_data {
- __le16 BlobSize;
- __u8 BlobData[];
-} __packed;
-
-/* Valid values for Attr */
-#define SE_GROUP_MANDATORY 0x00000001
-#define SE_GROUP_ENABLED_BY_DEFAULT 0x00000002
-#define SE_GROUP_ENABLED 0x00000004
-#define SE_GROUP_OWNER 0x00000008
-#define SE_GROUP_USE_FOR_DENY_ONLY 0x00000010
-#define SE_GROUP_INTEGRITY 0x00000020
-#define SE_GROUP_INTEGRITY_ENABLED 0x00000040
-#define SE_GROUP_RESOURCE 0x20000000
-#define SE_GROUP_LOGON_ID 0xC0000000
-
-/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */
-
-struct sid_array_data {
- __le16 SidAttrCount;
- /* SidAttrList - array of sid_attr_data structs */
-} __packed;
-
-struct luid_attr_data {
-
-} __packed;
-
-/*
- * struct privilege_data is the same as BLOB_DATA - see MS-SMB2 2.2.9.2.1.5
- * but with size of LUID_ATTR_DATA struct and BlobData set to LUID_ATTR DATA
- */
-
-struct privilege_array_data {
- __le16 PrivilegeCount;
- /* array of privilege_data structs */
-} __packed;
-
-struct remoted_identity_tcon_context {
- __le16 TicketType; /* must be 0x0001 */
- __le16 TicketSize; /* total size of this struct */
- __le16 User; /* offset to SID_ATTR_DATA struct with user info */
- __le16 UserName; /* offset to null terminated Unicode username string */
- __le16 Domain; /* offset to null terminated Unicode domain name */
- __le16 Groups; /* offset to SID_ARRAY_DATA struct with group info */
- __le16 RestrictedGroups; /* similar to above */
- __le16 Privileges; /* offset to PRIVILEGE_ARRAY_DATA struct */
- __le16 PrimaryGroup; /* offset to SID_ARRAY_DATA struct */
- __le16 Owner; /* offset to BLOB_DATA struct */
- __le16 DefaultDacl; /* offset to BLOB_DATA struct */
- __le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */
- __le16 UserClaims; /* offset to BLOB_DATA struct */
- __le16 DeviceClaims; /* offset to BLOB_DATA struct */
- __u8 TicketInfo[]; /* variable length buf - remoted identity data */
-} __packed;
-
-struct smb2_tree_connect_req_extension {
- __le32 TreeConnectContextOffset;
- __le16 TreeConnectContextCount;
- __u8 Reserved[10];
- __u8 PathName[]; /* variable sized array */
- /* followed by array of TreeConnectContexts */
-} __packed;
-
-struct smb2_tree_connect_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 16 */
- __u8 ShareType; /* see below */
- __u8 Reserved;
- __le32 ShareFlags; /* see below */
- __le32 Capabilities; /* see below */
- __le32 MaximalAccess;
-} __packed;
-
-/* Possible ShareType values */
-#define SMB2_SHARE_TYPE_DISK 0x01
-#define SMB2_SHARE_TYPE_PIPE 0x02
-#define SMB2_SHARE_TYPE_PRINT 0x03
-
-/*
- * Possible ShareFlags - exactly one and only one of the first 4 caching flags
- * must be set (any of the remaining, SHI1005, flags may be set individually
- * or in combination.
- */
-#define SMB2_SHAREFLAG_MANUAL_CACHING 0x00000000
-#define SMB2_SHAREFLAG_AUTO_CACHING 0x00000010
-#define SMB2_SHAREFLAG_VDO_CACHING 0x00000020
-#define SMB2_SHAREFLAG_NO_CACHING 0x00000030
-#define SHI1005_FLAGS_DFS 0x00000001
-#define SHI1005_FLAGS_DFS_ROOT 0x00000002
-#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100
-#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200
-#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400
-#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800
-#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000
-#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000
-#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000
-#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000
-#define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */
-#define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */
-#define SHI1005_FLAGS_ALL 0x0014FF33
-
-/* Possible share capabilities */
-#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */
-#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */
-#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */
-#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */
-#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
-#define SMB2_SHARE_CAP_REDIRECT_TO_OWNER cpu_to_le32(0x00000100) /* 3.1.1 */
-
-struct smb2_tree_disconnect_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-struct smb2_tree_disconnect_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-/* File Attrubutes */
-#define FILE_ATTRIBUTE_READONLY 0x00000001
-#define FILE_ATTRIBUTE_HIDDEN 0x00000002
-#define FILE_ATTRIBUTE_SYSTEM 0x00000004
-#define FILE_ATTRIBUTE_DIRECTORY 0x00000010
-#define FILE_ATTRIBUTE_ARCHIVE 0x00000020
-#define FILE_ATTRIBUTE_NORMAL 0x00000080
-#define FILE_ATTRIBUTE_TEMPORARY 0x00000100
-#define FILE_ATTRIBUTE_SPARSE_FILE 0x00000200
-#define FILE_ATTRIBUTE_REPARSE_POINT 0x00000400
-#define FILE_ATTRIBUTE_COMPRESSED 0x00000800
-#define FILE_ATTRIBUTE_OFFLINE 0x00001000
-#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x00002000
-#define FILE_ATTRIBUTE_ENCRYPTED 0x00004000
-#define FILE_ATTRIBUTE_INTEGRITY_STREAM 0x00008000
-#define FILE_ATTRIBUTE_NO_SCRUB_DATA 0x00020000
-
-/* Oplock levels */
-#define SMB2_OPLOCK_LEVEL_NONE 0x00
-#define SMB2_OPLOCK_LEVEL_II 0x01
-#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08
-#define SMB2_OPLOCK_LEVEL_BATCH 0x09
-#define SMB2_OPLOCK_LEVEL_LEASE 0xFF
-/* Non-spec internal type */
-#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99
-
-/* Desired Access Flags */
-#define FILE_READ_DATA_LE cpu_to_le32(0x00000001)
-#define FILE_WRITE_DATA_LE cpu_to_le32(0x00000002)
-#define FILE_APPEND_DATA_LE cpu_to_le32(0x00000004)
-#define FILE_READ_EA_LE cpu_to_le32(0x00000008)
-#define FILE_WRITE_EA_LE cpu_to_le32(0x00000010)
-#define FILE_EXECUTE_LE cpu_to_le32(0x00000020)
-#define FILE_READ_ATTRIBUTES_LE cpu_to_le32(0x00000080)
-#define FILE_WRITE_ATTRIBUTES_LE cpu_to_le32(0x00000100)
-#define FILE_DELETE_LE cpu_to_le32(0x00010000)
-#define FILE_READ_CONTROL_LE cpu_to_le32(0x00020000)
-#define FILE_WRITE_DAC_LE cpu_to_le32(0x00040000)
-#define FILE_WRITE_OWNER_LE cpu_to_le32(0x00080000)
-#define FILE_SYNCHRONIZE_LE cpu_to_le32(0x00100000)
-#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000)
-#define FILE_MAXIMAL_ACCESS_LE cpu_to_le32(0x02000000)
-#define FILE_GENERIC_ALL_LE cpu_to_le32(0x10000000)
-#define FILE_GENERIC_EXECUTE_LE cpu_to_le32(0x20000000)
-#define FILE_GENERIC_WRITE_LE cpu_to_le32(0x40000000)
-#define FILE_GENERIC_READ_LE cpu_to_le32(0x80000000)
-
-/* ShareAccess Flags */
-#define FILE_SHARE_READ_LE cpu_to_le32(0x00000001)
-#define FILE_SHARE_WRITE_LE cpu_to_le32(0x00000002)
-#define FILE_SHARE_DELETE_LE cpu_to_le32(0x00000004)
-#define FILE_SHARE_ALL_LE cpu_to_le32(0x00000007)
-
-/* CreateDisposition Flags */
-#define FILE_SUPERSEDE_LE cpu_to_le32(0x00000000)
-#define FILE_OPEN_LE cpu_to_le32(0x00000001)
-#define FILE_CREATE_LE cpu_to_le32(0x00000002)
-#define FILE_OPEN_IF_LE cpu_to_le32(0x00000003)
-#define FILE_OVERWRITE_LE cpu_to_le32(0x00000004)
-#define FILE_OVERWRITE_IF_LE cpu_to_le32(0x00000005)
-
-/* CreateOptions Flags */
-#define FILE_DIRECTORY_FILE_LE cpu_to_le32(0x00000001)
-/* same as #define CREATE_NOT_FILE_LE cpu_to_le32(0x00000001) */
-#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002)
-#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004)
-#define FILE_NO_INTERMEDIATE_BUFFERRING_LE cpu_to_le32(0x00000008)
-#define FILE_SYNCHRONOUS_IO_ALERT_LE cpu_to_le32(0x00000010)
-#define FILE_SYNCHRONOUS_IO_NON_ALERT_LE cpu_to_le32(0x00000020)
-#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040)
-#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100)
-#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200)
-#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800)
-#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000)
-#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000)
-#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000)
-#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000)
-#define FILE_RESERVE_OPFILTER_LE cpu_to_le32(0x00100000)
-#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000)
-#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000)
-#define FILE_OPEN_FOR_FREE_SPACE_QUERY_LE cpu_to_le32(0x00800000)
-
-#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \
- | FILE_READ_ATTRIBUTES_LE)
-#define FILE_WRITE_RIGHTS_LE (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE \
- | FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE)
-#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE)
-
-/* Impersonation Levels. See MS-WPO section 9.7 and MSDN-IMPERS */
-#define IL_ANONYMOUS cpu_to_le32(0x00000000)
-#define IL_IDENTIFICATION cpu_to_le32(0x00000001)
-#define IL_IMPERSONATION cpu_to_le32(0x00000002)
-#define IL_DELEGATE cpu_to_le32(0x00000003)
-
-/* Create Context Values */
-#define SMB2_CREATE_EA_BUFFER "ExtA" /* extended attributes */
-#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */
-#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ"
-#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC"
-#define SMB2_CREATE_ALLOCATION_SIZE "AISi"
-#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc"
-#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp"
-#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid"
-#define SMB2_CREATE_REQUEST_LEASE "RqLs"
-#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q"
-#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C"
-#define SMB2_CREATE_APP_INSTANCE_ID 0x45BCA66AEFA7F74A9008FA462E144D74
-#define SMB2_CREATE_APP_INSTANCE_VERSION 0xB982D0B73B56074FA07B524A8116A010
-#define SVHDX_OPEN_DEVICE_CONTEX 0x9CCBCF9E04C1E643980E158DA1F6EC83
-#define SMB2_CREATE_TAG_POSIX 0x93AD25509CB411E7B42383DE968BCD7C
-
-/* Flag (SMB3 open response) values */
-#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01
-
/*
* Maximum number of iovs we need for an open/create request.
* [0] : struct smb2_create_req
@@ -808,26 +128,6 @@ struct smb2_tree_disconnect_rsp {
*/
#define SMB2_CREATE_IOV_SIZE 8
-struct smb2_create_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 57 */
- __u8 SecurityFlags;
- __u8 RequestedOplockLevel;
- __le32 ImpersonationLevel;
- __le64 SmbCreateFlags;
- __le64 Reserved;
- __le32 DesiredAccess;
- __le32 FileAttributes;
- __le32 ShareAccess;
- __le32 CreateDisposition;
- __le32 CreateOptions;
- __le16 NameOffset;
- __le16 NameLength;
- __le32 CreateContextsOffset;
- __le32 CreateContextsLength;
- __u8 Buffer[];
-} __packed;
-
/*
* Maximum size of a SMB2_CREATE response is 64 (smb2 header) +
* 88 (fixed part of create response) + 520 (path) + 208 (contexts) +
@@ -835,37 +135,6 @@ struct smb2_create_req {
*/
#define MAX_SMB2_CREATE_RESPONSE_SIZE 880
-struct smb2_create_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 89 */
- __u8 OplockLevel;
- __u8 Flag; /* 0x01 if reparse point */
- __le32 CreateAction;
- __le64 CreationTime;
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le64 AllocationSize;
- __le64 EndofFile;
- __le32 FileAttributes;
- __le32 Reserved2;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le32 CreateContextsOffset;
- __le32 CreateContextsLength;
- __u8 Buffer[1];
-} __packed;
-
-struct create_context {
- __le32 Next;
- __le16 NameOffset;
- __le16 NameLength;
- __le16 Reserved;
- __le16 DataOffset;
- __le32 DataLength;
- __u8 Buffer[];
-} __packed;
-
#define SMB2_LEASE_READ_CACHING_HE 0x01
#define SMB2_LEASE_HANDLE_CACHING_HE 0x02
#define SMB2_LEASE_WRITE_CACHING_HE 0x04
@@ -1211,7 +480,7 @@ struct duplicate_extents_to_file {
#define SMB2_IOCTL_IOV_SIZE 2
struct smb2_ioctl_req {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 57 */
__u16 Reserved;
__le32 CtlCode;
@@ -1229,7 +498,7 @@ struct smb2_ioctl_req {
} __packed;
struct smb2_ioctl_rsp {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 57 */
__u16 Reserved;
__le32 CtlCode;
@@ -1244,161 +513,6 @@ struct smb2_ioctl_rsp {
/* char * buffer[] */
} __packed;
-/* Currently defined values for close flags */
-#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001)
-struct smb2_close_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 24 */
- __le16 Flags;
- __le32 Reserved;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
-} __packed;
-
-/*
- * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data)
- */
-#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124
-
-struct smb2_close_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* 60 */
- __le16 Flags;
- __le32 Reserved;
- __le64 CreationTime;
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */
- __le64 EndOfFile;
- __le32 Attributes;
-} __packed;
-
-struct smb2_flush_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 24 */
- __le16 Reserved1;
- __le32 Reserved2;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
-} __packed;
-
-struct smb2_flush_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize;
- __le16 Reserved;
-} __packed;
-
-/* For read request Flags field below, following flag is defined for SMB3.02 */
-#define SMB2_READFLAG_READ_UNBUFFERED 0x01
-#define SMB2_READFLAG_REQUEST_COMPRESSED 0x02 /* See MS-SMB2 2.2.19 */
-
-/* Channel field for read and write: exactly one of following flags can be set*/
-#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000)
-#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001) /* SMB3 or later */
-#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) /* >= SMB3.02 */
-#define SMB2_CHANNEL_RDMA_TRANSFORM cpu_to_le32(0x00000003) /* >= SMB3.02, only used on write */
-
-/* SMB2 read request without RFC1001 length at the beginning */
-struct smb2_read_plain_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 49 */
- __u8 Padding; /* offset from start of SMB2 header to place read */
- __u8 Flags; /* MBZ unless SMB3.02 or later */
- __le32 Length;
- __le64 Offset;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le32 MinimumCount;
- __le32 Channel; /* MBZ except for SMB3 or later */
- __le32 RemainingBytes;
- __le16 ReadChannelInfoOffset;
- __le16 ReadChannelInfoLength;
- __u8 Buffer[1];
-} __packed;
-
-/* Read flags */
-#define SMB2_READFLAG_RESPONSE_NONE 0x00000000
-#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM 0x00000001
-
-struct smb2_read_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 17 */
- __u8 DataOffset;
- __u8 Reserved;
- __le32 DataLength;
- __le32 DataRemaining;
- __u32 Flags;
- __u8 Buffer[1];
-} __packed;
-
-/* For write request Flags field below the following flags are defined: */
-#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */
-#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */
-
-struct smb2_write_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 49 */
- __le16 DataOffset; /* offset from start of SMB2 header to write data */
- __le32 Length;
- __le64 Offset;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le32 Channel; /* MBZ unless SMB3.02 or later */
- __le32 RemainingBytes;
- __le16 WriteChannelInfoOffset;
- __le16 WriteChannelInfoLength;
- __le32 Flags;
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_write_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 17 */
- __u8 DataOffset;
- __u8 Reserved;
- __le32 DataLength;
- __le32 DataRemaining;
- __u32 Reserved2;
- __u8 Buffer[1];
-} __packed;
-
-/* notify flags */
-#define SMB2_WATCH_TREE 0x0001
-
-/* notify completion filter flags. See MS-FSCC 2.6 and MS-SMB2 2.2.35 */
-#define FILE_NOTIFY_CHANGE_FILE_NAME 0x00000001
-#define FILE_NOTIFY_CHANGE_DIR_NAME 0x00000002
-#define FILE_NOTIFY_CHANGE_ATTRIBUTES 0x00000004
-#define FILE_NOTIFY_CHANGE_SIZE 0x00000008
-#define FILE_NOTIFY_CHANGE_LAST_WRITE 0x00000010
-#define FILE_NOTIFY_CHANGE_LAST_ACCESS 0x00000020
-#define FILE_NOTIFY_CHANGE_CREATION 0x00000040
-#define FILE_NOTIFY_CHANGE_EA 0x00000080
-#define FILE_NOTIFY_CHANGE_SECURITY 0x00000100
-#define FILE_NOTIFY_CHANGE_STREAM_NAME 0x00000200
-#define FILE_NOTIFY_CHANGE_STREAM_SIZE 0x00000400
-#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800
-
-struct smb2_change_notify_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize;
- __le16 Flags;
- __le32 OutputBufferLength;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le32 CompletionFilter;
- __u32 Reserved;
-} __packed;
-
-struct smb2_change_notify_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 OutputBufferOffset;
- __le32 OutputBufferLength;
- __u8 Buffer[1]; /* array of file notify structs */
-} __packed;
-
#define SMB2_LOCKFLAG_SHARED_LOCK 0x0001
#define SMB2_LOCKFLAG_EXCLUSIVE_LOCK 0x0002
#define SMB2_LOCKFLAG_UNLOCK 0x0004
@@ -1412,7 +526,7 @@ struct smb2_lock_element {
} __packed;
struct smb2_lock_req {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 48 */
__le16 LockCount;
/*
@@ -1427,19 +541,19 @@ struct smb2_lock_req {
} __packed;
struct smb2_lock_rsp {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 4 */
__le16 Reserved;
} __packed;
struct smb2_echo_req {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 4 */
__u16 Reserved;
} __packed;
struct smb2_echo_rsp {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 4 */
__u16 Reserved;
} __packed;
@@ -1469,7 +583,7 @@ struct smb2_echo_rsp {
*/
struct smb2_query_directory_req {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 33 */
__u8 FileInformationClass;
__u8 Flags;
@@ -1483,7 +597,7 @@ struct smb2_query_directory_req {
} __packed;
struct smb2_query_directory_rsp {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 9 */
__le16 OutputBufferOffset;
__le32 OutputBufferLength;
@@ -1516,7 +630,7 @@ struct smb2_query_directory_rsp {
#define SL_INDEX_SPECIFIED 0x00000004
struct smb2_query_info_req {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 41 */
__u8 InfoType;
__u8 FileInfoClass;
@@ -1532,7 +646,7 @@ struct smb2_query_info_req {
} __packed;
struct smb2_query_info_rsp {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 9 */
__le16 OutputBufferOffset;
__le32 OutputBufferLength;
@@ -1549,7 +663,7 @@ struct smb2_query_info_rsp {
#define SMB2_SET_INFO_IOV_SIZE 3
struct smb2_set_info_req {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 33 */
__u8 InfoType;
__u8 FileInfoClass;
@@ -1563,12 +677,12 @@ struct smb2_set_info_req {
} __packed;
struct smb2_set_info_rsp {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 2 */
} __packed;
struct smb2_oplock_break {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 24 */
__u8 OplockLevel;
__u8 Reserved;
@@ -1580,7 +694,7 @@ struct smb2_oplock_break {
#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01)
struct smb2_lease_break {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 44 */
__le16 Epoch;
__le32 Flags;
@@ -1593,7 +707,7 @@ struct smb2_lease_break {
} __packed;
struct smb2_lease_ack {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 36 */
__le16 Reserved;
__le32 Flags;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 263767f644f8..096fada16ebd 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: LGPL-2.1 */
/*
- * fs/cifs/smb2proto.h
*
* Copyright (c) International Business Machines Corp., 2002, 2011
* Etersoft, 2012
@@ -26,7 +25,7 @@ extern int smb2_check_message(char *buf, unsigned int length,
struct TCP_Server_Info *server);
extern unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *server);
extern char *smb2_get_data_area_len(int *off, int *len,
- struct smb2_sync_hdr *shdr);
+ struct smb2_hdr *shdr);
extern __le16 *cifs_convert_path_to_utf16(const char *from,
struct cifs_sb_info *cifs_sb);
diff --git a/fs/cifs/smb2status.h b/fs/cifs/smb2status.h
index 0215ef36e240..a9e958166fc5 100644
--- a/fs/cifs/smb2status.h
+++ b/fs/cifs/smb2status.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: LGPL-2.1 */
/*
- * fs/cifs/smb2status.h
*
* SMB2 Status code (network error) definitions
* Definitions are from MS-ERREF
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 6f7952ea4941..2bf047b390a9 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/smb2transport.c
*
* Copyright (C) International Business Machines Corp., 2002, 2011
* Etersoft, 2012
@@ -20,7 +19,6 @@
#include <linux/mempool.h>
#include <linux/highmem.h>
#include <crypto/aead.h>
-#include "smb2pdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
#include "smb2proto.h"
@@ -214,14 +212,14 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
unsigned char smb2_signature[SMB2_HMACSHA256_SIZE];
unsigned char *sigptr = smb2_signature;
struct kvec *iov = rqst->rq_iov;
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base;
struct cifs_ses *ses;
struct shash_desc *shash;
struct crypto_shash *hash;
struct sdesc *sdesc = NULL;
struct smb_rqst drqst;
- ses = smb2_find_smb_ses(server, shdr->SessionId);
+ ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId));
if (!ses) {
cifs_server_dbg(VFS, "%s: Could not find session\n", __func__);
return 0;
@@ -535,14 +533,14 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
unsigned char smb3_signature[SMB2_CMACAES_SIZE];
unsigned char *sigptr = smb3_signature;
struct kvec *iov = rqst->rq_iov;
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base;
struct shash_desc *shash;
struct crypto_shash *hash;
struct sdesc *sdesc = NULL;
struct smb_rqst drqst;
u8 key[SMB3_SIGN_KEY_SIZE];
- rc = smb2_get_sign_key(shdr->SessionId, server, key);
+ rc = smb2_get_sign_key(le64_to_cpu(shdr->SessionId), server, key);
if (rc)
return 0;
@@ -612,12 +610,12 @@ static int
smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
int rc = 0;
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
struct smb2_sess_setup_req *ssr;
bool is_binding;
bool is_signed;
- shdr = (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+ shdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
ssr = (struct smb2_sess_setup_req *)shdr;
is_binding = shdr->Command == SMB2_SESSION_SETUP &&
@@ -643,8 +641,8 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
unsigned int rc;
char server_response_sig[SMB2_SIGNATURE_SIZE];
- struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+ struct smb2_hdr *shdr =
+ (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
if ((shdr->Command == SMB2_NEGOTIATE) ||
(shdr->Command == SMB2_SESSION_SETUP) ||
@@ -690,7 +688,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
*/
static inline void
smb2_seq_num_into_buf(struct TCP_Server_Info *server,
- struct smb2_sync_hdr *shdr)
+ struct smb2_hdr *shdr)
{
unsigned int i, num = le16_to_cpu(shdr->CreditCharge);
@@ -701,7 +699,7 @@ smb2_seq_num_into_buf(struct TCP_Server_Info *server,
}
static struct mid_q_entry *
-smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
+smb2_mid_entry_alloc(const struct smb2_hdr *shdr,
struct TCP_Server_Info *server)
{
struct mid_q_entry *temp;
@@ -733,14 +731,15 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
atomic_inc(&midCount);
temp->mid_state = MID_REQUEST_ALLOCATED;
- trace_smb3_cmd_enter(shdr->TreeId, shdr->SessionId,
- le16_to_cpu(shdr->Command), temp->mid);
+ trace_smb3_cmd_enter(le32_to_cpu(shdr->Id.SyncId.TreeId),
+ le64_to_cpu(shdr->SessionId),
+ le16_to_cpu(shdr->Command), temp->mid);
return temp;
}
static int
smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server,
- struct smb2_sync_hdr *shdr, struct mid_q_entry **mid)
+ struct smb2_hdr *shdr, struct mid_q_entry **mid)
{
if (server->tcpStatus == CifsExiting)
return -ENOENT;
@@ -808,8 +807,8 @@ smb2_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *server,
struct smb_rqst *rqst)
{
int rc;
- struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+ struct smb2_hdr *shdr =
+ (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
struct mid_q_entry *mid;
smb2_seq_num_into_buf(server, shdr);
@@ -834,8 +833,8 @@ struct mid_q_entry *
smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
{
int rc;
- struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+ struct smb2_hdr *shdr =
+ (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
struct mid_q_entry *mid;
if (server->tcpStatus == CifsNeedNegotiate &&
diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h
index 60189efb3236..aeffdad829e2 100644
--- a/fs/cifs/smberr.h
+++ b/fs/cifs/smberr.h
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: LGPL-2.1 */
/*
- * fs/cifs/smberr.h
*
* Copyright (c) International Business Machines Corp., 2002,2004
* Author(s): Steve French (sfrench@us.ibm.com)
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index dafcb6ab050d..6cecf302dcfd 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -11,6 +11,8 @@
#define _CIFS_TRACE_H
#include <linux/tracepoint.h>
+#include <linux/net.h>
+#include <linux/inet.h>
/*
* Please use this 3-part article as a reference for writing new tracepoints:
@@ -854,6 +856,75 @@ DEFINE_EVENT(smb3_lease_err_class, smb3_##name, \
DEFINE_SMB3_LEASE_ERR_EVENT(lease_err);
+DECLARE_EVENT_CLASS(smb3_connect_class,
+ TP_PROTO(char *hostname,
+ __u64 conn_id,
+ const struct __kernel_sockaddr_storage *dst_addr),
+ TP_ARGS(hostname, conn_id, dst_addr),
+ TP_STRUCT__entry(
+ __string(hostname, hostname)
+ __field(__u64, conn_id)
+ __array(__u8, dst_addr, sizeof(struct sockaddr_storage))
+ ),
+ TP_fast_assign(
+ struct sockaddr_storage *pss = NULL;
+
+ __entry->conn_id = conn_id;
+ pss = (struct sockaddr_storage *)__entry->dst_addr;
+ *pss = *dst_addr;
+ __assign_str(hostname, hostname);
+ ),
+ TP_printk("conn_id=0x%llx server=%s addr=%pISpsfc",
+ __entry->conn_id,
+ __get_str(hostname),
+ __entry->dst_addr)
+)
+
+#define DEFINE_SMB3_CONNECT_EVENT(name) \
+DEFINE_EVENT(smb3_connect_class, smb3_##name, \
+ TP_PROTO(char *hostname, \
+ __u64 conn_id, \
+ const struct __kernel_sockaddr_storage *addr), \
+ TP_ARGS(hostname, conn_id, addr))
+
+DEFINE_SMB3_CONNECT_EVENT(connect_done);
+
+DECLARE_EVENT_CLASS(smb3_connect_err_class,
+ TP_PROTO(char *hostname, __u64 conn_id,
+ const struct __kernel_sockaddr_storage *dst_addr, int rc),
+ TP_ARGS(hostname, conn_id, dst_addr, rc),
+ TP_STRUCT__entry(
+ __string(hostname, hostname)
+ __field(__u64, conn_id)
+ __array(__u8, dst_addr, sizeof(struct sockaddr_storage))
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ struct sockaddr_storage *pss = NULL;
+
+ __entry->conn_id = conn_id;
+ __entry->rc = rc;
+ pss = (struct sockaddr_storage *)__entry->dst_addr;
+ *pss = *dst_addr;
+ __assign_str(hostname, hostname);
+ ),
+ TP_printk("rc=%d conn_id=0x%llx server=%s addr=%pISpsfc",
+ __entry->rc,
+ __entry->conn_id,
+ __get_str(hostname),
+ __entry->dst_addr)
+)
+
+#define DEFINE_SMB3_CONNECT_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_connect_err_class, smb3_##name, \
+ TP_PROTO(char *hostname, \
+ __u64 conn_id, \
+ const struct __kernel_sockaddr_storage *addr, \
+ int rc), \
+ TP_ARGS(hostname, conn_id, addr, rc))
+
+DEFINE_SMB3_CONNECT_ERR_EVENT(connect_err);
+
DECLARE_EVENT_CLASS(smb3_reconnect_class,
TP_PROTO(__u64 currmid,
__u64 conn_id,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 75a95de320cf..b7379329b741 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/transport.c
*
* Copyright (C) International Business Machines Corp., 2002,2008
* Author(s): Steve French (sfrench@us.ibm.com)
diff --git a/fs/cifs/winucase.c b/fs/cifs/winucase.c
index 59b6c577aa0a..2f075b5b50df 100644
--- a/fs/cifs/winucase.c
+++ b/fs/cifs/winucase.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * fs/cifs/winucase.c
*
* Copyright (c) Jeffrey Layton <jlayton@redhat.com>, 2013
*
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 9ed481e79ce0..7d8b72d67c80 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/cifs/xattr.c
*
* Copyright (c) International Business Machines Corp., 2003, 2007
* Author(s): Steve French (sfrench@us.ibm.com)
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 06855f6c7902..62a3d2565c26 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -63,9 +63,10 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
struct inode *inode;
struct coda_inode_info *cii;
unsigned long hash = coda_f2i(fid);
+ umode_t inode_type = coda_inode_type(attr);
+retry:
inode = iget5_locked(sb, hash, coda_test_inode, coda_set_inode, fid);
-
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -75,11 +76,15 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
inode->i_ino = hash;
/* inode is locked and unique, no need to grab cii->c_lock */
cii->c_mapcount = 0;
+ coda_fill_inode(inode, attr);
unlock_new_inode(inode);
+ } else if ((inode->i_mode & S_IFMT) != inode_type) {
+ /* Inode has changed type, mark bad and grab a new one */
+ remove_inode_hash(inode);
+ coda_flag_inode(inode, C_PURGE);
+ iput(inode);
+ goto retry;
}
-
- /* always replace the attributes, type might have changed */
- coda_fill_inode(inode, attr);
return inode;
}
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 2e1a5a192074..903ca8fa4b9b 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -87,28 +87,27 @@ static struct coda_timespec timespec64_to_coda(struct timespec64 ts64)
}
/* utility functions below */
+umode_t coda_inode_type(struct coda_vattr *attr)
+{
+ switch (attr->va_type) {
+ case C_VREG:
+ return S_IFREG;
+ case C_VDIR:
+ return S_IFDIR;
+ case C_VLNK:
+ return S_IFLNK;
+ case C_VNON:
+ default:
+ return 0;
+ }
+}
+
void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
{
- int inode_type;
- /* inode's i_flags, i_ino are set by iget
- XXX: is this all we need ??
- */
- switch (attr->va_type) {
- case C_VNON:
- inode_type = 0;
- break;
- case C_VREG:
- inode_type = S_IFREG;
- break;
- case C_VDIR:
- inode_type = S_IFDIR;
- break;
- case C_VLNK:
- inode_type = S_IFLNK;
- break;
- default:
- inode_type = 0;
- }
+ /* inode's i_flags, i_ino are set by iget
+ * XXX: is this all we need ??
+ */
+ umode_t inode_type = coda_inode_type(attr);
inode->i_mode |= inode_type;
if (attr->va_mode != (u_short) -1)
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index e7b27754ce78..9be281bbcc06 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -53,10 +53,11 @@ int coda_getattr(struct user_namespace *, const struct path *, struct kstat *,
u32, unsigned int);
int coda_setattr(struct user_namespace *, struct dentry *, struct iattr *);
-/* this file: heloers */
+/* this file: helpers */
char *coda_f2s(struct CodaFid *f);
int coda_iscontrol(const char *name, size_t length);
+umode_t coda_inode_type(struct coda_vattr *attr);
void coda_vattr_to_iattr(struct inode *, struct coda_vattr *);
void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *);
unsigned short coda_flags_to_cflags(unsigned short);
@@ -83,6 +84,9 @@ static __inline__ void coda_flag_inode(struct inode *inode, int flag)
{
struct coda_inode_info *cii = ITOC(inode);
+ if (!inode)
+ return;
+
spin_lock(&cii->c_lock);
cii->c_flags |= flag;
spin_unlock(&cii->c_lock);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index d69989c1bac3..328d7a684b63 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -317,13 +317,10 @@ static int coda_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
coda_dir_drop_nlink(old_dir);
coda_dir_inc_nlink(new_dir);
}
- coda_dir_update_mtime(old_dir);
- coda_dir_update_mtime(new_dir);
coda_flag_inode(d_inode(new_dentry), C_VATTR);
- } else {
- coda_flag_inode(old_dir, C_VATTR);
- coda_flag_inode(new_dir, C_VATTR);
}
+ coda_dir_update_mtime(old_dir);
+ coda_dir_update_mtime(new_dir);
}
return error;
}
@@ -499,15 +496,20 @@ out:
*/
static int coda_dentry_delete(const struct dentry * dentry)
{
- int flags;
+ struct inode *inode;
+ struct coda_inode_info *cii;
if (d_really_is_negative(dentry))
return 0;
- flags = (ITOC(d_inode(dentry))->c_flags) & C_PURGE;
- if (is_bad_inode(d_inode(dentry)) || flags) {
+ inode = d_inode(dentry);
+ if (!inode || is_bad_inode(inode))
return 1;
- }
+
+ cii = ITOC(inode);
+ if (cii->c_flags & C_PURGE)
+ return 1;
+
return 0;
}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ef5ca22bfb3e..29dd87be2fb8 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -8,6 +8,7 @@
* to the Coda project. Contact Peter Braam <coda@cs.cmu.edu>.
*/
+#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/time.h>
@@ -28,7 +29,7 @@
#include "coda_int.h"
struct coda_vm_ops {
- atomic_t refcnt;
+ refcount_t refcnt;
struct file *coda_file;
const struct vm_operations_struct *host_vm_ops;
struct vm_operations_struct vm_ops;
@@ -98,7 +99,7 @@ coda_vm_open(struct vm_area_struct *vma)
struct coda_vm_ops *cvm_ops =
container_of(vma->vm_ops, struct coda_vm_ops, vm_ops);
- atomic_inc(&cvm_ops->refcnt);
+ refcount_inc(&cvm_ops->refcnt);
if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->open)
cvm_ops->host_vm_ops->open(vma);
@@ -113,7 +114,7 @@ coda_vm_close(struct vm_area_struct *vma)
if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->close)
cvm_ops->host_vm_ops->close(vma);
- if (atomic_dec_and_test(&cvm_ops->refcnt)) {
+ if (refcount_dec_and_test(&cvm_ops->refcnt)) {
vma->vm_ops = cvm_ops->host_vm_ops;
fput(cvm_ops->coda_file);
kfree(cvm_ops);
@@ -189,7 +190,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
cvm_ops->vm_ops.open = coda_vm_open;
cvm_ops->vm_ops.close = coda_vm_close;
cvm_ops->coda_file = coda_file;
- atomic_set(&cvm_ops->refcnt, 1);
+ refcount_set(&cvm_ops->refcnt, 1);
vma->vm_ops = &cvm_ops->vm_ops;
}
@@ -238,11 +239,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
struct coda_file_info *cfi;
struct coda_inode_info *cii;
struct inode *host_inode;
- int err;
cfi = coda_ftoc(coda_file);
- err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
+ venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
coda_flags, coda_file->f_cred->fsuid);
host_inode = file_inode(cfi->cfi_container);
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 240669f51eac..b39580ad4ce5 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -122,14 +122,10 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
hdr.opcode, hdr.unique);
nbytes = size;
}
- dcbuf = kvmalloc(nbytes, GFP_KERNEL);
- if (!dcbuf) {
- retval = -ENOMEM;
- goto out;
- }
- if (copy_from_user(dcbuf, buf, nbytes)) {
- kvfree(dcbuf);
- retval = -EFAULT;
+
+ dcbuf = vmemdup_user(buf, nbytes);
+ if (IS_ERR(dcbuf)) {
+ retval = PTR_ERR(dcbuf);
goto out;
}
@@ -388,7 +384,7 @@ MODULE_AUTHOR("Jan Harkes, Peter J. Braam");
MODULE_DESCRIPTION("Coda Distributed File System VFS interface");
MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR);
MODULE_LICENSE("GPL");
-MODULE_VERSION("7.0");
+MODULE_VERSION("7.2");
static int __init init_coda(void)
{
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index eb3b1898da46..59f6cfd06f96 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -744,7 +744,8 @@ static int coda_upcall(struct venus_comm *vcp,
list_add_tail(&req->uc_chain, &vcp->vc_pending);
wake_up_interruptible(&vcp->vc_waitq);
- if (req->uc_flags & CODA_REQ_ASYNC) {
+ /* We can return early on asynchronous requests */
+ if (outSize == NULL) {
mutex_unlock(&vcp->vc_mutex);
return 0;
}
diff --git a/fs/coredump.c b/fs/coredump.c
index 3224dee44d30..a6b3c196cdef 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -359,7 +359,7 @@ static int zap_process(struct task_struct *start, int exit_code, int flags)
for_each_thread(start, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
- if (t != current && t->mm) {
+ if (t != current && !(t->flags & PF_POSTCOREDUMP)) {
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
nr++;
@@ -369,99 +369,34 @@ static int zap_process(struct task_struct *start, int exit_code, int flags)
return nr;
}
-static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+static int zap_threads(struct task_struct *tsk,
struct core_state *core_state, int exit_code)
{
- struct task_struct *g, *p;
- unsigned long flags;
int nr = -EAGAIN;
spin_lock_irq(&tsk->sighand->siglock);
if (!signal_group_exit(tsk->signal)) {
- mm->core_state = core_state;
+ tsk->signal->core_state = core_state;
tsk->signal->group_exit_task = tsk;
nr = zap_process(tsk, exit_code, 0);
clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
+ tsk->flags |= PF_DUMPCORE;
+ atomic_set(&core_state->nr_threads, nr);
}
spin_unlock_irq(&tsk->sighand->siglock);
- if (unlikely(nr < 0))
- return nr;
-
- tsk->flags |= PF_DUMPCORE;
- if (atomic_read(&mm->mm_users) == nr + 1)
- goto done;
- /*
- * We should find and kill all tasks which use this mm, and we should
- * count them correctly into ->nr_threads. We don't take tasklist
- * lock, but this is safe wrt:
- *
- * fork:
- * None of sub-threads can fork after zap_process(leader). All
- * processes which were created before this point should be
- * visible to zap_threads() because copy_process() adds the new
- * process to the tail of init_task.tasks list, and lock/unlock
- * of ->siglock provides a memory barrier.
- *
- * do_exit:
- * The caller holds mm->mmap_lock. This means that the task which
- * uses this mm can't pass exit_mm(), so it can't exit or clear
- * its ->mm.
- *
- * de_thread:
- * It does list_replace_rcu(&leader->tasks, &current->tasks),
- * we must see either old or new leader, this does not matter.
- * However, it can change p->sighand, so lock_task_sighand(p)
- * must be used. Since p->mm != NULL and we hold ->mmap_lock
- * it can't fail.
- *
- * Note also that "g" can be the old leader with ->mm == NULL
- * and already unhashed and thus removed from ->thread_group.
- * This is OK, __unhash_process()->list_del_rcu() does not
- * clear the ->next pointer, we will find the new leader via
- * next_thread().
- */
- rcu_read_lock();
- for_each_process(g) {
- if (g == tsk->group_leader)
- continue;
- if (g->flags & PF_KTHREAD)
- continue;
-
- for_each_thread(g, p) {
- if (unlikely(!p->mm))
- continue;
- if (unlikely(p->mm == mm)) {
- lock_task_sighand(p, &flags);
- nr += zap_process(p, exit_code,
- SIGNAL_GROUP_EXIT);
- unlock_task_sighand(p, &flags);
- }
- break;
- }
- }
- rcu_read_unlock();
-done:
- atomic_set(&core_state->nr_threads, nr);
return nr;
}
static int coredump_wait(int exit_code, struct core_state *core_state)
{
struct task_struct *tsk = current;
- struct mm_struct *mm = tsk->mm;
int core_waiters = -EBUSY;
init_completion(&core_state->startup);
core_state->dumper.task = tsk;
core_state->dumper.next = NULL;
- if (mmap_write_lock_killable(mm))
- return -EINTR;
-
- if (!mm->core_state)
- core_waiters = zap_threads(tsk, mm, core_state, exit_code);
- mmap_write_unlock(mm);
-
+ core_waiters = zap_threads(tsk, core_state, exit_code);
if (core_waiters > 0) {
struct core_thread *ptr;
@@ -483,7 +418,7 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
return core_waiters;
}
-static void coredump_finish(struct mm_struct *mm, bool core_dumped)
+static void coredump_finish(bool core_dumped)
{
struct core_thread *curr, *next;
struct task_struct *task;
@@ -493,22 +428,21 @@ static void coredump_finish(struct mm_struct *mm, bool core_dumped)
current->signal->group_exit_code |= 0x80;
current->signal->group_exit_task = NULL;
current->signal->flags = SIGNAL_GROUP_EXIT;
+ next = current->signal->core_state->dumper.next;
+ current->signal->core_state = NULL;
spin_unlock_irq(&current->sighand->siglock);
- next = mm->core_state->dumper.next;
while ((curr = next) != NULL) {
next = curr->next;
task = curr->task;
/*
- * see exit_mm(), curr->task must not see
+ * see coredump_task_exit(), curr->task must not see
* ->task == NULL before we read ->next.
*/
smp_mb();
curr->task = NULL;
wake_up_process(task);
}
-
- mm->core_state = NULL;
}
static bool dump_interrupted(void)
@@ -839,7 +773,7 @@ fail_dropcount:
fail_unlock:
kfree(argv);
kfree(cn.corename);
- coredump_finish(mm, core_dumped);
+ coredump_finish(core_dumped);
revert_creds(old_cred);
fail_creds:
put_cred(cred);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 2be65269a987..666aa380011e 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -209,7 +209,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
return read_buffers[i] + blk_offset;
}
- devsize = mapping->host->i_size >> PAGE_SHIFT;
+ devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT;
/* Ok, read in BLKS_PER_BUF pages completely first. */
for (i = 0; i < BLKS_PER_BUF; i++) {
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 68a2de6b5a9b..bfc2a5b74ed3 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -1,23 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This contains encryption functions for per-file encryption.
+ * Utility functions for file contents encryption/decryption on
+ * block device-based filesystems.
*
* Copyright (C) 2015, Google, Inc.
* Copyright (C) 2015, Motorola Mobility
- *
- * Written by Michael Halcrow, 2014.
- *
- * Filename encryption additions
- * Uday Savagaonkar, 2014
- * Encryption policy handling additions
- * Ildar Muslukhov, 2014
- * Add fscrypt_pullback_bio_page()
- * Jaegeuk Kim, 2015.
- *
- * This has not yet undergone a rigorous security audit.
- *
- * The usage of AES-XTS should conform to recommendations in NIST
- * Special Publication 800-38E and IEEE P1619/D16.
*/
#include <linux/pagemap.h>
@@ -26,6 +13,21 @@
#include <linux/namei.h>
#include "fscrypt_private.h"
+/**
+ * fscrypt_decrypt_bio() - decrypt the contents of a bio
+ * @bio: the bio to decrypt
+ *
+ * Decrypt the contents of a "read" bio following successful completion of the
+ * underlying disk read. The bio must be reading a whole number of blocks of an
+ * encrypted file directly into the page cache. If the bio is reading the
+ * ciphertext into bounce pages instead of the page cache (for example, because
+ * the file is also compressed, so decompression is required after decryption),
+ * then this function isn't applicable. This function may sleep, so it must be
+ * called from a workqueue rather than from the bio's bi_end_io callback.
+ *
+ * This function sets PG_error on any pages that contain any blocks that failed
+ * to be decrypted. The filesystem must not mark such pages uptodate.
+ */
void fscrypt_decrypt_bio(struct bio *bio)
{
struct bio_vec *bv;
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index eb538c28df94..a9be4bc74a94 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -429,8 +429,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
if (fscrypt_has_encryption_key(dir)) {
if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy,
- iname->len,
- dir->i_sb->s_cop->max_namelen,
+ iname->len, NAME_MAX,
&fname->crypto_buf.len))
return -ENAMETOOLONG;
fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 3fa965eb3336..5b0a9e6478b5 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -20,6 +20,11 @@
#define FSCRYPT_FILE_NONCE_SIZE 16
+/*
+ * Minimum size of an fscrypt master key. Note: a longer key will be required
+ * if ciphers with a 256-bit security strength are used. This is just the
+ * absolute minimum, which applies when only 128-bit encryption is used.
+ */
#define FSCRYPT_MIN_KEY_SIZE 16
#define FSCRYPT_CONTEXT_V1 1
@@ -413,7 +418,11 @@ struct fscrypt_master_key_secret {
*/
struct fscrypt_hkdf hkdf;
- /* Size of the raw key in bytes. Set even if ->raw isn't set. */
+ /*
+ * Size of the raw key in bytes. This remains set even if ->raw was
+ * zeroized due to no longer being needed. I.e. we still remember the
+ * size of the key even if we don't need to remember the key itself.
+ */
u32 size;
/* For v1 policy keys: the raw key. Wiped for v2 policy keys. */
@@ -549,8 +558,9 @@ int __init fscrypt_init_keyring(void);
struct fscrypt_mode {
const char *friendly_name;
const char *cipher_str;
- int keysize;
- int ivsize;
+ int keysize; /* key size in bytes */
+ int security_strength; /* security strength in bytes */
+ int ivsize; /* IV size in bytes */
int logged_impl_name;
enum blk_crypto_mode_num blk_crypto_mode;
};
diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
index e0ec21055505..7607d18b35fc 100644
--- a/fs/crypto/hkdf.c
+++ b/fs/crypto/hkdf.c
@@ -16,9 +16,14 @@
/*
* HKDF supports any unkeyed cryptographic hash algorithm, but fscrypt uses
- * SHA-512 because it is reasonably secure and efficient; and since it produces
- * a 64-byte digest, deriving an AES-256-XTS key preserves all 64 bytes of
- * entropy from the master key and requires only one iteration of HKDF-Expand.
+ * SHA-512 because it is well-established, secure, and reasonably efficient.
+ *
+ * HKDF-SHA256 was also considered, as its 256-bit security strength would be
+ * sufficient here. A 512-bit security strength is "nice to have", though.
+ * Also, on 64-bit CPUs, SHA-512 is usually just as fast as SHA-256. In the
+ * common case of deriving an AES-256-XTS key (512 bits), that can result in
+ * HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of
+ * SHA-512 causes HKDF-Expand to only need to do one iteration rather than two.
*/
#define HKDF_HMAC_ALG "hmac(sha512)"
#define HKDF_HASHLEN SHA512_DIGEST_SIZE
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index bca9c6658a7c..eede186b04ce 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -19,6 +19,7 @@ struct fscrypt_mode fscrypt_modes[] = {
.friendly_name = "AES-256-XTS",
.cipher_str = "xts(aes)",
.keysize = 64,
+ .security_strength = 32,
.ivsize = 16,
.blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS,
},
@@ -26,12 +27,14 @@ struct fscrypt_mode fscrypt_modes[] = {
.friendly_name = "AES-256-CTS-CBC",
.cipher_str = "cts(cbc(aes))",
.keysize = 32,
+ .security_strength = 32,
.ivsize = 16,
},
[FSCRYPT_MODE_AES_128_CBC] = {
.friendly_name = "AES-128-CBC-ESSIV",
.cipher_str = "essiv(cbc(aes),sha256)",
.keysize = 16,
+ .security_strength = 16,
.ivsize = 16,
.blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV,
},
@@ -39,12 +42,14 @@ struct fscrypt_mode fscrypt_modes[] = {
.friendly_name = "AES-128-CTS-CBC",
.cipher_str = "cts(cbc(aes))",
.keysize = 16,
+ .security_strength = 16,
.ivsize = 16,
},
[FSCRYPT_MODE_ADIANTUM] = {
.friendly_name = "Adiantum",
.cipher_str = "adiantum(xchacha12,aes)",
.keysize = 32,
+ .security_strength = 32,
.ivsize = 32,
.blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM,
},
@@ -117,8 +122,9 @@ err_free_tfm:
/*
* Prepare the crypto transform object or blk-crypto key in @prep_key, given the
- * raw key, encryption mode, and flag indicating which encryption implementation
- * (fs-layer or blk-crypto) will be used.
+ * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption
+ * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt),
+ * and IV generation method (@ci->ci_policy.flags).
*/
int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
const u8 *raw_key, const struct fscrypt_info *ci)
@@ -358,6 +364,45 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
}
/*
+ * Check whether the size of the given master key (@mk) is appropriate for the
+ * encryption settings which a particular file will use (@ci).
+ *
+ * If the file uses a v1 encryption policy, then the master key must be at least
+ * as long as the derived key, as this is a requirement of the v1 KDF.
+ *
+ * Otherwise, the KDF can accept any size key, so we enforce a slightly looser
+ * requirement: we require that the size of the master key be at least the
+ * maximum security strength of any algorithm whose key will be derived from it
+ * (but in practice we only need to consider @ci->ci_mode, since any other
+ * possible subkeys such as DIRHASH and INODE_HASH will never increase the
+ * required key size over @ci->ci_mode). This allows AES-256-XTS keys to be
+ * derived from a 256-bit master key, which is cryptographically sufficient,
+ * rather than requiring a 512-bit master key which is unnecessarily long. (We
+ * still allow 512-bit master keys if the user chooses to use them, though.)
+ */
+static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
+ const struct fscrypt_info *ci)
+{
+ unsigned int min_keysize;
+
+ if (ci->ci_policy.version == FSCRYPT_POLICY_V1)
+ min_keysize = ci->ci_mode->keysize;
+ else
+ min_keysize = ci->ci_mode->security_strength;
+
+ if (mk->mk_secret.size < min_keysize) {
+ fscrypt_warn(NULL,
+ "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
+ master_key_spec_type(&mk->mk_spec),
+ master_key_spec_len(&mk->mk_spec),
+ (u8 *)&mk->mk_spec.u,
+ mk->mk_secret.size, min_keysize);
+ return false;
+ }
+ return true;
+}
+
+/*
* Find the master key, then set up the inode's actual encryption key.
*
* If the master key is found in the filesystem-level keyring, then the
@@ -422,18 +467,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
goto out_release_key;
}
- /*
- * Require that the master key be at least as long as the derived key.
- * Otherwise, the derived key cannot possibly contain as much entropy as
- * that required by the encryption mode it will be used for. For v1
- * policies it's also required for the KDF to work at all.
- */
- if (mk->mk_secret.size < ci->ci_mode->keysize) {
- fscrypt_warn(NULL,
- "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
- master_key_spec_type(&mk_spec),
- master_key_spec_len(&mk_spec), (u8 *)&mk_spec.u,
- mk->mk_secret.size, ci->ci_mode->keysize);
+ if (!fscrypt_valid_master_key_size(mk, ci)) {
err = -ENOKEY;
goto out_release_key;
}
diff --git a/fs/d_path.c b/fs/d_path.c
index cd60c7535181..e4e0ebad1f15 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -77,9 +77,8 @@ static bool prepend(struct prepend_buffer *p, const char *str, int namelen)
/**
* prepend_name - prepend a pathname in front of current buffer pointer
- * @buffer: buffer pointer
- * @buflen: allocated length of the buffer
- * @name: name string and length qstr structure
+ * @p: prepend buffer which contains buffer pointer and allocated length
+ * @name: name string and length qstr structure
*
* With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
* make sure that either the old or the new name pointer and length are
@@ -141,8 +140,7 @@ static int __prepend_path(const struct dentry *dentry, const struct mount *mnt,
* prepend_path - Prepend path string to a buffer
* @path: the dentry/vfsmount to report
* @root: root vfsmnt/dentry
- * @buffer: pointer to the end of the buffer
- * @buflen: pointer to buffer length
+ * @p: prepend buffer which contains buffer pointer and allocated length
*
* The function will first try to write out the pathname without taking any
* lock other than the RCU read lock to make sure that dentries won't go away.
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 8129a430d789..2f117c57160d 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -528,7 +528,7 @@ void debugfs_create_file_size(const char *name, umode_t mode,
{
struct dentry *de = debugfs_create_file(name, mode, parent, data, fops);
- if (de)
+ if (!IS_ERR(de))
d_inode(de)->i_size = file_size;
}
EXPORT_SYMBOL_GPL(debugfs_create_file_size);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b2e86e739d7a..654443558047 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -119,7 +119,6 @@ struct dio {
int flags; /* doesn't change */
int op;
int op_flags;
- blk_qc_t bio_cookie;
struct gendisk *bio_disk;
struct inode *inode;
loff_t i_size; /* i_size when submitted */
@@ -308,7 +307,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
if (ret > 0 && dio->op == REQ_OP_WRITE)
ret = generic_write_sync(dio->iocb, ret);
- dio->iocb->ki_complete(dio->iocb, ret, 0);
+ dio->iocb->ki_complete(dio->iocb, ret);
}
kmem_cache_free(dio_cache, dio);
@@ -438,11 +437,10 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
dio->bio_disk = bio->bi_bdev->bd_disk;
- if (sdio->submit_io) {
+ if (sdio->submit_io)
sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio);
- dio->bio_cookie = BLK_QC_T_NONE;
- } else
- dio->bio_cookie = submit_bio(bio);
+ else
+ submit_bio(bio);
sdio->bio = NULL;
sdio->boundary = 0;
@@ -481,9 +479,7 @@ static struct bio *dio_await_one(struct dio *dio)
__set_current_state(TASK_UNINTERRUPTIBLE);
dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true))
- blk_io_schedule();
+ blk_io_schedule();
/* wake up sets us TASK_RUNNING */
spin_lock_irqsave(&dio->bio_lock, flags);
dio->waiter = NULL;
@@ -1214,8 +1210,6 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
} else {
dio->op = REQ_OP_READ;
}
- if (iocb->ki_flags & IOCB_HIPRI)
- dio->op_flags |= REQ_HIPRI;
/*
* For AIO O_(D)SYNC writes we need to defer completions to a workqueue
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 14b747026742..f57255ab88ed 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -6,16 +6,22 @@ config EROFS_FS
select FS_IOMAP
select LIBCRC32C
help
- EROFS (Enhanced Read-Only File System) is a lightweight
- read-only file system with modern designs (eg. page-sized
- blocks, inline xattrs/data, etc.) for scenarios which need
- high-performance read-only requirements, e.g. Android OS
- for mobile phones and LIVECDs.
+ EROFS (Enhanced Read-Only File System) is a lightweight read-only
+ file system with modern designs (e.g. no buffer heads, inline
+ xattrs/data, chunk-based deduplication, multiple devices, etc.) for
+ scenarios which need high-performance read-only solutions, e.g.
+ smartphones with Android OS, LiveCDs and high-density hosts with
+ numerous containers;
- It also provides fixed-sized output compression support,
- which improves storage density, keeps relatively higher
- compression ratios, which is more useful to achieve high
- performance for embedded devices with limited memory.
+ It also provides fixed-sized output compression support in order to
+ improve storage density as well as keep relatively higher compression
+ ratios and implements in-place decompression to reuse the file page
+ for compressed data temporarily with proper strategies, which is
+ quite useful to ensure guaranteed end-to-end runtime decompression
+ performance under extremely memory pressure without extra cost.
+
+ See the documentation at <file:Documentation/filesystems/erofs.rst>
+ for more details.
If unsure, say N.
@@ -76,3 +82,19 @@ config EROFS_FS_ZIP
Enable fixed-sized output compression for EROFS.
If you don't want to enable compression feature, say N.
+
+config EROFS_FS_ZIP_LZMA
+ bool "EROFS LZMA compressed data support"
+ depends on EROFS_FS_ZIP
+ select XZ_DEC
+ select XZ_DEC_MICROLZMA
+ help
+ Saying Y here includes support for reading EROFS file systems
+ containing LZMA compressed data, specifically called microLZMA. it
+ gives better compression ratios than the LZ4 algorithm, at the
+ expense of more CPU overhead.
+
+ LZMA support is an experimental feature for now and so most file
+ systems will be readable without selecting this option.
+
+ If unsure, say N.
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 1f9aced49070..756fe2d65272 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_EROFS_FS) += erofs.o
erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
+erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 3701c72bacb2..579406504919 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -8,11 +8,6 @@
#include "internal.h"
-enum {
- Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
- Z_EROFS_COMPRESSION_RUNTIME_MAX
-};
-
struct z_erofs_decompress_req {
struct super_block *sb;
struct page **in, **out;
@@ -25,6 +20,12 @@ struct z_erofs_decompress_req {
bool inplace_io, partial_decoding;
};
+struct z_erofs_decompressor {
+ int (*decompress)(struct z_erofs_decompress_req *rq,
+ struct page **pagepool);
+ char *name;
+};
+
/* some special page->private (unsigned long, see below) */
#define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2)
#define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2)
@@ -63,7 +64,7 @@ static inline bool z_erofs_is_shortlived_page(struct page *page)
return true;
}
-static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool,
+static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
struct page *page)
{
if (!z_erofs_is_shortlived_page(page))
@@ -74,13 +75,22 @@ static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool,
put_page(page);
} else {
/* follow the pcluster rule above. */
- set_page_private(page, 0);
- list_add(&page->lru, pagepool);
+ erofs_pagepool_add(pagepool, page);
}
return true;
}
+#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
+static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
+ struct page *page)
+{
+ return page->mapping == MNGD_MAPPING(sbi);
+}
+
int z_erofs_decompress(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool);
+ struct page **pagepool);
+/* prototypes for specific algorithms */
+int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool);
#endif
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 9db829715652..0e35ef3f9f3d 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -89,6 +89,7 @@ static int erofs_map_blocks(struct inode *inode,
erofs_off_t pos;
int err = 0;
+ map->m_deviceid = 0;
if (map->m_la >= inode->i_size) {
/* leave out-of-bound access unmapped */
map->m_flags = 0;
@@ -135,14 +136,8 @@ static int erofs_map_blocks(struct inode *inode,
map->m_flags = 0;
break;
default:
- /* only one device is supported for now */
- if (idx->device_id) {
- erofs_err(sb, "invalid device id %u @ %llu for nid %llu",
- le16_to_cpu(idx->device_id),
- chunknr, vi->nid);
- err = -EFSCORRUPTED;
- goto out_unlock;
- }
+ map->m_deviceid = le16_to_cpu(idx->device_id) &
+ EROFS_SB(sb)->device_id_mask;
map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr));
map->m_flags = EROFS_MAP_MAPPED;
break;
@@ -155,11 +150,55 @@ out:
return err;
}
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
+{
+ struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
+ struct erofs_device_info *dif;
+ int id;
+
+ /* primary device by default */
+ map->m_bdev = sb->s_bdev;
+ map->m_daxdev = EROFS_SB(sb)->dax_dev;
+
+ if (map->m_deviceid) {
+ down_read(&devs->rwsem);
+ dif = idr_find(&devs->tree, map->m_deviceid - 1);
+ if (!dif) {
+ up_read(&devs->rwsem);
+ return -ENODEV;
+ }
+ map->m_bdev = dif->bdev;
+ map->m_daxdev = dif->dax_dev;
+ up_read(&devs->rwsem);
+ } else if (devs->extra_devices) {
+ down_read(&devs->rwsem);
+ idr_for_each_entry(&devs->tree, dif, id) {
+ erofs_off_t startoff, length;
+
+ if (!dif->mapped_blkaddr)
+ continue;
+ startoff = blknr_to_addr(dif->mapped_blkaddr);
+ length = blknr_to_addr(dif->blocks);
+
+ if (map->m_pa >= startoff &&
+ map->m_pa < startoff + length) {
+ map->m_pa -= startoff;
+ map->m_bdev = dif->bdev;
+ map->m_daxdev = dif->dax_dev;
+ break;
+ }
+ }
+ up_read(&devs->rwsem);
+ }
+ return 0;
+}
+
static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
{
int ret;
struct erofs_map_blocks map;
+ struct erofs_map_dev mdev;
map.m_la = offset;
map.m_llen = length;
@@ -168,8 +207,16 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (ret < 0)
return ret;
- iomap->bdev = inode->i_sb->s_bdev;
- iomap->dax_dev = EROFS_I_SB(inode)->dax_dev;
+ mdev = (struct erofs_map_dev) {
+ .m_deviceid = map.m_deviceid,
+ .m_pa = map.m_pa,
+ };
+ ret = erofs_map_dev(inode->i_sb, &mdev);
+ if (ret)
+ return ret;
+
+ iomap->bdev = mdev.m_bdev;
+ iomap->dax_dev = mdev.m_daxdev;
iomap->offset = map.m_la;
iomap->length = map.m_llen;
iomap->flags = 0;
@@ -188,15 +235,15 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->type = IOMAP_INLINE;
ipage = erofs_get_meta_page(inode->i_sb,
- erofs_blknr(map.m_pa));
+ erofs_blknr(mdev.m_pa));
if (IS_ERR(ipage))
return PTR_ERR(ipage);
iomap->inline_data = page_address(ipage) +
- erofs_blkoff(map.m_pa);
+ erofs_blkoff(mdev.m_pa);
iomap->private = ipage;
} else {
iomap->type = IOMAP_MAPPED;
- iomap->addr = map.m_pa;
+ iomap->addr = mdev.m_pa;
}
return 0;
}
@@ -287,7 +334,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (!err)
return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
- NULL, 0);
+ NULL, 0, 0);
if (err < 0)
return err;
}
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index a5bc4b1b7813..bf37fc76b182 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -16,17 +16,6 @@
#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32)
#endif
-struct z_erofs_decompressor {
- /*
- * if destpages have sparsed pages, fill them with bounce pages.
- * it also check whether destpages indicate continuous physical memory.
- */
- int (*prepare_destpages)(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool);
- int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out);
- char *name;
-};
-
int z_erofs_load_lz4_config(struct super_block *sb,
struct erofs_super_block *dsb,
struct z_erofs_lz4_cfgs *lz4, int size)
@@ -63,8 +52,12 @@ int z_erofs_load_lz4_config(struct super_block *sb,
return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
}
-static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+/*
+ * Fill all gaps with bounce pages if it's a sparse page list. Also check if
+ * all physical pages are consecutive, which can be seen for moderate CR.
+ */
+static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
const unsigned int nr =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -119,7 +112,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
return kaddr ? 1 : 0;
}
-static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq,
+static void *z_erofs_lz4_handle_inplace_io(struct z_erofs_decompress_req *rq,
void *inpage, unsigned int *inputmargin, int *maptype,
bool support_0padding)
{
@@ -189,7 +182,8 @@ docopy:
return src;
}
-static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
+static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq,
+ u8 *out)
{
unsigned int inputmargin;
u8 *headpage, *src;
@@ -216,8 +210,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
}
rq->inputsize -= inputmargin;
- src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype,
- support_0padding);
+ src = z_erofs_lz4_handle_inplace_io(rq, headpage, &inputmargin,
+ &maptype, support_0padding);
if (IS_ERR(src))
return PTR_ERR(src);
@@ -233,7 +227,6 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
ret, rq->inputsize, inputmargin, rq->outputsize);
- WARN_ON(1);
print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
16, 1, src + inputmargin, rq->inputsize, true);
print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
@@ -242,6 +235,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
if (ret >= 0)
memset(out + ret, 0, rq->outputsize - ret);
ret = -EIO;
+ } else {
+ ret = 0;
}
if (maptype == 0) {
@@ -257,86 +252,25 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
return ret;
}
-static struct z_erofs_decompressor decompressors[] = {
- [Z_EROFS_COMPRESSION_SHIFTED] = {
- .name = "shifted"
- },
- [Z_EROFS_COMPRESSION_LZ4] = {
- .prepare_destpages = z_erofs_lz4_prepare_destpages,
- .decompress = z_erofs_lz4_decompress,
- .name = "lz4"
- },
-};
-
-static void copy_from_pcpubuf(struct page **out, const char *dst,
- unsigned short pageofs_out,
- unsigned int outputsize)
-{
- const char *end = dst + outputsize;
- const unsigned int righthalf = PAGE_SIZE - pageofs_out;
- const char *cur = dst - pageofs_out;
-
- while (cur < end) {
- struct page *const page = *out++;
-
- if (page) {
- char *buf = kmap_atomic(page);
-
- if (cur >= dst) {
- memcpy(buf, cur, min_t(uint, PAGE_SIZE,
- end - cur));
- } else {
- memcpy(buf + pageofs_out, cur + pageofs_out,
- min_t(uint, righthalf, end - cur));
- }
- kunmap_atomic(buf);
- }
- cur += PAGE_SIZE;
- }
-}
-
-static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const struct z_erofs_decompressor *alg = decompressors + rq->alg;
unsigned int dst_maptype;
void *dst;
int ret;
- /* two optimized fast paths only for non bigpcluster cases yet */
- if (rq->inputsize <= PAGE_SIZE) {
- if (nrpages_out == 1 && !rq->inplace_io) {
- DBG_BUGON(!*rq->out);
- dst = kmap_atomic(*rq->out);
- dst_maptype = 0;
- goto dstmap_out;
- }
-
- /*
- * For the case of small output size (especially much less
- * than PAGE_SIZE), memcpy the decompressed data rather than
- * compressed data is preferred.
- */
- if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
- dst = erofs_get_pcpubuf(1);
- if (IS_ERR(dst))
- return PTR_ERR(dst);
-
- rq->inplace_io = false;
- ret = alg->decompress(rq, dst);
- if (!ret)
- copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
- rq->outputsize);
-
- erofs_put_pcpubuf(dst);
- return ret;
- }
+ /* one optimized fast path only for non bigpcluster cases yet */
+ if (rq->inputsize <= PAGE_SIZE && nrpages_out == 1 && !rq->inplace_io) {
+ DBG_BUGON(!*rq->out);
+ dst = kmap_atomic(*rq->out);
+ dst_maptype = 0;
+ goto dstmap_out;
}
/* general decoding path which can be used for all cases */
- ret = alg->prepare_destpages(rq, pagepool);
+ ret = z_erofs_lz4_prepare_dstpages(rq, pagepool);
if (ret < 0)
return ret;
if (ret) {
@@ -351,7 +285,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
dst_maptype = 2;
dstmap_out:
- ret = alg->decompress(rq, dst + rq->pageofs_out);
+ ret = z_erofs_lz4_decompress_mem(rq, dst + rq->pageofs_out);
if (!dst_maptype)
kunmap_atomic(dst);
@@ -360,8 +294,8 @@ dstmap_out:
return ret;
}
-static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -399,10 +333,25 @@ static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
return 0;
}
+static struct z_erofs_decompressor decompressors[] = {
+ [Z_EROFS_COMPRESSION_SHIFTED] = {
+ .decompress = z_erofs_shifted_transform,
+ .name = "shifted"
+ },
+ [Z_EROFS_COMPRESSION_LZ4] = {
+ .decompress = z_erofs_lz4_decompress,
+ .name = "lz4"
+ },
+#ifdef CONFIG_EROFS_FS_ZIP_LZMA
+ [Z_EROFS_COMPRESSION_LZMA] = {
+ .decompress = z_erofs_lzma_decompress,
+ .name = "lzma"
+ },
+#endif
+};
+
int z_erofs_decompress(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+ struct page **pagepool)
{
- if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED)
- return z_erofs_shifted_transform(rq, pagepool);
- return z_erofs_decompress_generic(rq, pagepool);
+ return decompressors[rq->alg].decompress(rq, pagepool);
}
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
new file mode 100644
index 000000000000..50045510a1f4
--- /dev/null
+++ b/fs/erofs/decompressor_lzma.c
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/xz.h>
+#include <linux/module.h>
+#include "compress.h"
+
+struct z_erofs_lzma {
+ struct z_erofs_lzma *next;
+ struct xz_dec_microlzma *state;
+ struct xz_buf buf;
+ u8 bounce[PAGE_SIZE];
+};
+
+/* considering the LZMA performance, no need to use a lockless list for now */
+static DEFINE_SPINLOCK(z_erofs_lzma_lock);
+static unsigned int z_erofs_lzma_max_dictsize;
+static unsigned int z_erofs_lzma_nstrms, z_erofs_lzma_avail_strms;
+static struct z_erofs_lzma *z_erofs_lzma_head;
+static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq);
+
+module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444);
+
+void z_erofs_lzma_exit(void)
+{
+ /* there should be no running fs instance */
+ while (z_erofs_lzma_avail_strms) {
+ struct z_erofs_lzma *strm;
+
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ DBG_BUGON(1);
+ return;
+ }
+ z_erofs_lzma_head = NULL;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ while (strm) {
+ struct z_erofs_lzma *n = strm->next;
+
+ if (strm->state)
+ xz_dec_microlzma_end(strm->state);
+ kfree(strm);
+ --z_erofs_lzma_avail_strms;
+ strm = n;
+ }
+ }
+}
+
+int z_erofs_lzma_init(void)
+{
+ unsigned int i;
+
+ /* by default, use # of possible CPUs instead */
+ if (!z_erofs_lzma_nstrms)
+ z_erofs_lzma_nstrms = num_possible_cpus();
+
+ for (i = 0; i < z_erofs_lzma_nstrms; ++i) {
+ struct z_erofs_lzma *strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+
+ if (!strm) {
+ z_erofs_lzma_exit();
+ return -ENOMEM;
+ }
+ spin_lock(&z_erofs_lzma_lock);
+ strm->next = z_erofs_lzma_head;
+ z_erofs_lzma_head = strm;
+ spin_unlock(&z_erofs_lzma_lock);
+ ++z_erofs_lzma_avail_strms;
+ }
+ return 0;
+}
+
+int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lzma_cfgs *lzma, int size)
+{
+ static DEFINE_MUTEX(lzma_resize_mutex);
+ unsigned int dict_size, i;
+ struct z_erofs_lzma *strm, *head = NULL;
+ int err;
+
+ if (!lzma || size < sizeof(struct z_erofs_lzma_cfgs)) {
+ erofs_err(sb, "invalid lzma cfgs, size=%u", size);
+ return -EINVAL;
+ }
+ if (lzma->format) {
+ erofs_err(sb, "unidentified lzma format %x, please check kernel version",
+ le16_to_cpu(lzma->format));
+ return -EINVAL;
+ }
+ dict_size = le32_to_cpu(lzma->dict_size);
+ if (dict_size > Z_EROFS_LZMA_MAX_DICT_SIZE || dict_size < 4096) {
+ erofs_err(sb, "unsupported lzma dictionary size %u",
+ dict_size);
+ return -EINVAL;
+ }
+
+ erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!");
+
+ /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */
+ mutex_lock(&lzma_resize_mutex);
+
+ if (z_erofs_lzma_max_dictsize >= dict_size) {
+ mutex_unlock(&lzma_resize_mutex);
+ return 0;
+ }
+
+ /* 1. collect/isolate all streams for the following check */
+ for (i = 0; i < z_erofs_lzma_avail_strms; ++i) {
+ struct z_erofs_lzma *last;
+
+again:
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ wait_event(z_erofs_lzma_wq,
+ READ_ONCE(z_erofs_lzma_head));
+ goto again;
+ }
+ z_erofs_lzma_head = NULL;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ for (last = strm; last->next; last = last->next)
+ ++i;
+ last->next = head;
+ head = strm;
+ }
+
+ err = 0;
+ /* 2. walk each isolated stream and grow max dict_size if needed */
+ for (strm = head; strm; strm = strm->next) {
+ if (strm->state)
+ xz_dec_microlzma_end(strm->state);
+ strm->state = xz_dec_microlzma_alloc(XZ_PREALLOC, dict_size);
+ if (!strm->state)
+ err = -ENOMEM;
+ }
+
+ /* 3. push back all to the global list and update max dict_size */
+ spin_lock(&z_erofs_lzma_lock);
+ DBG_BUGON(z_erofs_lzma_head);
+ z_erofs_lzma_head = head;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ z_erofs_lzma_max_dictsize = dict_size;
+ mutex_unlock(&lzma_resize_mutex);
+ return err;
+}
+
+int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
+{
+ const unsigned int nrpages_out =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ const unsigned int nrpages_in =
+ PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+ unsigned int inputmargin, inlen, outlen, pageofs;
+ struct z_erofs_lzma *strm;
+ u8 *kin;
+ bool bounced = false;
+ int no, ni, j, err = 0;
+
+ /* 1. get the exact LZMA compressed size */
+ kin = kmap(*rq->in);
+ inputmargin = 0;
+ while (!kin[inputmargin & ~PAGE_MASK])
+ if (!(++inputmargin & ~PAGE_MASK))
+ break;
+
+ if (inputmargin >= PAGE_SIZE) {
+ kunmap(*rq->in);
+ return -EFSCORRUPTED;
+ }
+ rq->inputsize -= inputmargin;
+
+ /* 2. get an available lzma context */
+again:
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ wait_event(z_erofs_lzma_wq, READ_ONCE(z_erofs_lzma_head));
+ goto again;
+ }
+ z_erofs_lzma_head = strm->next;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ /* 3. multi-call decompress */
+ inlen = rq->inputsize;
+ outlen = rq->outputsize;
+ xz_dec_microlzma_reset(strm->state, inlen, outlen,
+ !rq->partial_decoding);
+ pageofs = rq->pageofs_out;
+ strm->buf.in = kin + inputmargin;
+ strm->buf.in_pos = 0;
+ strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - inputmargin);
+ inlen -= strm->buf.in_size;
+ strm->buf.out = NULL;
+ strm->buf.out_pos = 0;
+ strm->buf.out_size = 0;
+
+ for (ni = 0, no = -1;;) {
+ enum xz_ret xz_err;
+
+ if (strm->buf.out_pos == strm->buf.out_size) {
+ if (strm->buf.out) {
+ kunmap(rq->out[no]);
+ strm->buf.out = NULL;
+ }
+
+ if (++no >= nrpages_out || !outlen) {
+ erofs_err(rq->sb, "decompressed buf out of bound");
+ err = -EFSCORRUPTED;
+ break;
+ }
+ strm->buf.out_pos = 0;
+ strm->buf.out_size = min_t(u32, outlen,
+ PAGE_SIZE - pageofs);
+ outlen -= strm->buf.out_size;
+ if (rq->out[no])
+ strm->buf.out = kmap(rq->out[no]) + pageofs;
+ pageofs = 0;
+ } else if (strm->buf.in_pos == strm->buf.in_size) {
+ kunmap(rq->in[ni]);
+
+ if (++ni >= nrpages_in || !inlen) {
+ erofs_err(rq->sb, "compressed buf out of bound");
+ err = -EFSCORRUPTED;
+ break;
+ }
+ strm->buf.in_pos = 0;
+ strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE);
+ inlen -= strm->buf.in_size;
+ kin = kmap(rq->in[ni]);
+ strm->buf.in = kin;
+ bounced = false;
+ }
+
+ /*
+ * Handle overlapping: Use bounced buffer if the compressed
+ * data is under processing; Otherwise, Use short-lived pages
+ * from the on-stack pagepool where pages share with the same
+ * request.
+ */
+ if (!bounced && rq->out[no] == rq->in[ni]) {
+ memcpy(strm->bounce, strm->buf.in, strm->buf.in_size);
+ strm->buf.in = strm->bounce;
+ bounced = true;
+ }
+ for (j = ni + 1; j < nrpages_in; ++j) {
+ struct page *tmppage;
+
+ if (rq->out[no] != rq->in[j])
+ continue;
+
+ DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
+ rq->in[j]));
+ tmppage = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
+ set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
+ copy_highpage(tmppage, rq->in[j]);
+ rq->in[j] = tmppage;
+ }
+ xz_err = xz_dec_microlzma_run(strm->state, &strm->buf);
+ DBG_BUGON(strm->buf.out_pos > strm->buf.out_size);
+ DBG_BUGON(strm->buf.in_pos > strm->buf.in_size);
+
+ if (xz_err != XZ_OK) {
+ if (xz_err == XZ_STREAM_END && !outlen)
+ break;
+ erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]",
+ xz_err, rq->inputsize, rq->outputsize);
+ err = -EFSCORRUPTED;
+ break;
+ }
+ }
+ if (no < nrpages_out && strm->buf.out)
+ kunmap(rq->in[no]);
+ if (ni < nrpages_in)
+ kunmap(rq->in[ni]);
+ /* 4. push back LZMA stream context to the global list */
+ spin_lock(&z_erofs_lzma_lock);
+ strm->next = z_erofs_lzma_head;
+ z_erofs_lzma_head = strm;
+ spin_unlock(&z_erofs_lzma_lock);
+ wake_up(&z_erofs_lzma_wq);
+ return err;
+}
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index b0b23f41abc3..083997a034e5 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -21,14 +21,29 @@
#define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002
#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002
#define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004
+#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008
+#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008
#define EROFS_ALL_FEATURE_INCOMPAT \
(EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
- EROFS_FEATURE_INCOMPAT_CHUNKED_FILE)
+ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
+ EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
+ EROFS_FEATURE_INCOMPAT_COMPR_HEAD2)
#define EROFS_SB_EXTSLOT_SIZE 16
+struct erofs_deviceslot {
+ union {
+ u8 uuid[16]; /* used for device manager later */
+ u8 userdata[64]; /* digest(sha256), etc. */
+ } u;
+ __le32 blocks; /* total fs blocks of this device */
+ __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
+ u8 reserved[56];
+};
+#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
+
/* erofs on-disk super block (currently 128 bytes) */
struct erofs_super_block {
__le32 magic; /* file system magic number */
@@ -54,7 +69,9 @@ struct erofs_super_block {
/* customized sliding window size instead of 64k by default */
__le16 lz4_max_distance;
} __packed u1;
- __u8 reserved2[42];
+ __le16 extra_devices; /* # of devices besides the primary device */
+ __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */
+ __u8 reserved2[38];
};
/*
@@ -238,7 +255,7 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
/* 8-byte inode chunk indexes */
struct erofs_inode_chunk_index {
__le16 advise; /* always 0, don't care for now */
- __le16 device_id; /* back-end storage id, always 0 for now */
+ __le16 device_id; /* back-end storage id (with bits masked) */
__le32 blkaddr; /* start block address of this inode chunk */
};
@@ -247,10 +264,11 @@ struct erofs_inode_chunk_index {
/* available compression algorithm types (for h_algorithmtype) */
enum {
- Z_EROFS_COMPRESSION_LZ4 = 0,
+ Z_EROFS_COMPRESSION_LZ4 = 0,
+ Z_EROFS_COMPRESSION_LZMA = 1,
Z_EROFS_COMPRESSION_MAX
};
-#define Z_EROFS_ALL_COMPR_ALGS (1 << (Z_EROFS_COMPRESSION_MAX - 1))
+#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1)
/* 14 bytes (+ length field = 16 bytes) */
struct z_erofs_lz4_cfgs {
@@ -259,6 +277,15 @@ struct z_erofs_lz4_cfgs {
u8 reserved[10];
} __packed;
+/* 14 bytes (+ length field = 16 bytes) */
+struct z_erofs_lzma_cfgs {
+ __le32 dict_size;
+ __le16 format;
+ u8 reserved[8];
+} __packed;
+
+#define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE)
+
/*
* bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
* e.g. for 4k logical cluster size, 4B if compacted 2B is off;
@@ -288,35 +315,34 @@ struct z_erofs_map_header {
#define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8
/*
- * Fixed-sized output compression ondisk Logical Extent cluster type:
- * 0 - literal (uncompressed) cluster
- * 1 - compressed cluster (for the head logical cluster)
- * 2 - compressed cluster (for the other logical clusters)
+ * Fixed-sized output compression on-disk logical cluster type:
+ * 0 - literal (uncompressed) lcluster
+ * 1,3 - compressed lcluster (for HEAD lclusters)
+ * 2 - compressed lcluster (for NONHEAD lclusters)
*
* In detail,
- * 0 - literal (uncompressed) cluster,
+ * 0 - literal (uncompressed) lcluster,
* di_advise = 0
- * di_clusterofs = the literal data offset of the cluster
- * di_blkaddr = the blkaddr of the literal cluster
+ * di_clusterofs = the literal data offset of the lcluster
+ * di_blkaddr = the blkaddr of the literal pcluster
*
- * 1 - compressed cluster (for the head logical cluster)
- * di_advise = 1
- * di_clusterofs = the decompressed data offset of the cluster
- * di_blkaddr = the blkaddr of the compressed cluster
+ * 1,3 - compressed lcluster (for HEAD lclusters)
+ * di_advise = 1 or 3
+ * di_clusterofs = the decompressed data offset of the lcluster
+ * di_blkaddr = the blkaddr of the compressed pcluster
*
- * 2 - compressed cluster (for the other logical clusters)
+ * 2 - compressed lcluster (for NONHEAD lclusters)
* di_advise = 2
* di_clusterofs =
- * the decompressed data offset in its own head cluster
- * di_u.delta[0] = distance to its corresponding head cluster
- * di_u.delta[1] = distance to its corresponding tail cluster
- * (di_advise could be 0, 1 or 2)
+ * the decompressed data offset in its own HEAD lcluster
+ * di_u.delta[0] = distance to this HEAD lcluster
+ * di_u.delta[1] = distance to the next HEAD lcluster
*/
enum {
Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0,
- Z_EROFS_VLE_CLUSTER_TYPE_HEAD = 1,
+ Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 = 1,
Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2,
- Z_EROFS_VLE_CLUSTER_TYPE_RESERVED = 3,
+ Z_EROFS_VLE_CLUSTER_TYPE_HEAD2 = 3,
Z_EROFS_VLE_CLUSTER_TYPE_MAX
};
@@ -384,6 +410,7 @@ static inline void erofs_check_ondisk_layout_definitions(void)
/* keep in sync between 2 index structures for better extendibility */
BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) !=
sizeof(struct z_erofs_vle_decompressed_index));
+ BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128);
BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 31ac3a73b390..2345f1de438e 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -176,7 +176,7 @@ static struct page *erofs_read_inode(struct inode *inode,
}
if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
- if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_ALL)) {
+ if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) {
erofs_err(inode->i_sb,
"unsupported chunk format %x of nid %llu",
vi->chunkformat, vi->nid);
@@ -192,7 +192,7 @@ static struct page *erofs_read_inode(struct inode *inode,
inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
inode->i_flags &= ~S_DAX;
- if (test_opt(&sbi->ctx, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
+ if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
vi->datalayout == EROFS_INODE_FLAT_PLAIN)
inode->i_flags |= S_DAX;
if (!nblks)
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 9524e155b38f..3265688af7f9 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -47,7 +47,16 @@ typedef u64 erofs_off_t;
/* data type for filesystem-wide blocks number */
typedef u32 erofs_blk_t;
-struct erofs_fs_context {
+struct erofs_device_info {
+ char *path;
+ struct block_device *bdev;
+ struct dax_device *dax_dev;
+
+ u32 blocks;
+ u32 mapped_blkaddr;
+};
+
+struct erofs_mount_opts {
#ifdef CONFIG_EROFS_FS_ZIP
/* current strategy of how to use managed cache */
unsigned char cache_strategy;
@@ -60,6 +69,18 @@ struct erofs_fs_context {
unsigned int mount_opt;
};
+struct erofs_dev_context {
+ struct idr tree;
+ struct rw_semaphore rwsem;
+
+ unsigned int extra_devices;
+};
+
+struct erofs_fs_context {
+ struct erofs_mount_opts opt;
+ struct erofs_dev_context *devs;
+};
+
/* all filesystem-wide lz4 configurations */
struct erofs_sb_lz4_info {
/* # of pages needed for EROFS lz4 rolling decompression */
@@ -69,6 +90,7 @@ struct erofs_sb_lz4_info {
};
struct erofs_sb_info {
+ struct erofs_mount_opts opt; /* options */
#ifdef CONFIG_EROFS_FS_ZIP
/* list for all registered superblocks, mainly for shrinker */
struct list_head list;
@@ -85,12 +107,16 @@ struct erofs_sb_info {
struct erofs_sb_lz4_info lz4;
#endif /* CONFIG_EROFS_FS_ZIP */
+ struct erofs_dev_context *devs;
struct dax_device *dax_dev;
- u32 blocks;
+ u64 total_blocks;
+ u32 primarydevice_blocks;
+
u32 meta_blkaddr;
#ifdef CONFIG_EROFS_FS_XATTR
u32 xattr_blkaddr;
#endif
+ u16 device_id_mask; /* valid bits of device id to be used */
/* inode slot unit size in bit shift */
unsigned char islotbits;
@@ -108,8 +134,6 @@ struct erofs_sb_info {
u8 volume_name[16]; /* volume name */
u32 feature_compat;
u32 feature_incompat;
-
- struct erofs_fs_context ctx; /* options */
};
#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
@@ -121,9 +145,9 @@ struct erofs_sb_info {
#define EROFS_MOUNT_DAX_ALWAYS 0x00000040
#define EROFS_MOUNT_DAX_NEVER 0x00000080
-#define clear_opt(ctx, option) ((ctx)->mount_opt &= ~EROFS_MOUNT_##option)
-#define set_opt(ctx, option) ((ctx)->mount_opt |= EROFS_MOUNT_##option)
-#define test_opt(ctx, option) ((ctx)->mount_opt & EROFS_MOUNT_##option)
+#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option)
+#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
+#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option)
enum {
EROFS_ZIP_CACHE_DISABLED,
@@ -237,6 +261,7 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING)
EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS)
EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER)
+EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
/* atomic flag definitions */
@@ -307,6 +332,19 @@ static inline unsigned int erofs_inode_datalayout(unsigned int value)
EROFS_I_DATALAYOUT_BITS);
}
+/*
+ * Different from grab_cache_page_nowait(), reclaiming is never triggered
+ * when allocating new pages.
+ */
+static inline
+struct page *erofs_grab_cache_page_nowait(struct address_space *mapping,
+ pgoff_t index)
+{
+ return pagecache_get_page(mapping, index,
+ FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
+ readahead_gfp_mask(mapping) & ~__GFP_RECLAIM);
+}
+
extern const struct super_operations erofs_sops;
extern const struct address_space_operations erofs_raw_access_aops;
@@ -338,7 +376,7 @@ extern const struct address_space_operations z_erofs_aops;
* of the corresponding uncompressed data in the file.
*/
enum {
- BH_Zipped = BH_PrivateStart,
+ BH_Encoded = BH_PrivateStart,
BH_FullMapped,
};
@@ -346,8 +384,8 @@ enum {
#define EROFS_MAP_MAPPED (1 << BH_Mapped)
/* Located in metadata (could be copied from bd_inode) */
#define EROFS_MAP_META (1 << BH_Meta)
-/* The extent has been compressed */
-#define EROFS_MAP_ZIPPED (1 << BH_Zipped)
+/* The extent is encoded */
+#define EROFS_MAP_ENCODED (1 << BH_Encoded)
/* The length of extent is full */
#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped)
@@ -355,6 +393,8 @@ struct erofs_map_blocks {
erofs_off_t m_pa, m_la;
u64 m_plen, m_llen;
+ unsigned short m_deviceid;
+ char m_algorithmformat;
unsigned int m_flags;
struct page *mpage;
@@ -367,6 +407,13 @@ struct erofs_map_blocks {
* approach instead if possible since it's more metadata lightweight.)
*/
#define EROFS_GET_BLOCKS_FIEMAP 0x0002
+/* Used to map the whole extent if non-negligible data is requested for LZMA */
+#define EROFS_GET_BLOCKS_READMORE 0x0004
+
+enum {
+ Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+ Z_EROFS_COMPRESSION_RUNTIME_MAX
+};
/* zmap.c */
extern const struct iomap_ops z_erofs_iomap_report_ops;
@@ -386,9 +433,18 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
}
#endif /* !CONFIG_EROFS_FS_ZIP */
+struct erofs_map_dev {
+ struct block_device *m_bdev;
+ struct dax_device *m_daxdev;
+
+ erofs_off_t m_pa;
+ unsigned int m_deviceid;
+};
+
/* data.c */
extern const struct file_operations erofs_file_fops;
struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
@@ -443,7 +499,14 @@ void erofs_pcpubuf_init(void);
void erofs_pcpubuf_exit(void);
/* utils.c / zdata.c */
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
+struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp);
+static inline void erofs_pagepool_add(struct page **pagepool,
+ struct page *page)
+{
+ set_page_private(page, (unsigned long)*pagepool);
+ *pagepool = page;
+}
+void erofs_release_pages(struct page **pagepool);
#ifdef CONFIG_EROFS_FS_ZIP
int erofs_workgroup_put(struct erofs_workgroup *grp);
@@ -483,6 +546,26 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb,
}
#endif /* !CONFIG_EROFS_FS_ZIP */
+#ifdef CONFIG_EROFS_FS_ZIP_LZMA
+int z_erofs_lzma_init(void);
+void z_erofs_lzma_exit(void);
+int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lzma_cfgs *lzma, int size);
+#else
+static inline int z_erofs_lzma_init(void) { return 0; }
+static inline int z_erofs_lzma_exit(void) { return 0; }
+static inline int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lzma_cfgs *lzma, int size) {
+ if (lzma) {
+ erofs_err(sb, "lzma algorithm isn't enabled");
+ return -EINVAL;
+ }
+ return 0;
+}
+#endif /* !CONFIG_EROFS_FS_ZIP */
+
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#endif /* __EROFS_INTERNAL_H */
diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c
index 6c885575128a..a2efd833d1b6 100644
--- a/fs/erofs/pcpubuf.c
+++ b/fs/erofs/pcpubuf.c
@@ -49,7 +49,7 @@ int erofs_pcpubuf_growsize(unsigned int nrpages)
{
static DEFINE_MUTEX(pcb_resize_mutex);
static unsigned int pcb_nrpages;
- LIST_HEAD(pagepool);
+ struct page *pagepool = NULL;
int delta, cpu, ret, i;
mutex_lock(&pcb_resize_mutex);
@@ -102,13 +102,13 @@ int erofs_pcpubuf_growsize(unsigned int nrpages)
vunmap(old_ptr);
free_pagearray:
while (i)
- list_add(&oldpages[--i]->lru, &pagepool);
+ erofs_pagepool_add(&pagepool, oldpages[--i]);
kfree(oldpages);
if (ret)
break;
}
pcb_nrpages = nrpages;
- put_pages_list(&pagepool);
+ erofs_release_pages(&pagepool);
out:
mutex_unlock(&pcb_resize_mutex);
return ret;
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 11b88559f8bf..6a969b1e0ee6 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -225,6 +225,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
case Z_EROFS_COMPRESSION_LZ4:
ret = z_erofs_load_lz4_config(sb, dsb, data, size);
break;
+ case Z_EROFS_COMPRESSION_LZMA:
+ ret = z_erofs_load_lzma_config(sb, dsb, data, size);
+ break;
default:
DBG_BUGON(1);
ret = -EFAULT;
@@ -252,6 +255,79 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
}
#endif
+static int erofs_init_devices(struct super_block *sb,
+ struct erofs_super_block *dsb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ unsigned int ondisk_extradevs;
+ erofs_off_t pos;
+ struct page *page = NULL;
+ struct erofs_device_info *dif;
+ struct erofs_deviceslot *dis;
+ void *ptr;
+ int id, err = 0;
+
+ sbi->total_blocks = sbi->primarydevice_blocks;
+ if (!erofs_sb_has_device_table(sbi))
+ ondisk_extradevs = 0;
+ else
+ ondisk_extradevs = le16_to_cpu(dsb->extra_devices);
+
+ if (ondisk_extradevs != sbi->devs->extra_devices) {
+ erofs_err(sb, "extra devices don't match (ondisk %u, given %u)",
+ ondisk_extradevs, sbi->devs->extra_devices);
+ return -EINVAL;
+ }
+ if (!ondisk_extradevs)
+ return 0;
+
+ sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1;
+ pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE;
+ down_read(&sbi->devs->rwsem);
+ idr_for_each_entry(&sbi->devs->tree, dif, id) {
+ erofs_blk_t blk = erofs_blknr(pos);
+ struct block_device *bdev;
+
+ if (!page || page->index != blk) {
+ if (page) {
+ kunmap(page);
+ unlock_page(page);
+ put_page(page);
+ }
+
+ page = erofs_get_meta_page(sb, blk);
+ if (IS_ERR(page)) {
+ up_read(&sbi->devs->rwsem);
+ return PTR_ERR(page);
+ }
+ ptr = kmap(page);
+ }
+ dis = ptr + erofs_blkoff(pos);
+
+ bdev = blkdev_get_by_path(dif->path,
+ FMODE_READ | FMODE_EXCL,
+ sb->s_type);
+ if (IS_ERR(bdev)) {
+ err = PTR_ERR(bdev);
+ goto err_out;
+ }
+ dif->bdev = bdev;
+ dif->dax_dev = fs_dax_get_by_bdev(bdev);
+ dif->blocks = le32_to_cpu(dis->blocks);
+ dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
+ sbi->total_blocks += dif->blocks;
+ pos += EROFS_DEVT_SLOT_SIZE;
+ }
+err_out:
+ up_read(&sbi->devs->rwsem);
+ if (page) {
+ kunmap(page);
+ unlock_page(page);
+ put_page(page);
+ }
+ return err;
+}
+
static int erofs_read_superblock(struct super_block *sb)
{
struct erofs_sb_info *sbi;
@@ -303,7 +379,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->sb_size);
goto out;
}
- sbi->blocks = le32_to_cpu(dsb->blocks);
+ sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks);
sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
#ifdef CONFIG_EROFS_FS_XATTR
sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
@@ -330,6 +406,11 @@ static int erofs_read_superblock(struct super_block *sb)
ret = erofs_load_compr_cfgs(sb, dsb);
else
ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0);
+ if (ret < 0)
+ goto out;
+
+ /* handle multiple devices */
+ ret = erofs_init_devices(sb, dsb);
out:
kunmap(page);
put_page(page);
@@ -340,15 +421,15 @@ out:
static void erofs_default_options(struct erofs_fs_context *ctx)
{
#ifdef CONFIG_EROFS_FS_ZIP
- ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
- ctx->max_sync_decompress_pages = 3;
- ctx->readahead_sync_decompress = false;
+ ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+ ctx->opt.max_sync_decompress_pages = 3;
+ ctx->opt.readahead_sync_decompress = false;
#endif
#ifdef CONFIG_EROFS_FS_XATTR
- set_opt(ctx, XATTR_USER);
+ set_opt(&ctx->opt, XATTR_USER);
#endif
#ifdef CONFIG_EROFS_FS_POSIX_ACL
- set_opt(ctx, POSIX_ACL);
+ set_opt(&ctx->opt, POSIX_ACL);
#endif
}
@@ -358,6 +439,7 @@ enum {
Opt_cache_strategy,
Opt_dax,
Opt_dax_enum,
+ Opt_device,
Opt_err
};
@@ -381,6 +463,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
erofs_param_cache_strategy),
fsparam_flag("dax", Opt_dax),
fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums),
+ fsparam_string("device", Opt_device),
{}
};
@@ -392,12 +475,12 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
switch (mode) {
case EROFS_MOUNT_DAX_ALWAYS:
warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- set_opt(ctx, DAX_ALWAYS);
- clear_opt(ctx, DAX_NEVER);
+ set_opt(&ctx->opt, DAX_ALWAYS);
+ clear_opt(&ctx->opt, DAX_NEVER);
return true;
case EROFS_MOUNT_DAX_NEVER:
- set_opt(ctx, DAX_NEVER);
- clear_opt(ctx, DAX_ALWAYS);
+ set_opt(&ctx->opt, DAX_NEVER);
+ clear_opt(&ctx->opt, DAX_ALWAYS);
return true;
default:
DBG_BUGON(1);
@@ -412,9 +495,10 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
static int erofs_fc_parse_param(struct fs_context *fc,
struct fs_parameter *param)
{
- struct erofs_fs_context *ctx __maybe_unused = fc->fs_private;
+ struct erofs_fs_context *ctx = fc->fs_private;
struct fs_parse_result result;
- int opt;
+ struct erofs_device_info *dif;
+ int opt, ret;
opt = fs_parse(fc, erofs_fs_parameters, param, &result);
if (opt < 0)
@@ -424,9 +508,9 @@ static int erofs_fc_parse_param(struct fs_context *fc,
case Opt_user_xattr:
#ifdef CONFIG_EROFS_FS_XATTR
if (result.boolean)
- set_opt(ctx, XATTR_USER);
+ set_opt(&ctx->opt, XATTR_USER);
else
- clear_opt(ctx, XATTR_USER);
+ clear_opt(&ctx->opt, XATTR_USER);
#else
errorfc(fc, "{,no}user_xattr options not supported");
#endif
@@ -434,16 +518,16 @@ static int erofs_fc_parse_param(struct fs_context *fc,
case Opt_acl:
#ifdef CONFIG_EROFS_FS_POSIX_ACL
if (result.boolean)
- set_opt(ctx, POSIX_ACL);
+ set_opt(&ctx->opt, POSIX_ACL);
else
- clear_opt(ctx, POSIX_ACL);
+ clear_opt(&ctx->opt, POSIX_ACL);
#else
errorfc(fc, "{,no}acl options not supported");
#endif
break;
case Opt_cache_strategy:
#ifdef CONFIG_EROFS_FS_ZIP
- ctx->cache_strategy = result.uint_32;
+ ctx->opt.cache_strategy = result.uint_32;
#else
errorfc(fc, "compression not supported, cache_strategy ignored");
#endif
@@ -456,6 +540,25 @@ static int erofs_fc_parse_param(struct fs_context *fc,
if (!erofs_fc_set_dax_mode(fc, result.uint_32))
return -EINVAL;
break;
+ case Opt_device:
+ dif = kzalloc(sizeof(*dif), GFP_KERNEL);
+ if (!dif)
+ return -ENOMEM;
+ dif->path = kstrdup(param->string, GFP_KERNEL);
+ if (!dif->path) {
+ kfree(dif);
+ return -ENOMEM;
+ }
+ down_write(&ctx->devs->rwsem);
+ ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL);
+ up_write(&ctx->devs->rwsem);
+ if (ret < 0) {
+ kfree(dif->path);
+ kfree(dif);
+ return ret;
+ }
+ ++ctx->devs->extra_devices;
+ break;
default:
return -ENOPARAM;
}
@@ -540,15 +643,19 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return -ENOMEM;
sb->s_fs_info = sbi;
+ sbi->opt = ctx->opt;
sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
+ sbi->devs = ctx->devs;
+ ctx->devs = NULL;
+
err = erofs_read_superblock(sb);
if (err)
return err;
- if (test_opt(ctx, DAX_ALWAYS) &&
+ if (test_opt(&sbi->opt, DAX_ALWAYS) &&
!dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) {
errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
- clear_opt(ctx, DAX_ALWAYS);
+ clear_opt(&sbi->opt, DAX_ALWAYS);
}
sb->s_flags |= SB_RDONLY | SB_NOATIME;
sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -557,13 +664,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_op = &erofs_sops;
sb->s_xattr = erofs_xattr_handlers;
- if (test_opt(ctx, POSIX_ACL))
+ if (test_opt(&sbi->opt, POSIX_ACL))
sb->s_flags |= SB_POSIXACL;
else
sb->s_flags &= ~SB_POSIXACL;
- sbi->ctx = *ctx;
-
#ifdef CONFIG_EROFS_FS_ZIP
xa_init(&sbi->managed_pslots);
#endif
@@ -607,20 +712,44 @@ static int erofs_fc_reconfigure(struct fs_context *fc)
DBG_BUGON(!sb_rdonly(sb));
- if (test_opt(ctx, POSIX_ACL))
+ if (test_opt(&ctx->opt, POSIX_ACL))
fc->sb_flags |= SB_POSIXACL;
else
fc->sb_flags &= ~SB_POSIXACL;
- sbi->ctx = *ctx;
+ sbi->opt = ctx->opt;
fc->sb_flags |= SB_RDONLY;
return 0;
}
+static int erofs_release_device_info(int id, void *ptr, void *data)
+{
+ struct erofs_device_info *dif = ptr;
+
+ fs_put_dax(dif->dax_dev);
+ if (dif->bdev)
+ blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
+ kfree(dif->path);
+ kfree(dif);
+ return 0;
+}
+
+static void erofs_free_dev_context(struct erofs_dev_context *devs)
+{
+ if (!devs)
+ return;
+ idr_for_each(&devs->tree, &erofs_release_device_info, NULL);
+ idr_destroy(&devs->tree);
+ kfree(devs);
+}
+
static void erofs_fc_free(struct fs_context *fc)
{
- kfree(fc->fs_private);
+ struct erofs_fs_context *ctx = fc->fs_private;
+
+ erofs_free_dev_context(ctx->devs);
+ kfree(ctx);
}
static const struct fs_context_operations erofs_context_ops = {
@@ -632,15 +761,21 @@ static const struct fs_context_operations erofs_context_ops = {
static int erofs_init_fs_context(struct fs_context *fc)
{
- fc->fs_private = kzalloc(sizeof(struct erofs_fs_context), GFP_KERNEL);
- if (!fc->fs_private)
- return -ENOMEM;
+ struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- /* set default mount options */
- erofs_default_options(fc->fs_private);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
+ if (!ctx->devs) {
+ kfree(ctx);
+ return -ENOMEM;
+ }
+ fc->fs_private = ctx;
+ idr_init(&ctx->devs->tree);
+ init_rwsem(&ctx->devs->rwsem);
+ erofs_default_options(ctx);
fc->ops = &erofs_context_ops;
-
return 0;
}
@@ -659,6 +794,8 @@ static void erofs_kill_sb(struct super_block *sb)
sbi = EROFS_SB(sb);
if (!sbi)
return;
+
+ erofs_free_dev_context(sbi->devs);
fs_put_dax(sbi->dax_dev);
kfree(sbi);
sb->s_fs_info = NULL;
@@ -706,6 +843,10 @@ static int __init erofs_module_init(void)
if (err)
goto shrinker_err;
+ err = z_erofs_lzma_init();
+ if (err)
+ goto lzma_err;
+
erofs_pcpubuf_init();
err = z_erofs_init_zip_subsystem();
if (err)
@@ -720,6 +861,8 @@ static int __init erofs_module_init(void)
fs_err:
z_erofs_exit_zip_subsystem();
zip_err:
+ z_erofs_lzma_exit();
+lzma_err:
erofs_exit_shrinker();
shrinker_err:
kmem_cache_destroy(erofs_inode_cachep);
@@ -730,11 +873,13 @@ icache_err:
static void __exit erofs_module_exit(void)
{
unregister_filesystem(&erofs_fs_type);
- z_erofs_exit_zip_subsystem();
- erofs_exit_shrinker();
- /* Ensure all RCU free inodes are safe before cache is destroyed. */
+ /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */
rcu_barrier();
+
+ z_erofs_exit_zip_subsystem();
+ z_erofs_lzma_exit();
+ erofs_exit_shrinker();
kmem_cache_destroy(erofs_inode_cachep);
erofs_pcpubuf_exit();
}
@@ -748,7 +893,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = sb->s_magic;
buf->f_bsize = EROFS_BLKSIZ;
- buf->f_blocks = sbi->blocks;
+ buf->f_blocks = sbi->total_blocks;
buf->f_bfree = buf->f_bavail = 0;
buf->f_files = ULLONG_MAX;
@@ -763,31 +908,31 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
static int erofs_show_options(struct seq_file *seq, struct dentry *root)
{
struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
- struct erofs_fs_context *ctx = &sbi->ctx;
+ struct erofs_mount_opts *opt = &sbi->opt;
#ifdef CONFIG_EROFS_FS_XATTR
- if (test_opt(ctx, XATTR_USER))
+ if (test_opt(opt, XATTR_USER))
seq_puts(seq, ",user_xattr");
else
seq_puts(seq, ",nouser_xattr");
#endif
#ifdef CONFIG_EROFS_FS_POSIX_ACL
- if (test_opt(ctx, POSIX_ACL))
+ if (test_opt(opt, POSIX_ACL))
seq_puts(seq, ",acl");
else
seq_puts(seq, ",noacl");
#endif
#ifdef CONFIG_EROFS_FS_ZIP
- if (ctx->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
+ if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
seq_puts(seq, ",cache_strategy=disabled");
- else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
+ else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
seq_puts(seq, ",cache_strategy=readahead");
- else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
+ else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
seq_puts(seq, ",cache_strategy=readaround");
#endif
- if (test_opt(ctx, DAX_ALWAYS))
+ if (test_opt(opt, DAX_ALWAYS))
seq_puts(seq, ",dax=always");
- if (test_opt(ctx, DAX_NEVER))
+ if (test_opt(opt, DAX_NEVER))
seq_puts(seq, ",dax=never");
return 0;
}
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index bd86067a63f7..84da2c280012 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -6,20 +6,29 @@
#include "internal.h"
#include <linux/pagevec.h>
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp)
{
- struct page *page;
+ struct page *page = *pagepool;
- if (!list_empty(pool)) {
- page = lru_to_page(pool);
+ if (page) {
DBG_BUGON(page_ref_count(page) != 1);
- list_del(&page->lru);
+ *pagepool = (struct page *)page_private(page);
} else {
page = alloc_page(gfp);
}
return page;
}
+void erofs_release_pages(struct page **pagepool)
+{
+ while (*pagepool) {
+ struct page *page = *pagepool;
+
+ *pagepool = (struct page *)page_private(page);
+ put_page(page);
+ }
+}
+
#ifdef CONFIG_EROFS_FS_ZIP
/* global shrink count (for all mounted EROFS instances) */
static atomic_long_t erofs_global_shrink_cnt;
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 778f2c52295d..01c581e93c5f 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -429,7 +429,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
static bool erofs_xattr_user_list(struct dentry *dentry)
{
- return test_opt(&EROFS_SB(dentry->d_sb)->ctx, XATTR_USER);
+ return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER);
}
static bool erofs_xattr_trusted_list(struct dentry *dentry)
@@ -476,7 +476,7 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler,
switch (handler->flags) {
case EROFS_XATTR_INDEX_USER:
- if (!test_opt(&sbi->ctx, XATTR_USER))
+ if (!test_opt(&sbi->opt, XATTR_USER))
return -EOPNOTSUPP;
break;
case EROFS_XATTR_INDEX_TRUSTED:
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 11c7a1aaebad..bcb1b91b234f 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -236,7 +236,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock);
static void preload_compressed_pages(struct z_erofs_collector *clt,
struct address_space *mc,
enum z_erofs_cache_alloctype type,
- struct list_head *pagepool)
+ struct page **pagepool)
{
struct z_erofs_pcluster *pcl = clt->pcl;
bool standalone = true;
@@ -287,12 +287,10 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
continue;
- if (page) {
+ if (page)
put_page(page);
- } else if (newpage) {
- set_page_private(newpage, 0);
- list_add(&newpage->lru, pagepool);
- }
+ else if (newpage)
+ erofs_pagepool_add(pagepool, newpage);
}
/*
@@ -476,6 +474,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
struct erofs_workgroup *grp;
int err;
+ if (!(map->m_flags & EROFS_MAP_ENCODED)) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+
/* no available pcluster, let's allocate one */
pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT);
if (IS_ERR(pcl))
@@ -483,16 +486,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
atomic_set(&pcl->obj.refcount, 1);
pcl->obj.index = map->m_pa >> PAGE_SHIFT;
-
+ pcl->algorithmformat = map->m_algorithmformat;
pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
(map->m_flags & EROFS_MAP_FULL_MAPPED ?
Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
- if (map->m_flags & EROFS_MAP_ZIPPED)
- pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4;
- else
- pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
-
/* new pclusters should be claimed as type 1, primary and followed */
pcl->next = clt->owned_head;
clt->mode = COLLECT_PRIMARY_FOLLOWED;
@@ -643,7 +641,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
}
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page, struct list_head *pagepool)
+ struct page *page, struct page **pagepool)
{
struct inode *const inode = fe->inode;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
@@ -695,7 +693,7 @@ restart_now:
goto err_out;
/* preload all compressed pages (maybe downgrade role if necessary) */
- if (should_alloc_managed_pages(fe, sbi->ctx.cache_strategy, map->m_la))
+ if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, map->m_la))
cache_strategy = TRYALLOC;
else
cache_strategy = DONTALLOC;
@@ -796,7 +794,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
/* Use workqueue and sync decompression for atomic contexts only */
if (in_atomic() || irqs_disabled()) {
queue_work(z_erofs_workqueue, &io->u.work);
- sbi->ctx.readahead_sync_decompress = true;
+ sbi->opt.readahead_sync_decompress = true;
return;
}
z_erofs_decompressqueue_work(&io->u.work);
@@ -836,7 +834,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio)
static int z_erofs_decompress_pcluster(struct super_block *sb,
struct z_erofs_pcluster *pcl,
- struct list_head *pagepool)
+ struct page **pagepool)
{
struct erofs_sb_info *const sbi = EROFS_SB(sb);
struct z_erofs_pagevec_ctor ctor;
@@ -1036,7 +1034,7 @@ out:
}
static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
- struct list_head *pagepool)
+ struct page **pagepool)
{
z_erofs_next_pcluster_t owned = io->head;
@@ -1060,18 +1058,18 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
{
struct z_erofs_decompressqueue *bgq =
container_of(work, struct z_erofs_decompressqueue, u.work);
- LIST_HEAD(pagepool);
+ struct page *pagepool = NULL;
DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
z_erofs_decompress_queue(bgq, &pagepool);
- put_pages_list(&pagepool);
+ erofs_release_pages(&pagepool);
kvfree(bgq);
}
static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
unsigned int nr,
- struct list_head *pagepool,
+ struct page **pagepool,
struct address_space *mc,
gfp_t gfp)
{
@@ -1173,7 +1171,7 @@ repeat:
out_allocpage:
page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
- list_add(&page->lru, pagepool);
+ erofs_pagepool_add(pagepool, page);
cond_resched();
goto repeat;
}
@@ -1257,7 +1255,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
static void z_erofs_submit_queue(struct super_block *sb,
struct z_erofs_decompress_frontend *f,
- struct list_head *pagepool,
+ struct page **pagepool,
struct z_erofs_decompressqueue *fgq,
bool *force_fg)
{
@@ -1266,8 +1264,9 @@ static void z_erofs_submit_queue(struct super_block *sb,
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
void *bi_private;
z_erofs_next_pcluster_t owned_head = f->clt.owned_head;
- /* since bio will be NULL, no need to initialize last_index */
+ /* bio is NULL initially, so no need to initialize last_{index,bdev} */
pgoff_t last_index;
+ struct block_device *last_bdev;
unsigned int nr_bios = 0;
struct bio *bio = NULL;
@@ -1279,6 +1278,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
q[JQ_SUBMIT]->head = owned_head;
do {
+ struct erofs_map_dev mdev;
struct z_erofs_pcluster *pcl;
pgoff_t cur, end;
unsigned int i = 0;
@@ -1290,7 +1290,13 @@ static void z_erofs_submit_queue(struct super_block *sb,
pcl = container_of(owned_head, struct z_erofs_pcluster, next);
- cur = pcl->obj.index;
+ /* no device id here, thus it will always succeed */
+ mdev = (struct erofs_map_dev) {
+ .m_pa = blknr_to_addr(pcl->obj.index),
+ };
+ (void)erofs_map_dev(sb, &mdev);
+
+ cur = erofs_blknr(mdev.m_pa);
end = cur + pcl->pclusterpages;
/* close the main owned chain at first */
@@ -1306,7 +1312,8 @@ static void z_erofs_submit_queue(struct super_block *sb,
if (!page)
continue;
- if (bio && cur != last_index + 1) {
+ if (bio && (cur != last_index + 1 ||
+ last_bdev != mdev.m_bdev)) {
submit_bio_retry:
submit_bio(bio);
bio = NULL;
@@ -1314,9 +1321,10 @@ submit_bio_retry:
if (!bio) {
bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
-
bio->bi_end_io = z_erofs_decompressqueue_endio;
- bio_set_dev(bio, sb->s_bdev);
+
+ bio_set_dev(bio, mdev.m_bdev);
+ last_bdev = mdev.m_bdev;
bio->bi_iter.bi_sector = (sector_t)cur <<
LOG_SECTORS_PER_BLOCK;
bio->bi_private = bi_private;
@@ -1355,7 +1363,7 @@ submit_bio_retry:
static void z_erofs_runqueue(struct super_block *sb,
struct z_erofs_decompress_frontend *f,
- struct list_head *pagepool, bool force_fg)
+ struct page **pagepool, bool force_fg)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
@@ -1377,18 +1385,87 @@ static void z_erofs_runqueue(struct super_block *sb,
z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
}
+/*
+ * Since partial uptodate is still unimplemented for now, we have to use
+ * approximate readmore strategies as a start.
+ */
+static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
+ struct readahead_control *rac,
+ erofs_off_t end,
+ struct page **pagepool,
+ bool backmost)
+{
+ struct inode *inode = f->inode;
+ struct erofs_map_blocks *map = &f->map;
+ erofs_off_t cur;
+ int err;
+
+ if (backmost) {
+ map->m_la = end;
+ err = z_erofs_map_blocks_iter(inode, map,
+ EROFS_GET_BLOCKS_READMORE);
+ if (err)
+ return;
+
+ /* expend ra for the trailing edge if readahead */
+ if (rac) {
+ loff_t newstart = readahead_pos(rac);
+
+ cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
+ readahead_expand(rac, newstart, cur - newstart);
+ return;
+ }
+ end = round_up(end, PAGE_SIZE);
+ } else {
+ end = round_up(map->m_la, PAGE_SIZE);
+
+ if (!map->m_llen)
+ return;
+ }
+
+ cur = map->m_la + map->m_llen - 1;
+ while (cur >= end) {
+ pgoff_t index = cur >> PAGE_SHIFT;
+ struct page *page;
+
+ page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
+ if (!page)
+ goto skip;
+
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto skip;
+ }
+
+ err = z_erofs_do_read_page(f, page, pagepool);
+ if (err)
+ erofs_err(inode->i_sb,
+ "readmore error at page %lu @ nid %llu",
+ index, EROFS_I(inode)->nid);
+ put_page(page);
+skip:
+ if (cur < PAGE_SIZE)
+ break;
+ cur = (index << PAGE_SHIFT) - 1;
+ }
+}
+
static int z_erofs_readpage(struct file *file, struct page *page)
{
struct inode *const inode = page->mapping->host;
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+ struct page *pagepool = NULL;
int err;
- LIST_HEAD(pagepool);
trace_erofs_readpage(page, false);
-
f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
+ z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1,
+ &pagepool, true);
err = z_erofs_do_read_page(&f, page, &pagepool);
+ z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false);
+
(void)z_erofs_collector_end(&f.clt);
/* if some compressed cluster ready, need submit them anyway */
@@ -1400,8 +1477,7 @@ static int z_erofs_readpage(struct file *file, struct page *page)
if (f.map.mpage)
put_page(f.map.mpage);
- /* clean up the remaining free pages */
- put_pages_list(&pagepool);
+ erofs_release_pages(&pagepool);
return err;
}
@@ -1409,29 +1485,19 @@ static void z_erofs_readahead(struct readahead_control *rac)
{
struct inode *const inode = rac->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
-
- unsigned int nr_pages = readahead_count(rac);
- bool sync = (sbi->ctx.readahead_sync_decompress &&
- nr_pages <= sbi->ctx.max_sync_decompress_pages);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- struct page *page, *head = NULL;
- LIST_HEAD(pagepool);
-
- trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+ struct page *pagepool = NULL, *head = NULL, *page;
+ unsigned int nr_pages;
f.readahead = true;
f.headoffset = readahead_pos(rac);
- while ((page = readahead_page(rac))) {
- prefetchw(&page->flags);
-
- /*
- * A pure asynchronous readahead is indicated if
- * a PG_readahead marked page is hitted at first.
- * Let's also do asynchronous decompression for this case.
- */
- sync &= !(PageReadahead(page) && !head);
+ z_erofs_pcluster_readmore(&f, rac, f.headoffset +
+ readahead_length(rac) - 1, &pagepool, true);
+ nr_pages = readahead_count(rac);
+ trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+ while ((page = readahead_page(rac))) {
set_page_private(page, (unsigned long)head);
head = page;
}
@@ -1450,16 +1516,15 @@ static void z_erofs_readahead(struct readahead_control *rac)
page->index, EROFS_I(inode)->nid);
put_page(page);
}
-
+ z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
(void)z_erofs_collector_end(&f.clt);
- z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync);
-
+ z_erofs_runqueue(inode->i_sb, &f, &pagepool,
+ sbi->opt.readahead_sync_decompress &&
+ nr_pages <= sbi->opt.max_sync_decompress_pages);
if (f.map.mpage)
put_page(f.map.mpage);
-
- /* clean up the remaining free pages */
- put_pages_list(&pagepool);
+ erofs_release_pages(&pagepool);
}
const struct address_space_operations z_erofs_aops = {
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index 3a008f1b9f78..879df5362777 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -94,13 +94,6 @@ struct z_erofs_decompressqueue {
} u;
};
-#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
-static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
- struct page *page)
-{
- return page->mapping == MNGD_MAPPING(sbi);
-}
-
#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2
#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS)
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 9fb98d85a3ce..660489a7fb64 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -28,7 +28,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
{
struct erofs_inode *const vi = EROFS_I(inode);
struct super_block *const sb = inode->i_sb;
- int err;
+ int err, headnr;
erofs_off_t pos;
struct page *page;
void *kaddr;
@@ -68,9 +68,11 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
- if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) {
- erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel",
- vi->z_algorithmtype[0], vi->nid);
+ headnr = 0;
+ if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
+ vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) {
+ erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
+ headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
err = -EOPNOTSUPP;
goto unmap_done;
}
@@ -111,7 +113,7 @@ struct z_erofs_maprecorder {
unsigned long lcn;
/* compression extent information gathered */
- u8 type;
+ u8 type, headtype;
u16 clusterofs;
u16 delta[2];
erofs_blk_t pblk, compressedlcs;
@@ -178,7 +180,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
m->clusterofs = 1 << vi->z_logical_clusterbits;
m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) {
- if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+ if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
+ Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -189,7 +192,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
break;
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
m->clusterofs = le16_to_cpu(di->di_clusterofs);
m->pblk = le32_to_cpu(di->di_u.blkaddr);
break;
@@ -369,7 +373,8 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
if (compacted_4b_initial == 32 / 4)
compacted_4b_initial = 0;
- if (vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B)
+ if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) &&
+ compacted_4b_initial < totalidx)
compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
else
compacted_2b = 0;
@@ -445,9 +450,9 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
}
return z_erofs_extent_lookback(m, m->delta[0]);
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- map->m_flags &= ~EROFS_MAP_ZIPPED;
- fallthrough;
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+ m->headtype = m->type;
map->m_la = (lcn << lclusterbits) | m->clusterofs;
break;
default:
@@ -470,13 +475,18 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
int err;
DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN &&
- m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD);
- if (!(map->m_flags & EROFS_MAP_ZIPPED) ||
- !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+ m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 &&
+ m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD2);
+ DBG_BUGON(m->type != m->headtype);
+
+ if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
+ ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1) &&
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) ||
+ ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) &&
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
map->m_plen = 1 << lclusterbits;
return 0;
}
-
lcn = m->lcn + 1;
if (m->compressedlcs)
goto out;
@@ -498,7 +508,8 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
switch (m->type) {
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
/*
* if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
* rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
@@ -553,7 +564,8 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
DBG_BUGON(!m->delta[1] &&
m->clusterofs != 1 << lclusterbits);
} else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
- m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD) {
+ m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 ||
+ m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) {
/* go on until the next HEAD lcluster */
if (lcn != headlcn)
break;
@@ -608,16 +620,15 @@ int z_erofs_map_blocks_iter(struct inode *inode,
if (err)
goto unmap_out;
- map->m_flags = EROFS_MAP_ZIPPED; /* by default, compressed */
+ map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED;
end = (m.lcn + 1ULL) << lclusterbits;
switch (m.type) {
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- if (endoff >= m.clusterofs)
- map->m_flags &= ~EROFS_MAP_ZIPPED;
- fallthrough;
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
if (endoff >= m.clusterofs) {
+ m.headtype = m.type;
map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
break;
}
@@ -649,13 +660,22 @@ int z_erofs_map_blocks_iter(struct inode *inode,
map->m_llen = end - map->m_la;
map->m_pa = blknr_to_addr(m.pblk);
- map->m_flags |= EROFS_MAP_MAPPED;
err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
if (err)
goto out;
- if (flags & EROFS_GET_BLOCKS_FIEMAP) {
+ if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN)
+ map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+ else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2)
+ map->m_algorithmformat = vi->z_algorithmtype[1];
+ else
+ map->m_algorithmformat = vi->z_algorithmtype[0];
+
+ if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
+ ((flags & EROFS_GET_BLOCKS_READMORE) &&
+ map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA &&
+ map->m_llen >= EROFS_BLKSIZ)) {
err = z_erofs_get_extent_decompressedlen(&m);
if (!err)
map->m_flags |= EROFS_MAP_FULL_MAPPED;
diff --git a/fs/exec.c b/fs/exec.c
index ac7b51b51f38..537d92c41105 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -987,16 +987,14 @@ static int exec_mmap(struct mm_struct *mm)
if (old_mm) {
/*
- * Make sure that if there is a core dump in progress
- * for the old mm, we get out and die instead of going
- * through with the exec. We must hold mmap_lock around
- * checking core_state and changing tsk->mm.
+ * If there is a pending fatal signal perhaps a signal
+ * whose default action is to create a coredump get
+ * out and die instead of going through with the exec.
*/
- mmap_read_lock(old_mm);
- if (unlikely(old_mm->core_state)) {
- mmap_read_unlock(old_mm);
+ ret = mmap_read_lock_killable(old_mm);
+ if (ret) {
up_write(&tsk->signal->exec_update_lock);
- return -EINTR;
+ return ret;
}
}
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index ca37d4344361..1c7aa1ea4724 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -604,7 +604,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
exfat_save_attr(inode, info->attr);
inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) &
- ~(sbi->cluster_size - 1)) >> inode->i_blkbits;
+ ~((loff_t)sbi->cluster_size - 1)) >> inode->i_blkbits;
inode->i_mtime = info->mtime;
inode->i_ctime = info->mtime;
ei->i_crtime = info->crtime;
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 1f3f4326bf3c..c17ccc19b938 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -48,10 +48,9 @@ struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
struct ext2_sb_info *sbi = EXT2_SB(sb);
if (block_group >= sbi->s_groups_count) {
- ext2_error (sb, "ext2_get_group_desc",
- "block_group >= groups_count - "
- "block_group = %d, groups_count = %lu",
- block_group, sbi->s_groups_count);
+ WARN(1, "block_group >= groups_count - "
+ "block_group = %d, groups_count = %lu",
+ block_group, sbi->s_groups_count);
return NULL;
}
@@ -59,10 +58,9 @@ struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
group_desc = block_group >> EXT2_DESC_PER_BLOCK_BITS(sb);
offset = block_group & (EXT2_DESC_PER_BLOCK(sb) - 1);
if (!sbi->s_group_desc[group_desc]) {
- ext2_error (sb, "ext2_get_group_desc",
- "Group descriptor not loaded - "
- "block_group = %d, group_desc = %lu, desc = %lu",
- block_group, group_desc, offset);
+ WARN(1, "Group descriptor not loaded - "
+ "block_group = %d, group_desc = %lu, desc = %lu",
+ block_group, group_desc, offset);
return NULL;
}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ffb295aa891c..74b172a4adda 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -551,7 +551,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
struct dir_private_info *info = file->private_data;
struct inode *inode = file_inode(file);
struct fname *fname;
- int ret;
+ int ret = 0;
if (!info) {
info = ext4_htree_create_dir_info(file, ctx->pos);
@@ -599,7 +599,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
info->curr_minor_hash,
&info->next_hash);
if (ret < 0)
- return ret;
+ goto finished;
if (ret == 0) {
ctx->pos = ext4_get_htree_eof(file);
break;
@@ -630,7 +630,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
}
finished:
info->last_pos = ctx->pos;
- return 0;
+ return ret < 0 ? ret : 0;
}
static int ext4_release_dir(struct inode *inode, struct file *filp)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 90ff5acaf11f..3825195539d7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3593,9 +3593,6 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
unsigned flags,
struct page **pagep,
void **fsdata);
-extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
- unsigned len, unsigned copied,
- struct page *page);
extern int ext4_try_add_inline_entry(handle_t *handle,
struct ext4_filename *fname,
struct inode *dir, struct inode *inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c0de30f25185..0e02571f2f82 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5916,7 +5916,7 @@ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
}
/* Check if *cur is a hole and if it is, skip it */
-static void skip_hole(struct inode *inode, ext4_lblk_t *cur)
+static int skip_hole(struct inode *inode, ext4_lblk_t *cur)
{
int ret;
struct ext4_map_blocks map;
@@ -5925,9 +5925,12 @@ static void skip_hole(struct inode *inode, ext4_lblk_t *cur)
map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur;
ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
if (ret != 0)
- return;
+ return 0;
*cur = *cur + map.m_len;
+ return 0;
}
/* Count number of blocks used by this inode and update i_blocks */
@@ -5976,7 +5979,9 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
* iblocks by total number of differences found.
*/
cur = 0;
- skip_hole(inode, &cur);
+ ret = skip_hole(inode, &cur);
+ if (ret < 0)
+ goto out;
path = ext4_find_extent(inode, cur, NULL, 0);
if (IS_ERR(path))
goto out;
@@ -5995,8 +6000,12 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
}
cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
ext4_ext_get_actual_len(ex));
- skip_hole(inode, &cur);
-
+ ret = skip_hole(inode, &cur);
+ if (ret < 0) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ break;
+ }
path2 = ext4_find_extent(inode, cur, NULL, 0);
if (IS_ERR(path2)) {
ext4_ext_drop_refs(path);
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 8e610a381862..8ea5a81e6554 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -892,6 +892,12 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
sizeof(lrange), (u8 *)&lrange, crc))
return -ENOSPC;
} else {
+ unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
+ EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
+
+ /* Limit the number of blocks in one extent */
+ map.m_len = min(max, map.m_len);
+
fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
ex = (struct ext4_extent *)&fc_ext.fc_ex;
ex->ee_block = cpu_to_le32(map.m_lblk);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ac0e11bbb445..4c5f41052351 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -74,7 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
return generic_file_read_iter(iocb, to);
}
- ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0);
+ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0);
inode_unlock_shared(inode);
file_accessed(iocb->ki_filp);
@@ -566,7 +566,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ilock_shared)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
- (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);
+ (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
+ 0);
if (ret == -ENOTBLK)
ret = 0;
@@ -915,7 +916,7 @@ const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read_iter = ext4_file_read_iter,
.write_iter = ext4_file_write_iter,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 82bf4ff6be28..39a1ab129fdc 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -7,6 +7,7 @@
#include <linux/iomap.h>
#include <linux/fiemap.h>
#include <linux/iversion.h>
+#include <linux/backing-dev.h>
#include "ext4_jbd2.h"
#include "ext4.h"
@@ -733,45 +734,83 @@ convert:
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
unsigned copied, struct page *page)
{
- int ret, no_expand;
+ handle_t *handle = ext4_journal_current_handle();
+ int no_expand;
void *kaddr;
struct ext4_iloc iloc;
+ int ret = 0, ret2;
+
+ if (unlikely(copied < len) && !PageUptodate(page))
+ copied = 0;
- if (unlikely(copied < len)) {
- if (!PageUptodate(page)) {
- copied = 0;
+ if (likely(copied)) {
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret) {
+ unlock_page(page);
+ put_page(page);
+ ext4_std_error(inode->i_sb, ret);
goto out;
}
- }
+ ext4_write_lock_xattr(inode, &no_expand);
+ BUG_ON(!ext4_has_inline_data(inode));
- ret = ext4_get_inode_loc(inode, &iloc);
- if (ret) {
- ext4_std_error(inode->i_sb, ret);
- copied = 0;
- goto out;
- }
+ /*
+ * ei->i_inline_off may have changed since
+ * ext4_write_begin() called
+ * ext4_try_to_write_inline_data()
+ */
+ (void) ext4_find_inline_data_nolock(inode);
- ext4_write_lock_xattr(inode, &no_expand);
- BUG_ON(!ext4_has_inline_data(inode));
+ kaddr = kmap_atomic(page);
+ ext4_write_inline_data(inode, &iloc, kaddr, pos, copied);
+ kunmap_atomic(kaddr);
+ SetPageUptodate(page);
+ /* clear page dirty so that writepages wouldn't work for us. */
+ ClearPageDirty(page);
- /*
- * ei->i_inline_off may have changed since ext4_write_begin()
- * called ext4_try_to_write_inline_data()
- */
- (void) ext4_find_inline_data_nolock(inode);
+ ext4_write_unlock_xattr(inode, &no_expand);
+ brelse(iloc.bh);
- kaddr = kmap_atomic(page);
- ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
- kunmap_atomic(kaddr);
- SetPageUptodate(page);
- /* clear page dirty so that writepages wouldn't work for us. */
- ClearPageDirty(page);
+ /*
+ * It's important to update i_size while still holding page
+ * lock: page writeout could otherwise come in and zero
+ * beyond i_size.
+ */
+ ext4_update_inode_size(inode, pos + copied);
+ }
+ unlock_page(page);
+ put_page(page);
- ext4_write_unlock_xattr(inode, &no_expand);
- brelse(iloc.bh);
- mark_inode_dirty(inode);
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (likely(copied))
+ mark_inode_dirty(inode);
out:
- return copied;
+ /*
+ * If we didn't copy as much data as expected, we need to trim back
+ * size of xattr containing inline data.
+ */
+ if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ ext4_orphan_add(handle, inode);
+
+ ret2 = ext4_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+ if (pos + len > inode->i_size) {
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+ * is removed from the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+ return ret ? ret : copied;
}
struct buffer_head *
@@ -953,43 +992,6 @@ out:
return ret;
}
-int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
- unsigned len, unsigned copied,
- struct page *page)
-{
- int ret;
-
- ret = ext4_write_inline_data_end(inode, pos, len, copied, page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- return ret;
- }
- copied = ret;
-
- /*
- * No need to use i_size_read() here, the i_size
- * cannot change under us because we hold i_mutex.
- *
- * But it's important to update i_size while still holding page lock:
- * page writeout could otherwise come in and zero beyond i_size.
- */
- if (pos+copied > inode->i_size)
- i_size_write(inode, pos+copied);
- unlock_page(page);
- put_page(page);
-
- /*
- * Don't mark the inode dirty under page lock. First, it unnecessarily
- * makes the holding time of page lock longer. Second, it forces lock
- * ordering of page lock and transaction start for journaling
- * filesystems.
- */
- mark_inode_dirty(inode);
-
- return copied;
-}
-
#ifdef INLINE_DIR_DEBUG
void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
void *inline_start, int inline_size)
@@ -1917,6 +1919,24 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
EXT4_I(inode)->i_disksize = i_size;
if (i_size < inline_size) {
+ /*
+ * if there's inline data to truncate and this file was
+ * converted to extents after that inline data was written,
+ * the extent status cache must be cleared to avoid leaving
+ * behind stale delayed allocated extent entries
+ */
+ if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+retry:
+ err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+ if (err == -ENOMEM) {
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
+ if (err)
+ goto out_error;
+ }
+
/* Clear the content in the xattr space. */
if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d18852d6029c..0f06305167d5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1284,22 +1284,14 @@ static int ext4_write_end(struct file *file,
loff_t old_size = inode->i_size;
int ret = 0, ret2;
int i_size_changed = 0;
- int inline_data = ext4_has_inline_data(inode);
bool verity = ext4_verity_in_progress(inode);
trace_ext4_write_end(inode, pos, len, copied);
- if (inline_data) {
- ret = ext4_write_inline_data_end(inode, pos, len,
- copied, page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- goto errout;
- }
- copied = ret;
- } else
- copied = block_write_end(file, mapping, pos,
- len, copied, page, fsdata);
+
+ if (ext4_has_inline_data(inode))
+ return ext4_write_inline_data_end(inode, pos, len, copied, page);
+
+ copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
/*
* it's important to update i_size while still holding page lock:
* page writeout could otherwise come in and zero beyond i_size.
@@ -1320,7 +1312,7 @@ static int ext4_write_end(struct file *file,
* ordering of page lock and transaction start for journaling
* filesystems.
*/
- if (i_size_changed || inline_data)
+ if (i_size_changed)
ret = ext4_mark_inode_dirty(handle, inode);
if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
@@ -1329,7 +1321,7 @@ static int ext4_write_end(struct file *file,
* inode->i_size. So truncate them
*/
ext4_orphan_add(handle, inode);
-errout:
+
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
@@ -1395,7 +1387,6 @@ static int ext4_journalled_write_end(struct file *file,
int partial = 0;
unsigned from, to;
int size_changed = 0;
- int inline_data = ext4_has_inline_data(inode);
bool verity = ext4_verity_in_progress(inode);
trace_ext4_journalled_write_end(inode, pos, len, copied);
@@ -1404,16 +1395,10 @@ static int ext4_journalled_write_end(struct file *file,
BUG_ON(!ext4_handle_valid(handle));
- if (inline_data) {
- ret = ext4_write_inline_data_end(inode, pos, len,
- copied, page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- goto errout;
- }
- copied = ret;
- } else if (unlikely(copied < len) && !PageUptodate(page)) {
+ if (ext4_has_inline_data(inode))
+ return ext4_write_inline_data_end(inode, pos, len, copied, page);
+
+ if (unlikely(copied < len) && !PageUptodate(page)) {
copied = 0;
ext4_journalled_zero_new_buffers(handle, inode, page, from, to);
} else {
@@ -1436,7 +1421,7 @@ static int ext4_journalled_write_end(struct file *file,
if (old_size < pos && !verity)
pagecache_isize_extended(inode, old_size, pos);
- if (size_changed || inline_data) {
+ if (size_changed) {
ret2 = ext4_mark_inode_dirty(handle, inode);
if (!ret)
ret = ret2;
@@ -1449,7 +1434,6 @@ static int ext4_journalled_write_end(struct file *file,
*/
ext4_orphan_add(handle, inode);
-errout:
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
@@ -1644,6 +1628,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
bool allocated = false;
+ bool reserved = false;
/*
* If the cluster containing lblk is shared with a delayed,
@@ -1660,6 +1645,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
ret = ext4_da_reserve_space(inode);
if (ret != 0) /* ENOSPC */
goto errout;
+ reserved = true;
} else { /* bigalloc */
if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
if (!ext4_es_scan_clu(inode,
@@ -1672,6 +1658,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
ret = ext4_da_reserve_space(inode);
if (ret != 0) /* ENOSPC */
goto errout;
+ reserved = true;
} else {
allocated = true;
}
@@ -1682,6 +1669,8 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
}
ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
+ if (ret && reserved)
+ ext4_da_release_space(inode, 1);
errout:
return ret;
@@ -1722,13 +1711,16 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
}
/*
- * Delayed extent could be allocated by fallocate.
- * So we need to check it.
+ * the buffer head associated with a delayed and not unwritten
+ * block found in the extent status cache must contain an
+ * invalid block number and have its BH_New and BH_Delay bits
+ * set, reflecting the state assigned when the block was
+ * initially delayed allocated
*/
- if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
- map_bh(bh, inode->i_sb, invalid_block);
- set_buffer_new(bh);
- set_buffer_delay(bh);
+ if (ext4_es_is_delonly(&es)) {
+ BUG_ON(bh->b_blocknr != invalid_block);
+ BUG_ON(!buffer_new(bh));
+ BUG_ON(!buffer_delay(bh));
return 0;
}
@@ -2932,19 +2924,6 @@ static int ext4_nonda_switch(struct super_block *sb)
return 0;
}
-/* We always reserve for an inode update; the superblock could be there too */
-static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
-{
- if (likely(ext4_has_feature_large_file(inode->i_sb)))
- return 1;
-
- if (pos + len <= 0x7fffffffULL)
- return 1;
-
- /* We might need to update the superblock to set LARGE_FILE */
- return 2;
-}
-
static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -2953,7 +2932,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
struct page *page;
pgoff_t index;
struct inode *inode = mapping->host;
- handle_t *handle;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
@@ -2979,41 +2957,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
return 0;
}
- /*
- * grab_cache_page_write_begin() can take a long time if the
- * system is thrashing due to memory pressure, or if the page
- * is being written back. So grab it first before we start
- * the transaction handle. This also allows us to allocate
- * the page (if needed) without using GFP_NOFS.
- */
-retry_grab:
+retry:
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
- unlock_page(page);
- /*
- * With delayed allocation, we don't log the i_disksize update
- * if there is delayed block allocation. But we still need
- * to journalling the i_disksize update if writes to the end
- * of file which has an already mapped buffer.
- */
-retry_journal:
- handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
- ext4_da_write_credits(inode, pos, len));
- if (IS_ERR(handle)) {
- put_page(page);
- return PTR_ERR(handle);
- }
-
- lock_page(page);
- if (page->mapping != mapping) {
- /* The page got truncated from under us */
- unlock_page(page);
- put_page(page);
- ext4_journal_stop(handle);
- goto retry_grab;
- }
/* In case writeback began while the page was unlocked */
wait_for_stable_page(page);
@@ -3025,20 +2973,18 @@ retry_journal:
#endif
if (ret < 0) {
unlock_page(page);
- ext4_journal_stop(handle);
+ put_page(page);
/*
* block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
- * i_size_read because we hold i_mutex.
+ * i_size_read because we hold inode lock.
*/
if (pos + len > inode->i_size)
ext4_truncate_failed_write(inode);
if (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry_journal;
-
- put_page(page);
+ goto retry;
return ret;
}
@@ -3075,8 +3021,6 @@ static int ext4_da_write_end(struct file *file,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
- int ret = 0, ret2;
- handle_t *handle = ext4_journal_current_handle();
loff_t new_i_size;
unsigned long start, end;
int write_mode = (int)(unsigned long)fsdata;
@@ -3086,44 +3030,36 @@ static int ext4_da_write_end(struct file *file,
len, copied, page, fsdata);
trace_ext4_da_write_end(inode, pos, len, copied);
+
+ if (write_mode != CONVERT_INLINE_DATA &&
+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
+ ext4_has_inline_data(inode))
+ return ext4_write_inline_data_end(inode, pos, len, copied, page);
+
start = pos & (PAGE_SIZE - 1);
end = start + copied - 1;
/*
- * generic_write_end() will run mark_inode_dirty() if i_size
- * changes. So let's piggyback the i_disksize mark_inode_dirty
- * into that.
+ * Since we are holding inode lock, we are sure i_disksize <=
+ * i_size. We also know that if i_disksize < i_size, there are
+ * delalloc writes pending in the range upto i_size. If the end of
+ * the current write is <= i_size, there's no need to touch
+ * i_disksize since writeback will push i_disksize upto i_size
+ * eventually. If the end of the current write is > i_size and
+ * inside an allocated block (ext4_da_should_update_i_disksize()
+ * check), we need to update i_disksize here as neither
+ * ext4_writepage() nor certain ext4_writepages() paths not
+ * allocating blocks update i_disksize.
+ *
+ * Note that we defer inode dirtying to generic_write_end() /
+ * ext4_da_write_inline_data_end().
*/
new_i_size = pos + copied;
- if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
- if (ext4_has_inline_data(inode) ||
- ext4_da_should_update_i_disksize(page, end)) {
- ext4_update_i_disksize(inode, new_i_size);
- /* We need to mark inode dirty even if
- * new_i_size is less that inode->i_size
- * bu greater than i_disksize.(hint delalloc)
- */
- ret = ext4_mark_inode_dirty(handle, inode);
- }
- }
+ if (copied && new_i_size > inode->i_size &&
+ ext4_da_should_update_i_disksize(page, end))
+ ext4_update_i_disksize(inode, new_i_size);
- if (write_mode != CONVERT_INLINE_DATA &&
- ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
- ext4_has_inline_data(inode))
- ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
- page);
- else
- ret2 = generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
-
- copied = ret2;
- if (ret2 < 0)
- ret = ret2;
- ret2 = ext4_journal_stop(handle);
- if (unlikely(ret2 && !ret))
- ret = ret2;
-
- return ret ? ret : copied;
+ return generic_write_end(file, mapping, pos, len, copied, page, fsdata);
}
/*
@@ -4340,6 +4276,12 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
goto has_buffer;
lock_buffer(bh);
+ if (ext4_buffer_uptodate(bh)) {
+ /* Someone brought it uptodate while we waited */
+ unlock_buffer(bh);
+ goto has_buffer;
+ }
+
/*
* If we have all information of the inode in memory and this
* is the only valid inode in the block, we need not read the
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0775950ee84e..a320c54202d9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -46,6 +46,7 @@
#include <linux/part_stat.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/fsnotify.h>
#include "ext4.h"
#include "ext4_extents.h" /* Needed for trace points definition */
@@ -658,7 +659,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
* constraints, it may not be safe to do it right here so we
* defer superblock flushing to a workqueue.
*/
- if (continue_fs)
+ if (continue_fs && journal)
schedule_work(&EXT4_SB(sb)->s_error_work);
else
ext4_commit_super(sb);
@@ -759,6 +760,8 @@ void __ext4_error(struct super_block *sb, const char *function,
sb->s_id, function, line, current->comm, &vaf);
va_end(args);
}
+ fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED);
+
ext4_handle_error(sb, force_ro, error, 0, block, function, line);
}
@@ -789,6 +792,8 @@ void __ext4_error_inode(struct inode *inode, const char *function,
current->comm, &vaf);
va_end(args);
}
+ fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED);
+
ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block,
function, line);
}
@@ -827,6 +832,8 @@ void __ext4_error_file(struct file *file, const char *function,
current->comm, path, &vaf);
va_end(args);
}
+ fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED);
+
ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block,
function, line);
}
@@ -894,6 +901,7 @@ void __ext4_std_error(struct super_block *sb, const char *function,
printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
sb->s_id, function, line, errstr);
}
+ fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED);
ext4_handle_error(sb, false, -errno, 0, 0, function, line);
}
@@ -1350,6 +1358,12 @@ static void ext4_destroy_inode(struct inode *inode)
true);
dump_stack();
}
+
+ if (EXT4_I(inode)->i_reserved_data_blocks)
+ ext4_msg(inode->i_sb, KERN_ERR,
+ "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
+ inode->i_ino, EXT4_I(inode),
+ EXT4_I(inode)->i_reserved_data_blocks);
}
static void init_once(void *foo)
@@ -1566,7 +1580,6 @@ static const struct fscrypt_operations ext4_cryptops = {
.set_context = ext4_set_context,
.get_dummy_policy = ext4_get_dummy_policy,
.empty_dir = ext4_empty_dir,
- .max_namelen = EXT4_NAME_LEN,
.has_stable_inodes = ext4_has_stable_inodes,
.get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits,
};
@@ -3021,17 +3034,17 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
*/
static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{
- loff_t res = EXT4_NDIR_BLOCKS;
+ unsigned long long upper_limit, res = EXT4_NDIR_BLOCKS;
int meta_blocks;
- loff_t upper_limit;
- /* This is calculated to be the largest file size for a dense, block
+
+ /*
+ * This is calculated to be the largest file size for a dense, block
* mapped file such that the file's total number of 512-byte sectors,
* including data and all indirect blocks, does not exceed (2^48 - 1).
*
* __u32 i_blocks_lo and _u16 i_blocks_high represent the total
* number of 512-byte sectors of the file.
*/
-
if (!has_huge_files) {
/*
* !has_huge_files or implies that the inode i_block field
@@ -3074,7 +3087,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
if (res > MAX_LFS_FILESIZE)
res = MAX_LFS_FILESIZE;
- return res;
+ return (loff_t)res;
}
static ext4_fsblk_t descriptor_loc(struct super_block *sb,
@@ -4468,7 +4481,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto cantfind_ext4;
/* check blocks count against device size */
- blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+ blocks_count = sb_bdev_nr_blocks(sb);
if (blocks_count && ext4_blocks_count(es) > blocks_count) {
ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
"exceeds size of device (%llu blocks)",
@@ -5042,12 +5055,15 @@ failed_mount_wq:
sbi->s_ea_block_cache = NULL;
if (sbi->s_journal) {
+ /* flush s_error_work before journal destroy. */
+ flush_work(&sbi->s_error_work);
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
failed_mount3a:
ext4_es_unregister_shrinker(sbi);
failed_mount3:
+ /* flush s_error_work before sbi destroy */
flush_work(&sbi->s_error_work);
del_timer_sync(&sbi->s_err_report);
ext4_stop_mmpd(sbi);
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index c1bf9ad4c220..20a083dc9042 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -7,6 +7,7 @@
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
+#include <linux/moduleparam.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/lzo.h>
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 9c8ef33bd8d3..eb971e1e7227 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -4276,7 +4276,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
size_t target_size = 0;
int err;
- if (iov_iter_fault_in_readable(from, iov_iter_count(from)))
+ if (fault_in_iov_iter_readable(from, iov_iter_count(from)))
set_inode_flag(inode, FI_NO_PREALLOC);
if ((iocb->ki_flags & IOCB_NOWAIT)) {
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 78ebc306ee2b..cf049a042482 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2976,7 +2976,6 @@ static const struct fscrypt_operations f2fs_cryptops = {
.set_context = f2fs_set_context,
.get_dummy_policy = f2fs_get_dummy_policy,
.empty_dir = f2fs_empty_dir,
- .max_namelen = F2FS_NAME_LEN,
.has_stable_inodes = f2fs_has_stable_inodes,
.get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits,
.get_num_devices = f2fs_get_num_devices,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index de0c9b013a85..a6f1c6d426d1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1536,14 +1536,11 @@ static int fat_read_static_bpb(struct super_block *sb,
struct fat_bios_param_block *bpb)
{
static const char *notdos1x = "This doesn't look like a DOS 1.x volume";
-
+ sector_t bd_sects = bdev_nr_sectors(sb->s_bdev);
struct fat_floppy_defaults *fdefaults = NULL;
int error = -EINVAL;
- sector_t bd_sects;
unsigned i;
- bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE;
-
/* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */
if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) {
if (!silent)
@@ -1943,10 +1940,8 @@ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
ret = writeback_inode(i1);
if (!ret && i2)
ret = writeback_inode(i2);
- if (!ret) {
- struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
- ret = filemap_flush(mapping);
- }
+ if (!ret)
+ ret = sync_blockdev_nowait(sb->s_bdev);
return ret;
}
EXPORT_SYMBOL_GPL(fat_flush_inodes);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 81ec192ce067..67f0e88eed01 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -566,7 +566,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
return;
- isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC);
+ isw = kzalloc(struct_size(isw, inodes, 2), GFP_ATOMIC);
if (!isw)
return;
@@ -624,8 +624,8 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
int nr;
bool restart = false;
- isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW *
- sizeof(struct inode *), GFP_KERNEL);
+ isw = kzalloc(struct_size(isw, inodes, WB_MAX_INODES_PER_ISW),
+ GFP_KERNEL);
if (!isw)
return restart;
@@ -1893,7 +1893,8 @@ static long writeback_sb_inodes(struct super_block *sb,
* unplug, so get our IOs out the door before we
* give up the CPU.
*/
- blk_flush_plug(current);
+ if (current->plug)
+ blk_flush_plug(current->plug, false);
cond_resched();
}
@@ -2291,7 +2292,7 @@ void wakeup_flusher_threads(enum wb_reason reason)
* If we are expecting writeback progress we must submit plugged IO.
*/
if (blk_needs_flush_plug(current))
- blk_schedule_flush_plug(current);
+ blk_flush_plug(current->plug, true);
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index f346a78f4bd6..6a675652129b 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -77,7 +77,6 @@ static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object);
static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready);
static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation);
static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object);
-static WORK_STATE(CREATE_OBJECT, "CRTO", fscache_look_up_object);
static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available);
static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents);
@@ -907,6 +906,7 @@ static void fscache_dequeue_object(struct fscache_object *object)
* @object: The object to ask about
* @data: The auxiliary data for the object
* @datalen: The size of the auxiliary data
+ * @object_size: The size of the object according to the server.
*
* This function consults the netfs about the coherency state of an object.
* The caller must be holding a ref on cookie->n_active (held by
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 433877107700..e002cdfaf3cc 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -22,7 +22,10 @@ static void fscache_operation_dummy_cancel(struct fscache_operation *op)
/**
* fscache_operation_init - Do basic initialisation of an operation
+ * @cookie: The cookie to operate on
* @op: The operation to initialise
+ * @processor: The function to perform the operation
+ * @cancel: A function to handle operation cancellation
* @release: The release function to assign
*
* Do basic initialisation of an operation. The caller must still set flags,
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 281d79f8b3d3..713818d74de6 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -732,11 +732,8 @@ static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from)
ssize_t ret;
ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
- if (ret < 0)
- return ret;
- fuse_invalidate_attr(inode);
- fuse_write_update_size(inode, iocb->ki_pos);
+ fuse_write_update_attr(inode, iocb->ki_pos, ret);
return ret;
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index dde341a6388a..79f7eda49e06 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -756,7 +756,7 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
{
unsigned ncpy = min(*size, cs->len);
if (val) {
- void *pgaddr = kmap_atomic(cs->pg);
+ void *pgaddr = kmap_local_page(cs->pg);
void *buf = pgaddr + cs->offset;
if (cs->write)
@@ -764,7 +764,7 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
else
memcpy(*val, buf, ncpy);
- kunmap_atomic(pgaddr);
+ kunmap_local(pgaddr);
*val += ncpy;
}
*size -= ncpy;
@@ -847,6 +847,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
replace_page_cache_page(oldpage, newpage);
+ /*
+ * Release while we have extra ref on stolen page. Otherwise
+ * anon_pipe_buf_release() might think the page can be reused.
+ */
+ pipe_buf_release(cs->pipe, buf);
+
get_page(newpage);
if (!(buf->flags & PIPE_BUF_FLAG_LRU))
@@ -949,10 +955,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
}
}
if (page) {
- void *mapaddr = kmap_atomic(page);
+ void *mapaddr = kmap_local_page(page);
void *buf = mapaddr + offset;
offset += fuse_copy_do(cs, &buf, &count);
- kunmap_atomic(mapaddr);
+ kunmap_local(mapaddr);
} else
offset += fuse_copy_do(cs, NULL, &count);
}
@@ -1591,7 +1597,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
end = outarg.offset + outarg.size;
if (end > file_size) {
file_size = end;
- fuse_write_update_size(inode, file_size);
+ fuse_write_update_attr(inode, file_size, outarg.size);
}
num = outarg.size;
@@ -2031,8 +2037,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
pipe_lock(pipe);
out_free:
- for (idx = 0; idx < nbuf; idx++)
- pipe_buf_release(pipe, &bufs[idx]);
+ for (idx = 0; idx < nbuf; idx++) {
+ struct pipe_buffer *buf = &bufs[idx];
+
+ if (buf->ops)
+ pipe_buf_release(pipe, buf);
+ }
pipe_unlock(pipe);
kvfree(bufs);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d9b977c0f38d..0654bfedcbb0 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -116,7 +116,7 @@ u64 entry_attr_timeout(struct fuse_entry_out *o)
return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
}
-static void fuse_invalidate_attr_mask(struct inode *inode, u32 mask)
+void fuse_invalidate_attr_mask(struct inode *inode, u32 mask)
{
set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask);
}
@@ -738,14 +738,51 @@ static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir,
return create_new_entry(fm, &args, dir, entry, S_IFLNK);
}
-void fuse_update_ctime(struct inode *inode)
+void fuse_flush_time_update(struct inode *inode)
+{
+ int err = sync_inode_metadata(inode, 1);
+
+ mapping_set_error(inode->i_mapping, err);
+}
+
+static void fuse_update_ctime_in_cache(struct inode *inode)
{
if (!IS_NOCMTIME(inode)) {
inode->i_ctime = current_time(inode);
mark_inode_dirty_sync(inode);
+ fuse_flush_time_update(inode);
}
}
+void fuse_update_ctime(struct inode *inode)
+{
+ fuse_invalidate_attr_mask(inode, STATX_CTIME);
+ fuse_update_ctime_in_cache(inode);
+}
+
+static void fuse_entry_unlinked(struct dentry *entry)
+{
+ struct inode *inode = d_inode(entry);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ /*
+ * If i_nlink == 0 then unlink doesn't make sense, yet this can
+ * happen if userspace filesystem is careless. It would be
+ * difficult to enforce correct nlink usage so just ignore this
+ * condition here
+ */
+ if (S_ISDIR(inode->i_mode))
+ clear_nlink(inode);
+ else if (inode->i_nlink > 0)
+ drop_nlink(inode);
+ spin_unlock(&fi->lock);
+ fuse_invalidate_entry_cache(entry);
+ fuse_update_ctime(inode);
+}
+
static int fuse_unlink(struct inode *dir, struct dentry *entry)
{
int err;
@@ -762,24 +799,8 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
args.in_args[0].value = entry->d_name.name;
err = fuse_simple_request(fm, &args);
if (!err) {
- struct inode *inode = d_inode(entry);
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- spin_lock(&fi->lock);
- fi->attr_version = atomic64_inc_return(&fm->fc->attr_version);
- /*
- * If i_nlink == 0 then unlink doesn't make sense, yet this can
- * happen if userspace filesystem is careless. It would be
- * difficult to enforce correct nlink usage so just ignore this
- * condition here
- */
- if (inode->i_nlink > 0)
- drop_nlink(inode);
- spin_unlock(&fi->lock);
- fuse_invalidate_attr(inode);
fuse_dir_changed(dir);
- fuse_invalidate_entry_cache(entry);
- fuse_update_ctime(inode);
+ fuse_entry_unlinked(entry);
} else if (err == -EINTR)
fuse_invalidate_entry(entry);
return err;
@@ -801,9 +822,8 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
args.in_args[0].value = entry->d_name.name;
err = fuse_simple_request(fm, &args);
if (!err) {
- clear_nlink(d_inode(entry));
fuse_dir_changed(dir);
- fuse_invalidate_entry_cache(entry);
+ fuse_entry_unlinked(entry);
} else if (err == -EINTR)
fuse_invalidate_entry(entry);
return err;
@@ -833,24 +853,18 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
err = fuse_simple_request(fm, &args);
if (!err) {
/* ctime changes */
- fuse_invalidate_attr(d_inode(oldent));
fuse_update_ctime(d_inode(oldent));
- if (flags & RENAME_EXCHANGE) {
- fuse_invalidate_attr(d_inode(newent));
+ if (flags & RENAME_EXCHANGE)
fuse_update_ctime(d_inode(newent));
- }
fuse_dir_changed(olddir);
if (olddir != newdir)
fuse_dir_changed(newdir);
/* newent will end up negative */
- if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) {
- fuse_invalidate_attr(d_inode(newent));
- fuse_invalidate_entry_cache(newent);
- fuse_update_ctime(d_inode(newent));
- }
+ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent))
+ fuse_entry_unlinked(newent);
} else if (err == -EINTR) {
/* If request was interrupted, DEITY only knows if the
rename actually took place. If the invalidation
@@ -916,25 +930,11 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
args.in_args[1].size = newent->d_name.len + 1;
args.in_args[1].value = newent->d_name.name;
err = create_new_entry(fm, &args, newdir, newent, inode->i_mode);
- /* Contrary to "normal" filesystems it can happen that link
- makes two "logical" inodes point to the same "physical"
- inode. We invalidate the attributes of the old one, so it
- will reflect changes in the backing inode (link count,
- etc.)
- */
- if (!err) {
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- spin_lock(&fi->lock);
- fi->attr_version = atomic64_inc_return(&fm->fc->attr_version);
- if (likely(inode->i_nlink < UINT_MAX))
- inc_nlink(inode);
- spin_unlock(&fi->lock);
- fuse_invalidate_attr(inode);
- fuse_update_ctime(inode);
- } else if (err == -EINTR) {
+ if (!err)
+ fuse_update_ctime_in_cache(inode);
+ else if (err == -EINTR)
fuse_invalidate_attr(inode);
- }
+
return err;
}
@@ -944,15 +944,6 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
unsigned int blkbits;
struct fuse_conn *fc = get_fuse_conn(inode);
- /* see the comment in fuse_change_attributes() */
- if (fc->writeback_cache && S_ISREG(inode->i_mode)) {
- attr->size = i_size_read(inode);
- attr->mtime = inode->i_mtime.tv_sec;
- attr->mtimensec = inode->i_mtime.tv_nsec;
- attr->ctime = inode->i_ctime.tv_sec;
- attr->ctimensec = inode->i_ctime.tv_nsec;
- }
-
stat->dev = inode->i_sb->s_dev;
stat->ino = attr->ino;
stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
@@ -1030,12 +1021,14 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
struct fuse_inode *fi = get_fuse_inode(inode);
int err = 0;
bool sync;
+ u32 inval_mask = READ_ONCE(fi->inval_mask);
+ u32 cache_mask = fuse_get_cache_mask(inode);
if (flags & AT_STATX_FORCE_SYNC)
sync = true;
else if (flags & AT_STATX_DONT_SYNC)
sync = false;
- else if (request_mask & READ_ONCE(fi->inval_mask))
+ else if (request_mask & inval_mask & ~cache_mask)
sync = true;
else
sync = time_before64(fi->i_time, get_jiffies_64());
@@ -1052,11 +1045,9 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
return err;
}
-int fuse_update_attributes(struct inode *inode, struct file *file)
+int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask)
{
- /* Do *not* need to get atime for internal purposes */
- return fuse_update_get_attr(inode, file, NULL,
- STATX_BASIC_STATS & ~STATX_ATIME, 0);
+ return fuse_update_get_attr(inode, file, NULL, mask, 0);
}
int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
@@ -1071,7 +1062,7 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
if (!parent)
return -ENOENT;
- inode_lock(parent);
+ inode_lock_nested(parent, I_MUTEX_PARENT);
if (!S_ISDIR(parent->i_mode))
goto unlock;
@@ -1561,10 +1552,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
bool is_truncate = false;
- bool is_wb = fc->writeback_cache;
+ bool is_wb = fc->writeback_cache && S_ISREG(inode->i_mode);
loff_t oldsize;
int err;
- bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
+ bool trust_local_cmtime = is_wb;
bool fault_blocked = false;
if (!fc->default_permissions)
@@ -1608,7 +1599,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
}
/* Flush dirty data/metadata before non-truncate SETATTR */
- if (is_wb && S_ISREG(inode->i_mode) &&
+ if (is_wb &&
attr->ia_valid &
(ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET |
ATTR_TIMES_SET)) {
@@ -1676,10 +1667,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
}
fuse_change_attributes_common(inode, &outarg.attr,
- attr_timeout(&outarg));
+ attr_timeout(&outarg),
+ fuse_get_cache_mask(inode));
oldsize = inode->i_size;
/* see the comment in fuse_change_attributes() */
- if (!is_wb || is_truncate || !S_ISREG(inode->i_mode))
+ if (!is_wb || is_truncate)
i_size_write(inode, outarg.attr.size);
if (is_truncate) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 11404f8c21c7..9d6c5f6361f7 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -211,9 +211,8 @@ void fuse_finish_open(struct inode *inode, struct file *file)
i_size_write(inode, 0);
spin_unlock(&fi->lock);
truncate_pagecache(inode, 0);
- fuse_invalidate_attr(inode);
- if (fc->writeback_cache)
- file_update_time(file);
+ file_update_time(file);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
} else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
invalidate_inode_pages2(inode->i_mapping);
}
@@ -339,12 +338,6 @@ static int fuse_open(struct inode *inode, struct file *file)
static int fuse_release(struct inode *inode, struct file *file)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
-
- /* see fuse_vma_close() for !writeback_cache case */
- if (fc->writeback_cache)
- write_inode_now(inode, 1);
-
fuse_release_common(file, false);
/* return value is ignored by VFS */
@@ -483,6 +476,9 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (fuse_is_bad(inode))
return -EIO;
+ if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache)
+ return 0;
+
err = write_inode_now(inode, 1);
if (err)
return err;
@@ -521,7 +517,7 @@ inval_attr_out:
* enabled, i_blocks from cached attr may not be accurate.
*/
if (!err && fm->fc->writeback_cache)
- fuse_invalidate_attr(inode);
+ fuse_invalidate_attr_mask(inode, STATX_BLOCKS);
return err;
}
@@ -687,7 +683,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
spin_unlock(&fi->lock);
}
- io->iocb->ki_complete(io->iocb, res, 0);
+ io->iocb->ki_complete(io->iocb, res);
}
kref_put(&io->refcnt, fuse_io_release);
@@ -793,7 +789,7 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
struct fuse_inode *fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
- if (attr_ver == fi->attr_version && size < inode->i_size &&
+ if (attr_ver >= fi->attr_version && size < inode->i_size &&
!test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, size);
@@ -1003,7 +999,7 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (fc->auto_inval_data ||
(iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
int err;
- err = fuse_update_attributes(inode, iocb->ki_filp);
+ err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE);
if (err)
return err;
}
@@ -1072,7 +1068,7 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
return err ?: ia->write.out.size;
}
-bool fuse_write_update_size(struct inode *inode, loff_t pos)
+bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1080,12 +1076,14 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos)
spin_lock(&fi->lock);
fi->attr_version = atomic64_inc_return(&fc->attr_version);
- if (pos > inode->i_size) {
+ if (written > 0 && pos > inode->i_size) {
i_size_write(inode, pos);
ret = true;
}
spin_unlock(&fi->lock);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
+
return ret;
}
@@ -1164,7 +1162,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
again:
err = -EFAULT;
- if (iov_iter_fault_in_readable(ii, bytes))
+ if (fault_in_iov_iter_readable(ii, bytes))
break;
err = -ENOMEM;
@@ -1268,11 +1266,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
kfree(ap->pages);
} while (!err && iov_iter_count(ii));
- if (res > 0)
- fuse_write_update_size(inode, pos);
-
+ fuse_write_update_attr(inode, pos, res);
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
- fuse_invalidate_attr(inode);
return res > 0 ? res : err;
}
@@ -1290,7 +1285,8 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (fc->writeback_cache) {
/* Update size (EOF optimization) and mode (SUID clearing) */
- err = fuse_update_attributes(mapping->host, file);
+ err = fuse_update_attributes(mapping->host, file,
+ STATX_SIZE | STATX_MODE);
if (err)
return err;
@@ -1451,7 +1447,6 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
if (!ia)
return -ENOMEM;
- ia->io = io;
if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
if (!write)
inode_lock(inode);
@@ -1561,11 +1556,9 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
} else {
res = fuse_direct_io(&io, from, &iocb->ki_pos,
FUSE_DIO_WRITE);
+ fuse_write_update_attr(inode, iocb->ki_pos, res);
}
}
- fuse_invalidate_attr(inode);
- if (res > 0)
- fuse_write_update_size(inode, iocb->ki_pos);
inode_unlock(inode);
return res;
@@ -1776,7 +1769,7 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
* is enabled, we trust local ctime/mtime.
*/
if (!fc->writeback_cache)
- fuse_invalidate_attr(inode);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY);
spin_lock(&fi->lock);
rb_erase(&wpa->writepages_entry, &fi->writepages);
while (wpa->next) {
@@ -1822,14 +1815,13 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi)
{
- struct fuse_file *ff = NULL;
+ struct fuse_file *ff;
spin_lock(&fi->lock);
- if (!list_empty(&fi->write_files)) {
- ff = list_entry(fi->write_files.next, struct fuse_file,
- write_entry);
+ ff = list_first_entry_or_null(&fi->write_files, struct fuse_file,
+ write_entry);
+ if (ff)
fuse_file_get(ff);
- }
spin_unlock(&fi->lock);
return ff;
@@ -1848,6 +1840,17 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
struct fuse_file *ff;
int err;
+ /*
+ * Inode is always written before the last reference is dropped and
+ * hence this should not be reached from reclaim.
+ *
+ * Writing back the inode from reclaim can deadlock if the request
+ * processing itself needs an allocation. Allocations triggering
+ * reclaim while serving a request can't be prevented, because it can
+ * involve any number of unrelated userspace processes.
+ */
+ WARN_ON(wbc->for_reclaim);
+
ff = __fuse_write_file_get(fi);
err = fuse_flush_times(inode, ff);
if (ff)
@@ -2306,15 +2309,18 @@ static int fuse_write_end(struct file *file, struct address_space *mapping,
if (!copied)
goto unlock;
+ pos += copied;
if (!PageUptodate(page)) {
/* Zero any unwritten bytes at the end of the page */
- size_t endoff = (pos + copied) & ~PAGE_MASK;
+ size_t endoff = pos & ~PAGE_MASK;
if (endoff)
zero_user_segment(page, endoff, PAGE_SIZE);
SetPageUptodate(page);
}
- fuse_write_update_size(inode, pos + copied);
+ if (pos > inode->i_size)
+ i_size_write(inode, pos);
+
set_page_dirty(page);
unlock:
@@ -2340,12 +2346,15 @@ static int fuse_launder_page(struct page *page)
}
/*
- * Write back dirty pages now, because there may not be any suitable
- * open files later
+ * Write back dirty data/metadata now (there may not be any suitable
+ * open files later for data)
*/
static void fuse_vma_close(struct vm_area_struct *vma)
{
- filemap_write_and_wait(vma->vm_file->f_mapping);
+ int err;
+
+ err = write_inode_now(vma->vm_file->f_mapping->host, 1);
+ mapping_set_error(vma->vm_file->f_mapping, err);
}
/*
@@ -2628,7 +2637,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
fallback:
- err = fuse_update_attributes(inode, file);
+ err = fuse_update_attributes(inode, file, STATX_SIZE);
if (!err)
return generic_file_llseek(file, offset, whence);
else
@@ -2648,7 +2657,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
break;
case SEEK_END:
inode_lock(inode);
- retval = fuse_update_attributes(inode, file);
+ retval = fuse_update_attributes(inode, file, STATX_SIZE);
if (!retval)
retval = generic_file_llseek(file, offset, whence);
inode_unlock(inode);
@@ -2869,7 +2878,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (iov_iter_rw(iter) == WRITE) {
ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
- fuse_invalidate_attr(inode);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
} else {
ret = __fuse_direct_read(io, iter, &pos);
}
@@ -2891,9 +2900,8 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
kref_put(&io->refcnt, fuse_io_release);
if (iov_iter_rw(iter) == WRITE) {
- if (ret > 0)
- fuse_write_update_size(inode, pos);
- else if (ret < 0 && offset + count > i_size)
+ fuse_write_update_attr(inode, pos, ret);
+ if (ret < 0 && offset + count > i_size)
fuse_do_truncate(file);
}
@@ -2981,16 +2989,14 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
/* we could have extended the file */
if (!(mode & FALLOC_FL_KEEP_SIZE)) {
- bool changed = fuse_write_update_size(inode, offset + length);
-
- if (changed && fm->fc->writeback_cache)
+ if (fuse_write_update_attr(inode, offset + length, length))
file_update_time(file);
}
if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
truncate_pagecache_range(inode, offset, offset + length - 1);
- fuse_invalidate_attr(inode);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
out:
if (!(mode & FALLOC_FL_KEEP_SIZE))
@@ -3002,6 +3008,8 @@ out:
if (lock_inode)
inode_unlock(inode);
+ fuse_flush_time_update(inode);
+
return err;
}
@@ -3096,12 +3104,8 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
ALIGN_DOWN(pos_out, PAGE_SIZE),
ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
- if (fc->writeback_cache) {
- fuse_write_update_size(inode_out, pos_out + outarg.size);
- file_update_time(file_out);
- }
-
- fuse_invalidate_attr(inode_out);
+ file_update_time(file_out);
+ fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size);
err = outarg.size;
out:
@@ -3111,6 +3115,8 @@ out:
inode_unlock(inode_out);
file_accessed(file_in);
+ fuse_flush_time_update(inode_out);
+
return err;
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 319596df5dc6..198637b41e19 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1031,7 +1031,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
u64 attr_valid, u64 attr_version);
void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
- u64 attr_valid);
+ u64 attr_valid, u32 cache_mask);
+
+u32 fuse_get_cache_mask(struct inode *inode);
/**
* Initialize the client device
@@ -1065,7 +1067,15 @@ void fuse_wait_aborted(struct fuse_conn *fc);
/**
* Invalidate inode attributes
*/
+
+/* Attributes possibly changed on data modification */
+#define FUSE_STATX_MODIFY (STATX_MTIME | STATX_CTIME | STATX_BLOCKS)
+
+/* Attributes possibly changed on data and/or size modification */
+#define FUSE_STATX_MODSIZE (FUSE_STATX_MODIFY | STATX_SIZE)
+
void fuse_invalidate_attr(struct inode *inode);
+void fuse_invalidate_attr_mask(struct inode *inode, u32 mask);
void fuse_invalidate_entry_cache(struct dentry *entry);
@@ -1121,6 +1131,9 @@ int fuse_init_fs_context_submount(struct fs_context *fsc);
*/
void fuse_conn_destroy(struct fuse_mount *fm);
+/* Drop the connection and free the fuse mount */
+void fuse_mount_destroy(struct fuse_mount *fm);
+
/**
* Add connection to control filesystem
*/
@@ -1145,9 +1158,10 @@ int fuse_allow_current_process(struct fuse_conn *fc);
u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
+void fuse_flush_time_update(struct inode *inode);
void fuse_update_ctime(struct inode *inode);
-int fuse_update_attributes(struct inode *inode, struct file *file);
+int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask);
void fuse_flush_writepages(struct inode *inode);
@@ -1205,7 +1219,7 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd,
__poll_t fuse_file_poll(struct file *file, poll_table *wait);
int fuse_dev_release(struct inode *inode, struct file *file);
-bool fuse_write_update_size(struct inode *inode, loff_t pos);
+bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written);
int fuse_flush_times(struct inode *inode, struct fuse_file *ff);
int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 36cd03114b6d..8b89e3ba7df3 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -118,6 +118,9 @@ static void fuse_evict_inode(struct inode *inode)
{
struct fuse_inode *fi = get_fuse_inode(inode);
+ /* Will write inode on close/munmap and in all other dirtiers */
+ WARN_ON(inode->i_state & I_DIRTY_INODE);
+
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (inode->i_sb->s_flags & SB_ACTIVE) {
@@ -161,7 +164,7 @@ static ino_t fuse_squash_ino(u64 ino64)
}
void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
- u64 attr_valid)
+ u64 attr_valid, u32 cache_mask)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -181,9 +184,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_atime.tv_sec = attr->atime;
inode->i_atime.tv_nsec = attr->atimensec;
/* mtime from server may be stale due to local buffered write */
- if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) {
+ if (!(cache_mask & STATX_MTIME)) {
inode->i_mtime.tv_sec = attr->mtime;
inode->i_mtime.tv_nsec = attr->mtimensec;
+ }
+ if (!(cache_mask & STATX_CTIME)) {
inode->i_ctime.tv_sec = attr->ctime;
inode->i_ctime.tv_nsec = attr->ctimensec;
}
@@ -215,16 +220,44 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_flags &= ~S_NOSEC;
}
+u32 fuse_get_cache_mask(struct inode *inode)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (!fc->writeback_cache || !S_ISREG(inode->i_mode))
+ return 0;
+
+ return STATX_MTIME | STATX_CTIME | STATX_SIZE;
+}
+
void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
u64 attr_valid, u64 attr_version)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
- bool is_wb = fc->writeback_cache;
+ u32 cache_mask;
loff_t oldsize;
struct timespec64 old_mtime;
spin_lock(&fi->lock);
+ /*
+ * In case of writeback_cache enabled, writes update mtime, ctime and
+ * may update i_size. In these cases trust the cached value in the
+ * inode.
+ */
+ cache_mask = fuse_get_cache_mask(inode);
+ if (cache_mask & STATX_SIZE)
+ attr->size = i_size_read(inode);
+
+ if (cache_mask & STATX_MTIME) {
+ attr->mtime = inode->i_mtime.tv_sec;
+ attr->mtimensec = inode->i_mtime.tv_nsec;
+ }
+ if (cache_mask & STATX_CTIME) {
+ attr->ctime = inode->i_ctime.tv_sec;
+ attr->ctimensec = inode->i_ctime.tv_nsec;
+ }
+
if ((attr_version != 0 && fi->attr_version > attr_version) ||
test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
spin_unlock(&fi->lock);
@@ -232,7 +265,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
}
old_mtime = inode->i_mtime;
- fuse_change_attributes_common(inode, attr, attr_valid);
+ fuse_change_attributes_common(inode, attr, attr_valid, cache_mask);
oldsize = inode->i_size;
/*
@@ -240,11 +273,11 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
* extend local i_size without keeping userspace server in sync. So,
* attr->size coming from server can be stale. We cannot trust it.
*/
- if (!is_wb || !S_ISREG(inode->i_mode))
+ if (!(cache_mask & STATX_SIZE))
i_size_write(inode, attr->size);
spin_unlock(&fi->lock);
- if (!is_wb && S_ISREG(inode->i_mode)) {
+ if (!cache_mask && S_ISREG(inode->i_mode)) {
bool inval = false;
if (oldsize != attr->size) {
@@ -457,14 +490,6 @@ static void fuse_send_destroy(struct fuse_mount *fm)
}
}
-static void fuse_put_super(struct super_block *sb)
-{
- struct fuse_mount *fm = get_fuse_mount_super(sb);
-
- fuse_conn_put(fm->fc);
- kfree(fm);
-}
-
static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
{
stbuf->f_type = FUSE_SUPER_MAGIC;
@@ -1003,7 +1028,6 @@ static const struct super_operations fuse_super_operations = {
.evict_inode = fuse_evict_inode,
.write_inode = fuse_write_inode,
.drop_inode = generic_delete_inode,
- .put_super = fuse_put_super,
.umount_begin = fuse_umount_begin,
.statfs = fuse_statfs,
.sync_fs = fuse_sync_fs,
@@ -1424,20 +1448,17 @@ static int fuse_get_tree_submount(struct fs_context *fsc)
if (!fm)
return -ENOMEM;
+ fm->fc = fuse_conn_get(fc);
fsc->s_fs_info = fm;
sb = sget_fc(fsc, NULL, set_anon_super_fc);
- if (IS_ERR(sb)) {
- kfree(fm);
+ if (fsc->s_fs_info)
+ fuse_mount_destroy(fm);
+ if (IS_ERR(sb))
return PTR_ERR(sb);
- }
- fm->fc = fuse_conn_get(fc);
/* Initialize superblock, making @mp_fi its root */
err = fuse_fill_super_submount(sb, mp_fi);
if (err) {
- fuse_conn_put(fc);
- kfree(fm);
- sb->s_fs_info = NULL;
deactivate_locked_super(sb);
return err;
}
@@ -1569,8 +1590,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
{
struct fuse_fs_context *ctx = fsc->fs_private;
int err;
- struct fuse_conn *fc;
- struct fuse_mount *fm;
if (!ctx->file || !ctx->rootmode_present ||
!ctx->user_id_present || !ctx->group_id_present)
@@ -1580,42 +1599,18 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
* Require mount to happen from the same user namespace which
* opened /dev/fuse to prevent potential attacks.
*/
- err = -EINVAL;
if ((ctx->file->f_op != &fuse_dev_operations) ||
(ctx->file->f_cred->user_ns != sb->s_user_ns))
- goto err;
+ return -EINVAL;
ctx->fudptr = &ctx->file->private_data;
- fc = kmalloc(sizeof(*fc), GFP_KERNEL);
- err = -ENOMEM;
- if (!fc)
- goto err;
-
- fm = kzalloc(sizeof(*fm), GFP_KERNEL);
- if (!fm) {
- kfree(fc);
- goto err;
- }
-
- fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
- fc->release = fuse_free_conn;
-
- sb->s_fs_info = fm;
-
err = fuse_fill_super_common(sb, ctx);
if (err)
- goto err_put_conn;
+ return err;
/* file->private_data shall be visible on all CPUs after this */
smp_mb();
fuse_send_init(get_fuse_mount_super(sb));
return 0;
-
- err_put_conn:
- fuse_conn_put(fc);
- kfree(fm);
- sb->s_fs_info = NULL;
- err:
- return err;
}
/*
@@ -1637,22 +1632,40 @@ static int fuse_get_tree(struct fs_context *fsc)
{
struct fuse_fs_context *ctx = fsc->fs_private;
struct fuse_dev *fud;
+ struct fuse_conn *fc;
+ struct fuse_mount *fm;
struct super_block *sb;
int err;
+ fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+ if (!fc)
+ return -ENOMEM;
+
+ fm = kzalloc(sizeof(*fm), GFP_KERNEL);
+ if (!fm) {
+ kfree(fc);
+ return -ENOMEM;
+ }
+
+ fuse_conn_init(fc, fm, fsc->user_ns, &fuse_dev_fiq_ops, NULL);
+ fc->release = fuse_free_conn;
+
+ fsc->s_fs_info = fm;
+
if (ctx->fd_present)
ctx->file = fget(ctx->fd);
if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) {
err = get_tree_bdev(fsc, fuse_fill_super);
- goto out_fput;
+ goto out;
}
/*
* While block dev mount can be initialized with a dummy device fd
* (found by device name), normal fuse mounts can't
*/
+ err = -EINVAL;
if (!ctx->file)
- return -EINVAL;
+ goto out;
/*
* Allow creating a fuse mount with an already initialized fuse
@@ -1668,7 +1681,9 @@ static int fuse_get_tree(struct fs_context *fsc)
} else {
err = get_tree_nodev(fsc, fuse_fill_super);
}
-out_fput:
+out:
+ if (fsc->s_fs_info)
+ fuse_mount_destroy(fm);
if (ctx->file)
fput(ctx->file);
return err;
@@ -1747,17 +1762,25 @@ static void fuse_sb_destroy(struct super_block *sb)
struct fuse_mount *fm = get_fuse_mount_super(sb);
bool last;
- if (fm) {
+ if (sb->s_root) {
last = fuse_mount_remove(fm);
if (last)
fuse_conn_destroy(fm);
}
}
+void fuse_mount_destroy(struct fuse_mount *fm)
+{
+ fuse_conn_put(fm->fc);
+ kfree(fm);
+}
+EXPORT_SYMBOL(fuse_mount_destroy);
+
static void fuse_kill_sb_anon(struct super_block *sb)
{
fuse_sb_destroy(sb);
kill_anon_super(sb);
+ fuse_mount_destroy(get_fuse_mount_super(sb));
}
static struct file_system_type fuse_fs_type = {
@@ -1775,6 +1798,7 @@ static void fuse_kill_sb_blk(struct super_block *sb)
{
fuse_sb_destroy(sb);
kill_block_super(sb);
+ fuse_mount_destroy(get_fuse_mount_super(sb));
}
static struct file_system_type fuseblk_fs_type = {
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 546ea3d58fb4..fbc09dab1f85 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -286,11 +286,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
goto out;
- vaddr = kmap_atomic(ap.pages[0]);
+ vaddr = kmap_local_page(ap.pages[0]);
err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr,
transferred, in_iovs + out_iovs,
(flags & FUSE_IOCTL_COMPAT) != 0);
- kunmap_atomic(vaddr);
+ kunmap_local(vaddr);
if (err)
goto out;
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index bc267832310c..b4e565711045 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -76,11 +76,11 @@ static void fuse_add_dirent_to_cache(struct file *file,
WARN_ON(fi->rdc.pos != pos))
goto unlock;
- addr = kmap_atomic(page);
+ addr = kmap_local_page(page);
if (!offset)
clear_page(addr);
memcpy(addr + offset, dirent, reclen);
- kunmap_atomic(addr);
+ kunmap_local(addr);
fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen;
fi->rdc.pos = dirent->off;
unlock:
@@ -454,7 +454,7 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx)
* cache; both cases require an up-to-date mtime value.
*/
if (!ctx->pos && fc->auto_inval_data) {
- int err = fuse_update_attributes(inode, file);
+ int err = fuse_update_attributes(inode, file, STATX_MTIME);
if (err)
return err;
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 0ad89c6629d7..4cfa4bc1f579 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -649,7 +649,7 @@ static void virtio_fs_vq_done(struct virtqueue *vq)
static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name,
int vq_type)
{
- strncpy(fsvq->name, name, VQ_NAME_LEN);
+ strscpy(fsvq->name, name, VQ_NAME_LEN);
spin_lock_init(&fsvq->lock);
INIT_LIST_HEAD(&fsvq->queued_reqs);
INIT_LIST_HEAD(&fsvq->end_reqs);
@@ -1394,12 +1394,13 @@ static void virtio_kill_sb(struct super_block *sb)
bool last;
/* If mount failed, we can still be called without any fc */
- if (fm) {
+ if (sb->s_root) {
last = fuse_mount_remove(fm);
if (last)
virtio_fs_conn_destroy(fm);
}
kill_anon_super(sb);
+ fuse_mount_destroy(fm);
}
static int virtio_fs_test_super(struct super_block *sb,
@@ -1455,19 +1456,14 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
fsc->s_fs_info = fm;
sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc);
- if (fsc->s_fs_info) {
- fuse_conn_put(fc);
- kfree(fm);
- }
+ if (fsc->s_fs_info)
+ fuse_mount_destroy(fm);
if (IS_ERR(sb))
return PTR_ERR(sb);
if (!sb->s_root) {
err = virtio_fs_fill_super(sb, fsc);
if (err) {
- fuse_conn_put(fc);
- kfree(fm);
- sb->s_fs_info = NULL;
deactivate_locked_super(sb);
return err;
}
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 61dfaf7b7d20..0d3e7177fce0 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -42,10 +42,9 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value,
fm->fc->no_setxattr = 1;
err = -EOPNOTSUPP;
}
- if (!err) {
- fuse_invalidate_attr(inode);
+ if (!err)
fuse_update_ctime(inode);
- }
+
return err;
}
@@ -173,10 +172,9 @@ int fuse_removexattr(struct inode *inode, const char *name)
fm->fc->no_removexattr = 1;
err = -EOPNOTSUPP;
}
- if (!err) {
- fuse_invalidate_attr(inode);
+ if (!err)
fuse_update_ctime(inode);
- }
+
return err;
}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5414c2c33580..7235d539e969 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -961,46 +961,6 @@ hole_found:
goto out;
}
-static int gfs2_write_lock(struct inode *inode)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- int error;
-
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
- error = gfs2_glock_nq(&ip->i_gh);
- if (error)
- goto out_uninit;
- if (&ip->i_inode == sdp->sd_rindex) {
- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
-
- error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
- GL_NOCACHE, &m_ip->i_gh);
- if (error)
- goto out_unlock;
- }
- return 0;
-
-out_unlock:
- gfs2_glock_dq(&ip->i_gh);
-out_uninit:
- gfs2_holder_uninit(&ip->i_gh);
- return error;
-}
-
-static void gfs2_write_unlock(struct inode *inode)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
-
- if (&ip->i_inode == sdp->sd_rindex) {
- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
-
- gfs2_glock_dq_uninit(&m_ip->i_gh);
- }
- gfs2_glock_dq_uninit(&ip->i_gh);
-}
-
static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
unsigned len)
{
@@ -1118,11 +1078,6 @@ out_qunlock:
return ret;
}
-static inline bool gfs2_iomap_need_write_lock(unsigned flags)
-{
- return (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT);
-}
-
static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
unsigned flags, struct iomap *iomap,
struct iomap *srcmap)
@@ -1135,12 +1090,6 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
iomap->flags |= IOMAP_F_BUFFER_HEAD;
trace_gfs2_iomap_start(ip, pos, length, flags);
- if (gfs2_iomap_need_write_lock(flags)) {
- ret = gfs2_write_lock(inode);
- if (ret)
- goto out;
- }
-
ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
if (ret)
goto out_unlock;
@@ -1168,10 +1117,7 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
out_unlock:
- if (ret && gfs2_iomap_need_write_lock(flags))
- gfs2_write_unlock(inode);
release_metapath(&mp);
-out:
trace_gfs2_iomap_end(ip, iomap, ret);
return ret;
}
@@ -1219,15 +1165,11 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
}
if (unlikely(!written))
- goto out_unlock;
+ return 0;
if (iomap->flags & IOMAP_F_SIZE_CHANGED)
mark_inode_dirty(inode);
set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
-
-out_unlock:
- if (gfs2_iomap_need_write_lock(flags))
- gfs2_write_unlock(inode);
return 0;
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index c559827cb6f9..adafaaf7d24d 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -213,11 +213,9 @@ void gfs2_set_inode_flags(struct inode *inode)
* @inode: The inode
* @reqflags: The flags to set
* @mask: Indicates which flags are valid
- * @fsflags: The FS_* inode flags passed in
*
*/
-static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask,
- const u32 fsflags)
+static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -236,11 +234,6 @@ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask,
if ((new_flags ^ flags) == 0)
goto out;
- error = -EPERM;
- if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
- goto out;
- if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
- goto out;
if (!IS_IMMUTABLE(inode)) {
error = gfs2_permission(&init_user_ns, inode, MAY_WRITE);
if (error)
@@ -313,7 +306,7 @@ int gfs2_fileattr_set(struct user_namespace *mnt_userns,
mask &= ~(GFS2_DIF_TOPDIR | GFS2_DIF_INHERIT_JDATA);
}
- return do_gfs2_set_flags(inode, gfsflags, mask, fsflags);
+ return do_gfs2_set_flags(inode, gfsflags, mask);
}
static int gfs2_getlabel(struct file *filp, char __user *label)
@@ -776,27 +769,99 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
return ret ? ret : ret1;
}
+static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i,
+ size_t *prev_count,
+ size_t *window_size)
+{
+ char __user *p = i->iov[0].iov_base + i->iov_offset;
+ size_t count = iov_iter_count(i);
+ int pages = 1;
+
+ if (likely(!count))
+ return false;
+ if (ret <= 0 && ret != -EFAULT)
+ return false;
+ if (!iter_is_iovec(i))
+ return false;
+
+ if (*prev_count != count || !*window_size) {
+ int pages, nr_dirtied;
+
+ pages = min_t(int, BIO_MAX_VECS,
+ DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE));
+ nr_dirtied = max(current->nr_dirtied_pause -
+ current->nr_dirtied, 1);
+ pages = min(pages, nr_dirtied);
+ }
+
+ *prev_count = count;
+ *window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p);
+ return true;
+}
+
static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
struct gfs2_holder *gh)
{
struct file *file = iocb->ki_filp;
struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
- size_t count = iov_iter_count(to);
+ size_t prev_count = 0, window_size = 0;
+ size_t written = 0;
ssize_t ret;
- if (!count)
+ /*
+ * In this function, we disable page faults when we're holding the
+ * inode glock while doing I/O. If a page fault occurs, we indicate
+ * that the inode glock may be dropped, fault in the pages manually,
+ * and retry.
+ *
+ * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger
+ * physical as well as manual page faults, and we need to disable both
+ * kinds.
+ *
+ * For direct I/O, gfs2 takes the inode glock in deferred mode. This
+ * locking mode is compatible with other deferred holders, so multiple
+ * processes and nodes can do direct I/O to a file at the same time.
+ * There's no guarantee that reads or writes will be atomic. Any
+ * coordination among readers and writers needs to happen externally.
+ */
+
+ if (!iov_iter_count(to))
return 0; /* skip atime */
gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
+retry_under_glock:
+ pagefault_disable();
+ to->nofault = true;
+ ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
+ IOMAP_DIO_PARTIAL, written);
+ to->nofault = false;
+ pagefault_enable();
+ if (ret > 0)
+ written = ret;
+
+ if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
+ size_t leftover;
- ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
- gfs2_glock_dq(gh);
+ gfs2_holder_allow_demote(gh);
+ leftover = fault_in_iov_iter_writeable(to, window_size);
+ gfs2_holder_disallow_demote(gh);
+ if (leftover != window_size) {
+ if (!gfs2_holder_queued(gh))
+ goto retry;
+ goto retry_under_glock;
+ }
+ }
+ if (gfs2_holder_queued(gh))
+ gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
- return ret;
+ if (ret < 0)
+ return ret;
+ return written;
}
static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
@@ -805,11 +870,21 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
- size_t len = iov_iter_count(from);
- loff_t offset = iocb->ki_pos;
+ size_t prev_count = 0, window_size = 0;
+ size_t read = 0;
ssize_t ret;
/*
+ * In this function, we disable page faults when we're holding the
+ * inode glock while doing I/O. If a page fault occurs, we indicate
+ * that the inode glock may be dropped, fault in the pages manually,
+ * and retry.
+ *
+ * For writes, iomap_dio_rw only triggers manual page faults, so we
+ * don't need to disable physical ones.
+ */
+
+ /*
* Deferred lock, even if its a write, since we do no allocation on
* this path. All we need to change is the atime, and this lock mode
* ensures that other nodes have flushed their buffered read caches
@@ -818,31 +893,62 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
* VFS does.
*/
gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
-
+retry_under_glock:
/* Silently fall back to buffered I/O when writing beyond EOF */
- if (offset + len > i_size_read(&ip->i_inode))
+ if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))
goto out;
- ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);
+ from->nofault = true;
+ ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
+ IOMAP_DIO_PARTIAL, read);
+ from->nofault = false;
+
if (ret == -ENOTBLK)
ret = 0;
+ if (ret > 0)
+ read = ret;
+
+ if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
+ size_t leftover;
+
+ gfs2_holder_allow_demote(gh);
+ leftover = fault_in_iov_iter_readable(from, window_size);
+ gfs2_holder_disallow_demote(gh);
+ if (leftover != window_size) {
+ if (!gfs2_holder_queued(gh))
+ goto retry;
+ goto retry_under_glock;
+ }
+ }
out:
- gfs2_glock_dq(gh);
+ if (gfs2_holder_queued(gh))
+ gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
- return ret;
+ if (ret < 0)
+ return ret;
+ return read;
}
static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct gfs2_inode *ip;
struct gfs2_holder gh;
+ size_t prev_count = 0, window_size = 0;
size_t written = 0;
ssize_t ret;
+ /*
+ * In this function, we disable page faults when we're holding the
+ * inode glock while doing I/O. If a page fault occurs, we indicate
+ * that the inode glock may be dropped, fault in the pages manually,
+ * and retry.
+ */
+
if (iocb->ki_flags & IOCB_DIRECT) {
ret = gfs2_file_direct_read(iocb, to, &gh);
if (likely(ret != -ENOTBLK))
@@ -864,18 +970,118 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
ip = GFS2_I(iocb->ki_filp->f_mapping->host);
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+retry:
ret = gfs2_glock_nq(&gh);
if (ret)
goto out_uninit;
+retry_under_glock:
+ pagefault_disable();
ret = generic_file_read_iter(iocb, to);
+ pagefault_enable();
if (ret > 0)
written += ret;
- gfs2_glock_dq(&gh);
+
+ if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
+ size_t leftover;
+
+ gfs2_holder_allow_demote(&gh);
+ leftover = fault_in_iov_iter_writeable(to, window_size);
+ gfs2_holder_disallow_demote(&gh);
+ if (leftover != window_size) {
+ if (!gfs2_holder_queued(&gh)) {
+ if (written)
+ goto out_uninit;
+ goto retry;
+ }
+ goto retry_under_glock;
+ }
+ }
+ if (gfs2_holder_queued(&gh))
+ gfs2_glock_dq(&gh);
out_uninit:
gfs2_holder_uninit(&gh);
return written ? written : ret;
}
+static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
+ struct iov_iter *from,
+ struct gfs2_holder *gh)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_holder *statfs_gh = NULL;
+ size_t prev_count = 0, window_size = 0;
+ size_t read = 0;
+ ssize_t ret;
+
+ /*
+ * In this function, we disable page faults when we're holding the
+ * inode glock while doing I/O. If a page fault occurs, we indicate
+ * that the inode glock may be dropped, fault in the pages manually,
+ * and retry.
+ */
+
+ if (inode == sdp->sd_rindex) {
+ statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS);
+ if (!statfs_gh)
+ return -ENOMEM;
+ }
+
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh);
+retry:
+ ret = gfs2_glock_nq(gh);
+ if (ret)
+ goto out_uninit;
+retry_under_glock:
+ if (inode == sdp->sd_rindex) {
+ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+
+ ret = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
+ GL_NOCACHE, statfs_gh);
+ if (ret)
+ goto out_unlock;
+ }
+
+ current->backing_dev_info = inode_to_bdi(inode);
+ pagefault_disable();
+ ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+ pagefault_enable();
+ current->backing_dev_info = NULL;
+ if (ret > 0) {
+ iocb->ki_pos += ret;
+ read += ret;
+ }
+
+ if (inode == sdp->sd_rindex)
+ gfs2_glock_dq_uninit(statfs_gh);
+
+ if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
+ size_t leftover;
+
+ gfs2_holder_allow_demote(gh);
+ leftover = fault_in_iov_iter_readable(from, window_size);
+ gfs2_holder_disallow_demote(gh);
+ if (leftover != window_size) {
+ if (!gfs2_holder_queued(gh)) {
+ if (read)
+ goto out_uninit;
+ goto retry;
+ }
+ goto retry_under_glock;
+ }
+ }
+out_unlock:
+ if (gfs2_holder_queued(gh))
+ gfs2_glock_dq(gh);
+out_uninit:
+ gfs2_holder_uninit(gh);
+ if (statfs_gh)
+ kfree(statfs_gh);
+ return read ? read : ret;
+}
+
/**
* gfs2_file_write_iter - Perform a write to a file
* @iocb: The io context
@@ -927,9 +1133,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out_unlock;
iocb->ki_flags |= IOCB_DSYNC;
- current->backing_dev_info = inode_to_bdi(inode);
- buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
- current->backing_dev_info = NULL;
+ buffered = gfs2_file_buffered_write(iocb, from, &gh);
if (unlikely(buffered <= 0)) {
if (!ret)
ret = buffered;
@@ -943,7 +1147,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
* the direct I/O range as we don't know if the buffered pages
* made it to disk.
*/
- iocb->ki_pos += buffered;
ret2 = generic_write_sync(iocb, buffered);
invalidate_mapping_pages(mapping,
(iocb->ki_pos - buffered) >> PAGE_SHIFT,
@@ -951,13 +1154,9 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (!ret || ret2 > 0)
ret += ret2;
} else {
- current->backing_dev_info = inode_to_bdi(inode);
- ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
- current->backing_dev_info = NULL;
- if (likely(ret > 0)) {
- iocb->ki_pos += ret;
+ ret = gfs2_file_buffered_write(iocb, from, &gh);
+ if (likely(ret > 0))
ret = generic_write_sync(iocb, ret);
- }
}
out_unlock:
@@ -1338,8 +1537,6 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
{
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- if (fl->fl_type & LOCK_MAND)
- return -EOPNOTSUPP;
if (fl->fl_type == F_UNLCK) {
do_unflock(file, fl);
@@ -1353,7 +1550,7 @@ const struct file_operations gfs2_file_fops = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.compat_ioctl = gfs2_compat_ioctl,
.mmap = gfs2_mmap,
@@ -1386,7 +1583,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.compat_ioctl = gfs2_compat_ioctl,
.mmap = gfs2_mmap,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index e0eaa9cf9fb6..19f38aee1b61 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -58,6 +58,7 @@ struct gfs2_glock_iter {
typedef void (*glock_examiner) (struct gfs2_glock * gl);
static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
+static void __gfs2_glock_dq(struct gfs2_holder *gh);
static struct dentry *gfs2_root;
static struct workqueue_struct *glock_workqueue;
@@ -197,6 +198,12 @@ static int demote_ok(const struct gfs2_glock *gl)
if (gl->gl_state == LM_ST_UNLOCKED)
return 0;
+ /*
+ * Note that demote_ok is used for the lru process of disposing of
+ * glocks. For this purpose, we don't care if the glock's holders
+ * have the HIF_MAY_DEMOTE flag set or not. If someone is using
+ * them, don't demote.
+ */
if (!list_empty(&gl->gl_holders))
return 0;
if (glops->go_demote_ok)
@@ -294,6 +301,9 @@ void gfs2_glock_queue_put(struct gfs2_glock *gl)
void gfs2_glock_put(struct gfs2_glock *gl)
{
+ /* last put could call sleepable dlm api */
+ might_sleep();
+
if (lockref_put_or_lock(&gl->gl_lockref))
return;
@@ -301,46 +311,59 @@ void gfs2_glock_put(struct gfs2_glock *gl)
}
/**
- * may_grant - check if its ok to grant a new lock
+ * may_grant - check if it's ok to grant a new lock
* @gl: The glock
+ * @current_gh: One of the current holders of @gl
* @gh: The lock request which we wish to grant
*
- * Returns: true if its ok to grant the lock
+ * With our current compatibility rules, if a glock has one or more active
+ * holders (HIF_HOLDER flag set), any of those holders can be passed in as
+ * @current_gh; they are all the same as far as compatibility with the new @gh
+ * goes.
+ *
+ * Returns true if it's ok to grant the lock.
*/
-static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
-{
- const struct gfs2_holder *gh_head = list_first_entry(&gl->gl_holders, const struct gfs2_holder, gh_list);
+static inline bool may_grant(struct gfs2_glock *gl,
+ struct gfs2_holder *current_gh,
+ struct gfs2_holder *gh)
+{
+ if (current_gh) {
+ GLOCK_BUG_ON(gl, !test_bit(HIF_HOLDER, &current_gh->gh_iflags));
+
+ switch(current_gh->gh_state) {
+ case LM_ST_EXCLUSIVE:
+ /*
+ * Here we make a special exception to grant holders
+ * who agree to share the EX lock with other holders
+ * who also have the bit set. If the original holder
+ * has the LM_FLAG_NODE_SCOPE bit set, we grant more
+ * holders with the bit set.
+ */
+ return gh->gh_state == LM_ST_EXCLUSIVE &&
+ (current_gh->gh_flags & LM_FLAG_NODE_SCOPE) &&
+ (gh->gh_flags & LM_FLAG_NODE_SCOPE);
- if (gh != gh_head) {
- /**
- * Here we make a special exception to grant holders who agree
- * to share the EX lock with other holders who also have the
- * bit set. If the original holder has the LM_FLAG_NODE_SCOPE bit
- * is set, we grant more holders with the bit set.
- */
- if (gh_head->gh_state == LM_ST_EXCLUSIVE &&
- (gh_head->gh_flags & LM_FLAG_NODE_SCOPE) &&
- gh->gh_state == LM_ST_EXCLUSIVE &&
- (gh->gh_flags & LM_FLAG_NODE_SCOPE))
- return 1;
- if ((gh->gh_state == LM_ST_EXCLUSIVE ||
- gh_head->gh_state == LM_ST_EXCLUSIVE))
- return 0;
+ case LM_ST_SHARED:
+ case LM_ST_DEFERRED:
+ return gh->gh_state == current_gh->gh_state;
+
+ default:
+ return false;
+ }
}
+
if (gl->gl_state == gh->gh_state)
- return 1;
+ return true;
if (gh->gh_flags & GL_EXACT)
- return 0;
+ return false;
if (gl->gl_state == LM_ST_EXCLUSIVE) {
- if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
- return 1;
- if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
- return 1;
+ return gh->gh_state == LM_ST_SHARED ||
+ gh->gh_state == LM_ST_DEFERRED;
}
- if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
- return 1;
- return 0;
+ if (gh->gh_flags & LM_FLAG_ANY)
+ return gl->gl_state != LM_ST_UNLOCKED;
+ return false;
}
static void gfs2_holder_wake(struct gfs2_holder *gh)
@@ -366,7 +389,7 @@ static void do_error(struct gfs2_glock *gl, const int ret)
struct gfs2_holder *gh, *tmp;
list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
- if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ if (!test_bit(HIF_WAIT, &gh->gh_iflags))
continue;
if (ret & LM_OUT_ERROR)
gh->gh_error = -EIO;
@@ -381,6 +404,123 @@ static void do_error(struct gfs2_glock *gl, const int ret)
}
/**
+ * demote_incompat_holders - demote incompatible demoteable holders
+ * @gl: the glock we want to promote
+ * @new_gh: the new holder to be promoted
+ */
+static void demote_incompat_holders(struct gfs2_glock *gl,
+ struct gfs2_holder *new_gh)
+{
+ struct gfs2_holder *gh;
+
+ /*
+ * Demote incompatible holders before we make ourselves eligible.
+ * (This holder may or may not allow auto-demoting, but we don't want
+ * to demote the new holder before it's even granted.)
+ */
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+ /*
+ * Since holders are at the front of the list, we stop when we
+ * find the first non-holder.
+ */
+ if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+ return;
+ if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) &&
+ !may_grant(gl, new_gh, gh)) {
+ /*
+ * We should not recurse into do_promote because
+ * __gfs2_glock_dq only calls handle_callback,
+ * gfs2_glock_add_to_lru and __gfs2_glock_queue_work.
+ */
+ __gfs2_glock_dq(gh);
+ }
+ }
+}
+
+/**
+ * find_first_holder - find the first "holder" gh
+ * @gl: the glock
+ */
+
+static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+
+ if (!list_empty(&gl->gl_holders)) {
+ gh = list_first_entry(&gl->gl_holders, struct gfs2_holder,
+ gh_list);
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ return gh;
+ }
+ return NULL;
+}
+
+/**
+ * find_first_strong_holder - find the first non-demoteable holder
+ * @gl: the glock
+ *
+ * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set.
+ */
+static inline struct gfs2_holder *
+find_first_strong_holder(struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+ if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+ return NULL;
+ if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
+ return gh;
+ }
+ return NULL;
+}
+
+/*
+ * gfs2_instantiate - Call the glops instantiate function
+ * @gl: The glock
+ *
+ * Returns: 0 if instantiate was successful, 2 if type specific operation is
+ * underway, or error.
+ */
+int gfs2_instantiate(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ int ret;
+
+again:
+ if (!test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags))
+ return 0;
+
+ /*
+ * Since we unlock the lockref lock, we set a flag to indicate
+ * instantiate is in progress.
+ */
+ if (test_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags)) {
+ wait_on_bit(&gl->gl_flags, GLF_INSTANTIATE_IN_PROG,
+ TASK_UNINTERRUPTIBLE);
+ /*
+ * Here we just waited for a different instantiate to finish.
+ * But that may not have been successful, as when a process
+ * locks an inode glock _before_ it has an actual inode to
+ * instantiate into. So we check again. This process might
+ * have an inode to instantiate, so might be successful.
+ */
+ goto again;
+ }
+
+ set_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags);
+
+ ret = glops->go_instantiate(gh);
+ if (!ret)
+ clear_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags);
+ clear_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&gl->gl_flags, GLF_INSTANTIATE_IN_PROG);
+ return ret;
+}
+
+/**
* do_promote - promote as many requests as possible on the current queue
* @gl: The glock
*
@@ -392,44 +532,59 @@ static int do_promote(struct gfs2_glock *gl)
__releases(&gl->gl_lockref.lock)
__acquires(&gl->gl_lockref.lock)
{
- const struct gfs2_glock_operations *glops = gl->gl_ops;
- struct gfs2_holder *gh, *tmp;
+ struct gfs2_holder *gh, *tmp, *first_gh;
+ bool incompat_holders_demoted = false;
+ bool lock_released;
int ret;
restart:
+ first_gh = find_first_strong_holder(gl);
list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+ lock_released = false;
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
continue;
- if (may_grant(gl, gh)) {
- if (gh->gh_list.prev == &gl->gl_holders &&
- glops->go_lock) {
- spin_unlock(&gl->gl_lockref.lock);
- /* FIXME: eliminate this eventually */
- ret = glops->go_lock(gh);
- spin_lock(&gl->gl_lockref.lock);
- if (ret) {
- if (ret == 1)
- return 2;
- gh->gh_error = ret;
- list_del_init(&gh->gh_list);
- trace_gfs2_glock_queue(gh, 0);
- gfs2_holder_wake(gh);
- goto restart;
- }
- set_bit(HIF_HOLDER, &gh->gh_iflags);
- trace_gfs2_promote(gh, 1);
+ if (!may_grant(gl, first_gh, gh)) {
+ /*
+ * If we get here, it means we may not grant this holder for
+ * some reason. If this holder is the head of the list, it
+ * means we have a blocked holder at the head, so return 1.
+ */
+ if (gh->gh_list.prev == &gl->gl_holders)
+ return 1;
+ do_error(gl, 0);
+ break;
+ }
+ if (!incompat_holders_demoted) {
+ demote_incompat_holders(gl, first_gh);
+ incompat_holders_demoted = true;
+ first_gh = gh;
+ }
+ if (test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags) &&
+ !(gh->gh_flags & GL_SKIP) && gl->gl_ops->go_instantiate) {
+ lock_released = true;
+ spin_unlock(&gl->gl_lockref.lock);
+ ret = gfs2_instantiate(gh);
+ spin_lock(&gl->gl_lockref.lock);
+ if (ret) {
+ if (ret == 1)
+ return 2;
+ gh->gh_error = ret;
+ list_del_init(&gh->gh_list);
+ trace_gfs2_glock_queue(gh, 0);
gfs2_holder_wake(gh);
goto restart;
}
- set_bit(HIF_HOLDER, &gh->gh_iflags);
- trace_gfs2_promote(gh, 0);
- gfs2_holder_wake(gh);
- continue;
}
- if (gh->gh_list.prev == &gl->gl_holders)
- return 1;
- do_error(gl, 0);
- break;
+ set_bit(HIF_HOLDER, &gh->gh_iflags);
+ trace_gfs2_promote(gh);
+ gfs2_holder_wake(gh);
+ /*
+ * If we released the gl_lockref.lock the holders list may have
+ * changed. For that reason, we start again at the start of
+ * the holders queue.
+ */
+ if (lock_released)
+ goto restart;
}
return 0;
}
@@ -723,23 +878,6 @@ out:
}
/**
- * find_first_holder - find the first "holder" gh
- * @gl: the glock
- */
-
-static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
-{
- struct gfs2_holder *gh;
-
- if (!list_empty(&gl->gl_holders)) {
- gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list);
- if (test_bit(HIF_HOLDER, &gh->gh_iflags))
- return gh;
- }
- return NULL;
-}
-
-/**
* run_queue - do all outstanding tasks related to a glock
* @gl: The glock in question
* @nonblock: True if we must not block in run_queue
@@ -822,7 +960,7 @@ static void gfs2_glock_poke(struct gfs2_glock *gl)
struct gfs2_holder gh;
int error;
- gfs2_holder_init(gl, LM_ST_SHARED, flags, &gh);
+ __gfs2_holder_init(gl, LM_ST_SHARED, flags, &gh, _RET_IP_);
error = gfs2_glock_nq(&gh);
if (!error)
gfs2_glock_dq(&gh);
@@ -1057,7 +1195,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
atomic_inc(&sdp->sd_glock_disposal);
gl->gl_node.next = NULL;
- gl->gl_flags = 0;
+ gl->gl_flags = glops->go_instantiate ? BIT(GLF_INSTANTIATE_NEEDED) : 0;
gl->gl_name = name;
lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass);
gl->gl_lockref.count = 1;
@@ -1119,12 +1257,12 @@ out:
*
*/
-void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags,
- struct gfs2_holder *gh)
+void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags,
+ struct gfs2_holder *gh, unsigned long ip)
{
INIT_LIST_HEAD(&gh->gh_list);
gh->gh_gl = gl;
- gh->gh_ip = _RET_IP_;
+ gh->gh_ip = ip;
gh->gh_owner_pid = get_pid(task_pid(current));
gh->gh_state = state;
gh->gh_flags = flags;
@@ -1354,15 +1492,20 @@ __acquires(&gl->gl_lockref.lock)
GLOCK_BUG_ON(gl, true);
if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
- if (test_bit(GLF_LOCK, &gl->gl_flags))
- try_futile = !may_grant(gl, gh);
+ if (test_bit(GLF_LOCK, &gl->gl_flags)) {
+ struct gfs2_holder *first_gh;
+
+ first_gh = find_first_strong_holder(gl);
+ try_futile = !may_grant(gl, first_gh, gh);
+ }
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
goto fail;
}
list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
- (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
+ (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) &&
+ !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags)))
goto trap_recursive;
if (try_futile &&
!(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
@@ -1458,51 +1601,83 @@ int gfs2_glock_poll(struct gfs2_holder *gh)
return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
}
-/**
- * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
- * @gh: the glock holder
- *
- */
+static inline bool needs_demote(struct gfs2_glock *gl)
+{
+ return (test_bit(GLF_DEMOTE, &gl->gl_flags) ||
+ test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags));
+}
-void gfs2_glock_dq(struct gfs2_holder *gh)
+static void __gfs2_glock_dq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
unsigned delay = 0;
int fast_path = 0;
- spin_lock(&gl->gl_lockref.lock);
/*
- * If we're in the process of file system withdraw, we cannot just
- * dequeue any glocks until our journal is recovered, lest we
- * introduce file system corruption. We need two exceptions to this
- * rule: We need to allow unlocking of nondisk glocks and the glock
- * for our own journal that needs recovery.
+ * This while loop is similar to function demote_incompat_holders:
+ * If the glock is due to be demoted (which may be from another node
+ * or even if this holder is GL_NOCACHE), the weak holders are
+ * demoted as well, allowing the glock to be demoted.
*/
- if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
- glock_blocked_by_withdraw(gl) &&
- gh->gh_gl != sdp->sd_jinode_gl) {
- sdp->sd_glock_dqs_held++;
- spin_unlock(&gl->gl_lockref.lock);
- might_sleep();
- wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
- TASK_UNINTERRUPTIBLE);
- spin_lock(&gl->gl_lockref.lock);
- }
- if (gh->gh_flags & GL_NOCACHE)
- handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+ while (gh) {
+ /*
+ * If we're in the process of file system withdraw, we cannot
+ * just dequeue any glocks until our journal is recovered, lest
+ * we introduce file system corruption. We need two exceptions
+ * to this rule: We need to allow unlocking of nondisk glocks
+ * and the glock for our own journal that needs recovery.
+ */
+ if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
+ glock_blocked_by_withdraw(gl) &&
+ gh->gh_gl != sdp->sd_jinode_gl) {
+ sdp->sd_glock_dqs_held++;
+ spin_unlock(&gl->gl_lockref.lock);
+ might_sleep();
+ wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
+ TASK_UNINTERRUPTIBLE);
+ spin_lock(&gl->gl_lockref.lock);
+ }
+
+ /*
+ * This holder should not be cached, so mark it for demote.
+ * Note: this should be done before the check for needs_demote
+ * below.
+ */
+ if (gh->gh_flags & GL_NOCACHE)
+ handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+
+ list_del_init(&gh->gh_list);
+ clear_bit(HIF_HOLDER, &gh->gh_iflags);
+ trace_gfs2_glock_queue(gh, 0);
+
+ /*
+ * If there hasn't been a demote request we are done.
+ * (Let the remaining holders, if any, keep holding it.)
+ */
+ if (!needs_demote(gl)) {
+ if (list_empty(&gl->gl_holders))
+ fast_path = 1;
+ break;
+ }
+ /*
+ * If we have another strong holder (we cannot auto-demote)
+ * we are done. It keeps holding it until it is done.
+ */
+ if (find_first_strong_holder(gl))
+ break;
- list_del_init(&gh->gh_list);
- clear_bit(HIF_HOLDER, &gh->gh_iflags);
- if (list_empty(&gl->gl_holders) &&
- !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
- !test_bit(GLF_DEMOTE, &gl->gl_flags))
- fast_path = 1;
+ /*
+ * If we have a weak holder at the head of the list, it
+ * (and all others like it) must be auto-demoted. If there
+ * are no more weak holders, we exit the while loop.
+ */
+ gh = find_first_holder(gl);
+ }
if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
gfs2_glock_add_to_lru(gl);
- trace_gfs2_glock_queue(gh, 0);
if (unlikely(!fast_path)) {
gl->gl_lockref.count++;
if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -1511,6 +1686,19 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
delay = gl->gl_hold_time;
__gfs2_glock_queue_work(gl, delay);
}
+}
+
+/**
+ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
+ * @gh: the glock holder
+ *
+ */
+void gfs2_glock_dq(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+
+ spin_lock(&gl->gl_lockref.lock);
+ __gfs2_glock_dq(gh);
spin_unlock(&gl->gl_lockref.lock);
}
@@ -1673,6 +1861,7 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
{
+ struct gfs2_holder mock_gh = { .gh_gl = gl, .gh_state = state, };
unsigned long delay = 0;
unsigned long holdtime;
unsigned long now = jiffies;
@@ -1687,6 +1876,28 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
delay = gl->gl_hold_time;
}
+ /*
+ * Note 1: We cannot call demote_incompat_holders from handle_callback
+ * or gfs2_set_demote due to recursion problems like: gfs2_glock_dq ->
+ * handle_callback -> demote_incompat_holders -> gfs2_glock_dq
+ * Plus, we only want to demote the holders if the request comes from
+ * a remote cluster node because local holder conflicts are resolved
+ * elsewhere.
+ *
+ * Note 2: if a remote node wants this glock in EX mode, lock_dlm will
+ * request that we set our state to UNLOCKED. Here we mock up a holder
+ * to make it look like someone wants the lock EX locally. Any SH
+ * and DF requests should be able to share the lock without demoting.
+ *
+ * Note 3: We only want to demote the demoteable holders when there
+ * are no more strong holders. The demoteable holders might as well
+ * keep the glock until the last strong holder is done with it.
+ */
+ if (!find_first_strong_holder(gl)) {
+ if (state == LM_ST_UNLOCKED)
+ mock_gh.gh_state = LM_ST_EXCLUSIVE;
+ demote_incompat_holders(gl, &mock_gh);
+ }
handle_callback(gl, state, delay, true);
__gfs2_glock_queue_work(gl, delay);
spin_unlock(&gl->gl_lockref.lock);
@@ -1893,10 +2104,10 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
do {
rhashtable_walk_start(&iter);
- while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl))
- if (gl->gl_name.ln_sbd == sdp &&
- lockref_get_not_dead(&gl->gl_lockref))
+ while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) {
+ if (gl->gl_name.ln_sbd == sdp)
examiner(gl);
+ }
rhashtable_walk_stop(&iter);
} while (cond_resched(), gl == ERR_PTR(-EAGAIN));
@@ -1919,7 +2130,7 @@ bool gfs2_queue_delete_work(struct gfs2_glock *gl, unsigned long delay)
void gfs2_cancel_delete_work(struct gfs2_glock *gl)
{
- if (cancel_delayed_work_sync(&gl->gl_delete)) {
+ if (cancel_delayed_work(&gl->gl_delete)) {
clear_bit(GLF_PENDING_DELETE, &gl->gl_flags);
gfs2_glock_put(gl);
}
@@ -1938,7 +2149,6 @@ static void flush_delete_work(struct gfs2_glock *gl)
&gl->gl_delete, 0);
}
}
- gfs2_glock_queue_work(gl, 0);
}
void gfs2_flush_delete_work(struct gfs2_sbd *sdp)
@@ -1955,10 +2165,10 @@ void gfs2_flush_delete_work(struct gfs2_sbd *sdp)
static void thaw_glock(struct gfs2_glock *gl)
{
- if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) {
- gfs2_glock_put(gl);
+ if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+ return;
+ if (!lockref_get_not_dead(&gl->gl_lockref))
return;
- }
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
gfs2_glock_queue_work(gl, 0);
}
@@ -1974,9 +2184,12 @@ static void clear_glock(struct gfs2_glock *gl)
gfs2_glock_remove_from_lru(gl);
spin_lock(&gl->gl_lockref.lock);
- if (gl->gl_state != LM_ST_UNLOCKED)
- handle_callback(gl, LM_ST_UNLOCKED, 0, false);
- __gfs2_glock_queue_work(gl, 0);
+ if (!__lockref_is_dead(&gl->gl_lockref)) {
+ gl->gl_lockref.count++;
+ if (gl->gl_state != LM_ST_UNLOCKED)
+ handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+ __gfs2_glock_queue_work(gl, 0);
+ }
spin_unlock(&gl->gl_lockref.lock);
}
@@ -2076,6 +2289,10 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
*p++ = 'H';
if (test_bit(HIF_WAIT, &iflags))
*p++ = 'W';
+ if (test_bit(HIF_MAY_DEMOTE, &iflags))
+ *p++ = 'D';
+ if (flags & GL_SKIP)
+ *p++ = 's';
*p = 0;
return buf;
}
@@ -2144,6 +2361,10 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
*p++ = 'P';
if (test_bit(GLF_FREEING, gflags))
*p++ = 'x';
+ if (test_bit(GLF_INSTANTIATE_NEEDED, gflags))
+ *p++ = 'n';
+ if (test_bit(GLF_INSTANTIATE_IN_PROG, gflags))
+ *p++ = 'N';
*p = 0;
return buf;
}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 31a8f2f649b5..4f8642301801 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -150,6 +150,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
break;
+ if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
+ continue;
if (gh->gh_owner_pid == pid)
goto out;
}
@@ -188,13 +190,21 @@ extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
extern void gfs2_glock_hold(struct gfs2_glock *gl);
extern void gfs2_glock_put(struct gfs2_glock *gl);
extern void gfs2_glock_queue_put(struct gfs2_glock *gl);
-extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
- u16 flags, struct gfs2_holder *gh);
+
+extern void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
+ u16 flags, struct gfs2_holder *gh,
+ unsigned long ip);
+static inline void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
+ u16 flags, struct gfs2_holder *gh) {
+ __gfs2_holder_init(gl, state, flags, gh, _RET_IP_);
+}
+
extern void gfs2_holder_reinit(unsigned int state, u16 flags,
struct gfs2_holder *gh);
extern void gfs2_holder_uninit(struct gfs2_holder *gh);
extern int gfs2_glock_nq(struct gfs2_holder *gh);
extern int gfs2_glock_poll(struct gfs2_holder *gh);
+extern int gfs2_instantiate(struct gfs2_holder *gh);
extern int gfs2_glock_wait(struct gfs2_holder *gh);
extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs);
extern void gfs2_glock_dq(struct gfs2_holder *gh);
@@ -239,7 +249,7 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
{
int error;
- gfs2_holder_init(gl, state, flags, gh);
+ __gfs2_holder_init(gl, state, flags, gh, _RET_IP_);
error = gfs2_glock_nq(gh);
if (error)
@@ -325,6 +335,24 @@ static inline void glock_clear_object(struct gfs2_glock *gl, void *object)
spin_unlock(&gl->gl_lockref.lock);
}
+static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+
+ spin_lock(&gl->gl_lockref.lock);
+ set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
+ spin_unlock(&gl->gl_lockref.lock);
+}
+
+static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+
+ spin_lock(&gl->gl_lockref.lock);
+ clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
+ spin_unlock(&gl->gl_lockref.lock);
+}
+
extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 79c621c7863d..650ad77c4d0b 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -228,7 +228,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
gfs2_rgrp_brelse(rgd);
WARN_ON_ONCE(!(flags & DIO_METADATA));
truncate_inode_pages_range(mapping, start, end);
- rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
+ set_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags);
}
static void gfs2_rgrp_go_dump(struct seq_file *seq, struct gfs2_glock *gl,
@@ -356,7 +356,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
struct address_space *mapping = gfs2_glock2aspace(gl);
truncate_inode_pages(mapping, 0);
if (ip) {
- set_bit(GIF_INVALID, &ip->i_flags);
+ set_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags);
forget_all_cached_acls(&ip->i_inode);
security_inode_invalidate_secctx(&ip->i_inode);
gfs2_dir_hash_inval(ip);
@@ -476,33 +476,29 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
error = gfs2_dinode_in(ip, dibh->b_data);
brelse(dibh);
- clear_bit(GIF_INVALID, &ip->i_flags);
-
return error;
}
/**
- * inode_go_lock - operation done after an inode lock is locked by a process
+ * inode_go_instantiate - read in an inode if necessary
* @gh: The glock holder
*
* Returns: errno
*/
-static int inode_go_lock(struct gfs2_holder *gh)
+static int inode_go_instantiate(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_inode *ip = gl->gl_object;
int error = 0;
- if (!ip || (gh->gh_flags & GL_SKIP))
- return 0;
+ if (!ip) /* no inode to populate - read it in later */
+ goto out;
- if (test_bit(GIF_INVALID, &ip->i_flags)) {
- error = gfs2_inode_refresh(ip);
- if (error)
- return error;
- }
+ error = gfs2_inode_refresh(ip);
+ if (error)
+ goto out;
if (gh->gh_state != LM_ST_DEFERRED)
inode_dio_wait(&ip->i_inode);
@@ -515,9 +511,10 @@ static int inode_go_lock(struct gfs2_holder *gh)
list_add(&ip->i_trunc_list, &sdp->sd_trunc_list);
spin_unlock(&sdp->sd_trunc_lock);
wake_up(&sdp->sd_quota_wait);
- return 1;
+ error = 1;
}
+out:
return error;
}
@@ -740,7 +737,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
.go_sync = inode_go_sync,
.go_inval = inode_go_inval,
.go_demote_ok = inode_go_demote_ok,
- .go_lock = inode_go_lock,
+ .go_instantiate = inode_go_instantiate,
.go_dump = inode_go_dump,
.go_type = LM_TYPE_INODE,
.go_flags = GLOF_ASPACE | GLOF_LRU | GLOF_LVB,
@@ -750,7 +747,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
const struct gfs2_glock_operations gfs2_rgrp_glops = {
.go_sync = rgrp_go_sync,
.go_inval = rgrp_go_inval,
- .go_lock = gfs2_rgrp_go_lock,
+ .go_instantiate = gfs2_rgrp_go_instantiate,
.go_dump = gfs2_rgrp_go_dump,
.go_type = LM_TYPE_RGRP,
.go_flags = GLOF_LVB,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 0fe49770166e..8c00fb389ae5 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -119,7 +119,6 @@ struct gfs2_rgrpd {
u32 rd_flags;
u32 rd_extfail_pt; /* extent failure point */
#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */
-#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */
#define GFS2_RDF_ERROR 0x40000000 /* error in rg */
#define GFS2_RDF_PREFERRED 0x80000000 /* This rgrp is preferred */
#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */
@@ -220,7 +219,7 @@ struct gfs2_glock_operations {
int (*go_xmote_bh)(struct gfs2_glock *gl);
void (*go_inval) (struct gfs2_glock *gl, int flags);
int (*go_demote_ok) (const struct gfs2_glock *gl);
- int (*go_lock) (struct gfs2_holder *gh);
+ int (*go_instantiate) (struct gfs2_holder *gh);
void (*go_dump)(struct seq_file *seq, struct gfs2_glock *gl,
const char *fs_id_buf);
void (*go_callback)(struct gfs2_glock *gl, bool remote);
@@ -252,6 +251,7 @@ struct gfs2_lkstats {
enum {
/* States */
+ HIF_MAY_DEMOTE = 1,
HIF_HOLDER = 6, /* Set for gh that "holds" the glock */
HIF_WAIT = 10,
};
@@ -315,6 +315,7 @@ struct gfs2_alloc_parms {
enum {
GLF_LOCK = 1,
+ GLF_INSTANTIATE_NEEDED = 2, /* needs instantiate */
GLF_DEMOTE = 3,
GLF_PENDING_DEMOTE = 4,
GLF_DEMOTE_IN_PROGRESS = 5,
@@ -324,6 +325,7 @@ enum {
GLF_REPLY_PENDING = 9,
GLF_INITIAL = 10,
GLF_FROZEN = 11,
+ GLF_INSTANTIATE_IN_PROG = 12, /* instantiate happening now */
GLF_LRU = 13,
GLF_OBJECT = 14, /* Used only for tracing */
GLF_BLOCKING = 15,
@@ -370,7 +372,6 @@ struct gfs2_glock {
};
enum {
- GIF_INVALID = 0,
GIF_QD_LOCKED = 1,
GIF_ALLOC_FAILED = 2,
GIF_SW_PAGED = 3,
@@ -386,9 +387,8 @@ struct gfs2_inode {
u64 i_generation;
u64 i_eattr;
unsigned long i_flags; /* GIF_... */
- struct gfs2_glock *i_gl; /* Move into i_gh? */
+ struct gfs2_glock *i_gl;
struct gfs2_holder i_iopen_gh;
- struct gfs2_holder i_gh; /* for prepare/commit_write only */
struct gfs2_qadata *i_qadata; /* quota allocation data */
struct gfs2_holder i_rgd_gh;
struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 3130f85d2b3f..6424b903e885 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -182,7 +182,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
}
glock_set_object(ip->i_gl, ip);
- set_bit(GIF_INVALID, &ip->i_flags);
+ set_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags);
error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
if (unlikely(error))
goto fail;
@@ -196,7 +196,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
if (type == DT_UNKNOWN) {
/* Inode glock must be locked already */
- error = gfs2_inode_refresh(GFS2_I(inode));
+ error = gfs2_instantiate(&i_gh);
if (error)
goto fail;
} else {
@@ -225,6 +225,10 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
return inode;
fail:
+ if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
+ glock_clear_object(ip->i_iopen_gh.gh_gl, ip);
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ }
if (io_gl)
gfs2_glock_put(io_gl);
if (gfs2_holder_initialized(&i_gh))
@@ -727,18 +731,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
goto fail_free_inode;
flush_delayed_work(&ip->i_gl->gl_work);
- glock_set_object(ip->i_gl, ip);
error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
if (error)
goto fail_free_inode;
gfs2_cancel_delete_work(io_gl);
- glock_set_object(io_gl, ip);
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
if (error)
goto fail_gunlock2;
+ glock_set_object(ip->i_gl, ip);
error = gfs2_trans_begin(sdp, blocks, 0);
if (error)
goto fail_gunlock2;
@@ -754,6 +757,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
goto fail_gunlock2;
+ glock_set_object(io_gl, ip);
gfs2_set_iop(inode);
insert_inode_hash(inode);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c3b00ba92ed2..0fb3c01bc557 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -932,7 +932,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
goto fail;
rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
- rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
+ rgd->rd_flags &= ~GFS2_RDF_PREFERRED;
if (rgd->rd_data > sdp->sd_max_rg_data)
sdp->sd_max_rg_data = rgd->rd_data;
spin_lock(&sdp->sd_rindex_spin);
@@ -1185,8 +1185,8 @@ static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd)
}
/**
- * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
- * @rgd: the struct gfs2_rgrpd describing the RG to read in
+ * gfs2_rgrp_go_instantiate - Read in a RG's header and bitmaps
+ * @gh: the glock holder representing the rgrpd to read in
*
* Read in all of a Resource Group's header and bitmap blocks.
* Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
@@ -1194,10 +1194,11 @@ static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd)
* Returns: errno
*/
-static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
+int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh)
{
+ struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_rgrpd *rgd = gl->gl_object;
struct gfs2_sbd *sdp = rgd->rd_sbd;
- struct gfs2_glock *gl = rgd->rd_gl;
unsigned int length = rgd->rd_length;
struct gfs2_bitmap *bi;
unsigned int x, y;
@@ -1225,21 +1226,18 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
}
}
- if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
- gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
- rgrp_set_bitmap_flags(rgd);
- rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
- rgd->rd_free_clone = rgd->rd_free;
- BUG_ON(rgd->rd_reserved);
- /* max out the rgrp allocation failure point */
- rgd->rd_extfail_pt = rgd->rd_free;
- }
+ gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
+ rgrp_set_bitmap_flags(rgd);
+ rgd->rd_flags |= GFS2_RDF_CHECK;
+ rgd->rd_free_clone = rgd->rd_free;
+ GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved);
+ /* max out the rgrp allocation failure point */
+ rgd->rd_extfail_pt = rgd->rd_free;
if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
rgd->rd_bits[0].bi_bh->b_data);
- }
- else if (sdp->sd_args.ar_rgrplvb) {
+ } else if (sdp->sd_args.ar_rgrplvb) {
if (!gfs2_rgrp_lvb_valid(rgd)){
gfs2_consist_rgrpd(rgd);
error = -EIO;
@@ -1257,19 +1255,18 @@ fail:
bi->bi_bh = NULL;
gfs2_assert_warn(sdp, !bi->bi_clone);
}
-
return error;
}
-static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
+static int update_rgrp_lvb(struct gfs2_rgrpd *rgd, struct gfs2_holder *gh)
{
u32 rl_flags;
- if (rgd->rd_flags & GFS2_RDF_UPTODATE)
+ if (!test_bit(GLF_INSTANTIATE_NEEDED, &gh->gh_gl->gl_flags))
return 0;
if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
- return gfs2_rgrp_bh_get(rgd);
+ return gfs2_instantiate(gh);
rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
rl_flags &= ~GFS2_RDF_MASK;
@@ -1280,7 +1277,7 @@ static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
rgrp_set_bitmap_flags(rgd);
rgd->rd_free_clone = rgd->rd_free;
- BUG_ON(rgd->rd_reserved);
+ GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved);
/* max out the rgrp allocation failure point */
rgd->rd_extfail_pt = rgd->rd_free;
rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
@@ -1288,16 +1285,6 @@ static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
return 0;
}
-int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
-{
- struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
- struct gfs2_sbd *sdp = rgd->rd_sbd;
-
- if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
- return 0;
- return gfs2_rgrp_bh_get(rgd);
-}
-
/**
* gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
* @rgd: The resource group
@@ -1315,6 +1302,7 @@ void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
bi->bi_bh = NULL;
}
}
+ set_bit(GLF_INSTANTIATE_NEEDED, &rgd->rd_gl->gl_flags);
}
int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
@@ -2113,7 +2101,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
gfs2_rgrp_congested(rs->rs_rgd, loops))
goto skip_rgrp;
if (sdp->sd_args.ar_rgrplvb) {
- error = update_rgrp_lvb(rs->rs_rgd);
+ error = update_rgrp_lvb(rs->rs_rgd,
+ &ip->i_rgd_gh);
if (unlikely(error)) {
rgrp_unlock_local(rs->rs_rgd);
gfs2_glock_dq_uninit(&ip->i_rgd_gh);
@@ -2128,8 +2117,11 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
(loops == 0 && target > rs->rs_rgd->rd_extfail_pt))
goto skip_rgrp;
- if (sdp->sd_args.ar_rgrplvb)
- gfs2_rgrp_bh_get(rs->rs_rgd);
+ if (sdp->sd_args.ar_rgrplvb) {
+ error = gfs2_instantiate(&ip->i_rgd_gh);
+ if (error)
+ goto skip_rgrp;
+ }
/* Get a reservation if we don't already have one */
if (!gfs2_rs_active(rs))
@@ -2215,7 +2207,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
struct gfs2_rgrpd *rgd = rs->rs_rgd;
spin_lock(&rgd->rd_rsspin);
- BUG_ON(rgd->rd_reserved < rs->rs_reserved);
+ GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved < rs->rs_reserved);
rgd->rd_reserved -= rs->rs_reserved;
spin_unlock(&rgd->rd_rsspin);
rs->rs_reserved = 0;
@@ -2476,9 +2468,9 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
spin_unlock(&rbm.rgd->rd_rsspin);
goto rgrp_error;
}
- BUG_ON(rbm.rgd->rd_reserved < *nblocks);
- BUG_ON(rbm.rgd->rd_free_clone < *nblocks);
- BUG_ON(rbm.rgd->rd_free < *nblocks);
+ GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_reserved < *nblocks);
+ GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_free_clone < *nblocks);
+ GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_free < *nblocks);
rbm.rgd->rd_reserved -= *nblocks;
rbm.rgd->rd_free_clone -= *nblocks;
rbm.rgd->rd_free -= *nblocks;
@@ -2765,8 +2757,6 @@ void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
void rgrp_lock_local(struct gfs2_rgrpd *rgd)
{
- BUG_ON(!gfs2_glock_is_held_excl(rgd->rd_gl) &&
- !test_bit(SDF_NORECOVERY, &rgd->rd_sbd->sd_flags));
mutex_lock(&rgd->rd_mutex);
}
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index a6855fd796e0..3e2ca1fb4305 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -31,7 +31,7 @@ extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
-extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
+extern int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh);
extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd);
extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 6e00d15ef0a8..5b121371508a 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1244,8 +1244,8 @@ static enum dinode_demise evict_should_delete(struct inode *inode,
if (ret)
return SHOULD_NOT_DELETE_DINODE;
- if (test_bit(GIF_INVALID, &ip->i_flags)) {
- ret = gfs2_inode_refresh(ip);
+ if (test_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags)) {
+ ret = gfs2_instantiate(gh);
if (ret)
return SHOULD_NOT_DELETE_DINODE;
}
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index bd6c8e9e49db..a5deb9f86831 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -197,15 +197,14 @@ TRACE_EVENT(gfs2_demote_rq,
/* Promotion/grant of a glock */
TRACE_EVENT(gfs2_promote,
- TP_PROTO(const struct gfs2_holder *gh, int first),
+ TP_PROTO(const struct gfs2_holder *gh),
- TP_ARGS(gh, first),
+ TP_ARGS(gh),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( u64, glnum )
__field( u32, gltype )
- __field( int, first )
__field( u8, state )
),
@@ -213,14 +212,12 @@ TRACE_EVENT(gfs2_promote,
__entry->dev = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->glnum = gh->gh_gl->gl_name.ln_number;
__entry->gltype = gh->gh_gl->gl_name.ln_type;
- __entry->first = first;
__entry->state = glock_trace_state(gh->gh_state);
),
- TP_printk("%u,%u glock %u:%llu promote %s %s",
+ TP_printk("%u,%u glock %u:%llu promote %s",
MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
(unsigned long long)__entry->glnum,
- __entry->first ? "first": "other",
glock_trace_name(__entry->state))
);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index cf345a86ef67..8241029a2a5d 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -454,6 +454,7 @@ void gfs2_consist_inode_i(struct gfs2_inode *ip,
(unsigned long long)ip->i_no_formal_ino,
(unsigned long long)ip->i_no_addr,
function, file, line);
+ gfs2_dump_glock(NULL, ip->i_gl, 1);
gfs2_withdraw(sdp);
}
@@ -475,6 +476,7 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd,
" function = %s, file = %s, line = %u\n",
(unsigned long long)rgd->rd_addr,
function, file, line);
+ gfs2_dump_glock(NULL, rgd->rd_gl, 1);
gfs2_withdraw(sdp);
}
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 4a95a92546a0..2a5143246282 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -462,8 +462,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
goto out;
if (S_ISDIR(main_inode->i_mode)) {
- if (fd.entrylength < sizeof(struct hfs_cat_dir))
- /* panic? */;
+ WARN_ON(fd.entrylength < sizeof(struct hfs_cat_dir));
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_dir));
if (rec.type != HFS_CDR_DIR ||
@@ -483,8 +482,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
} else {
- if (fd.entrylength < sizeof(struct hfs_cat_file))
- /* panic? */;
+ WARN_ON(fd.entrylength < sizeof(struct hfs_cat_file));
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
if (rec.type != HFS_CDR_FIL ||
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index cdf0edeeb278..5beb82652435 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -36,7 +36,7 @@ static int hfs_get_last_session(struct super_block *sb,
/* default values */
*start = 0;
- *size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+ *size = bdev_nr_sectors(sb->s_bdev);
if (HFS_SB(sb)->session >= 0) {
struct cdrom_tocentry te;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 6fef67c2a9f0..d08a8d1d40a4 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -509,8 +509,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
if (type == HFSPLUS_FOLDER) {
struct hfsplus_cat_folder *folder = &entry.folder;
- if (fd->entrylength < sizeof(struct hfsplus_cat_folder))
- /* panic? */;
+ WARN_ON(fd->entrylength < sizeof(struct hfsplus_cat_folder));
hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
sizeof(struct hfsplus_cat_folder));
hfsplus_get_perms(inode, &folder->permissions, 1);
@@ -530,8 +529,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
} else if (type == HFSPLUS_FILE) {
struct hfsplus_cat_file *file = &entry.file;
- if (fd->entrylength < sizeof(struct hfsplus_cat_file))
- /* panic? */;
+ WARN_ON(fd->entrylength < sizeof(struct hfsplus_cat_file));
hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
sizeof(struct hfsplus_cat_file));
@@ -588,8 +586,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
if (S_ISDIR(main_inode->i_mode)) {
struct hfsplus_cat_folder *folder = &entry.folder;
- if (fd.entrylength < sizeof(struct hfsplus_cat_folder))
- /* panic? */;
+ WARN_ON(fd.entrylength < sizeof(struct hfsplus_cat_folder));
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
sizeof(struct hfsplus_cat_folder));
/* simple node checks? */
@@ -614,8 +611,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
} else {
struct hfsplus_cat_file *file = &entry.file;
- if (fd.entrylength < sizeof(struct hfsplus_cat_file))
- /* panic? */;
+ WARN_ON(fd.entrylength < sizeof(struct hfsplus_cat_file));
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
sizeof(struct hfsplus_cat_file));
hfsplus_inode_write_fork(inode, &file->data_fork);
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 0350dc7821bf..51ae6f1eb4a5 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -131,7 +131,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
/* default values */
*start = 0;
- *size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+ *size = bdev_nr_sectors(sb->s_bdev);
if (HFSPLUS_SB(sb)->session >= 0) {
struct cdrom_tocentry te;
diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h
index d92c4af3e1b4..281dec8f636b 100644
--- a/fs/hpfs/hpfs.h
+++ b/fs/hpfs/hpfs.h
@@ -409,10 +409,10 @@ struct bplus_header
__le16 first_free; /* offset from start of header to
first free node in array */
union {
- struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving
- subtree pointers */
- struct bplus_leaf_node external[0]; /* (external) 3-word entries giving
- sector runs */
+ /* (internal) 2-word entries giving subtree pointers */
+ DECLARE_FLEX_ARRAY(struct bplus_internal_node, internal);
+ /* (external) 3-word entries giving sector runs */
+ DECLARE_FLEX_ARRAY(struct bplus_leaf_node, external);
} u;
};
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index cdfb1ae78a3f..49d2e686be74 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1446,8 +1446,8 @@ static int get_hstate_idx(int page_size_log)
* otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
*/
struct file *hugetlb_file_setup(const char *name, size_t size,
- vm_flags_t acctflag, struct ucounts **ucounts,
- int creat_flags, int page_size_log)
+ vm_flags_t acctflag, int creat_flags,
+ int page_size_log)
{
struct inode *inode;
struct vfsmount *mnt;
@@ -1458,22 +1458,19 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
if (hstate_idx < 0)
return ERR_PTR(-ENODEV);
- *ucounts = NULL;
mnt = hugetlbfs_vfsmount[hstate_idx];
if (!mnt)
return ERR_PTR(-ENOENT);
if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
- *ucounts = current_ucounts();
- if (user_shm_lock(size, *ucounts)) {
- task_lock(current);
- pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
+ struct ucounts *ucounts = current_ucounts();
+
+ if (user_shm_lock(size, ucounts)) {
+ pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n",
current->comm, current->pid);
- task_unlock(current);
- } else {
- *ucounts = NULL;
- return ERR_PTR(-EPERM);
+ user_shm_unlock(size, ucounts);
}
+ return ERR_PTR(-EPERM);
}
file = ERR_PTR(-ENOSPC);
@@ -1498,10 +1495,6 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
iput(inode);
out:
- if (*ucounts) {
- user_shm_unlock(size, *ucounts);
- *ucounts = NULL;
- }
return file;
}
diff --git a/fs/inode.c b/fs/inode.c
index 37710ca863b5..3eba0940ffcf 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -190,8 +190,10 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
mapping->private_data = NULL;
mapping->writeback_index = 0;
- __init_rwsem(&mapping->invalidate_lock, "mapping.invalidate_lock",
- &sb->s_type->invalidate_lock_key);
+ init_rwsem(&mapping->invalidate_lock);
+ lockdep_set_class_and_name(&mapping->invalidate_lock,
+ &sb->s_type->invalidate_lock_key,
+ "mapping.invalidate_lock");
inode->i_private = NULL;
inode->i_mapping = mapping;
INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
@@ -426,11 +428,20 @@ void ihold(struct inode *inode)
}
EXPORT_SYMBOL(ihold);
-static void inode_lru_list_add(struct inode *inode)
+static void __inode_add_lru(struct inode *inode, bool rotate)
{
+ if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
+ return;
+ if (atomic_read(&inode->i_count))
+ return;
+ if (!(inode->i_sb->s_flags & SB_ACTIVE))
+ return;
+ if (!mapping_shrinkable(&inode->i_data))
+ return;
+
if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_inc(nr_unused);
- else
+ else if (rotate)
inode->i_state |= I_REFERENCED;
}
@@ -441,16 +452,11 @@ static void inode_lru_list_add(struct inode *inode)
*/
void inode_add_lru(struct inode *inode)
{
- if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
- I_FREEING | I_WILL_FREE)) &&
- !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
- inode_lru_list_add(inode);
+ __inode_add_lru(inode, false);
}
-
static void inode_lru_list_del(struct inode *inode)
{
-
if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_dec(nr_unused);
}
@@ -726,10 +732,6 @@ again:
/*
* Isolate the inode from the LRU in preparation for freeing it.
*
- * Any inodes which are pinned purely because of attached pagecache have their
- * pagecache removed. If the inode has metadata buffers attached to
- * mapping->private_list then try to remove them.
- *
* If the inode has the I_REFERENCED flag set, then it means that it has been
* used recently - the flag is set in iput_final(). When we encounter such an
* inode, clear the flag and move it to the back of the LRU so it gets another
@@ -745,31 +747,39 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
struct inode *inode = container_of(item, struct inode, i_lru);
/*
- * we are inverting the lru lock/inode->i_lock here, so use a trylock.
- * If we fail to get the lock, just skip it.
+ * We are inverting the lru lock/inode->i_lock here, so use a
+ * trylock. If we fail to get the lock, just skip it.
*/
if (!spin_trylock(&inode->i_lock))
return LRU_SKIP;
/*
- * Referenced or dirty inodes are still in use. Give them another pass
- * through the LRU as we canot reclaim them now.
+ * Inodes can get referenced, redirtied, or repopulated while
+ * they're already on the LRU, and this can make them
+ * unreclaimable for a while. Remove them lazily here; iput,
+ * sync, or the last page cache deletion will requeue them.
*/
if (atomic_read(&inode->i_count) ||
- (inode->i_state & ~I_REFERENCED)) {
+ (inode->i_state & ~I_REFERENCED) ||
+ !mapping_shrinkable(&inode->i_data)) {
list_lru_isolate(lru, &inode->i_lru);
spin_unlock(&inode->i_lock);
this_cpu_dec(nr_unused);
return LRU_REMOVED;
}
- /* recently referenced inodes get one more pass */
+ /* Recently referenced inodes get one more pass */
if (inode->i_state & I_REFERENCED) {
inode->i_state &= ~I_REFERENCED;
spin_unlock(&inode->i_lock);
return LRU_ROTATE;
}
+ /*
+ * On highmem systems, mapping_shrinkable() permits dropping
+ * page cache in order to free up struct inodes: lowmem might
+ * be under pressure before the cache inside the highmem zone.
+ */
if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
__iget(inode);
spin_unlock(&inode->i_lock);
@@ -1636,7 +1646,7 @@ static void iput_final(struct inode *inode)
if (!drop &&
!(inode->i_state & I_DONTCACHE) &&
(sb->s_flags & SB_ACTIVE)) {
- inode_add_lru(inode);
+ __inode_add_lru(inode, true);
spin_unlock(&inode->i_lock);
return;
}
@@ -1780,12 +1790,13 @@ EXPORT_SYMBOL(generic_update_time);
* This does the actual work of updating an inodes time or version. Must have
* had called mnt_want_write() before calling this.
*/
-static int update_time(struct inode *inode, struct timespec64 *time, int flags)
+int inode_update_time(struct inode *inode, struct timespec64 *time, int flags)
{
if (inode->i_op->update_time)
return inode->i_op->update_time(inode, time, flags);
return generic_update_time(inode, time, flags);
}
+EXPORT_SYMBOL(inode_update_time);
/**
* atime_needs_update - update the access time
@@ -1855,7 +1866,7 @@ void touch_atime(const struct path *path)
* of the fs read only, e.g. subvolumes in Btrfs.
*/
now = current_time(inode);
- update_time(inode, &now, S_ATIME);
+ inode_update_time(inode, &now, S_ATIME);
__mnt_drop_write(mnt);
skip_update:
sb_end_write(inode->i_sb);
@@ -2000,7 +2011,7 @@ int file_update_time(struct file *file)
if (__mnt_want_write_file(file))
return 0;
- ret = update_time(inode, &now, sync_it);
+ ret = inode_update_time(inode, &now, sync_it);
__mnt_drop_write_file(file);
return ret;
diff --git a/fs/internal.h b/fs/internal.h
index 3cd065c8a66b..7979ff8d168c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -23,22 +23,11 @@ struct pipe_inode_info;
#ifdef CONFIG_BLOCK
extern void __init bdev_cache_init(void);
-extern int __sync_blockdev(struct block_device *bdev, int wait);
-void iterate_bdevs(void (*)(struct block_device *, void *), void *);
void emergency_thaw_bdev(struct super_block *sb);
#else
static inline void bdev_cache_init(void)
{
}
-
-static inline int __sync_blockdev(struct block_device *bdev, int wait)
-{
- return 0;
-}
-static inline void iterate_bdevs(void (*f)(struct block_device *, void *),
- void *arg)
-{
-}
static inline int emergency_thaw_bdev(struct super_block *sb)
{
return 0;
@@ -149,7 +138,6 @@ extern int vfs_open(const struct path *, struct file *);
* inode.c
*/
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
-extern void inode_add_lru(struct inode *inode);
extern int dentry_needs_remove_privs(struct dentry *dentry);
/*
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 6c55362c1f99..afd955d53db9 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -14,6 +14,8 @@
#include <linux/rculist_nulls.h>
#include <linux/cpu.h>
#include <linux/tracehook.h>
+#include <linux/audit.h>
+#include <uapi/linux/io_uring.h>
#include "io-wq.h"
@@ -139,6 +141,7 @@ static void io_wqe_dec_running(struct io_worker *worker);
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
struct io_wqe_acct *acct,
struct io_cb_cancel_data *match);
+static void create_worker_cb(struct callback_head *cb);
static bool io_worker_get(struct io_worker *worker)
{
@@ -173,20 +176,52 @@ static void io_worker_ref_put(struct io_wq *wq)
complete(&wq->worker_done);
}
+static void io_worker_cancel_cb(struct io_worker *worker)
+{
+ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+ struct io_wqe *wqe = worker->wqe;
+ struct io_wq *wq = wqe->wq;
+
+ atomic_dec(&acct->nr_running);
+ raw_spin_lock(&worker->wqe->lock);
+ acct->nr_workers--;
+ raw_spin_unlock(&worker->wqe->lock);
+ io_worker_ref_put(wq);
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
+}
+
+static bool io_task_worker_match(struct callback_head *cb, void *data)
+{
+ struct io_worker *worker;
+
+ if (cb->func != create_worker_cb)
+ return false;
+ worker = container_of(cb, struct io_worker, create_work);
+ return worker == data;
+}
+
static void io_worker_exit(struct io_worker *worker)
{
struct io_wqe *wqe = worker->wqe;
- struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+ struct io_wq *wq = wqe->wq;
- if (refcount_dec_and_test(&worker->ref))
- complete(&worker->ref_done);
+ while (1) {
+ struct callback_head *cb = task_work_cancel_match(wq->task,
+ io_task_worker_match, worker);
+
+ if (!cb)
+ break;
+ io_worker_cancel_cb(worker);
+ }
+
+ io_worker_release(worker);
wait_for_completion(&worker->ref_done);
raw_spin_lock(&wqe->lock);
if (worker->flags & IO_WORKER_F_FREE)
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
- acct->nr_workers--;
preempt_disable();
io_wqe_dec_running(worker);
worker->flags = 0;
@@ -246,8 +281,6 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe,
*/
static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
{
- bool do_create = false;
-
/*
* Most likely an attempt to queue unbounded work on an io_wq that
* wasn't setup with any unbounded workers.
@@ -256,18 +289,15 @@ static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
pr_warn_once("io-wq is not configured for unbound workers");
raw_spin_lock(&wqe->lock);
- if (acct->nr_workers < acct->max_workers) {
- acct->nr_workers++;
- do_create = true;
+ if (acct->nr_workers >= acct->max_workers) {
+ raw_spin_unlock(&wqe->lock);
+ return true;
}
+ acct->nr_workers++;
raw_spin_unlock(&wqe->lock);
- if (do_create) {
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
- return create_io_worker(wqe->wq, wqe, acct->index);
- }
-
- return true;
+ atomic_inc(&acct->nr_running);
+ atomic_inc(&wqe->wq->worker_refs);
+ return create_io_worker(wqe->wq, wqe, acct->index);
}
static void io_wqe_inc_running(struct io_worker *worker)
@@ -329,8 +359,10 @@ static bool io_queue_worker_create(struct io_worker *worker,
init_task_work(&worker->create_work, func);
worker->create_index = acct->index;
- if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+ if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) {
+ clear_bit_unlock(0, &worker->create_state);
return true;
+ }
clear_bit_unlock(0, &worker->create_state);
fail_release:
io_worker_release(worker);
@@ -562,6 +594,8 @@ static int io_wqe_worker(void *data)
snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
set_task_comm(current, buf);
+ audit_alloc_kernel(current);
+
while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
long ret;
@@ -574,6 +608,7 @@ loop:
}
/* timed out, exit unless we're the last worker */
if (last_timeout && acct->nr_workers > 1) {
+ acct->nr_workers--;
raw_spin_unlock(&wqe->lock);
__set_current_state(TASK_RUNNING);
break;
@@ -589,9 +624,7 @@ loop:
if (!get_signal(&ksig))
continue;
- if (fatal_signal_pending(current))
- break;
- continue;
+ break;
}
last_timeout = !ret;
}
@@ -601,6 +634,7 @@ loop:
io_worker_handle_work(worker);
}
+ audit_free(current);
io_worker_exit(worker);
return 0;
}
@@ -723,11 +757,8 @@ static void io_workqueue_create(struct work_struct *work)
struct io_worker *worker = container_of(work, struct io_worker, work);
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- if (!io_queue_worker_create(worker, acct, create_worker_cont)) {
- clear_bit_unlock(0, &worker->create_state);
- io_worker_release(worker);
+ if (!io_queue_worker_create(worker, acct, create_worker_cont))
kfree(worker);
- }
}
static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
@@ -1157,17 +1188,9 @@ static void io_wq_exit_workers(struct io_wq *wq)
while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
struct io_worker *worker;
- struct io_wqe_acct *acct;
worker = container_of(cb, struct io_worker, create_work);
- acct = io_wqe_get_acct(worker);
- atomic_dec(&acct->nr_running);
- raw_spin_lock(&worker->wqe->lock);
- acct->nr_workers--;
- raw_spin_unlock(&worker->wqe->lock);
- io_worker_ref_put(wq);
- clear_bit_unlock(0, &worker->create_state);
- io_worker_release(worker);
+ io_worker_cancel_cb(worker);
}
rcu_read_lock();
@@ -1285,26 +1308,43 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
*/
int io_wq_max_workers(struct io_wq *wq, int *new_count)
{
- int i, node, prev = 0;
+ int prev[IO_WQ_ACCT_NR];
+ bool first_node = true;
+ int i, node;
+
+ BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND != (int) IO_WQ_BOUND);
+ BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
+ BUILD_BUG_ON((int) IO_WQ_ACCT_NR != 2);
for (i = 0; i < 2; i++) {
if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
new_count[i] = task_rlimit(current, RLIMIT_NPROC);
}
+ for (i = 0; i < IO_WQ_ACCT_NR; i++)
+ prev[i] = 0;
+
rcu_read_lock();
for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
struct io_wqe_acct *acct;
+ raw_spin_lock(&wqe->lock);
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
- acct = &wq->wqes[node]->acct[i];
- prev = max_t(int, acct->max_workers, prev);
+ acct = &wqe->acct[i];
+ if (first_node)
+ prev[i] = max_t(int, acct->max_workers, prev[i]);
if (new_count[i])
acct->max_workers = new_count[i];
- new_count[i] = prev;
}
+ raw_spin_unlock(&wqe->lock);
+ first_node = false;
}
rcu_read_unlock();
+
+ for (i = 0; i < IO_WQ_ACCT_NR; i++)
+ new_count[i] = prev[i];
+
return 0;
}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index bf5c4c533760..41bf37674a49 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -29,6 +29,17 @@ struct io_wq_work_list {
struct io_wq_work_node *last;
};
+#define wq_list_for_each(pos, prv, head) \
+ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_for_each_resume(pos, prv) \
+ for (; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
+#define INIT_WQ_LIST(list) do { \
+ (list)->first = NULL; \
+} while (0)
+
static inline void wq_list_add_after(struct io_wq_work_node *node,
struct io_wq_work_node *pos,
struct io_wq_work_list *list)
@@ -54,6 +65,15 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
}
}
+static inline void wq_list_add_head(struct io_wq_work_node *node,
+ struct io_wq_work_list *list)
+{
+ node->next = list->first;
+ if (!node->next)
+ list->last = node;
+ WRITE_ONCE(list->first, node);
+}
+
static inline void wq_list_cut(struct io_wq_work_list *list,
struct io_wq_work_node *last,
struct io_wq_work_node *prev)
@@ -69,6 +89,31 @@ static inline void wq_list_cut(struct io_wq_work_list *list,
last->next = NULL;
}
+static inline void __wq_list_splice(struct io_wq_work_list *list,
+ struct io_wq_work_node *to)
+{
+ list->last->next = to->next;
+ to->next = list->first;
+ INIT_WQ_LIST(list);
+}
+
+static inline bool wq_list_splice(struct io_wq_work_list *list,
+ struct io_wq_work_node *to)
+{
+ if (!wq_list_empty(list)) {
+ __wq_list_splice(list, to);
+ return true;
+ }
+ return false;
+}
+
+static inline void wq_stack_add_head(struct io_wq_work_node *node,
+ struct io_wq_work_node *stack)
+{
+ node->next = stack->next;
+ stack->next = node;
+}
+
static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work_node *node,
struct io_wq_work_node *prev)
@@ -76,14 +121,14 @@ static inline void wq_list_del(struct io_wq_work_list *list,
wq_list_cut(list, node, prev);
}
-#define wq_list_for_each(pos, prv, head) \
- for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+static inline
+struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
+{
+ struct io_wq_work_node *node = stack->next;
-#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
-#define INIT_WQ_LIST(list) do { \
- (list)->first = NULL; \
- (list)->last = NULL; \
-} while (0)
+ stack->next = node->next;
+ return node;
+}
struct io_wq_work {
struct io_wq_work_node list;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 16fb7436043c..b07196b4511c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -79,6 +79,8 @@
#include <linux/pagemap.h>
#include <linux/io_uring.h>
#include <linux/tracehook.h>
+#include <linux/audit.h>
+#include <linux/security.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -103,11 +105,14 @@
#define IORING_MAX_REG_BUFFERS (1U << 14)
-#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
- IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
- IOSQE_BUFFER_SELECT)
+#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
+ IOSQE_IO_HARDLINK | IOSQE_ASYNC)
+
+#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN)
+
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
- REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
+ REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
+ REQ_F_ASYNC_DATA)
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
@@ -195,8 +200,10 @@ struct io_rings {
};
enum io_uring_cmd_flags {
- IO_URING_F_NONBLOCK = 1,
- IO_URING_F_COMPLETE_DEFER = 2,
+ IO_URING_F_COMPLETE_DEFER = 1,
+ IO_URING_F_UNLOCKED = 2,
+ /* int's last bit, sign checks are usually faster than a bit test */
+ IO_URING_F_NONBLOCK = INT_MIN,
};
struct io_mapped_ubuf {
@@ -305,26 +312,16 @@ struct io_submit_link {
};
struct io_submit_state {
- struct blk_plug plug;
+ /* inline/task_work completion list, under ->uring_lock */
+ struct io_wq_work_node free_list;
+ /* batch completion logic */
+ struct io_wq_work_list compl_reqs;
struct io_submit_link link;
- /*
- * io_kiocb alloc cache
- */
- void *reqs[IO_REQ_CACHE_SIZE];
- unsigned int free_reqs;
-
bool plug_started;
-
- /*
- * Batch completion logic
- */
- struct io_kiocb *compl_reqs[IO_COMPL_BATCH];
- unsigned int compl_nr;
- /* inline/task_work completion list, under ->uring_lock */
- struct list_head free_list;
-
- unsigned int ios_left;
+ bool need_plug;
+ unsigned short submit_nr;
+ struct blk_plug plug;
};
struct io_ring_ctx {
@@ -368,6 +365,7 @@ struct io_ring_ctx {
* uring_lock, and updated through io_uring_register(2)
*/
struct io_rsrc_node *rsrc_node;
+ int rsrc_cached_refs;
struct io_file_table file_table;
unsigned nr_user_files;
unsigned nr_user_bufs;
@@ -384,7 +382,7 @@ struct io_ring_ctx {
} ____cacheline_aligned_in_smp;
/* IRQ completion list, under ->completion_lock */
- struct list_head locked_free_list;
+ struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
@@ -399,11 +397,9 @@ struct io_ring_ctx {
unsigned cached_cq_tail;
unsigned cq_entries;
struct eventfd_ctx *cq_ev_fd;
- struct wait_queue_head poll_wait;
struct wait_queue_head cq_wait;
unsigned cq_extra;
atomic_t cq_timeouts;
- struct fasync_struct *cq_fasync;
unsigned cq_last_tm_flush;
} ____cacheline_aligned_in_smp;
@@ -418,7 +414,7 @@ struct io_ring_ctx {
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
- struct list_head iopoll_list;
+ struct io_wq_work_list iopoll_list;
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_queue;
@@ -457,6 +453,8 @@ struct io_ring_ctx {
struct work_struct exit_work;
struct list_head tctx_list;
struct completion ref_comp;
+ u32 iowq_limits[2];
+ bool iowq_limits_set;
};
};
@@ -502,6 +500,7 @@ struct io_poll_update {
struct io_close {
struct file *file;
int fd;
+ u32 file_slot;
};
struct io_timeout_data {
@@ -578,7 +577,6 @@ struct io_sr_msg {
int msg_flags;
int bgid;
size_t len;
- struct io_buffer *kbuf;
};
struct io_open {
@@ -690,11 +688,6 @@ struct io_hardlink {
int flags;
};
-struct io_completion {
- struct file *file;
- u32 cflags;
-};
-
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -708,10 +701,15 @@ struct io_async_msghdr {
struct sockaddr_storage addr;
};
-struct io_async_rw {
+struct io_rw_state {
+ struct iov_iter iter;
+ struct iov_iter_state iter_state;
struct iovec fast_iov[UIO_FASTIOV];
+};
+
+struct io_async_rw {
+ struct io_rw_state s;
const struct iovec *free_iovec;
- struct iov_iter iter;
size_t bytes_done;
struct wait_page_queue wpq;
};
@@ -735,13 +733,12 @@ enum {
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_COMPLETE_INLINE_BIT,
REQ_F_REISSUE_BIT,
- REQ_F_DONT_REISSUE_BIT,
REQ_F_CREDS_BIT,
REQ_F_REFCOUNT_BIT,
REQ_F_ARM_LTIMEOUT_BIT,
+ REQ_F_ASYNC_DATA_BIT,
/* keep async read/write and isreg together and in order */
- REQ_F_NOWAIT_READ_BIT,
- REQ_F_NOWAIT_WRITE_BIT,
+ REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
/* not a real bit, just to check we're not overflowing the space */
@@ -782,12 +779,8 @@ enum {
REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
/* caller should reissue async */
REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
- /* don't attempt request reissue, see io_rw_reissue() */
- REQ_F_DONT_REISSUE = BIT(REQ_F_DONT_REISSUE_BIT),
- /* supports async reads */
- REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT),
- /* supports async writes */
- REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT),
+ /* supports async reads/writes */
+ REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
/* has creds assigned */
@@ -796,6 +789,8 @@ enum {
REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
/* there is a linked timeout that has to be armed */
REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
+ /* ->async_data allocated */
+ REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
};
struct async_poll {
@@ -852,39 +847,41 @@ struct io_kiocb {
struct io_mkdir mkdir;
struct io_symlink symlink;
struct io_hardlink hardlink;
- /* use only after cleaning per-op data, see io_clean_op() */
- struct io_completion compl;
};
- /* opcode allocated if it needs to store data for async defer */
- void *async_data;
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
-
u16 buf_index;
+ unsigned int flags;
+
+ u64 user_data;
u32 result;
+ u32 cflags;
struct io_ring_ctx *ctx;
- unsigned int flags;
- atomic_t refs;
struct task_struct *task;
- u64 user_data;
- struct io_kiocb *link;
struct percpu_ref *fixed_rsrc_refs;
+ /* store used ubuf, so we can prevent reloading */
+ struct io_mapped_ubuf *imu;
- /* used with ctx->iopoll_list with reads/writes */
- struct list_head inflight_entry;
+ /* used by request caches, completion batching and iopoll */
+ struct io_wq_work_node comp_list;
+ atomic_t refs;
+ struct io_kiocb *link;
struct io_task_work io_task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
+ /* internal polling, see IORING_FEAT_FAST_POLL */
struct async_poll *apoll;
+ /* opcode allocated if it needs to store data for async defer */
+ void *async_data;
struct io_wq_work work;
+ /* custom credentials, valid IFF REQ_F_CREDS is set */
const struct cred *creds;
-
- /* store used ubuf, so we can prevent reloading */
- struct io_mapped_ubuf *imu;
+ /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+ struct io_buffer *kbuf;
};
struct io_tctx_node {
@@ -902,12 +899,12 @@ struct io_defer_entry {
struct io_op_def {
/* needs req->file assigned */
unsigned needs_file : 1;
+ /* should block plug */
+ unsigned plug : 1;
/* hash wq insertion if file is a regular file */
unsigned hash_reg_file : 1;
/* unbound wq insertion if file is a non-regular file */
unsigned unbound_nonreg_file : 1;
- /* opcode is not supported by this kernel */
- unsigned not_supported : 1;
/* set if opcode supports polled "wait" */
unsigned pollin : 1;
unsigned pollout : 1;
@@ -915,8 +912,10 @@ struct io_op_def {
unsigned buffer_select : 1;
/* do prep async if is going to be punted */
unsigned needs_async_setup : 1;
- /* should block plug */
- unsigned plug : 1;
+ /* opcode is not supported by this kernel */
+ unsigned not_supported : 1;
+ /* skip auditing */
+ unsigned audit_skip : 1;
/* size of async data needed, if any */
unsigned short async_size;
};
@@ -930,6 +929,7 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.needs_async_setup = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITEV] = {
@@ -939,16 +939,19 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.needs_async_setup = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE_FIXED] = {
@@ -957,15 +960,20 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .audit_skip = 1,
+ },
+ [IORING_OP_POLL_REMOVE] = {
+ .audit_skip = 1,
},
- [IORING_OP_POLL_REMOVE] = {},
[IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_SENDMSG] = {
.needs_file = 1,
@@ -983,18 +991,23 @@ static const struct io_op_def io_op_defs[] = {
.async_size = sizeof(struct io_async_msghdr),
},
[IORING_OP_TIMEOUT] = {
+ .audit_skip = 1,
.async_size = sizeof(struct io_timeout_data),
},
[IORING_OP_TIMEOUT_REMOVE] = {
/* used by timeout updates' prep() */
+ .audit_skip = 1,
},
[IORING_OP_ACCEPT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
},
- [IORING_OP_ASYNC_CANCEL] = {},
+ [IORING_OP_ASYNC_CANCEL] = {
+ .audit_skip = 1,
+ },
[IORING_OP_LINK_TIMEOUT] = {
+ .audit_skip = 1,
.async_size = sizeof(struct io_timeout_data),
},
[IORING_OP_CONNECT] = {
@@ -1009,14 +1022,19 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_OPENAT] = {},
[IORING_OP_CLOSE] = {},
- [IORING_OP_FILES_UPDATE] = {},
- [IORING_OP_STATX] = {},
+ [IORING_OP_FILES_UPDATE] = {
+ .audit_skip = 1,
+ },
+ [IORING_OP_STATX] = {
+ .audit_skip = 1,
+ },
[IORING_OP_READ] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE] = {
@@ -1025,39 +1043,50 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_MADVISE] = {},
[IORING_OP_SEND] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .audit_skip = 1,
},
[IORING_OP_RECV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
+ .audit_skip = 1,
},
[IORING_OP_OPENAT2] = {
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_SPLICE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
+ .audit_skip = 1,
+ },
+ [IORING_OP_PROVIDE_BUFFERS] = {
+ .audit_skip = 1,
+ },
+ [IORING_OP_REMOVE_BUFFERS] = {
+ .audit_skip = 1,
},
- [IORING_OP_PROVIDE_BUFFERS] = {},
- [IORING_OP_REMOVE_BUFFERS] = {},
[IORING_OP_TEE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_SHUTDOWN] = {
.needs_file = 1,
@@ -1080,7 +1109,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags);
+ s32 res, u32 cflags);
static void io_put_req(struct io_kiocb *req);
static void io_put_req_deferred(struct io_kiocb *req);
static void io_dismantle_req(struct io_kiocb *req);
@@ -1095,11 +1124,13 @@ static void __io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);
static void io_req_task_queue(struct io_kiocb *req);
-static void io_submit_flush_completions(struct io_ring_ctx *ctx);
+static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
static int io_req_prep_async(struct io_kiocb *req);
static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
unsigned int issue_flags, u32 slot_index);
+static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
+
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
static struct kmem_cache *req_cachep;
@@ -1165,6 +1196,12 @@ static inline void req_ref_get(struct io_kiocb *req)
atomic_inc(&req->refs);
}
+static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
+{
+ if (!wq_list_empty(&ctx->submit_state.compl_reqs))
+ __io_submit_flush_completions(ctx);
+}
+
static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
{
if (!(req->flags & REQ_F_REFCOUNT)) {
@@ -1178,13 +1215,52 @@ static inline void io_req_set_refcount(struct io_kiocb *req)
__io_req_set_refcount(req, 1);
}
-static inline void io_req_set_rsrc_node(struct io_kiocb *req)
+#define IO_RSRC_REF_BATCH 100
+
+static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
+ struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
{
- struct io_ring_ctx *ctx = req->ctx;
+ struct percpu_ref *ref = req->fixed_rsrc_refs;
+
+ if (ref) {
+ if (ref == &ctx->rsrc_node->refs)
+ ctx->rsrc_cached_refs++;
+ else
+ percpu_ref_put(ref);
+ }
+}
+static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
+{
+ if (req->fixed_rsrc_refs)
+ percpu_ref_put(req->fixed_rsrc_refs);
+}
+
+static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
+{
+ if (ctx->rsrc_cached_refs) {
+ percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
+ ctx->rsrc_cached_refs = 0;
+ }
+}
+
+static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
+{
+ ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
+ percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
+}
+
+static inline void io_req_set_rsrc_node(struct io_kiocb *req,
+ struct io_ring_ctx *ctx)
+{
if (!req->fixed_rsrc_refs) {
req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
- percpu_ref_get(req->fixed_rsrc_refs);
+ ctx->rsrc_cached_refs--;
+ if (unlikely(ctx->rsrc_cached_refs < 0))
+ io_rsrc_refs_refill(ctx);
}
}
@@ -1217,6 +1293,11 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
return false;
}
+static inline bool req_has_async_data(struct io_kiocb *req)
+{
+ return req->flags & REQ_F_ASYNC_DATA;
+}
+
static inline void req_set_fail(struct io_kiocb *req)
{
req->flags |= REQ_F_FAIL;
@@ -1228,7 +1309,7 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res)
req->result = res;
}
-static void io_ring_ctx_ref_free(struct percpu_ref *ref)
+static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -1240,7 +1321,7 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req)
return !req->timeout.off;
}
-static void io_fallback_req_func(struct work_struct *work)
+static __cold void io_fallback_req_func(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
fallback_work.work);
@@ -1253,15 +1334,13 @@ static void io_fallback_req_func(struct work_struct *work)
req->io_task_work.func(req, &locked);
if (locked) {
- if (ctx->submit_state.compl_nr)
- io_submit_flush_completions(ctx);
+ io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
}
percpu_ref_put(&ctx->refs);
-
}
-static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
+static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
int hash_bits;
@@ -1298,7 +1377,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ctx->flags = p->flags;
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
- init_waitqueue_head(&ctx->poll_wait);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->ref_comp);
xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
@@ -1307,7 +1385,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
init_waitqueue_head(&ctx->cq_wait);
spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock);
- INIT_LIST_HEAD(&ctx->iopoll_list);
+ INIT_WQ_LIST(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
INIT_LIST_HEAD(&ctx->ltimeout_list);
@@ -1316,9 +1394,10 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
init_llist_head(&ctx->rsrc_put_llist);
INIT_LIST_HEAD(&ctx->tctx_list);
- INIT_LIST_HEAD(&ctx->submit_state.free_list);
- INIT_LIST_HEAD(&ctx->locked_free_list);
+ ctx->submit_state.free_list.next = NULL;
+ INIT_WQ_LIST(&ctx->locked_free_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
+ INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
return ctx;
err:
kfree(ctx->dummy_ubuf);
@@ -1346,21 +1425,16 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
return false;
}
-#define FFS_ASYNC_READ 0x1UL
-#define FFS_ASYNC_WRITE 0x2UL
-#ifdef CONFIG_64BIT
-#define FFS_ISREG 0x4UL
-#else
-#define FFS_ISREG 0x0UL
-#endif
-#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
+#define FFS_NOWAIT 0x1UL
+#define FFS_ISREG 0x2UL
+#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
static inline bool io_req_ffs_set(struct io_kiocb *req)
{
- return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
+ return req->flags & REQ_F_FIXED_FILE;
}
-static void io_req_track_inflight(struct io_kiocb *req)
+static inline void io_req_track_inflight(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_INFLIGHT)) {
req->flags |= REQ_F_INFLIGHT;
@@ -1368,11 +1442,6 @@ static void io_req_track_inflight(struct io_kiocb *req)
}
}
-static inline void io_unprep_linked_timeout(struct io_kiocb *req)
-{
- req->flags &= ~REQ_F_LINK_TIMEOUT;
-}
-
static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
{
if (WARN_ON_ONCE(!req->link))
@@ -1443,15 +1512,19 @@ static void io_prep_async_link(struct io_kiocb *req)
}
}
-static void io_queue_async_work(struct io_kiocb *req, bool *locked)
+static inline void io_req_add_compl_list(struct io_kiocb *req)
+{
+ struct io_submit_state *state = &req->ctx->submit_state;
+
+ wq_list_add_tail(&req->comp_list, &state->compl_reqs);
+}
+
+static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
struct io_uring_task *tctx = req->task->io_uring;
- /* must not take the lock, NULL it as a precaution */
- locked = NULL;
-
BUG_ON(!tctx);
BUG_ON(!tctx->io_wq);
@@ -1492,7 +1565,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
}
}
-static void io_queue_deferred(struct io_ring_ctx *ctx)
+static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
{
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
@@ -1506,7 +1579,7 @@ static void io_queue_deferred(struct io_ring_ctx *ctx)
}
}
-static void io_flush_timeouts(struct io_ring_ctx *ctx)
+static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
__must_hold(&ctx->completion_lock)
{
u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
@@ -1539,7 +1612,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->timeout_lock);
}
-static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
+static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
if (ctx->off_timeout_used)
io_flush_timeouts(ctx);
@@ -1609,14 +1682,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
*/
if (wq_has_sleeper(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
- if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
- wake_up(&ctx->sq_data->wait);
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->poll_wait)) {
- wake_up_interruptible(&ctx->poll_wait);
- kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
- }
}
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
@@ -1630,10 +1697,6 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
}
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->poll_wait)) {
- wake_up_interruptible(&ctx->poll_wait);
- kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
- }
}
/* Returns true if there are no backlogged entries after the flush */
@@ -1729,7 +1792,7 @@ static inline void io_get_task_refs(int nr)
}
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+ s32 res, u32 cflags)
{
struct io_overflow_cqe *ocqe;
@@ -1757,7 +1820,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
}
static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+ s32 res, u32 cflags)
{
struct io_uring_cqe *cqe;
@@ -1780,13 +1843,13 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data
/* not as hot to bloat with inlining */
static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+ s32 res, u32 cflags)
{
return __io_cqring_fill_event(ctx, user_data, res, cflags);
}
-static void io_req_complete_post(struct io_kiocb *req, long res,
- unsigned int cflags)
+static void io_req_complete_post(struct io_kiocb *req, s32 res,
+ u32 cflags)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -1805,40 +1868,27 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
req->link = NULL;
}
}
+ io_req_put_rsrc(req, ctx);
io_dismantle_req(req);
io_put_task(req->task, 1);
- list_add(&req->inflight_entry, &ctx->locked_free_list);
+ wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
ctx->locked_free_nr++;
- } else {
- if (!percpu_ref_tryget(&ctx->refs))
- req = NULL;
}
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
-
- if (req) {
- io_cqring_ev_posted(ctx);
- percpu_ref_put(&ctx->refs);
- }
-}
-
-static inline bool io_req_needs_clean(struct io_kiocb *req)
-{
- return req->flags & IO_REQ_CLEAN_FLAGS;
+ io_cqring_ev_posted(ctx);
}
-static void io_req_complete_state(struct io_kiocb *req, long res,
- unsigned int cflags)
+static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
+ u32 cflags)
{
- if (io_req_needs_clean(req))
- io_clean_op(req);
req->result = res;
- req->compl.cflags = cflags;
+ req->cflags = cflags;
req->flags |= REQ_F_COMPLETE_INLINE;
}
static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
- long res, unsigned cflags)
+ s32 res, u32 cflags)
{
if (issue_flags & IO_URING_F_COMPLETE_DEFER)
io_req_complete_state(req, res, cflags);
@@ -1846,12 +1896,12 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
io_req_complete_post(req, res, cflags);
}
-static inline void io_req_complete(struct io_kiocb *req, long res)
+static inline void io_req_complete(struct io_kiocb *req, s32 res)
{
__io_req_complete(req, 0, res, 0);
}
-static void io_req_complete_failed(struct io_kiocb *req, long res)
+static void io_req_complete_failed(struct io_kiocb *req, s32 res)
{
req_set_fail(req);
io_req_complete_post(req, res, 0);
@@ -1885,7 +1935,7 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
struct io_submit_state *state)
{
spin_lock(&ctx->completion_lock);
- list_splice_init(&ctx->locked_free_list, &state->free_list);
+ wq_list_splice(&ctx->locked_free_list, &state->free_list);
ctx->locked_free_nr = 0;
spin_unlock(&ctx->completion_lock);
}
@@ -1894,7 +1944,6 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
- int nr;
/*
* If we have more than a batch's worth of requests in our IRQ side
@@ -1903,20 +1952,7 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
*/
if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
io_flush_cached_locked_reqs(ctx, state);
-
- nr = state->free_reqs;
- while (!list_empty(&state->free_list)) {
- struct io_kiocb *req = list_first_entry(&state->free_list,
- struct io_kiocb, inflight_entry);
-
- list_del(&req->inflight_entry);
- state->reqs[nr++] = req;
- if (nr == ARRAY_SIZE(state->reqs))
- break;
- }
-
- state->free_reqs = nr;
- return nr != 0;
+ return !!state->free_list.next;
}
/*
@@ -1925,38 +1961,54 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
* Because of that, io_alloc_req() should be called only under ->uring_lock
* and with extra caution to not get a request that is still worked on.
*/
-static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
struct io_submit_state *state = &ctx->submit_state;
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+ void *reqs[IO_REQ_ALLOC_BATCH];
+ struct io_kiocb *req;
int ret, i;
- BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
-
- if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
- goto got_req;
+ if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
+ return true;
- ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
- state->reqs);
+ ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
/*
* Bulk alloc is all-or-nothing. If we fail to get a batch,
* retry single alloc to be on the safe side.
*/
if (unlikely(ret <= 0)) {
- state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
- if (!state->reqs[0])
- return NULL;
+ reqs[0] = kmem_cache_alloc(req_cachep, gfp);
+ if (!reqs[0])
+ return false;
ret = 1;
}
- for (i = 0; i < ret; i++)
- io_preinit_req(state->reqs[i], ctx);
- state->free_reqs = ret;
-got_req:
- state->free_reqs--;
- return state->reqs[state->free_reqs];
+ percpu_ref_get_many(&ctx->refs, ret);
+ for (i = 0; i < ret; i++) {
+ req = reqs[i];
+
+ io_preinit_req(req, ctx);
+ wq_stack_add_head(&req->comp_list, &state->free_list);
+ }
+ return true;
+}
+
+static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
+{
+ if (unlikely(!ctx->submit_state.free_list.next))
+ return __io_alloc_req_refill(ctx);
+ return true;
+}
+
+static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+{
+ struct io_wq_work_node *node;
+
+ node = wq_stack_extract(&ctx->submit_state.free_list);
+ return container_of(node, struct io_kiocb, comp_list);
}
static inline void io_put_file(struct file *file)
@@ -1965,35 +2017,28 @@ static inline void io_put_file(struct file *file)
fput(file);
}
-static void io_dismantle_req(struct io_kiocb *req)
+static inline void io_dismantle_req(struct io_kiocb *req)
{
unsigned int flags = req->flags;
- if (io_req_needs_clean(req))
+ if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
io_clean_op(req);
if (!(flags & REQ_F_FIXED_FILE))
io_put_file(req->file);
- if (req->fixed_rsrc_refs)
- percpu_ref_put(req->fixed_rsrc_refs);
- if (req->async_data) {
- kfree(req->async_data);
- req->async_data = NULL;
- }
}
-static void __io_free_req(struct io_kiocb *req)
+static __cold void __io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ io_req_put_rsrc(req, ctx);
io_dismantle_req(req);
io_put_task(req->task, 1);
spin_lock(&ctx->completion_lock);
- list_add(&req->inflight_entry, &ctx->locked_free_list);
+ wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
ctx->locked_free_nr++;
spin_unlock(&ctx->completion_lock);
-
- percpu_ref_put(&ctx->refs);
}
static inline void io_remove_next_linked(struct io_kiocb *req)
@@ -2079,47 +2124,45 @@ static bool io_disarm_next(struct io_kiocb *req)
return posted;
}
-static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
+static void __io_req_find_next_prep(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ bool posted;
+
+ spin_lock(&ctx->completion_lock);
+ posted = io_disarm_next(req);
+ if (posted)
+ io_commit_cqring(req->ctx);
+ spin_unlock(&ctx->completion_lock);
+ if (posted)
+ io_cqring_ev_posted(ctx);
+}
+
+static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
struct io_kiocb *nxt;
+ if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
+ return NULL;
/*
* If LINK is set, we have dependent requests in this chain. If we
* didn't fail this request, queue the first one up, moving any other
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (req->flags & IO_DISARM_MASK) {
- struct io_ring_ctx *ctx = req->ctx;
- bool posted;
-
- spin_lock(&ctx->completion_lock);
- posted = io_disarm_next(req);
- if (posted)
- io_commit_cqring(req->ctx);
- spin_unlock(&ctx->completion_lock);
- if (posted)
- io_cqring_ev_posted(ctx);
- }
+ if (unlikely(req->flags & IO_DISARM_MASK))
+ __io_req_find_next_prep(req);
nxt = req->link;
req->link = NULL;
return nxt;
}
-static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
-{
- if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
- return NULL;
- return __io_req_find_next(req);
-}
-
static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
{
if (!ctx)
return;
if (*locked) {
- if (ctx->submit_state.compl_nr)
- io_submit_flush_completions(ctx);
+ io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
*locked = false;
}
@@ -2136,7 +2179,7 @@ static void tctx_task_work(struct callback_head *cb)
while (1) {
struct io_wq_work_node *node;
- if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
+ if (!tctx->task_list.first && locked)
io_submit_flush_completions(ctx);
spin_lock_irq(&tctx->task_lock);
@@ -2199,8 +2242,9 @@ static void io_req_task_work_add(struct io_kiocb *req)
* will do the job.
*/
notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
- if (!task_work_add(tsk, &tctx->task_work, notify)) {
- wake_up_process(tsk);
+ if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
+ if (notify == TWA_NONE)
+ wake_up_process(tsk);
return;
}
@@ -2278,77 +2322,62 @@ static void io_free_req_work(struct io_kiocb *req, bool *locked)
io_free_req(req);
}
-struct req_batch {
- struct task_struct *task;
- int task_refs;
- int ctx_refs;
-};
-
-static inline void io_init_req_batch(struct req_batch *rb)
+static void io_free_batch_list(struct io_ring_ctx *ctx,
+ struct io_wq_work_node *node)
+ __must_hold(&ctx->uring_lock)
{
- rb->task_refs = 0;
- rb->ctx_refs = 0;
- rb->task = NULL;
-}
+ struct task_struct *task = NULL;
+ int task_refs = 0;
-static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
- struct req_batch *rb)
-{
- if (rb->ctx_refs)
- percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
- if (rb->task)
- io_put_task(rb->task, rb->task_refs);
-}
+ do {
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ comp_list);
-static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
- struct io_submit_state *state)
-{
- io_queue_next(req);
- io_dismantle_req(req);
+ if (unlikely(req->flags & REQ_F_REFCOUNT)) {
+ node = req->comp_list.next;
+ if (!req_ref_put_and_test(req))
+ continue;
+ }
- if (req->task != rb->task) {
- if (rb->task)
- io_put_task(rb->task, rb->task_refs);
- rb->task = req->task;
- rb->task_refs = 0;
- }
- rb->task_refs++;
- rb->ctx_refs++;
+ io_req_put_rsrc_locked(req, ctx);
+ io_queue_next(req);
+ io_dismantle_req(req);
- if (state->free_reqs != ARRAY_SIZE(state->reqs))
- state->reqs[state->free_reqs++] = req;
- else
- list_add(&req->inflight_entry, &state->free_list);
+ if (req->task != task) {
+ if (task)
+ io_put_task(task, task_refs);
+ task = req->task;
+ task_refs = 0;
+ }
+ task_refs++;
+ node = req->comp_list.next;
+ wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+ } while (node);
+
+ if (task)
+ io_put_task(task, task_refs);
}
-static void io_submit_flush_completions(struct io_ring_ctx *ctx)
+static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
+ struct io_wq_work_node *node, *prev;
struct io_submit_state *state = &ctx->submit_state;
- int i, nr = state->compl_nr;
- struct req_batch rb;
spin_lock(&ctx->completion_lock);
- for (i = 0; i < nr; i++) {
- struct io_kiocb *req = state->compl_reqs[i];
+ wq_list_for_each(node, prev, &state->compl_reqs) {
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ comp_list);
__io_cqring_fill_event(ctx, req->user_data, req->result,
- req->compl.cflags);
+ req->cflags);
}
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
- io_init_req_batch(&rb);
- for (i = 0; i < nr; i++) {
- struct io_kiocb *req = state->compl_reqs[i];
-
- if (req_ref_put_and_test(req))
- io_req_free_batch(&rb, req, &ctx->submit_state);
- }
-
- io_req_free_batch_finish(ctx, &rb);
- state->compl_nr = 0;
+ io_free_batch_list(ctx, state->compl_reqs.first);
+ INIT_WQ_LIST(&state->compl_reqs);
}
/*
@@ -2408,12 +2437,9 @@ static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
{
- struct io_buffer *kbuf;
-
if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
return 0;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
- return io_put_kbuf(req, kbuf);
+ return io_put_kbuf(req, req->kbuf);
}
static inline bool io_run_task_work(void)
@@ -2427,57 +2453,22 @@ static inline bool io_run_task_work(void)
return false;
}
-/*
- * Find and free completed poll iocbs
- */
-static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
- struct list_head *done)
+static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
{
- struct req_batch rb;
- struct io_kiocb *req;
-
- /* order with ->result store in io_complete_rw_iopoll() */
- smp_rmb();
-
- io_init_req_batch(&rb);
- while (!list_empty(done)) {
- req = list_first_entry(done, struct io_kiocb, inflight_entry);
- list_del(&req->inflight_entry);
-
- if (READ_ONCE(req->result) == -EAGAIN &&
- !(req->flags & REQ_F_DONT_REISSUE)) {
- req->iopoll_completed = 0;
- io_req_task_queue_reissue(req);
- continue;
- }
-
- __io_cqring_fill_event(ctx, req->user_data, req->result,
- io_put_rw_kbuf(req));
- (*nr_events)++;
-
- if (req_ref_put_and_test(req))
- io_req_free_batch(&rb, req, &ctx->submit_state);
- }
-
- io_commit_cqring(ctx);
- io_cqring_ev_posted_iopoll(ctx);
- io_req_free_batch_finish(ctx, &rb);
-}
-
-static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
- long min)
-{
- struct io_kiocb *req, *tmp;
- LIST_HEAD(done);
- bool spin;
+ struct io_wq_work_node *pos, *start, *prev;
+ unsigned int poll_flags = BLK_POLL_NOSLEEP;
+ DEFINE_IO_COMP_BATCH(iob);
+ int nr_events = 0;
/*
* Only spin for completions if we don't have multiple devices hanging
- * off our complete list, and we're under the requested amount.
+ * off our complete list.
*/
- spin = !ctx->poll_multi_queue && *nr_events < min;
+ if (ctx->poll_multi_queue || force_nonspin)
+ poll_flags |= BLK_POLL_ONESHOT;
- list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
+ wq_list_for_each(pos, start, &ctx->iopoll_list) {
+ struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
struct kiocb *kiocb = &req->rw.kiocb;
int ret;
@@ -2486,47 +2477,62 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
* If we find a request that requires polling, break out
* and complete those lists first, if we have entries there.
*/
- if (READ_ONCE(req->iopoll_completed)) {
- list_move_tail(&req->inflight_entry, &done);
- continue;
- }
- if (!list_empty(&done))
+ if (READ_ONCE(req->iopoll_completed))
break;
- ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
+ ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
if (unlikely(ret < 0))
return ret;
else if (ret)
- spin = false;
+ poll_flags |= BLK_POLL_ONESHOT;
/* iopoll may have completed current req */
- if (READ_ONCE(req->iopoll_completed))
- list_move_tail(&req->inflight_entry, &done);
+ if (!rq_list_empty(iob.req_list) ||
+ READ_ONCE(req->iopoll_completed))
+ break;
}
- if (!list_empty(&done))
- io_iopoll_complete(ctx, nr_events, &done);
+ if (!rq_list_empty(iob.req_list))
+ iob.complete(&iob);
+ else if (!pos)
+ return 0;
- return 0;
+ prev = start;
+ wq_list_for_each_resume(pos, prev) {
+ struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
+
+ /* order with io_complete_rw_iopoll(), e.g. ->result updates */
+ if (!smp_load_acquire(&req->iopoll_completed))
+ break;
+ __io_cqring_fill_event(ctx, req->user_data, req->result,
+ io_put_rw_kbuf(req));
+ nr_events++;
+ }
+
+ if (unlikely(!nr_events))
+ return 0;
+
+ io_commit_cqring(ctx);
+ io_cqring_ev_posted_iopoll(ctx);
+ pos = start ? start->next : ctx->iopoll_list.first;
+ wq_list_cut(&ctx->iopoll_list, prev, start);
+ io_free_batch_list(ctx, pos);
+ return nr_events;
}
/*
* We can't just wait for polled events to come to us, we have to actively
* find and complete them.
*/
-static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
+static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_IOPOLL))
return;
mutex_lock(&ctx->uring_lock);
- while (!list_empty(&ctx->iopoll_list)) {
- unsigned int nr_events = 0;
-
- io_do_iopoll(ctx, &nr_events, 0);
-
+ while (!wq_list_empty(&ctx->iopoll_list)) {
/* let it sleep and repeat later if can't complete a request */
- if (nr_events == 0)
+ if (io_do_iopoll(ctx, true) == 0)
break;
/*
* Ensure we allow local-to-the-cpu processing to take place,
@@ -2573,7 +2579,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
* forever, while the workqueue is stuck trying to acquire the
* very same mutex.
*/
- if (list_empty(&ctx->iopoll_list)) {
+ if (wq_list_empty(&ctx->iopoll_list)) {
u32 tail = ctx->cached_cq_tail;
mutex_unlock(&ctx->uring_lock);
@@ -2582,11 +2588,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
/* some requests don't go through iopoll_list */
if (tail != ctx->cached_cq_tail ||
- list_empty(&ctx->iopoll_list))
+ wq_list_empty(&ctx->iopoll_list))
break;
}
- ret = io_do_iopoll(ctx, &nr_events, min);
- } while (!ret && nr_events < min && !need_resched());
+ ret = io_do_iopoll(ctx, !min);
+ if (ret < 0)
+ break;
+ nr_events += ret;
+ ret = 0;
+ } while (nr_events < min && !need_resched());
out:
mutex_unlock(&ctx->uring_lock);
return ret;
@@ -2611,10 +2621,9 @@ static bool io_resubmit_prep(struct io_kiocb *req)
{
struct io_async_rw *rw = req->async_data;
- if (!rw)
+ if (!req_has_async_data(req))
return !io_req_prep_async(req);
- /* may have left rw->iter inconsistent on -EIOCBQUEUED */
- iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter));
+ iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
return true;
}
@@ -2658,7 +2667,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
{
if (req->rw.kiocb.ki_flags & IOCB_WRITE)
kiocb_end_write(req);
- if (res != req->result) {
+ if (unlikely(res != req->result)) {
if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
io_rw_should_reissue(req)) {
req->flags |= REQ_F_REISSUE;
@@ -2673,16 +2682,11 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
static void io_req_task_complete(struct io_kiocb *req, bool *locked)
{
unsigned int cflags = io_put_rw_kbuf(req);
- long res = req->result;
+ int res = req->result;
if (*locked) {
- struct io_ring_ctx *ctx = req->ctx;
- struct io_submit_state *state = &ctx->submit_state;
-
io_req_complete_state(req, res, cflags);
- state->compl_reqs[state->compl_nr++] = req;
- if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
- io_submit_flush_completions(ctx);
+ io_req_add_compl_list(req);
} else {
io_req_complete_post(req, res, cflags);
}
@@ -2696,7 +2700,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
__io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req));
}
-static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw(struct kiocb *kiocb, long res)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
@@ -2707,24 +2711,22 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
io_req_task_work_add(req);
}
-static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
if (unlikely(res != req->result)) {
- if (!(res == -EAGAIN && io_rw_should_reissue(req) &&
- io_resubmit_prep(req))) {
- req_set_fail(req);
- req->flags |= REQ_F_DONT_REISSUE;
+ if (res == -EAGAIN && io_rw_should_reissue(req)) {
+ req->flags |= REQ_F_REISSUE;
+ return;
}
+ req->result = res;
}
- WRITE_ONCE(req->result, res);
- /* order with io_iopoll_complete() checking ->result */
- smp_wmb();
- WRITE_ONCE(req->iopoll_completed, 1);
+ /* order with io_iopoll_complete() checking ->iopoll_completed */
+ smp_store_release(&req->iopoll_completed, 1);
}
/*
@@ -2733,13 +2735,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
* find it from a io_do_iopoll() thread before the issuer is done
* accessing the kiocb cookie.
*/
-static void io_iopoll_req_issued(struct io_kiocb *req)
+static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
- const bool in_async = io_wq_current_is_worker();
+ const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
/* workqueue context doesn't hold uring_lock, grab it now */
- if (unlikely(in_async))
+ if (unlikely(needs_lock))
mutex_lock(&ctx->uring_lock);
/*
@@ -2747,23 +2749,15 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* how we do polling eventually, not spinning if we're on potentially
* different devices.
*/
- if (list_empty(&ctx->iopoll_list)) {
+ if (wq_list_empty(&ctx->iopoll_list)) {
ctx->poll_multi_queue = false;
} else if (!ctx->poll_multi_queue) {
struct io_kiocb *list_req;
- unsigned int queue_num0, queue_num1;
- list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
- inflight_entry);
-
- if (list_req->file != req->file) {
+ list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
+ comp_list);
+ if (list_req->file != req->file)
ctx->poll_multi_queue = true;
- } else {
- queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie);
- queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie);
- if (queue_num0 != queue_num1)
- ctx->poll_multi_queue = true;
- }
}
/*
@@ -2771,11 +2765,11 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* it to the front so we find it first.
*/
if (READ_ONCE(req->iopoll_completed))
- list_add(&req->inflight_entry, &ctx->iopoll_list);
+ wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
else
- list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
+ wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
- if (unlikely(in_async)) {
+ if (unlikely(needs_lock)) {
/*
* If IORING_SETUP_SQPOLL is enabled, sqes are either handle
* in sq thread task context or in io worker task context. If
@@ -2800,10 +2794,8 @@ static bool io_bdev_nowait(struct block_device *bdev)
* any file. For now, just ensure that anything potentially problematic is done
* inline.
*/
-static bool __io_file_supports_nowait(struct file *file, int rw)
+static bool __io_file_supports_nowait(struct file *file, umode_t mode)
{
- umode_t mode = file_inode(file)->i_mode;
-
if (S_ISBLK(mode)) {
if (IS_ENABLED(CONFIG_BLOCK) &&
io_bdev_nowait(I_BDEV(file->f_mapping->host)))
@@ -2823,24 +2815,29 @@ static bool __io_file_supports_nowait(struct file *file, int rw)
/* any ->read/write should understand O_NONBLOCK */
if (file->f_flags & O_NONBLOCK)
return true;
+ return file->f_mode & FMODE_NOWAIT;
+}
- if (!(file->f_mode & FMODE_NOWAIT))
- return false;
-
- if (rw == READ)
- return file->f_op->read_iter != NULL;
+/*
+ * If we tracked the file through the SCM inflight mechanism, we could support
+ * any file. For now, just ensure that anything potentially problematic is done
+ * inline.
+ */
+static unsigned int io_file_get_flags(struct file *file)
+{
+ umode_t mode = file_inode(file)->i_mode;
+ unsigned int res = 0;
- return file->f_op->write_iter != NULL;
+ if (S_ISREG(mode))
+ res |= FFS_ISREG;
+ if (__io_file_supports_nowait(file, mode))
+ res |= FFS_NOWAIT;
+ return res;
}
-static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
+static inline bool io_file_supports_nowait(struct io_kiocb *req)
{
- if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
- return true;
- else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
- return true;
-
- return __io_file_supports_nowait(req->file, rw);
+ return req->flags & REQ_F_SUPPORT_NOWAIT;
}
static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -2851,37 +2848,30 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
unsigned ioprio;
int ret;
- if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
- req->flags |= REQ_F_ISREG;
+ if (!io_req_ffs_set(req))
+ req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
kiocb->ki_pos = READ_ONCE(sqe->off);
if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
req->flags |= REQ_F_CUR_POS;
kiocb->ki_pos = file->f_pos;
}
- kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
- kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
+ kiocb->ki_flags = iocb_flags(file);
ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
if (unlikely(ret))
return ret;
- /* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
- if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
+ /*
+ * If the file is marked O_NONBLOCK, still allow retry for it if it
+ * supports async. Otherwise it's impossible to use O_NONBLOCK files
+ * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
+ */
+ if ((kiocb->ki_flags & IOCB_NOWAIT) ||
+ ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
req->flags |= REQ_F_NOWAIT;
- ioprio = READ_ONCE(sqe->ioprio);
- if (ioprio) {
- ret = ioprio_check_cap(ioprio);
- if (ret)
- return ret;
-
- kiocb->ki_ioprio = ioprio;
- } else
- kiocb->ki_ioprio = get_current_ioprio();
-
if (ctx->flags & IORING_SETUP_IOPOLL) {
- if (!(kiocb->ki_flags & IOCB_DIRECT) ||
- !kiocb->ki_filp->f_op->iopoll)
+ if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
return -EOPNOTSUPP;
kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
@@ -2893,12 +2883,18 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
kiocb->ki_complete = io_complete_rw;
}
- if (req->opcode == IORING_OP_READ_FIXED ||
- req->opcode == IORING_OP_WRITE_FIXED) {
- req->imu = NULL;
- io_req_set_rsrc_node(req);
+ ioprio = READ_ONCE(sqe->ioprio);
+ if (ioprio) {
+ ret = ioprio_check_cap(ioprio);
+ if (ret)
+ return ret;
+
+ kiocb->ki_ioprio = ioprio;
+ } else {
+ kiocb->ki_ioprio = get_current_ioprio();
}
+ req->imu = NULL;
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
req->buf_index = READ_ONCE(sqe->buf_index);
@@ -2922,7 +2918,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
ret = -EINTR;
fallthrough;
default:
- kiocb->ki_complete(kiocb, ret, 0);
+ kiocb->ki_complete(kiocb, ret);
}
}
@@ -2931,10 +2927,9 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
struct io_async_rw *io = req->async_data;
- bool check_reissue = kiocb->ki_complete == io_complete_rw;
/* add previously done IO, if any */
- if (io && io->bytes_done > 0) {
+ if (req_has_async_data(req) && io->bytes_done > 0) {
if (ret < 0)
ret = io->bytes_done;
else
@@ -2943,19 +2938,27 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
if (req->flags & REQ_F_CUR_POS)
req->file->f_pos = kiocb->ki_pos;
- if (ret >= 0 && check_reissue)
+ if (ret >= 0 && (kiocb->ki_complete == io_complete_rw))
__io_complete_rw(req, ret, 0, issue_flags);
else
io_rw_done(kiocb, ret);
- if (check_reissue && (req->flags & REQ_F_REISSUE)) {
+ if (req->flags & REQ_F_REISSUE) {
req->flags &= ~REQ_F_REISSUE;
if (io_resubmit_prep(req)) {
io_req_task_queue_reissue(req);
} else {
+ unsigned int cflags = io_put_rw_kbuf(req);
+ struct io_ring_ctx *ctx = req->ctx;
+
req_set_fail(req);
- __io_req_complete(req, issue_flags, ret,
- io_put_rw_kbuf(req));
+ if (issue_flags & IO_URING_F_UNLOCKED) {
+ mutex_lock(&ctx->uring_lock);
+ __io_req_complete(req, issue_flags, ret, cflags);
+ mutex_unlock(&ctx->uring_lock);
+ } else {
+ __io_req_complete(req, issue_flags, ret, cflags);
+ }
}
}
}
@@ -3020,13 +3023,15 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter
static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
{
- struct io_ring_ctx *ctx = req->ctx;
struct io_mapped_ubuf *imu = req->imu;
u16 index, buf_index = req->buf_index;
if (likely(!imu)) {
+ struct io_ring_ctx *ctx = req->ctx;
+
if (unlikely(buf_index >= ctx->nr_user_bufs))
return -EFAULT;
+ io_req_set_rsrc_node(req, ctx);
index = array_index_nospec(buf_index, ctx->nr_user_bufs);
imu = READ_ONCE(ctx->user_bufs[index]);
req->imu = imu;
@@ -3053,10 +3058,11 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
}
static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
- int bgid, struct io_buffer *kbuf,
- bool needs_lock)
+ int bgid, unsigned int issue_flags)
{
+ struct io_buffer *kbuf = req->kbuf;
struct io_buffer *head;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
if (req->flags & REQ_F_BUFFER_SELECTED)
return kbuf;
@@ -3077,34 +3083,32 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
}
if (*len > kbuf->len)
*len = kbuf->len;
+ req->flags |= REQ_F_BUFFER_SELECTED;
+ req->kbuf = kbuf;
} else {
kbuf = ERR_PTR(-ENOBUFS);
}
io_ring_submit_unlock(req->ctx, needs_lock);
-
return kbuf;
}
static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct io_buffer *kbuf;
u16 bgid;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
bgid = req->buf_index;
- kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
+ kbuf = io_buffer_select(req, len, bgid, issue_flags);
if (IS_ERR(kbuf))
return kbuf;
- req->rw.addr = (u64) (unsigned long) kbuf;
- req->flags |= REQ_F_BUFFER_SELECTED;
return u64_to_user_ptr(kbuf->addr);
}
#ifdef CONFIG_COMPAT
static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct compat_iovec __user *uiov;
compat_ssize_t clen;
@@ -3120,7 +3124,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
return -EINVAL;
len = clen;
- buf = io_rw_buffer_select(req, &len, needs_lock);
+ buf = io_rw_buffer_select(req, &len, issue_flags);
if (IS_ERR(buf))
return PTR_ERR(buf);
iov[0].iov_base = buf;
@@ -3130,7 +3134,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
#endif
static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
void __user *buf;
@@ -3142,7 +3146,7 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
len = iov[0].iov_len;
if (len < 0)
return -EINVAL;
- buf = io_rw_buffer_select(req, &len, needs_lock);
+ buf = io_rw_buffer_select(req, &len, issue_flags);
if (IS_ERR(buf))
return PTR_ERR(buf);
iov[0].iov_base = buf;
@@ -3151,12 +3155,11 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
}
static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
if (req->flags & REQ_F_BUFFER_SELECTED) {
- struct io_buffer *kbuf;
+ struct io_buffer *kbuf = req->kbuf;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
iov[0].iov_len = kbuf->len;
return 0;
@@ -3166,52 +3169,72 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
- return io_compat_import(req, iov, needs_lock);
+ return io_compat_import(req, iov, issue_flags);
#endif
- return __io_iov_buffer_select(req, iov, needs_lock);
+ return __io_iov_buffer_select(req, iov, issue_flags);
}
-static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
- struct iov_iter *iter, bool needs_lock)
+static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
+ struct io_rw_state *s,
+ unsigned int issue_flags)
{
- void __user *buf = u64_to_user_ptr(req->rw.addr);
- size_t sqe_len = req->rw.len;
+ struct iov_iter *iter = &s->iter;
u8 opcode = req->opcode;
+ struct iovec *iovec;
+ void __user *buf;
+ size_t sqe_len;
ssize_t ret;
- if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
- *iovec = NULL;
- return io_import_fixed(req, rw, iter);
- }
+ BUILD_BUG_ON(ERR_PTR(0) != NULL);
+
+ if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED)
+ return ERR_PTR(io_import_fixed(req, rw, iter));
/* buffer index only valid with fixed read/write, or buffer select */
- if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
- return -EINVAL;
+ if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
+ return ERR_PTR(-EINVAL);
+
+ buf = u64_to_user_ptr(req->rw.addr);
+ sqe_len = req->rw.len;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
if (req->flags & REQ_F_BUFFER_SELECT) {
- buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
+ buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
if (IS_ERR(buf))
- return PTR_ERR(buf);
+ return ERR_CAST(buf);
req->rw.len = sqe_len;
}
- ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
- *iovec = NULL;
- return ret;
+ ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
+ return ERR_PTR(ret);
}
+ iovec = s->fast_iov;
if (req->flags & REQ_F_BUFFER_SELECT) {
- ret = io_iov_buffer_select(req, *iovec, needs_lock);
+ ret = io_iov_buffer_select(req, iovec, issue_flags);
if (!ret)
- iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
- *iovec = NULL;
- return ret;
+ iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
+ return ERR_PTR(ret);
}
- return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
+ ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
req->ctx->compat);
+ if (unlikely(ret < 0))
+ return ERR_PTR(ret);
+ return iovec;
+}
+
+static inline int io_import_iovec(int rw, struct io_kiocb *req,
+ struct iovec **iovec, struct io_rw_state *s,
+ unsigned int issue_flags)
+{
+ *iovec = __io_import_iovec(rw, req, s, issue_flags);
+ if (unlikely(IS_ERR(*iovec)))
+ return PTR_ERR(*iovec);
+
+ iov_iter_save_state(&s->iter, &s->iter_state);
+ return 0;
}
static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
@@ -3236,7 +3259,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
*/
if (kiocb->ki_flags & IOCB_HIPRI)
return -EOPNOTSUPP;
- if (kiocb->ki_flags & IOCB_NOWAIT)
+ if ((kiocb->ki_flags & IOCB_NOWAIT) &&
+ !(kiocb->ki_filp->f_flags & O_NONBLOCK))
return -EAGAIN;
while (iov_iter_count(iter)) {
@@ -3263,12 +3287,15 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
ret = nr;
break;
}
+ if (!iov_iter_is_bvec(iter)) {
+ iov_iter_advance(iter, nr);
+ } else {
+ req->rw.len -= nr;
+ req->rw.addr += nr;
+ }
ret += nr;
if (nr != iovec.iov_len)
break;
- req->rw.len -= nr;
- req->rw.addr += nr;
- iov_iter_advance(iter, nr);
}
return ret;
@@ -3279,7 +3306,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
{
struct io_async_rw *rw = req->async_data;
- memcpy(&rw->iter, iter, sizeof(*iter));
+ memcpy(&rw->s.iter, iter, sizeof(*iter));
rw->free_iovec = iovec;
rw->bytes_done = 0;
/* can only be fixed buffers, no need to do anything */
@@ -3288,39 +3315,47 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
if (!iovec) {
unsigned iov_off = 0;
- rw->iter.iov = rw->fast_iov;
+ rw->s.iter.iov = rw->s.fast_iov;
if (iter->iov != fast_iov) {
iov_off = iter->iov - fast_iov;
- rw->iter.iov += iov_off;
+ rw->s.iter.iov += iov_off;
}
- if (rw->fast_iov != fast_iov)
- memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
+ if (rw->s.fast_iov != fast_iov)
+ memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
sizeof(struct iovec) * iter->nr_segs);
} else {
req->flags |= REQ_F_NEED_CLEANUP;
}
}
-static inline int io_alloc_async_data(struct io_kiocb *req)
+static inline bool io_alloc_async_data(struct io_kiocb *req)
{
WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
- return req->async_data == NULL;
+ if (req->async_data) {
+ req->flags |= REQ_F_ASYNC_DATA;
+ return false;
+ }
+ return true;
}
static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
- const struct iovec *fast_iov,
- struct iov_iter *iter, bool force)
+ struct io_rw_state *s, bool force)
{
if (!force && !io_op_defs[req->opcode].needs_async_setup)
return 0;
- if (!req->async_data) {
+ if (!req_has_async_data(req)) {
+ struct io_async_rw *iorw;
+
if (io_alloc_async_data(req)) {
kfree(iovec);
return -ENOMEM;
}
- io_req_map_rw(req, iovec, fast_iov, iter);
+ io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
+ iorw = req->async_data;
+ /* we've copied and mapped the iter, ensure state is saved */
+ iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
}
return 0;
}
@@ -3328,10 +3363,11 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
{
struct io_async_rw *iorw = req->async_data;
- struct iovec *iov = iorw->fast_iov;
+ struct iovec *iov;
int ret;
- ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
+ /* submission path, ->uring_lock should already be taken */
+ ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
if (unlikely(ret < 0))
return ret;
@@ -3350,7 +3386,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
}
/*
- * This is our waitqueue callback handler, registered through lock_page_async()
+ * This is our waitqueue callback handler, registered through __folio_lock_async()
* when we initially tried to do the IO with the iocb armed our waitqueue.
* This gets called when the page is unlocked, and we generally expect that to
* happen when the page IO is completed and the page is now uptodate. This will
@@ -3422,7 +3458,7 @@ static bool io_rw_should_retry(struct io_kiocb *req)
static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
{
- if (req->file->f_op->read_iter)
+ if (likely(req->file->f_op->read_iter))
return call_read_iter(req->file, &req->rw.kiocb, iter);
else if (req->file->f_op->read)
return loop_rw_iter(READ, req, iter);
@@ -3438,43 +3474,49 @@ static bool need_read_all(struct io_kiocb *req)
static int io_read(struct io_kiocb *req, unsigned int issue_flags)
{
- struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct io_rw_state __s, *s = &__s;
+ struct iovec *iovec;
struct kiocb *kiocb = &req->rw.kiocb;
- struct iov_iter __iter, *iter = &__iter;
- struct io_async_rw *rw = req->async_data;
- ssize_t io_size, ret, ret2;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ struct io_async_rw *rw;
+ ssize_t ret, ret2;
- if (rw) {
- iter = &rw->iter;
- iovec = NULL;
- } else {
- ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
+ if (!req_has_async_data(req)) {
+ ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
+ if (unlikely(ret < 0))
return ret;
+ } else {
+ rw = req->async_data;
+ s = &rw->s;
+ /*
+ * We come here from an earlier attempt, restore our state to
+ * match in case it doesn't. It's cheap enough that we don't
+ * need to make this conditional.
+ */
+ iov_iter_restore(&s->iter, &s->iter_state);
+ iovec = NULL;
}
- io_size = iov_iter_count(iter);
- req->result = io_size;
+ req->result = iov_iter_count(&s->iter);
- /* Ensure we clear previously set non-block flag */
- if (!force_nonblock)
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- else
+ if (force_nonblock) {
+ /* If the file doesn't support async, just async punt */
+ if (unlikely(!io_file_supports_nowait(req))) {
+ ret = io_setup_async_rw(req, iovec, s, true);
+ return ret ?: -EAGAIN;
+ }
kiocb->ki_flags |= IOCB_NOWAIT;
-
- /* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_nowait(req, READ)) {
- ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
- return ret ?: -EAGAIN;
+ } else {
+ /* Ensure we clear previously set non-block flag */
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
}
- ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
+ ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
if (unlikely(ret)) {
kfree(iovec);
return ret;
}
- ret = io_iter_do_read(req, iter);
+ ret = io_iter_do_read(req, &s->iter);
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
@@ -3484,30 +3526,46 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
/* no retry on NONBLOCK nor RWF_NOWAIT */
if (req->flags & REQ_F_NOWAIT)
goto done;
- /* some cases will consume bytes even on error returns */
- iov_iter_reexpand(iter, iter->count + iter->truncated);
- iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = 0;
} else if (ret == -EIOCBQUEUED) {
goto out_free;
- } else if (ret <= 0 || ret == io_size || !force_nonblock ||
+ } else if (ret == req->result || ret <= 0 || !force_nonblock ||
(req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
}
- ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
+ /*
+ * Don't depend on the iter state matching what was consumed, or being
+ * untouched in case of error. Restore it and we'll advance it
+ * manually if we need to.
+ */
+ iov_iter_restore(&s->iter, &s->iter_state);
+
+ ret2 = io_setup_async_rw(req, iovec, s, true);
if (ret2)
return ret2;
iovec = NULL;
rw = req->async_data;
- /* now use our persistent iterator, if we aren't already */
- iter = &rw->iter;
+ s = &rw->s;
+ /*
+ * Now use our persistent iterator and state, if we aren't already.
+ * We've restored and mapped the iter to match.
+ */
do {
- io_size -= ret;
+ /*
+ * We end up here because of a partial read, either from
+ * above or inside this loop. Advance the iter by the bytes
+ * that were consumed.
+ */
+ iov_iter_advance(&s->iter, ret);
+ if (!iov_iter_count(&s->iter))
+ break;
rw->bytes_done += ret;
+ iov_iter_save_state(&s->iter, &s->iter_state);
+
/* if we can retry, do so with the callbacks armed */
if (!io_rw_should_retry(req)) {
kiocb->ki_flags &= ~IOCB_WAITQ;
@@ -3520,12 +3578,13 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
* desired page gets unlocked. We can also get a partial read
* here, and if we do, then just retry at the new offset.
*/
- ret = io_iter_do_read(req, iter);
+ ret = io_iter_do_read(req, &s->iter);
if (ret == -EIOCBQUEUED)
return 0;
/* we got some bytes, but not all. retry. */
kiocb->ki_flags &= ~IOCB_WAITQ;
- } while (ret > 0 && ret < io_size);
+ iov_iter_restore(&s->iter, &s->iter_state);
+ } while (ret > 0);
done:
kiocb_done(kiocb, ret, issue_flags);
out_free:
@@ -3539,45 +3598,48 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
return -EBADF;
+ req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file));
return io_prep_rw(req, sqe);
}
static int io_write(struct io_kiocb *req, unsigned int issue_flags)
{
- struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct io_rw_state __s, *s = &__s;
+ struct iovec *iovec;
struct kiocb *kiocb = &req->rw.kiocb;
- struct iov_iter __iter, *iter = &__iter;
- struct io_async_rw *rw = req->async_data;
- ssize_t ret, ret2, io_size;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ ssize_t ret, ret2;
- if (rw) {
- iter = &rw->iter;
- iovec = NULL;
- } else {
- ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
+ if (!req_has_async_data(req)) {
+ ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
+ if (unlikely(ret < 0))
return ret;
+ } else {
+ struct io_async_rw *rw = req->async_data;
+
+ s = &rw->s;
+ iov_iter_restore(&s->iter, &s->iter_state);
+ iovec = NULL;
}
- io_size = iov_iter_count(iter);
- req->result = io_size;
+ req->result = iov_iter_count(&s->iter);
- /* Ensure we clear previously set non-block flag */
- if (!force_nonblock)
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- else
- kiocb->ki_flags |= IOCB_NOWAIT;
+ if (force_nonblock) {
+ /* If the file doesn't support async, just async punt */
+ if (unlikely(!io_file_supports_nowait(req)))
+ goto copy_iov;
- /* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_nowait(req, WRITE))
- goto copy_iov;
+ /* file path doesn't support NOWAIT for non-direct_IO */
+ if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
+ (req->flags & REQ_F_ISREG))
+ goto copy_iov;
- /* file path doesn't support NOWAIT for non-direct_IO */
- if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
- (req->flags & REQ_F_ISREG))
- goto copy_iov;
+ kiocb->ki_flags |= IOCB_NOWAIT;
+ } else {
+ /* Ensure we clear previously set non-block flag */
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
+ }
- ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
+ ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
if (unlikely(ret))
goto out_free;
@@ -3595,10 +3657,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
}
kiocb->ki_flags |= IOCB_WRITE;
- if (req->file->f_op->write_iter)
- ret2 = call_write_iter(req->file, kiocb, iter);
+ if (likely(req->file->f_op->write_iter))
+ ret2 = call_write_iter(req->file, kiocb, &s->iter);
else if (req->file->f_op->write)
- ret2 = loop_rw_iter(WRITE, req, iter);
+ ret2 = loop_rw_iter(WRITE, req, &s->iter);
else
ret2 = -EINVAL;
@@ -3618,16 +3680,14 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
goto done;
if (!force_nonblock || ret2 != -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
- if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
+ if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
goto copy_iov;
done:
kiocb_done(kiocb, ret2, issue_flags);
} else {
copy_iov:
- /* some cases will consume bytes even on error returns */
- iov_iter_reexpand(iter, iter->count + iter->truncated);
- iov_iter_revert(iter, io_size - iov_iter_count(iter));
- ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
+ iov_iter_restore(&s->iter, &s->iter_state);
+ ret = io_setup_async_rw(req, iovec, s, false);
return ret ?: -EAGAIN;
}
out_free:
@@ -3763,7 +3823,7 @@ static int io_mkdirat_prep(struct io_kiocb *req,
return 0;
}
-static int io_mkdirat(struct io_kiocb *req, int issue_flags)
+static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_mkdir *mkd = &req->mkdir;
int ret;
@@ -3812,7 +3872,7 @@ static int io_symlinkat_prep(struct io_kiocb *req,
return 0;
}
-static int io_symlinkat(struct io_kiocb *req, int issue_flags)
+static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_symlink *sl = &req->symlink;
int ret;
@@ -3862,7 +3922,7 @@ static int io_linkat_prep(struct io_kiocb *req,
return 0;
}
-static int io_linkat(struct io_kiocb *req, int issue_flags)
+static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_hardlink *lnk = &req->hardlink;
int ret;
@@ -4281,9 +4341,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer *head;
int ret = 0;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
lockdep_assert_held(&ctx->uring_lock);
@@ -4296,7 +4356,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, !force_nonblock);
+ io_ring_submit_unlock(ctx, needs_lock);
return 0;
}
@@ -4342,7 +4402,7 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
int i, bid = pbuf->bid;
for (i = 0; i < pbuf->nbufs; i++) {
- buf = kmalloc(sizeof(*buf), GFP_KERNEL);
+ buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
if (!buf)
break;
@@ -4368,9 +4428,9 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer *head, *list;
int ret = 0;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
lockdep_assert_held(&ctx->uring_lock);
@@ -4386,7 +4446,7 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, !force_nonblock);
+ io_ring_submit_unlock(ctx, needs_lock);
return 0;
}
@@ -4549,12 +4609,16 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
- sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
+ sqe->rw_flags || sqe->buf_index)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
req->close.fd = READ_ONCE(sqe->fd);
+ req->close.file_slot = READ_ONCE(sqe->file_index);
+ if (req->close.file_slot && req->close.fd)
+ return -EINVAL;
+
return 0;
}
@@ -4566,6 +4630,11 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
struct file *file = NULL;
int ret = -EBADF;
+ if (req->close.file_slot) {
+ ret = io_close_fixed(req, issue_flags);
+ goto err;
+ }
+
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
if (close->fd >= fdt->max_fds) {
@@ -4710,8 +4779,9 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- kmsg = req->async_data;
- if (!kmsg) {
+ if (req_has_async_data(req)) {
+ kmsg = req->async_data;
+ } else {
ret = io_sendmsg_copy_hdr(req, &iomsg);
if (ret)
return ret;
@@ -4870,23 +4940,16 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
}
static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct io_sr_msg *sr = &req->sr_msg;
- struct io_buffer *kbuf;
- kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
- if (IS_ERR(kbuf))
- return kbuf;
-
- sr->kbuf = kbuf;
- req->flags |= REQ_F_BUFFER_SELECTED;
- return kbuf;
+ return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
}
static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
{
- return io_put_kbuf(req, req->sr_msg.kbuf);
+ return io_put_kbuf(req, req->kbuf);
}
static int io_recvmsg_prep_async(struct io_kiocb *req)
@@ -4934,8 +4997,9 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- kmsg = req->async_data;
- if (!kmsg) {
+ if (req_has_async_data(req)) {
+ kmsg = req->async_data;
+ } else {
ret = io_recvmsg_copy_hdr(req, &iomsg);
if (ret)
return ret;
@@ -4943,7 +5007,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
}
if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, !force_nonblock);
+ kbuf = io_recv_buffer_select(req, issue_flags);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
@@ -4995,7 +5059,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
return -ENOTSOCK;
if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, !force_nonblock);
+ kbuf = io_recv_buffer_select(req, issue_flags);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
buf = u64_to_user_ptr(kbuf->addr);
@@ -5126,7 +5190,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
int ret;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- if (req->async_data) {
+ if (req_has_async_data(req)) {
io = req->async_data;
} else {
ret = move_addr_to_kernel(req->connect.addr,
@@ -5142,7 +5206,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
ret = __sys_connect_file(req->file, &io->address,
req->connect.addr_len, file_flags);
if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
- if (req->async_data)
+ if (req_has_async_data(req))
return -EAGAIN;
if (io_alloc_async_data(req)) {
ret = -ENOMEM;
@@ -5293,7 +5357,7 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
if (req->poll.events & EPOLLONESHOT)
flags = 0;
if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
- req->poll.done = true;
+ req->poll.events |= EPOLLONESHOT;
flags = 0;
}
if (flags & IORING_CQE_F_MORE)
@@ -5302,16 +5366,6 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
return !(flags & IORING_CQE_F_MORE);
}
-static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
- __must_hold(&req->ctx->completion_lock)
-{
- bool done;
-
- done = __io_poll_complete(req, mask);
- io_commit_cqring(req->ctx);
- return done;
-}
-
static void io_poll_task_func(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -5322,10 +5376,15 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
} else {
bool done;
+ if (req->poll.done) {
+ spin_unlock(&ctx->completion_lock);
+ return;
+ }
done = __io_poll_complete(req, req->result);
if (done) {
io_poll_remove_double(req);
hash_del(&req->hash_node);
+ req->poll.done = true;
} else {
req->result = 0;
add_wait_queue(req->poll.head, &req->poll.wait);
@@ -5428,7 +5487,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
req_ref_get(req);
poll->wait.private = req;
+
*poll_ptr = poll;
+ if (req->opcode == IORING_OP_POLL_ADD)
+ req->flags |= REQ_F_ASYNC_DATA;
}
pt->nr_entries++;
@@ -5463,6 +5525,7 @@ static void io_async_task_func(struct io_kiocb *req, bool *locked)
hash_del(&req->hash_node);
io_poll_remove_double(req);
+ apoll->poll.done = true;
spin_unlock(&ctx->completion_lock);
if (!READ_ONCE(apoll->poll.canceled))
@@ -5551,17 +5614,13 @@ static int io_arm_poll_handler(struct io_kiocb *req)
struct async_poll *apoll;
struct io_poll_table ipt;
__poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
- int rw;
- if (!req->file || !file_can_poll(req->file))
- return IO_APOLL_ABORTED;
- if (req->flags & REQ_F_POLLED)
- return IO_APOLL_ABORTED;
if (!def->pollin && !def->pollout)
return IO_APOLL_ABORTED;
+ if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
+ return IO_APOLL_ABORTED;
if (def->pollin) {
- rw = READ;
mask |= POLLIN | POLLRDNORM;
/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
@@ -5569,14 +5628,9 @@ static int io_arm_poll_handler(struct io_kiocb *req)
(req->sr_msg.msg_flags & MSG_ERRQUEUE))
mask &= ~POLLIN;
} else {
- rw = WRITE;
mask |= POLLOUT | POLLWRNORM;
}
- /* if we can't nonblock try, then no point in arming a poll handler */
- if (!io_file_supports_nowait(req, rw))
- return IO_APOLL_ABORTED;
-
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
if (unlikely(!apoll))
return IO_APOLL_ABORTED;
@@ -5637,8 +5691,8 @@ static bool io_poll_remove_one(struct io_kiocb *req)
/*
* Returns true if we found and killed one or more poll requests
*/
-static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
- bool cancel_all)
+static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
+ struct task_struct *tsk, bool cancel_all)
{
struct hlist_node *tmp;
struct io_kiocb *req;
@@ -5783,6 +5837,7 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
struct io_ring_ctx *ctx = req->ctx;
struct io_poll_table ipt;
__poll_t mask;
+ bool done;
ipt.pt._qproc = io_poll_queue_proc;
@@ -5791,13 +5846,14 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
if (mask) { /* no async, we'd stolen it */
ipt.error = 0;
- io_poll_complete(req, mask);
+ done = __io_poll_complete(req, mask);
+ io_commit_cqring(req->ctx);
}
spin_unlock(&ctx->completion_lock);
if (mask) {
io_cqring_ev_posted(ctx);
- if (poll->events & EPOLLONESHOT)
+ if (done)
io_put_req(req);
}
return ipt.error;
@@ -5867,7 +5923,10 @@ err:
static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
{
- req_set_fail(req);
+ struct io_timeout_data *data = req->async_data;
+
+ if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
+ req_set_fail(req);
io_req_complete_post(req, -ETIME, 0);
}
@@ -6073,7 +6132,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (off && is_timeout_link)
return -EINVAL;
flags = READ_ONCE(sqe->timeout_flags);
- if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK))
+ if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
+ IORING_TIMEOUT_ETIME_SUCCESS))
return -EINVAL;
/* more than one clock specified is invalid, obviously */
if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
@@ -6084,7 +6144,9 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(off && !req->ctx->off_timeout_used))
req->ctx->off_timeout_used = true;
- if (!req->async_data && io_alloc_async_data(req))
+ if (WARN_ON_ONCE(req_has_async_data(req)))
+ return -EFAULT;
+ if (io_alloc_async_data(req))
return -ENOMEM;
data = req->async_data;
@@ -6241,6 +6303,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
u64 sqe_addr = req->cancel.addr;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_tctx_node *node;
int ret;
@@ -6249,7 +6312,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
goto done;
/* slow path, try all io-wq's */
- io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_lock(ctx, needs_lock);
ret = -ENOENT;
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
struct io_uring_task *tctx = node->task->io_uring;
@@ -6258,7 +6321,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
if (ret != -ENOENT)
break;
}
- io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_unlock(ctx, needs_lock);
done:
if (ret < 0)
req_set_fail(req);
@@ -6285,22 +6348,20 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_uring_rsrc_update2 up;
int ret;
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
up.offset = req->rsrc_update.offset;
up.data = req->rsrc_update.arg;
up.nr = 0;
up.tags = 0;
up.resv = 0;
- mutex_lock(&ctx->uring_lock);
+ io_ring_submit_lock(ctx, needs_lock);
ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
&up, req->rsrc_update.nr_args);
- mutex_unlock(&ctx->uring_lock);
+ io_ring_submit_unlock(ctx, needs_lock);
if (ret < 0)
req_set_fail(req);
@@ -6396,7 +6457,7 @@ static int io_req_prep_async(struct io_kiocb *req)
{
if (!io_op_defs[req->opcode].needs_async_setup)
return 0;
- if (WARN_ON_ONCE(req->async_data))
+ if (WARN_ON_ONCE(req_has_async_data(req)))
return -EFAULT;
if (io_alloc_async_data(req))
return -EAGAIN;
@@ -6428,68 +6489,39 @@ static u32 io_get_sequence(struct io_kiocb *req)
return seq;
}
-static bool io_drain_req(struct io_kiocb *req)
+static __cold void io_drain_req(struct io_kiocb *req)
{
- struct io_kiocb *pos;
struct io_ring_ctx *ctx = req->ctx;
struct io_defer_entry *de;
int ret;
- u32 seq;
-
- if (req->flags & REQ_F_FAIL) {
- io_req_complete_fail_submit(req);
- return true;
- }
-
- /*
- * If we need to drain a request in the middle of a link, drain the
- * head request and the next request/link after the current link.
- * Considering sequential execution of links, IOSQE_IO_DRAIN will be
- * maintained for every request of our link.
- */
- if (ctx->drain_next) {
- req->flags |= REQ_F_IO_DRAIN;
- ctx->drain_next = false;
- }
- /* not interested in head, start from the first linked */
- io_for_each_link(pos, req->link) {
- if (pos->flags & REQ_F_IO_DRAIN) {
- ctx->drain_next = true;
- req->flags |= REQ_F_IO_DRAIN;
- break;
- }
- }
+ u32 seq = io_get_sequence(req);
/* Still need defer if there is pending req in defer list. */
- if (likely(list_empty_careful(&ctx->defer_list) &&
- !(req->flags & REQ_F_IO_DRAIN))) {
+ if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
+queue:
ctx->drain_active = false;
- return false;
+ io_req_task_queue(req);
+ return;
}
- seq = io_get_sequence(req);
- /* Still a chance to pass the sequence check */
- if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
- return false;
-
ret = io_req_prep_async(req);
- if (ret)
- goto fail;
+ if (ret) {
+fail:
+ io_req_complete_failed(req, ret);
+ return;
+ }
io_prep_async_link(req);
de = kmalloc(sizeof(*de), GFP_KERNEL);
if (!de) {
ret = -ENOMEM;
-fail:
- io_req_complete_failed(req, ret);
- return true;
+ goto fail;
}
spin_lock(&ctx->completion_lock);
if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
spin_unlock(&ctx->completion_lock);
kfree(de);
- io_queue_async_work(req, NULL);
- return true;
+ goto queue;
}
trace_io_uring_defer(ctx, req, req->user_data);
@@ -6497,23 +6529,13 @@ fail:
de->seq = seq;
list_add_tail(&de->list, &ctx->defer_list);
spin_unlock(&ctx->completion_lock);
- return true;
}
static void io_clean_op(struct io_kiocb *req)
{
if (req->flags & REQ_F_BUFFER_SELECTED) {
- switch (req->opcode) {
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_READ:
- kfree((void *)(unsigned long)req->rw.addr);
- break;
- case IORING_OP_RECVMSG:
- case IORING_OP_RECV:
- kfree(req->sr_msg.kbuf);
- break;
- }
+ kfree(req->kbuf);
+ req->kbuf = NULL;
}
if (req->flags & REQ_F_NEED_CLEANUP) {
@@ -6578,19 +6600,24 @@ static void io_clean_op(struct io_kiocb *req)
}
if (req->flags & REQ_F_CREDS)
put_cred(req->creds);
-
+ if (req->flags & REQ_F_ASYNC_DATA) {
+ kfree(req->async_data);
+ req->async_data = NULL;
+ }
req->flags &= ~IO_REQ_CLEAN_FLAGS;
}
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
- struct io_ring_ctx *ctx = req->ctx;
const struct cred *creds = NULL;
int ret;
- if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
+ if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
creds = override_creds(req->creds);
+ if (!io_op_defs[req->opcode].audit_skip)
+ audit_uring_entry(req->opcode);
+
switch (req->opcode) {
case IORING_OP_NOP:
ret = io_nop(req, issue_flags);
@@ -6706,13 +6733,16 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
break;
}
+ if (!io_op_defs[req->opcode].audit_skip)
+ audit_uring_exit(!ret, ret);
+
if (creds)
revert_creds(creds);
if (ret)
return ret;
/* If the op doesn't have a file, we're not polling for it */
- if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
- io_iopoll_req_issued(req);
+ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
+ io_iopoll_req_issued(req, issue_flags);
return 0;
}
@@ -6728,6 +6758,8 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
static void io_wq_submit_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ unsigned int issue_flags = IO_URING_F_UNLOCKED;
+ bool needs_poll = false;
struct io_kiocb *timeout;
int ret = 0;
@@ -6742,23 +6774,42 @@ static void io_wq_submit_work(struct io_wq_work *work)
io_queue_linked_timeout(timeout);
/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
- if (work->flags & IO_WQ_WORK_CANCEL)
- ret = -ECANCELED;
+ if (work->flags & IO_WQ_WORK_CANCEL) {
+ io_req_task_queue_fail(req, -ECANCELED);
+ return;
+ }
- if (!ret) {
- do {
- ret = io_issue_sqe(req, 0);
- /*
- * We can get EAGAIN for polled IO even though we're
- * forcing a sync submission from here, since we can't
- * wait for request slots on the block side.
- */
- if (ret != -EAGAIN)
- break;
- cond_resched();
- } while (1);
+ if (req->flags & REQ_F_FORCE_ASYNC) {
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+ bool opcode_poll = def->pollin || def->pollout;
+
+ if (opcode_poll && file_can_poll(req->file)) {
+ needs_poll = true;
+ issue_flags |= IO_URING_F_NONBLOCK;
+ }
}
+ do {
+ ret = io_issue_sqe(req, issue_flags);
+ if (ret != -EAGAIN)
+ break;
+ /*
+ * We can get EAGAIN for iopolled IO even though we're
+ * forcing a sync submission from here, since we can't
+ * wait for request slots on the block side.
+ */
+ if (!needs_poll) {
+ cond_resched();
+ continue;
+ }
+
+ if (io_arm_poll_handler(req) == IO_APOLL_OK)
+ return;
+ /* aborted or ready, in either case retry blocking */
+ needs_poll = false;
+ issue_flags &= ~IO_URING_F_NONBLOCK;
+ } while (1);
+
/* avoid locking problems by failing it from a clean context */
if (ret)
io_req_task_queue_fail(req, ret);
@@ -6782,12 +6833,7 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
{
unsigned long file_ptr = (unsigned long) file;
- if (__io_file_supports_nowait(file, READ))
- file_ptr |= FFS_ASYNC_READ;
- if (__io_file_supports_nowait(file, WRITE))
- file_ptr |= FFS_ASYNC_WRITE;
- if (S_ISREG(file_inode(file)->i_mode))
- file_ptr |= FFS_ISREG;
+ file_ptr |= io_file_get_flags(file);
file_slot->file_ptr = file_ptr;
}
@@ -6804,8 +6850,8 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
file = (struct file *) (file_ptr & FFS_MASK);
file_ptr &= ~FFS_MASK;
/* mask in overlapping REQ_F and FFS bits */
- req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
- io_req_set_rsrc_node(req);
+ req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
+ io_req_set_rsrc_node(req, ctx);
return file;
}
@@ -6897,67 +6943,62 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
io_put_req(req);
}
-static void __io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
+ __must_hold(&req->ctx->uring_lock)
+{
+ struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+
+ switch (io_arm_poll_handler(req)) {
+ case IO_APOLL_READY:
+ io_req_task_queue(req);
+ break;
+ case IO_APOLL_ABORTED:
+ /*
+ * Queued up for async execution, worker will release
+ * submit reference when the iocb is actually submitted.
+ */
+ io_queue_async_work(req, NULL);
+ break;
+ }
+
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
+}
+
+static inline void __io_queue_sqe(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
struct io_kiocb *linked_timeout;
int ret;
-issue_sqe:
ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+ if (req->flags & REQ_F_COMPLETE_INLINE) {
+ io_req_add_compl_list(req);
+ return;
+ }
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
if (likely(!ret)) {
- if (req->flags & REQ_F_COMPLETE_INLINE) {
- struct io_ring_ctx *ctx = req->ctx;
- struct io_submit_state *state = &ctx->submit_state;
-
- state->compl_reqs[state->compl_nr++] = req;
- if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
- io_submit_flush_completions(ctx);
- return;
- }
-
linked_timeout = io_prep_linked_timeout(req);
if (linked_timeout)
io_queue_linked_timeout(linked_timeout);
} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
- linked_timeout = io_prep_linked_timeout(req);
-
- switch (io_arm_poll_handler(req)) {
- case IO_APOLL_READY:
- if (linked_timeout)
- io_unprep_linked_timeout(req);
- goto issue_sqe;
- case IO_APOLL_ABORTED:
- /*
- * Queued up for async execution, worker will release
- * submit reference when the iocb is actually submitted.
- */
- io_queue_async_work(req, NULL);
- break;
- }
-
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
+ io_queue_sqe_arm_apoll(req);
} else {
io_req_complete_failed(req, ret);
}
}
-static inline void io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe_fallback(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
- if (unlikely(req->ctx->drain_active) && io_drain_req(req))
- return;
-
- if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
- __io_queue_sqe(req);
- } else if (req->flags & REQ_F_FAIL) {
+ if (req->flags & REQ_F_FAIL) {
io_req_complete_fail_submit(req);
+ } else if (unlikely(req->ctx->drain_active)) {
+ io_drain_req(req);
} else {
int ret = io_req_prep_async(req);
@@ -6968,6 +7009,15 @@ static inline void io_queue_sqe(struct io_kiocb *req)
}
}
+static inline void io_queue_sqe(struct io_kiocb *req)
+ __must_hold(&req->ctx->uring_lock)
+{
+ if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
+ __io_queue_sqe(req);
+ else
+ io_queue_sqe_fallback(req);
+}
+
/*
* Check SQE restrictions (opcode and flags).
*
@@ -6977,9 +7027,6 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
struct io_kiocb *req,
unsigned int sqe_flags)
{
- if (likely(!ctx->restricted))
- return true;
-
if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
return false;
@@ -6994,16 +7041,35 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
return true;
}
+static void io_init_req_drain(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *head = ctx->submit_state.link.head;
+
+ ctx->drain_active = true;
+ if (head) {
+ /*
+ * If we need to drain a request in the middle of a link, drain
+ * the head request and the next request/link after the current
+ * link. Considering sequential execution of links,
+ * IOSQE_IO_DRAIN will be maintained for every request of our
+ * link.
+ */
+ head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
+ ctx->drain_next = true;
+ }
+}
+
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
__must_hold(&ctx->uring_lock)
{
- struct io_submit_state *state;
unsigned int sqe_flags;
- int personality, ret = 0;
+ int personality;
+ u8 opcode;
/* req is partially pre-initialised, see io_preinit_req() */
- req->opcode = READ_ONCE(sqe->opcode);
+ req->opcode = opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags = sqe_flags = READ_ONCE(sqe->flags);
req->user_data = READ_ONCE(sqe->user_data);
@@ -7011,49 +7077,70 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->fixed_rsrc_refs = NULL;
req->task = current;
- /* enforce forwards compatibility on users */
- if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
+ if (unlikely(opcode >= IORING_OP_LAST)) {
+ req->opcode = 0;
return -EINVAL;
- if (unlikely(req->opcode >= IORING_OP_LAST))
- return -EINVAL;
- if (!io_check_restriction(ctx, req, sqe_flags))
- return -EACCES;
+ }
+ if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
+ /* enforce forwards compatibility on users */
+ if (sqe_flags & ~SQE_VALID_FLAGS)
+ return -EINVAL;
+ if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+ !io_op_defs[opcode].buffer_select)
+ return -EOPNOTSUPP;
+ if (sqe_flags & IOSQE_IO_DRAIN)
+ io_init_req_drain(req);
+ }
+ if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
+ if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
+ return -EACCES;
+ /* knock it to the slow queue path, will be drained there */
+ if (ctx->drain_active)
+ req->flags |= REQ_F_FORCE_ASYNC;
+ /* if there is no link, we're at "next" request and need to drain */
+ if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
+ ctx->drain_next = false;
+ ctx->drain_active = true;
+ req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
+ }
+ }
- if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
- !io_op_defs[req->opcode].buffer_select)
- return -EOPNOTSUPP;
- if (unlikely(sqe_flags & IOSQE_IO_DRAIN))
- ctx->drain_active = true;
+ if (io_op_defs[opcode].needs_file) {
+ struct io_submit_state *state = &ctx->submit_state;
+
+ /*
+ * Plug now if we have more than 2 IO left after this, and the
+ * target is potentially a read/write to block based storage.
+ */
+ if (state->need_plug && io_op_defs[opcode].plug) {
+ state->plug_started = true;
+ state->need_plug = false;
+ blk_start_plug_nr_ios(&state->plug, state->submit_nr);
+ }
+
+ req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
+ (sqe_flags & IOSQE_FIXED_FILE));
+ if (unlikely(!req->file))
+ return -EBADF;
+ }
personality = READ_ONCE(sqe->personality);
if (personality) {
+ int ret;
+
req->creds = xa_load(&ctx->personalities, personality);
if (!req->creds)
return -EINVAL;
get_cred(req->creds);
+ ret = security_uring_override_creds(req->creds);
+ if (ret) {
+ put_cred(req->creds);
+ return ret;
+ }
req->flags |= REQ_F_CREDS;
}
- state = &ctx->submit_state;
- /*
- * Plug now if we have more than 1 IO left after this, and the target
- * is potentially a read/write to block based storage.
- */
- if (!state->plug_started && state->ios_left > 1 &&
- io_op_defs[req->opcode].plug) {
- blk_start_plug(&state->plug);
- state->plug_started = true;
- }
-
- if (io_op_defs[req->opcode].needs_file) {
- req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
- (sqe_flags & IOSQE_FIXED_FILE));
- if (unlikely(!req->file))
- ret = -EBADF;
- }
-
- state->ios_left--;
- return ret;
+ return io_req_prep(req, sqe);
}
static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
@@ -7065,7 +7152,8 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
ret = io_init_req(ctx, req, sqe);
if (unlikely(ret)) {
-fail_req:
+ trace_io_uring_req_failed(sqe, ret);
+
/* fail even hard links since we don't submit */
if (link->head) {
/*
@@ -7088,10 +7176,6 @@ fail_req:
return ret;
}
req_fail_link_node(req, ret);
- } else {
- ret = io_req_prep(req, sqe);
- if (unlikely(ret))
- goto fail_req;
}
/* don't need @sqe from now on */
@@ -7121,33 +7205,32 @@ fail_req:
link->last->link = req;
link->last = req;
+ if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
+ return 0;
/* last request of a link, enqueue the link */
- if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
- link->head = NULL;
- io_queue_sqe(head);
- }
- } else {
- if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- link->head = req;
- link->last = req;
- } else {
- io_queue_sqe(req);
- }
+ link->head = NULL;
+ req = head;
+ } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+ link->head = req;
+ link->last = req;
+ return 0;
}
+ io_queue_sqe(req);
return 0;
}
/*
* Batched submission is done, ensure local IO is flushed out.
*/
-static void io_submit_state_end(struct io_submit_state *state,
- struct io_ring_ctx *ctx)
+static void io_submit_state_end(struct io_ring_ctx *ctx)
{
+ struct io_submit_state *state = &ctx->submit_state;
+
if (state->link.head)
io_queue_sqe(state->link.head);
- if (state->compl_nr)
- io_submit_flush_completions(ctx);
+ /* flush only after queuing links as they can generate completions */
+ io_submit_flush_completions(ctx);
if (state->plug_started)
blk_finish_plug(&state->plug);
}
@@ -7159,7 +7242,8 @@ static void io_submit_state_start(struct io_submit_state *state,
unsigned int max_ios)
{
state->plug_started = false;
- state->ios_left = max_ios;
+ state->need_plug = max_ios > 2;
+ state->submit_nr = max_ios;
/* set only head, no need to init link_last in advance */
state->link.head = NULL;
}
@@ -7211,45 +7295,45 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
__must_hold(&ctx->uring_lock)
{
+ unsigned int entries = io_sqring_entries(ctx);
int submitted = 0;
+ if (unlikely(!entries))
+ return 0;
/* make sure SQ entry isn't read before tail */
- nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
- if (!percpu_ref_tryget_many(&ctx->refs, nr))
- return -EAGAIN;
+ nr = min3(nr, ctx->sq_entries, entries);
io_get_task_refs(nr);
io_submit_state_start(&ctx->submit_state, nr);
- while (submitted < nr) {
+ do {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
- req = io_alloc_req(ctx);
- if (unlikely(!req)) {
+ if (unlikely(!io_alloc_req_refill(ctx))) {
if (!submitted)
submitted = -EAGAIN;
break;
}
+ req = io_alloc_req(ctx);
sqe = io_get_sqe(ctx);
if (unlikely(!sqe)) {
- list_add(&req->inflight_entry, &ctx->submit_state.free_list);
+ wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
break;
}
/* will complete beyond this point, count as submitted */
submitted++;
if (io_submit_sqe(ctx, req, sqe))
break;
- }
+ } while (submitted < nr);
if (unlikely(submitted != nr)) {
int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
int unused = nr - ref_used;
current->io_uring->cached_refs += unused;
- percpu_ref_put_many(&ctx->refs, unused);
}
- io_submit_state_end(&ctx->submit_state, ctx);
+ io_submit_state_end(ctx);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
io_commit_sqring(ctx);
@@ -7288,16 +7372,15 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
- if (!list_empty(&ctx->iopoll_list) || to_submit) {
- unsigned nr_events = 0;
+ if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
const struct cred *creds = NULL;
if (ctx->sq_creds != current_cred())
creds = override_creds(ctx->sq_creds);
mutex_lock(&ctx->uring_lock);
- if (!list_empty(&ctx->iopoll_list))
- io_do_iopoll(ctx, &nr_events, 0);
+ if (!wq_list_empty(&ctx->iopoll_list))
+ io_do_iopoll(ctx, true);
/*
* Don't submit if refs are dying, good for io_uring_register(),
@@ -7317,7 +7400,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
return ret;
}
-static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
+static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
{
struct io_ring_ctx *ctx;
unsigned sq_thread_idle = 0;
@@ -7360,6 +7443,8 @@ static int io_sq_thread(void *data)
set_cpus_allowed_ptr(current, cpu_online_mask);
current->flags |= PF_NO_SETAFFINITY;
+ audit_alloc_kernel(current);
+
mutex_lock(&sqd->lock);
while (1) {
bool cap_entries, sqt_spin = false;
@@ -7374,7 +7459,7 @@ static int io_sq_thread(void *data)
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
int ret = __io_sq_thread(ctx, cap_entries);
- if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
+ if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
sqt_spin = true;
}
if (io_run_task_work())
@@ -7395,7 +7480,7 @@ static int io_sq_thread(void *data)
io_ring_set_wakeup_flag(ctx);
if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- !list_empty_careful(&ctx->iopoll_list)) {
+ !wq_list_empty(&ctx->iopoll_list)) {
needs_sched = false;
break;
}
@@ -7425,6 +7510,8 @@ static int io_sq_thread(void *data)
io_run_task_work();
mutex_unlock(&sqd->lock);
+ audit_free(current);
+
complete(&sqd->exited);
do_exit(0);
}
@@ -7515,6 +7602,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
break;
} while (1);
+ if (uts) {
+ struct timespec64 ts;
+
+ if (get_timespec64(&ts, uts))
+ return -EFAULT;
+ timeout = timespec64_to_jiffies(&ts);
+ }
+
if (sig) {
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
@@ -7528,14 +7623,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
- if (uts) {
- struct timespec64 ts;
-
- if (get_timespec64(&ts, uts))
- return -EFAULT;
- timeout = timespec64_to_jiffies(&ts);
- }
-
init_waitqueue_func_entry(&iowq.wq, io_wake_function);
iowq.wq.private = current;
INIT_LIST_HEAD(&iowq.wq.entry);
@@ -7571,7 +7658,7 @@ static void io_free_page_table(void **table, size_t size)
kfree(table);
}
-static void **io_alloc_page_table(size_t size)
+static __cold void **io_alloc_page_table(size_t size)
{
unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
size_t init_size = size;
@@ -7600,7 +7687,7 @@ static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
kfree(ref_node);
}
-static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
+static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
{
struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
struct io_ring_ctx *ctx = node->rsrc_data->ctx;
@@ -7646,10 +7733,13 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_data *data_to_kill)
+ __must_hold(&ctx->uring_lock)
{
WARN_ON_ONCE(!ctx->rsrc_backup_node);
WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
+ io_rsrc_refs_drop(ctx);
+
if (data_to_kill) {
struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
@@ -7677,7 +7767,8 @@ static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
return ctx->rsrc_backup_node ? 0 : -ENOMEM;
}
-static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
+static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
+ struct io_ring_ctx *ctx)
{
int ret;
@@ -7733,9 +7824,9 @@ static void io_rsrc_data_free(struct io_rsrc_data *data)
kfree(data);
}
-static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
- u64 __user *utags, unsigned nr,
- struct io_rsrc_data **pdata)
+static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
+ u64 __user *utags, unsigned nr,
+ struct io_rsrc_data **pdata)
{
struct io_rsrc_data *data;
int ret = -ENOMEM;
@@ -8284,15 +8375,31 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
#endif
}
+static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
+ struct io_rsrc_node *node, void *rsrc)
+{
+ struct io_rsrc_put *prsrc;
+
+ prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
+ if (!prsrc)
+ return -ENOMEM;
+
+ prsrc->tag = *io_get_tag_slot(data, idx);
+ prsrc->rsrc = rsrc;
+ list_add(&prsrc->list, &node->rsrc_list);
+ return 0;
+}
+
static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
unsigned int issue_flags, u32 slot_index)
{
struct io_ring_ctx *ctx = req->ctx;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
+ bool needs_switch = false;
struct io_fixed_file *file_slot;
int ret = -EBADF;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
if (file->f_op == &io_uring_fops)
goto err;
ret = -ENXIO;
@@ -8304,9 +8411,22 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
- ret = -EBADF;
- if (file_slot->file_ptr)
- goto err;
+
+ if (file_slot->file_ptr) {
+ struct file *old_file;
+
+ ret = io_rsrc_node_switch_start(ctx);
+ if (ret)
+ goto err;
+
+ old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+ ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
+ ctx->rsrc_node, old_file);
+ if (ret)
+ goto err;
+ file_slot->file_ptr = 0;
+ needs_switch = true;
+ }
*io_get_tag_slot(ctx->file_data, slot_index) = 0;
io_fixed_file_set(file_slot, file);
@@ -8318,25 +8438,51 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
ret = 0;
err:
- io_ring_submit_unlock(ctx, !force_nonblock);
+ if (needs_switch)
+ io_rsrc_node_switch(ctx, ctx->file_data);
+ io_ring_submit_unlock(ctx, needs_lock);
if (ret)
fput(file);
return ret;
}
-static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
- struct io_rsrc_node *node, void *rsrc)
+static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
{
- struct io_rsrc_put *prsrc;
+ unsigned int offset = req->close.file_slot - 1;
+ struct io_ring_ctx *ctx = req->ctx;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
+ struct io_fixed_file *file_slot;
+ struct file *file;
+ int ret, i;
- prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
- if (!prsrc)
- return -ENOMEM;
+ io_ring_submit_lock(ctx, needs_lock);
+ ret = -ENXIO;
+ if (unlikely(!ctx->file_data))
+ goto out;
+ ret = -EINVAL;
+ if (offset >= ctx->nr_user_files)
+ goto out;
+ ret = io_rsrc_node_switch_start(ctx);
+ if (ret)
+ goto out;
- prsrc->tag = *io_get_tag_slot(data, idx);
- prsrc->rsrc = rsrc;
- list_add(&prsrc->list, &node->rsrc_list);
- return 0;
+ i = array_index_nospec(offset, ctx->nr_user_files);
+ file_slot = io_fixed_file_slot(&ctx->file_table, i);
+ ret = -EBADF;
+ if (!file_slot->file_ptr)
+ goto out;
+
+ file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+ ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
+ if (ret)
+ goto out;
+
+ file_slot->file_ptr = 0;
+ io_rsrc_node_switch(ctx, ctx->file_data);
+ ret = 0;
+out:
+ io_ring_submit_unlock(ctx, needs_lock);
+ return ret;
}
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
@@ -8451,8 +8597,8 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
return io_wq_create(concurrency, &data);
}
-static int io_uring_alloc_task_context(struct task_struct *task,
- struct io_ring_ctx *ctx)
+static __cold int io_uring_alloc_task_context(struct task_struct *task,
+ struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx;
int ret;
@@ -8499,8 +8645,8 @@ void __io_uring_free(struct task_struct *tsk)
tsk->io_uring = NULL;
}
-static int io_sq_offload_create(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
+static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
{
int ret;
@@ -8523,6 +8669,10 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
struct io_sq_data *sqd;
bool attached;
+ ret = security_uring_sqpoll();
+ if (ret)
+ return ret;
+
sqd = io_get_sq_data(p, &attached);
if (IS_ERR(sqd)) {
ret = PTR_ERR(sqd);
@@ -9105,33 +9255,31 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
struct io_buffer *buf;
unsigned long index;
- xa_for_each(&ctx->io_buffers, index, buf)
+ xa_for_each(&ctx->io_buffers, index, buf) {
__io_remove_buffers(ctx, buf, index, -1U);
-}
-
-static void io_req_cache_free(struct list_head *list)
-{
- struct io_kiocb *req, *nxt;
-
- list_for_each_entry_safe(req, nxt, list, inflight_entry) {
- list_del(&req->inflight_entry);
- kmem_cache_free(req_cachep, req);
+ cond_resched();
}
}
static void io_req_caches_free(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
+ int nr = 0;
mutex_lock(&ctx->uring_lock);
+ io_flush_cached_locked_reqs(ctx, state);
- if (state->free_reqs) {
- kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
- state->free_reqs = 0;
- }
+ while (state->free_list.next) {
+ struct io_wq_work_node *node;
+ struct io_kiocb *req;
- io_flush_cached_locked_reqs(ctx, state);
- io_req_cache_free(&state->free_list);
+ node = wq_stack_extract(&state->free_list);
+ req = container_of(node, struct io_kiocb, comp_list);
+ kmem_cache_free(req_cachep, req);
+ nr++;
+ }
+ if (nr)
+ percpu_ref_put_many(&ctx->refs, nr);
mutex_unlock(&ctx->uring_lock);
}
@@ -9141,7 +9289,7 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data)
wait_for_completion(&data->done);
}
-static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_sq_thread_finish(ctx);
@@ -9150,6 +9298,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL;
}
+ io_rsrc_refs_drop(ctx);
/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
io_wait_rsrc_data(ctx->buf_data);
io_wait_rsrc_data(ctx->file_data);
@@ -9173,6 +9322,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
if (ctx->rsrc_backup_node)
io_rsrc_node_destroy(ctx->rsrc_backup_node);
flush_delayed_work(&ctx->rsrc_put_work);
+ flush_delayed_work(&ctx->fallback_work);
WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
@@ -9203,7 +9353,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
struct io_ring_ctx *ctx = file->private_data;
__poll_t mask = 0;
- poll_wait(file, &ctx->poll_wait, wait);
+ poll_wait(file, &ctx->cq_wait, wait);
/*
* synchronizes with barrier from wq_has_sleeper call in
* io_commit_cqring
@@ -9231,13 +9381,6 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
return mask;
}
-static int io_uring_fasync(int fd, struct file *file, int on)
-{
- struct io_ring_ctx *ctx = file->private_data;
-
- return fasync_helper(fd, file, on, &ctx->cq_fasync);
-}
-
static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
const struct cred *creds;
@@ -9257,7 +9400,7 @@ struct io_tctx_exit {
struct io_ring_ctx *ctx;
};
-static void io_tctx_exit_cb(struct callback_head *cb)
+static __cold void io_tctx_exit_cb(struct callback_head *cb)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_exit *work;
@@ -9272,14 +9415,14 @@ static void io_tctx_exit_cb(struct callback_head *cb)
complete(&work->completion);
}
-static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
+static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
return req->ctx == data;
}
-static void io_ring_exit_work(struct work_struct *work)
+static __cold void io_ring_exit_work(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
unsigned long timeout = jiffies + HZ * 60 * 5;
@@ -9308,6 +9451,8 @@ static void io_ring_exit_work(struct work_struct *work)
io_sq_thread_unpark(sqd);
}
+ io_req_caches_free(ctx);
+
if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
/* there is little hope left, don't run it too often */
interval = HZ * 60;
@@ -9334,7 +9479,6 @@ static void io_ring_exit_work(struct work_struct *work)
ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
if (WARN_ON_ONCE(ret))
continue;
- wake_up_process(node->task);
mutex_unlock(&ctx->uring_lock);
wait_for_completion(&exit.completion);
@@ -9348,8 +9492,8 @@ static void io_ring_exit_work(struct work_struct *work)
}
/* Returns true if we found and killed one or more timeouts */
-static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
- bool cancel_all)
+static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
+ struct task_struct *tsk, bool cancel_all)
{
struct io_kiocb *req, *tmp;
int canceled = 0;
@@ -9371,7 +9515,7 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
return canceled != 0;
}
-static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
+static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
unsigned long index;
struct creds *creds;
@@ -9433,8 +9577,9 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
return ret;
}
-static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
- struct task_struct *task, bool cancel_all)
+static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ bool cancel_all)
{
struct io_defer_entry *de;
LIST_HEAD(list);
@@ -9459,7 +9604,7 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
return true;
}
-static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
+static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
{
struct io_tctx_node *node;
enum io_wq_cancel cret;
@@ -9483,9 +9628,9 @@ static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
return ret;
}
-static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
- struct task_struct *task,
- bool cancel_all)
+static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ bool cancel_all)
{
struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
struct io_uring_task *tctx = task ? task->io_uring : NULL;
@@ -9509,7 +9654,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
/* SQPOLL thread does its own polling */
if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
(ctx->sq_data && ctx->sq_data->thread == current)) {
- while (!list_empty_careful(&ctx->iopoll_list)) {
+ while (!wq_list_empty(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
ret = true;
}
@@ -9536,7 +9681,16 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
ret = io_uring_alloc_task_context(current, ctx);
if (unlikely(ret))
return ret;
+
tctx = current->io_uring;
+ if (ctx->iowq_limits_set) {
+ unsigned int limits[2] = { ctx->iowq_limits[0],
+ ctx->iowq_limits[1], };
+
+ ret = io_wq_max_workers(tctx->io_wq, limits);
+ if (ret)
+ return ret;
+ }
}
if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
node = kmalloc(sizeof(*node), GFP_KERNEL);
@@ -9575,7 +9729,7 @@ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
/*
* Remove this io_uring_file -> task mapping.
*/
-static void io_uring_del_tctx_node(unsigned long index)
+static __cold void io_uring_del_tctx_node(unsigned long index)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
@@ -9598,14 +9752,16 @@ static void io_uring_del_tctx_node(unsigned long index)
kfree(node);
}
-static void io_uring_clean_tctx(struct io_uring_task *tctx)
+static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
{
struct io_wq *wq = tctx->io_wq;
struct io_tctx_node *node;
unsigned long index;
- xa_for_each(&tctx->xa, index, node)
+ xa_for_each(&tctx->xa, index, node) {
io_uring_del_tctx_node(index);
+ cond_resched();
+ }
if (wq) {
/*
* Must be after io_uring_del_task_file() (removes nodes under
@@ -9623,7 +9779,7 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
return percpu_counter_sum(&tctx->inflight);
}
-static void io_uring_drop_tctx_refs(struct task_struct *task)
+static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
{
struct io_uring_task *tctx = task->io_uring;
unsigned int refs = tctx->cached_refs;
@@ -9639,7 +9795,8 @@ static void io_uring_drop_tctx_refs(struct task_struct *task)
* Find any io_uring ctx that this task has registered or done IO on, and cancel
* requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
*/
-static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
+static __cold void io_uring_cancel_generic(bool cancel_all,
+ struct io_sq_data *sqd)
{
struct io_uring_task *tctx = current->io_uring;
struct io_ring_ctx *ctx;
@@ -9732,7 +9889,7 @@ static void *io_uring_validate_mmap_request(struct file *file,
#ifdef CONFIG_MMU
-static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
size_t sz = vma->vm_end - vma->vm_start;
unsigned long pfn;
@@ -9917,7 +10074,7 @@ out_fput:
}
#ifdef CONFIG_PROC_FS
-static int io_uring_show_cred(struct seq_file *m, unsigned int id,
+static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
const struct cred *cred)
{
struct user_namespace *uns = seq_user_ns(m);
@@ -9949,11 +10106,59 @@ static int io_uring_show_cred(struct seq_file *m, unsigned int id,
return 0;
}
-static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
+static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
+ struct seq_file *m)
{
struct io_sq_data *sq = NULL;
+ struct io_overflow_cqe *ocqe;
+ struct io_rings *r = ctx->rings;
+ unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
+ unsigned int sq_head = READ_ONCE(r->sq.head);
+ unsigned int sq_tail = READ_ONCE(r->sq.tail);
+ unsigned int cq_head = READ_ONCE(r->cq.head);
+ unsigned int cq_tail = READ_ONCE(r->cq.tail);
+ unsigned int sq_entries, cq_entries;
bool has_lock;
- int i;
+ unsigned int i;
+
+ /*
+ * we may get imprecise sqe and cqe info if uring is actively running
+ * since we get cached_sq_head and cached_cq_tail without uring_lock
+ * and sq_tail and cq_head are changed by userspace. But it's ok since
+ * we usually use these info when it is stuck.
+ */
+ seq_printf(m, "SqMask:\t\t0x%x\n", sq_mask);
+ seq_printf(m, "SqHead:\t%u\n", sq_head);
+ seq_printf(m, "SqTail:\t%u\n", sq_tail);
+ seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
+ seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
+ seq_printf(m, "CqHead:\t%u\n", cq_head);
+ seq_printf(m, "CqTail:\t%u\n", cq_tail);
+ seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
+ seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
+ sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
+ for (i = 0; i < sq_entries; i++) {
+ unsigned int entry = i + sq_head;
+ unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
+ struct io_uring_sqe *sqe;
+
+ if (sq_idx > sq_mask)
+ continue;
+ sqe = &ctx->sq_sqes[sq_idx];
+ seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
+ sq_idx, sqe->opcode, sqe->fd, sqe->flags,
+ sqe->user_data);
+ }
+ seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
+ cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
+ for (i = 0; i < cq_entries; i++) {
+ unsigned int entry = i + cq_head;
+ struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
+
+ seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
+ entry & cq_mask, cqe->user_data, cqe->res,
+ cqe->flags);
+ }
/*
* Avoid ABBA deadlock between the seq lock and the io_uring mutex,
@@ -9995,7 +10200,10 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
xa_for_each(&ctx->personalities, index, cred)
io_uring_show_cred(m, index, cred);
}
- seq_printf(m, "PollList:\n");
+ if (has_lock)
+ mutex_unlock(&ctx->uring_lock);
+
+ seq_puts(m, "PollList:\n");
spin_lock(&ctx->completion_lock);
for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
struct hlist_head *list = &ctx->cancel_hash[i];
@@ -10005,12 +10213,20 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
req->task->task_works != NULL);
}
+
+ seq_puts(m, "CqOverflowList:\n");
+ list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
+ struct io_uring_cqe *cqe = &ocqe->cqe;
+
+ seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
+ cqe->user_data, cqe->res, cqe->flags);
+
+ }
+
spin_unlock(&ctx->completion_lock);
- if (has_lock)
- mutex_unlock(&ctx->uring_lock);
}
-static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
+static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
{
struct io_ring_ctx *ctx = f->private_data;
@@ -10029,14 +10245,13 @@ static const struct file_operations io_uring_fops = {
.mmap_capabilities = io_uring_nommu_mmap_capabilities,
#endif
.poll = io_uring_poll,
- .fasync = io_uring_fasync,
#ifdef CONFIG_PROC_FS
.show_fdinfo = io_uring_show_fdinfo,
#endif
};
-static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
+static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
{
struct io_rings *rings;
size_t size, sq_array_offset;
@@ -10112,8 +10327,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
return ERR_PTR(ret);
#endif
- file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
- O_RDWR | O_CLOEXEC);
+ file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
+ O_RDWR | O_CLOEXEC, NULL);
#if defined(CONFIG_UNIX)
if (IS_ERR(file)) {
sock_release(ctx->ring_sock);
@@ -10125,8 +10340,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
return file;
}
-static int io_uring_create(unsigned entries, struct io_uring_params *p,
- struct io_uring_params __user *params)
+static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
+ struct io_uring_params __user *params)
{
struct io_ring_ctx *ctx;
struct file *file;
@@ -10284,7 +10499,8 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
return io_uring_setup(entries, params);
}
-static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
+static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
{
struct io_uring_probe *p;
size_t size;
@@ -10340,8 +10556,8 @@ static int io_register_personality(struct io_ring_ctx *ctx)
return id;
}
-static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int nr_args)
+static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned int nr_args)
{
struct io_uring_restriction *res;
size_t size;
@@ -10475,7 +10691,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
return __io_register_rsrc_update(ctx, type, &up, up.nr);
}
-static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
+static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type)
{
struct io_uring_rsrc_register rr;
@@ -10501,8 +10717,8 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL;
}
-static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
- unsigned len)
+static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned len)
{
struct io_uring_task *tctx = current->io_uring;
cpumask_var_t new_mask;
@@ -10528,7 +10744,7 @@ static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
-static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
+static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
@@ -10538,9 +10754,11 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
return io_wq_cpu_affinity(tctx->io_wq, NULL);
}
-static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
- void __user *arg)
+static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
+ void __user *arg)
+ __must_hold(&ctx->uring_lock)
{
+ struct io_tctx_node *node;
struct io_uring_task *tctx = NULL;
struct io_sq_data *sqd = NULL;
__u32 new_count[2];
@@ -10560,33 +10778,62 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
* ordering. Fine to drop uring_lock here, we hold
* a ref to the ctx.
*/
+ refcount_inc(&sqd->refs);
mutex_unlock(&ctx->uring_lock);
mutex_lock(&sqd->lock);
mutex_lock(&ctx->uring_lock);
- tctx = sqd->thread->io_uring;
+ if (sqd->thread)
+ tctx = sqd->thread->io_uring;
}
} else {
tctx = current->io_uring;
}
- ret = -EINVAL;
- if (!tctx || !tctx->io_wq)
- goto err;
+ BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
- ret = io_wq_max_workers(tctx->io_wq, new_count);
- if (ret)
- goto err;
+ for (i = 0; i < ARRAY_SIZE(new_count); i++)
+ if (new_count[i])
+ ctx->iowq_limits[i] = new_count[i];
+ ctx->iowq_limits_set = true;
- if (sqd)
+ if (tctx && tctx->io_wq) {
+ ret = io_wq_max_workers(tctx->io_wq, new_count);
+ if (ret)
+ goto err;
+ } else {
+ memset(new_count, 0, sizeof(new_count));
+ }
+
+ if (sqd) {
mutex_unlock(&sqd->lock);
+ io_put_sq_data(sqd);
+ }
if (copy_to_user(arg, new_count, sizeof(new_count)))
return -EFAULT;
+ /* that's it for SQPOLL, only the SQPOLL task creates requests */
+ if (sqd)
+ return 0;
+
+ /* now propagate the restriction to all registered users */
+ list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
+ struct io_uring_task *tctx = node->task->io_uring;
+
+ if (WARN_ON_ONCE(!tctx->io_wq))
+ continue;
+
+ for (i = 0; i < ARRAY_SIZE(new_count); i++)
+ new_count[i] = ctx->iowq_limits[i];
+ /* ignore errors, it always returns zero anyway */
+ (void)io_wq_max_workers(tctx->io_wq, new_count);
+ }
return 0;
err:
- if (sqd)
+ if (sqd) {
mutex_unlock(&sqd->lock);
+ io_put_sq_data(sqd);
+ }
return ret;
}
@@ -10614,7 +10861,7 @@ static bool io_register_op_must_quiesce(int op)
}
}
-static int io_ctx_quiesce(struct io_ring_ctx *ctx)
+static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx)
{
long ret;
@@ -10629,10 +10876,14 @@ static int io_ctx_quiesce(struct io_ring_ctx *ctx)
*/
mutex_unlock(&ctx->uring_lock);
do {
- ret = wait_for_completion_interruptible(&ctx->ref_comp);
- if (!ret)
+ ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ);
+ if (ret) {
+ ret = min(0L, ret);
break;
+ }
+
ret = io_run_task_work_sig();
+ io_req_caches_free(ctx);
} while (ret >= 0);
mutex_lock(&ctx->uring_lock);
@@ -10863,6 +11114,8 @@ static int __init io_uring_init(void)
/* should fit into one byte */
BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
+ BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
+ BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 9cc5798423d1..1753c26c8e76 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -750,7 +750,7 @@ again:
* same page as we're writing to, without it being marked
* up-to-date.
*/
- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
status = -EFAULT;
break;
}
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 4ecd255e0511..b4dc51063d36 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -31,6 +31,7 @@ struct iomap_dio {
atomic_t ref;
unsigned flags;
int error;
+ size_t done_before;
bool wait_for_completion;
union {
@@ -38,8 +39,7 @@ struct iomap_dio {
struct {
struct iov_iter *iter;
struct task_struct *waiter;
- struct request_queue *last_queue;
- blk_qc_t cookie;
+ struct bio *poll_bio;
} submit;
/* used for aio completion: */
@@ -49,29 +49,20 @@ struct iomap_dio {
};
};
-int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
-{
- struct request_queue *q = READ_ONCE(kiocb->private);
-
- if (!q)
- return 0;
- return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
-}
-EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
-
static void iomap_dio_submit_bio(const struct iomap_iter *iter,
struct iomap_dio *dio, struct bio *bio, loff_t pos)
{
atomic_inc(&dio->ref);
- if (dio->iocb->ki_flags & IOCB_HIPRI)
+ if (dio->iocb->ki_flags & IOCB_HIPRI) {
bio_set_polled(bio, dio->iocb);
+ dio->submit.poll_bio = bio;
+ }
- dio->submit.last_queue = bdev_get_queue(iter->iomap.bdev);
if (dio->dops && dio->dops->submit_io)
- dio->submit.cookie = dio->dops->submit_io(iter, bio, pos);
+ dio->dops->submit_io(iter, bio, pos);
else
- dio->submit.cookie = submit_bio(bio);
+ submit_bio(bio);
}
ssize_t iomap_dio_complete(struct iomap_dio *dio)
@@ -124,6 +115,9 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
ret = generic_write_sync(iocb, ret);
+ if (ret > 0)
+ ret += dio->done_before;
+
kfree(dio);
return ret;
@@ -135,7 +129,7 @@ static void iomap_dio_complete_work(struct work_struct *work)
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
struct kiocb *iocb = dio->iocb;
- iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
+ iocb->ki_complete(iocb, iomap_dio_complete(dio));
}
/*
@@ -164,9 +158,11 @@ static void iomap_dio_bio_end_io(struct bio *bio)
} else if (dio->flags & IOMAP_DIO_WRITE) {
struct inode *inode = file_inode(dio->iocb->ki_filp);
+ WRITE_ONCE(dio->iocb->private, NULL);
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
} else {
+ WRITE_ONCE(dio->iocb->private, NULL);
iomap_dio_complete_work(&dio->aio.work);
}
}
@@ -282,6 +278,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
if (!iov_iter_count(dio->submit.iter))
goto out;
+ /*
+ * We can only poll for single bio I/Os.
+ */
+ if (need_zeroout ||
+ ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
+ dio->iocb->ki_flags &= ~IOCB_HIPRI;
+
if (need_zeroout) {
/* zero out from the start of the block to the write offset */
pad = pos & (fs_block_size - 1);
@@ -339,6 +342,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
BIO_MAX_VECS);
+ /*
+ * We can only poll for single bio I/Os.
+ */
+ if (nr_pages)
+ dio->iocb->ki_flags &= ~IOCB_HIPRI;
iomap_dio_submit_bio(iter, dio, bio, pos);
pos += n;
} while (nr_pages);
@@ -371,6 +379,8 @@ static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter,
loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);
dio->size += length;
+ if (!length)
+ return -EFAULT;
return length;
}
@@ -402,6 +412,8 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
copied = copy_to_iter(inline_data, length, iter);
}
dio->size += copied;
+ if (!copied)
+ return -EFAULT;
return copied;
}
@@ -446,13 +458,21 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter,
* may be pure data writes. In that case, we still need to do a full data sync
* completion.
*
+ * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL,
+ * __iomap_dio_rw can return a partial result if it encounters a non-resident
+ * page in @iter after preparing a transfer. In that case, the non-resident
+ * pages can be faulted in and the request resumed with @done_before set to the
+ * number of bytes previously transferred. The request will then complete with
+ * the correct total number of bytes transferred; this is essential for
+ * completing partial requests asynchronously.
+ *
* Returns -ENOTBLK In case of a page invalidation invalidation failure for
* writes. The callers needs to fall back to buffered I/O in this case.
*/
struct iomap_dio *
__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- unsigned int dio_flags)
+ unsigned int dio_flags, size_t done_before)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = file_inode(iocb->ki_filp);
@@ -482,11 +502,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->dops = dops;
dio->error = 0;
dio->flags = 0;
+ dio->done_before = done_before;
dio->submit.iter = iter;
dio->submit.waiter = current;
- dio->submit.cookie = BLK_QC_T_NONE;
- dio->submit.last_queue = NULL;
+ dio->submit.poll_bio = NULL;
if (iov_iter_rw(iter) == READ) {
if (iomi.pos >= dio->i_size)
@@ -565,8 +585,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
inode_dio_begin(inode);
blk_start_plug(&plug);
- while ((ret = iomap_iter(&iomi, ops)) > 0)
+ while ((ret = iomap_iter(&iomi, ops)) > 0) {
iomi.processed = iomap_dio_iter(&iomi, dio);
+
+ /*
+ * We can only poll for single bio I/Os.
+ */
+ iocb->ki_flags &= ~IOCB_HIPRI;
+ }
+
blk_finish_plug(&plug);
/*
@@ -577,6 +604,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size)
iov_iter_revert(iter, iomi.pos - dio->i_size);
+ if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) {
+ if (!(iocb->ki_flags & IOCB_NOWAIT))
+ wait_for_completion = true;
+ ret = 0;
+ }
+
/* magic error code to fall back to buffered I/O */
if (ret == -ENOTBLK) {
wait_for_completion = true;
@@ -592,8 +625,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (dio->flags & IOMAP_DIO_WRITE_FUA)
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
- WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
- WRITE_ONCE(iocb->private, dio->submit.last_queue);
+ WRITE_ONCE(iocb->private, dio->submit.poll_bio);
/*
* We are about to drop our additional submission reference, which
@@ -620,10 +652,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (!READ_ONCE(dio->submit.waiter))
break;
- if (!(iocb->ki_flags & IOCB_HIPRI) ||
- !dio->submit.last_queue ||
- !blk_poll(dio->submit.last_queue,
- dio->submit.cookie, true))
+ if (!dio->submit.poll_bio ||
+ !bio_poll(dio->submit.poll_bio, NULL, 0))
blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
@@ -642,11 +672,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- unsigned int dio_flags)
+ unsigned int dio_flags, size_t done_before)
{
struct iomap_dio *dio;
- dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags);
+ dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before);
if (IS_ERR_OR_NULL(dio))
return PTR_ERR_OR_ZERO(dio);
return iomap_dio_complete(dio);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 678e2c51b855..0c6eacfcbeef 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1322,6 +1322,8 @@ static int isofs_read_inode(struct inode *inode, int relocated)
de = (struct iso_directory_record *) (bh->b_data + offset);
de_len = *(unsigned char *) de;
+ if (de_len < sizeof(struct iso_directory_record))
+ goto fail;
if (offset + de_len > bufsize) {
int frag1 = bufsize - offset;
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 176580f54af9..104ae698443e 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -13,6 +13,7 @@
#include <linux/buffer_head.h>
#include <linux/mempool.h>
#include <linux/seq_file.h>
+#include <linux/writeback.h>
#include "jfs_incore.h"
#include "jfs_superblock.h"
#include "jfs_filsys.h"
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 5d7d7170c03c..aa4ff7bcaff2 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -81,14 +81,14 @@ int jfs_mount(struct super_block *sb)
* (initialize mount inode from the superblock)
*/
if ((rc = chkSuper(sb))) {
- goto errout20;
+ goto out;
}
ipaimap = diReadSpecial(sb, AGGREGATE_I, 0);
if (ipaimap == NULL) {
jfs_err("jfs_mount: Failed to read AGGREGATE_I");
rc = -EIO;
- goto errout20;
+ goto out;
}
sbi->ipaimap = ipaimap;
@@ -99,7 +99,7 @@ int jfs_mount(struct super_block *sb)
*/
if ((rc = diMount(ipaimap))) {
jfs_err("jfs_mount: diMount(ipaimap) failed w/rc = %d", rc);
- goto errout21;
+ goto err_ipaimap;
}
/*
@@ -108,7 +108,7 @@ int jfs_mount(struct super_block *sb)
ipbmap = diReadSpecial(sb, BMAP_I, 0);
if (ipbmap == NULL) {
rc = -EIO;
- goto errout22;
+ goto err_umount_ipaimap;
}
jfs_info("jfs_mount: ipbmap:0x%p", ipbmap);
@@ -120,7 +120,7 @@ int jfs_mount(struct super_block *sb)
*/
if ((rc = dbMount(ipbmap))) {
jfs_err("jfs_mount: dbMount failed w/rc = %d", rc);
- goto errout22;
+ goto err_ipbmap;
}
/*
@@ -139,7 +139,7 @@ int jfs_mount(struct super_block *sb)
if (!ipaimap2) {
jfs_err("jfs_mount: Failed to read AGGREGATE_I");
rc = -EIO;
- goto errout35;
+ goto err_umount_ipbmap;
}
sbi->ipaimap2 = ipaimap2;
@@ -151,7 +151,7 @@ int jfs_mount(struct super_block *sb)
if ((rc = diMount(ipaimap2))) {
jfs_err("jfs_mount: diMount(ipaimap2) failed, rc = %d",
rc);
- goto errout35;
+ goto err_ipaimap2;
}
} else
/* Secondary aggregate inode table is not valid */
@@ -168,7 +168,7 @@ int jfs_mount(struct super_block *sb)
jfs_err("jfs_mount: Failed to read FILESYSTEM_I");
/* open fileset secondary inode allocation map */
rc = -EIO;
- goto errout40;
+ goto err_umount_ipaimap2;
}
jfs_info("jfs_mount: ipimap:0x%p", ipimap);
@@ -178,41 +178,34 @@ int jfs_mount(struct super_block *sb)
/* initialize fileset inode allocation map */
if ((rc = diMount(ipimap))) {
jfs_err("jfs_mount: diMount failed w/rc = %d", rc);
- goto errout41;
+ goto err_ipimap;
}
- goto out;
+ return rc;
/*
* unwind on error
*/
- errout41: /* close fileset inode allocation map inode */
+err_ipimap:
+ /* close fileset inode allocation map inode */
diFreeSpecial(ipimap);
-
- errout40: /* fileset closed */
-
+err_umount_ipaimap2:
/* close secondary aggregate inode allocation map */
- if (ipaimap2) {
+ if (ipaimap2)
diUnmount(ipaimap2, 1);
+err_ipaimap2:
+ /* close aggregate inodes */
+ if (ipaimap2)
diFreeSpecial(ipaimap2);
- }
-
- errout35:
-
- /* close aggregate block allocation map */
+err_umount_ipbmap: /* close aggregate block allocation map */
dbUnmount(ipbmap, 1);
+err_ipbmap: /* close aggregate inodes */
diFreeSpecial(ipbmap);
-
- errout22: /* close aggregate inode allocation map */
-
+err_umount_ipaimap: /* close aggregate inode allocation map */
diUnmount(ipaimap, 1);
-
- errout21: /* close aggregate inodes */
+err_ipaimap: /* close aggregate inodes */
diFreeSpecial(ipaimap);
- errout20: /* aggregate closed */
-
- out:
-
+out:
if (rc)
jfs_err("Mount JFS Failure: %d", rc);
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index bde787c354fc..8b9a72ae5efa 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -86,8 +86,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
goto out;
}
- VolumeSize = i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits;
-
+ VolumeSize = sb_bdev_nr_blocks(sb);
if (VolumeSize) {
if (newLVSize > VolumeSize) {
printk(KERN_WARNING "jfs_extendfs: invalid size\n");
@@ -199,7 +198,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
txQuiesce(sb);
/* Reset size of direct inode */
- sbi->direct_inode->i_size = i_size_read(sb->s_bdev->bd_inode);
+ sbi->direct_inode->i_size = bdev_nr_bytes(sb->s_bdev);
if (sbi->mntflag & JFS_INLINELOG) {
/*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 9030aeaf0f88..24cbc9946e01 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -284,8 +284,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
}
case Opt_resize_nosize:
{
- *newLVSize = i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits;
+ *newLVSize = sb_bdev_nr_blocks(sb);
if (*newLVSize == 0)
pr_err("JFS: Cannot determine volume size\n");
break;
@@ -551,7 +550,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
ret = -ENOMEM;
goto out_unload;
}
- inode->i_size = i_size_read(sb->s_bdev->bd_inode);
+ inode->i_size = bdev_nr_bytes(sb->s_bdev);
inode->i_mapping->a_ops = &jfs_metapage_aops;
inode_fake_hash(inode);
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 87aac4c72c37..1b07550485b9 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -178,7 +178,7 @@ int kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
struct fd f = fdget(fd);
int ret = -EBADF;
- if (!f.file)
+ if (!f.file || !(f.file->f_mode & FMODE_READ))
goto out;
ret = kernel_read_file(f.file, offset, buf, buf_size, file_size, id);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index ba581429bf7b..8e0a1378a4b1 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1111,13 +1111,25 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
/* attach dentry and inode */
- if (kn && kernfs_active(kn)) {
+ if (kn) {
+ /* Inactive nodes are invisible to the VFS so don't
+ * create a negative.
+ */
+ if (!kernfs_active(kn)) {
+ up_read(&kernfs_rwsem);
+ return NULL;
+ }
inode = kernfs_get_inode(dir->i_sb, kn);
if (!inode)
inode = ERR_PTR(-ENOMEM);
}
- /* Needed only for negative dentry validation */
- if (!inode)
+ /*
+ * Needed for negative dentry validation.
+ * The negative dentry can be created in kernfs_iop_lookup()
+ * or transforms from positive dentry in dentry_unlink_inode()
+ * called from vfs_rmdir().
+ */
+ if (!IS_ERR(inode))
kernfs_set_rev(parent, dentry);
up_read(&kernfs_rwsem);
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index c8f8e41b8411..19a6c71c6ff5 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -36,8 +36,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
gid = target->iattr->ia_gid;
}
- kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, uid, gid,
- KERNFS_LINK);
+ kn = kernfs_new_node(parent, name, S_IFLNK|0777, uid, gid, KERNFS_LINK);
if (!kn)
return ERR_PTR(-ENOMEM);
diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c
index de36f12070bf..30a92ddc1817 100644
--- a/fs/ksmbd/auth.c
+++ b/fs/ksmbd/auth.c
@@ -68,125 +68,6 @@ void ksmbd_copy_gss_neg_header(void *buf)
memcpy(buf, NEGOTIATE_GSS_HEADER, AUTH_GSS_LENGTH);
}
-static void
-str_to_key(unsigned char *str, unsigned char *key)
-{
- int i;
-
- key[0] = str[0] >> 1;
- key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2);
- key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3);
- key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4);
- key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5);
- key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6);
- key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7);
- key[7] = str[6] & 0x7F;
- for (i = 0; i < 8; i++)
- key[i] = (key[i] << 1);
-}
-
-static int
-smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
-{
- unsigned char key2[8];
- struct des_ctx ctx;
-
- if (fips_enabled) {
- ksmbd_debug(AUTH, "FIPS compliance enabled: DES not permitted\n");
- return -ENOENT;
- }
-
- str_to_key(key, key2);
- des_expand_key(&ctx, key2, DES_KEY_SIZE);
- des_encrypt(&ctx, out, in);
- memzero_explicit(&ctx, sizeof(ctx));
- return 0;
-}
-
-static int ksmbd_enc_p24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
-{
- int rc;
-
- rc = smbhash(p24, c8, p21);
- if (rc)
- return rc;
- rc = smbhash(p24 + 8, c8, p21 + 7);
- if (rc)
- return rc;
- return smbhash(p24 + 16, c8, p21 + 14);
-}
-
-/* produce a md4 message digest from data of length n bytes */
-static int ksmbd_enc_md4(unsigned char *md4_hash, unsigned char *link_str,
- int link_len)
-{
- int rc;
- struct ksmbd_crypto_ctx *ctx;
-
- ctx = ksmbd_crypto_ctx_find_md4();
- if (!ctx) {
- ksmbd_debug(AUTH, "Crypto md4 allocation error\n");
- return -ENOMEM;
- }
-
- rc = crypto_shash_init(CRYPTO_MD4(ctx));
- if (rc) {
- ksmbd_debug(AUTH, "Could not init md4 shash\n");
- goto out;
- }
-
- rc = crypto_shash_update(CRYPTO_MD4(ctx), link_str, link_len);
- if (rc) {
- ksmbd_debug(AUTH, "Could not update with link_str\n");
- goto out;
- }
-
- rc = crypto_shash_final(CRYPTO_MD4(ctx), md4_hash);
- if (rc)
- ksmbd_debug(AUTH, "Could not generate md4 hash\n");
-out:
- ksmbd_release_crypto_ctx(ctx);
- return rc;
-}
-
-static int ksmbd_enc_update_sess_key(unsigned char *md5_hash, char *nonce,
- char *server_challenge, int len)
-{
- int rc;
- struct ksmbd_crypto_ctx *ctx;
-
- ctx = ksmbd_crypto_ctx_find_md5();
- if (!ctx) {
- ksmbd_debug(AUTH, "Crypto md5 allocation error\n");
- return -ENOMEM;
- }
-
- rc = crypto_shash_init(CRYPTO_MD5(ctx));
- if (rc) {
- ksmbd_debug(AUTH, "Could not init md5 shash\n");
- goto out;
- }
-
- rc = crypto_shash_update(CRYPTO_MD5(ctx), server_challenge, len);
- if (rc) {
- ksmbd_debug(AUTH, "Could not update with challenge\n");
- goto out;
- }
-
- rc = crypto_shash_update(CRYPTO_MD5(ctx), nonce, len);
- if (rc) {
- ksmbd_debug(AUTH, "Could not update with nonce\n");
- goto out;
- }
-
- rc = crypto_shash_final(CRYPTO_MD5(ctx), md5_hash);
- if (rc)
- ksmbd_debug(AUTH, "Could not generate md5 hash\n");
-out:
- ksmbd_release_crypto_ctx(ctx);
- return rc;
-}
-
/**
* ksmbd_gen_sess_key() - function to generate session key
* @sess: session of connection
@@ -325,43 +206,6 @@ out:
}
/**
- * ksmbd_auth_ntlm() - NTLM authentication handler
- * @sess: session of connection
- * @pw_buf: NTLM challenge response
- * @passkey: user password
- *
- * Return: 0 on success, error number on error
- */
-int ksmbd_auth_ntlm(struct ksmbd_session *sess, char *pw_buf)
-{
- int rc;
- unsigned char p21[21];
- char key[CIFS_AUTH_RESP_SIZE];
-
- memset(p21, '\0', 21);
- memcpy(p21, user_passkey(sess->user), CIFS_NTHASH_SIZE);
- rc = ksmbd_enc_p24(p21, sess->ntlmssp.cryptkey, key);
- if (rc) {
- pr_err("password processing failed\n");
- return rc;
- }
-
- ksmbd_enc_md4(sess->sess_key, user_passkey(sess->user),
- CIFS_SMB1_SESSKEY_SIZE);
- memcpy(sess->sess_key + CIFS_SMB1_SESSKEY_SIZE, key,
- CIFS_AUTH_RESP_SIZE);
- sess->sequence_number = 1;
-
- if (strncmp(pw_buf, key, CIFS_AUTH_RESP_SIZE) != 0) {
- ksmbd_debug(AUTH, "ntlmv1 authentication failed\n");
- return -EINVAL;
- }
-
- ksmbd_debug(AUTH, "ntlmv1 authentication pass\n");
- return 0;
-}
-
-/**
* ksmbd_auth_ntlmv2() - NTLMv2 authentication handler
* @sess: session of connection
* @ntlmv2: NTLMv2 challenge response
@@ -442,44 +286,6 @@ out:
}
/**
- * __ksmbd_auth_ntlmv2() - NTLM2(extended security) authentication handler
- * @sess: session of connection
- * @client_nonce: client nonce from LM response.
- * @ntlm_resp: ntlm response data from client.
- *
- * Return: 0 on success, error number on error
- */
-static int __ksmbd_auth_ntlmv2(struct ksmbd_session *sess, char *client_nonce,
- char *ntlm_resp)
-{
- char sess_key[CIFS_SMB1_SESSKEY_SIZE] = {0};
- int rc;
- unsigned char p21[21];
- char key[CIFS_AUTH_RESP_SIZE];
-
- rc = ksmbd_enc_update_sess_key(sess_key,
- client_nonce,
- (char *)sess->ntlmssp.cryptkey, 8);
- if (rc) {
- pr_err("password processing failed\n");
- goto out;
- }
-
- memset(p21, '\0', 21);
- memcpy(p21, user_passkey(sess->user), CIFS_NTHASH_SIZE);
- rc = ksmbd_enc_p24(p21, sess_key, key);
- if (rc) {
- pr_err("password processing failed\n");
- goto out;
- }
-
- if (memcmp(ntlm_resp, key, CIFS_AUTH_RESP_SIZE) != 0)
- rc = -EINVAL;
-out:
- return rc;
-}
-
-/**
* ksmbd_decode_ntlmssp_auth_blob() - helper function to construct
* authenticate blob
* @authblob: authenticate blob source pointer
@@ -492,8 +298,8 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
int blob_len, struct ksmbd_session *sess)
{
char *domain_name;
- unsigned int lm_off, nt_off;
- unsigned short nt_len;
+ unsigned int nt_off, dn_off;
+ unsigned short nt_len, dn_len;
int ret;
if (blob_len < sizeof(struct authenticate_message)) {
@@ -508,26 +314,17 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
return -EINVAL;
}
- lm_off = le32_to_cpu(authblob->LmChallengeResponse.BufferOffset);
nt_off = le32_to_cpu(authblob->NtChallengeResponse.BufferOffset);
nt_len = le16_to_cpu(authblob->NtChallengeResponse.Length);
+ dn_off = le32_to_cpu(authblob->DomainName.BufferOffset);
+ dn_len = le16_to_cpu(authblob->DomainName.Length);
- /* process NTLM authentication */
- if (nt_len == CIFS_AUTH_RESP_SIZE) {
- if (le32_to_cpu(authblob->NegotiateFlags) &
- NTLMSSP_NEGOTIATE_EXTENDED_SEC)
- return __ksmbd_auth_ntlmv2(sess, (char *)authblob +
- lm_off, (char *)authblob + nt_off);
- else
- return ksmbd_auth_ntlm(sess, (char *)authblob +
- nt_off);
- }
+ if (blob_len < (u64)dn_off + dn_len || blob_len < (u64)nt_off + nt_len)
+ return -EINVAL;
/* TODO : use domain name that imported from configuration file */
- domain_name = smb_strndup_from_utf16((const char *)authblob +
- le32_to_cpu(authblob->DomainName.BufferOffset),
- le16_to_cpu(authblob->DomainName.Length), true,
- sess->conn->local_nls);
+ domain_name = smb_strndup_from_utf16((const char *)authblob + dn_off,
+ dn_len, true, sess->conn->local_nls);
if (IS_ERR(domain_name))
return PTR_ERR(domain_name);
diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c
index af086d35398a..b57a0d8a392f 100644
--- a/fs/ksmbd/connection.c
+++ b/fs/ksmbd/connection.c
@@ -61,6 +61,8 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
conn->local_nls = load_nls_default();
atomic_set(&conn->req_running, 0);
atomic_set(&conn->r_count, 0);
+ conn->total_credits = 1;
+
init_waitqueue_head(&conn->req_running_q);
INIT_LIST_HEAD(&conn->conns_list);
INIT_LIST_HEAD(&conn->sessions);
@@ -296,10 +298,12 @@ int ksmbd_conn_handler_loop(void *p)
pdu_size = get_rfc1002_len(hdr_buf);
ksmbd_debug(CONN, "RFC1002 header %u bytes\n", pdu_size);
- /* make sure we have enough to get to SMB header end */
- if (!ksmbd_pdu_size_has_room(pdu_size)) {
- ksmbd_debug(CONN, "SMB request too short (%u bytes)\n",
- pdu_size);
+ /*
+ * Check if pdu size is valid (min : smb header size,
+ * max : 0x00FFFFFF).
+ */
+ if (pdu_size < __SMB2_HEADER_STRUCTURE_SIZE ||
+ pdu_size > MAX_STREAM_PROT_LEN) {
continue;
}
diff --git a/fs/ksmbd/crypto_ctx.c b/fs/ksmbd/crypto_ctx.c
index 5f4b1008d17e..81488d04199d 100644
--- a/fs/ksmbd/crypto_ctx.c
+++ b/fs/ksmbd/crypto_ctx.c
@@ -81,12 +81,6 @@ static struct shash_desc *alloc_shash_desc(int id)
case CRYPTO_SHASH_SHA512:
tfm = crypto_alloc_shash("sha512", 0, 0);
break;
- case CRYPTO_SHASH_MD4:
- tfm = crypto_alloc_shash("md4", 0, 0);
- break;
- case CRYPTO_SHASH_MD5:
- tfm = crypto_alloc_shash("md5", 0, 0);
- break;
default:
return NULL;
}
@@ -214,16 +208,6 @@ struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void)
return ____crypto_shash_ctx_find(CRYPTO_SHASH_SHA512);
}
-struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md4(void)
-{
- return ____crypto_shash_ctx_find(CRYPTO_SHASH_MD4);
-}
-
-struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md5(void)
-{
- return ____crypto_shash_ctx_find(CRYPTO_SHASH_MD5);
-}
-
static struct ksmbd_crypto_ctx *____crypto_aead_ctx_find(int id)
{
struct ksmbd_crypto_ctx *ctx;
diff --git a/fs/ksmbd/crypto_ctx.h b/fs/ksmbd/crypto_ctx.h
index ef11154b43df..4a367c62f653 100644
--- a/fs/ksmbd/crypto_ctx.h
+++ b/fs/ksmbd/crypto_ctx.h
@@ -15,8 +15,6 @@ enum {
CRYPTO_SHASH_CMACAES,
CRYPTO_SHASH_SHA256,
CRYPTO_SHASH_SHA512,
- CRYPTO_SHASH_MD4,
- CRYPTO_SHASH_MD5,
CRYPTO_SHASH_MAX,
};
@@ -43,8 +41,6 @@ struct ksmbd_crypto_ctx {
#define CRYPTO_CMACAES(c) ((c)->desc[CRYPTO_SHASH_CMACAES])
#define CRYPTO_SHA256(c) ((c)->desc[CRYPTO_SHASH_SHA256])
#define CRYPTO_SHA512(c) ((c)->desc[CRYPTO_SHASH_SHA512])
-#define CRYPTO_MD4(c) ((c)->desc[CRYPTO_SHASH_MD4])
-#define CRYPTO_MD5(c) ((c)->desc[CRYPTO_SHASH_MD5])
#define CRYPTO_HMACMD5_TFM(c) ((c)->desc[CRYPTO_SHASH_HMACMD5]->tfm)
#define CRYPTO_HMACSHA256_TFM(c)\
@@ -52,8 +48,6 @@ struct ksmbd_crypto_ctx {
#define CRYPTO_CMACAES_TFM(c) ((c)->desc[CRYPTO_SHASH_CMACAES]->tfm)
#define CRYPTO_SHA256_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA256]->tfm)
#define CRYPTO_SHA512_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA512]->tfm)
-#define CRYPTO_MD4_TFM(c) ((c)->desc[CRYPTO_SHASH_MD4]->tfm)
-#define CRYPTO_MD5_TFM(c) ((c)->desc[CRYPTO_SHASH_MD5]->tfm)
#define CRYPTO_GCM(c) ((c)->ccmaes[CRYPTO_AEAD_AES_GCM])
#define CRYPTO_CCM(c) ((c)->ccmaes[CRYPTO_AEAD_AES_CCM])
@@ -64,8 +58,6 @@ struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacsha256(void);
struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_cmacaes(void);
struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void);
struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha256(void);
-struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md4(void);
-struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md5(void);
struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_gcm(void);
struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_ccm(void);
void ksmbd_crypto_destroy(void);
diff --git a/fs/ksmbd/glob.h b/fs/ksmbd/glob.h
index 49a5a3afa118..5b8f3e0ebdb3 100644
--- a/fs/ksmbd/glob.h
+++ b/fs/ksmbd/glob.h
@@ -12,7 +12,7 @@
#include "unicode.h"
#include "vfs_cache.h"
-#define KSMBD_VERSION "3.1.9"
+#define KSMBD_VERSION "3.4.2"
extern int ksmbd_debug_types;
diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h
index 2fbe2bc1e093..c6718a05d347 100644
--- a/fs/ksmbd/ksmbd_netlink.h
+++ b/fs/ksmbd/ksmbd_netlink.h
@@ -211,6 +211,7 @@ struct ksmbd_tree_disconnect_request {
*/
struct ksmbd_logout_request {
__s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */
+ __u32 account_flags;
};
/*
@@ -317,6 +318,7 @@ enum KSMBD_TREE_CONN_STATUS {
#define KSMBD_USER_FLAG_BAD_UID BIT(2)
#define KSMBD_USER_FLAG_BAD_USER BIT(3)
#define KSMBD_USER_FLAG_GUEST_ACCOUNT BIT(4)
+#define KSMBD_USER_FLAG_DELAY_SESSION BIT(5)
/*
* Share config flags.
diff --git a/fs/ksmbd/mgmt/user_config.c b/fs/ksmbd/mgmt/user_config.c
index d21629ae5c89..1019d3677d55 100644
--- a/fs/ksmbd/mgmt/user_config.c
+++ b/fs/ksmbd/mgmt/user_config.c
@@ -55,7 +55,7 @@ struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp)
void ksmbd_free_user(struct ksmbd_user *user)
{
- ksmbd_ipc_logout_request(user->name);
+ ksmbd_ipc_logout_request(user->name, user->flags);
kfree(user->name);
kfree(user->passkey);
kfree(user);
diff --git a/fs/ksmbd/mgmt/user_config.h b/fs/ksmbd/mgmt/user_config.h
index b2bb074a0150..aff80b029579 100644
--- a/fs/ksmbd/mgmt/user_config.h
+++ b/fs/ksmbd/mgmt/user_config.h
@@ -18,6 +18,7 @@ struct ksmbd_user {
size_t passkey_sz;
char *passkey;
+ unsigned int failed_login_count;
};
static inline bool user_guest(struct ksmbd_user *user)
diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c
index 0b307ca28a19..60e7ac62c917 100644
--- a/fs/ksmbd/misc.c
+++ b/fs/ksmbd/misc.c
@@ -158,25 +158,18 @@ out:
* Return : windows path string or error
*/
-char *convert_to_nt_pathname(char *filename, char *sharepath)
+char *convert_to_nt_pathname(char *filename)
{
char *ab_pathname;
- int len, name_len;
- name_len = strlen(filename);
- ab_pathname = kmalloc(name_len, GFP_KERNEL);
+ if (strlen(filename) == 0)
+ filename = "\\";
+
+ ab_pathname = kstrdup(filename, GFP_KERNEL);
if (!ab_pathname)
return NULL;
- ab_pathname[0] = '\\';
- ab_pathname[1] = '\0';
-
- len = strlen(sharepath);
- if (!strncmp(filename, sharepath, len) && name_len != len) {
- strscpy(ab_pathname, &filename[len], name_len);
- ksmbd_conv_path_to_windows(ab_pathname);
- }
-
+ ksmbd_conv_path_to_windows(ab_pathname);
return ab_pathname;
}
@@ -240,7 +233,7 @@ char *ksmbd_extract_sharename(char *treename)
*
* Return: converted name on success, otherwise NULL
*/
-char *convert_to_unix_name(struct ksmbd_share_config *share, char *name)
+char *convert_to_unix_name(struct ksmbd_share_config *share, const char *name)
{
int no_slash = 0, name_len, path_len;
char *new_name;
diff --git a/fs/ksmbd/misc.h b/fs/ksmbd/misc.h
index af8717d4d85b..253366bd0951 100644
--- a/fs/ksmbd/misc.h
+++ b/fs/ksmbd/misc.h
@@ -14,13 +14,13 @@ struct ksmbd_file;
int match_pattern(const char *str, size_t len, const char *pattern);
int ksmbd_validate_filename(char *filename);
int parse_stream_name(char *filename, char **stream_name, int *s_type);
-char *convert_to_nt_pathname(char *filename, char *sharepath);
+char *convert_to_nt_pathname(char *filename);
int get_nlink(struct kstat *st);
void ksmbd_conv_path_to_unix(char *path);
void ksmbd_strip_last_slash(char *path);
void ksmbd_conv_path_to_windows(char *path);
char *ksmbd_extract_sharename(char *treename);
-char *convert_to_unix_name(struct ksmbd_share_config *share, char *name);
+char *convert_to_unix_name(struct ksmbd_share_config *share, const char *name);
#define KSMBD_DIR_INFO_ALIGNMENT 8
struct ksmbd_dir_info;
diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c
index 16b6236d1bd2..f9dae6ef2115 100644
--- a/fs/ksmbd/oplock.c
+++ b/fs/ksmbd/oplock.c
@@ -1451,26 +1451,47 @@ struct lease_ctx_info *parse_lease_state(void *open_req)
*/
struct create_context *smb2_find_context_vals(void *open_req, const char *tag)
{
- char *data_offset;
struct create_context *cc;
unsigned int next = 0;
char *name;
struct smb2_create_req *req = (struct smb2_create_req *)open_req;
+ unsigned int remain_len, name_off, name_len, value_off, value_len,
+ cc_len;
- data_offset = (char *)req + 4 + le32_to_cpu(req->CreateContextsOffset);
- cc = (struct create_context *)data_offset;
+ /*
+ * CreateContextsOffset and CreateContextsLength are guaranteed to
+ * be valid because of ksmbd_smb2_check_message().
+ */
+ cc = (struct create_context *)((char *)req + 4 +
+ le32_to_cpu(req->CreateContextsOffset));
+ remain_len = le32_to_cpu(req->CreateContextsLength);
do {
- int val;
-
cc = (struct create_context *)((char *)cc + next);
- name = le16_to_cpu(cc->NameOffset) + (char *)cc;
- val = le16_to_cpu(cc->NameLength);
- if (val < 4)
+ if (remain_len < offsetof(struct create_context, Buffer))
return ERR_PTR(-EINVAL);
- if (memcmp(name, tag, val) == 0)
- return cc;
next = le32_to_cpu(cc->Next);
+ name_off = le16_to_cpu(cc->NameOffset);
+ name_len = le16_to_cpu(cc->NameLength);
+ value_off = le16_to_cpu(cc->DataOffset);
+ value_len = le32_to_cpu(cc->DataLength);
+ cc_len = next ? next : remain_len;
+
+ if ((next & 0x7) != 0 ||
+ next > remain_len ||
+ name_off != offsetof(struct create_context, Buffer) ||
+ name_len < 4 ||
+ name_off + name_len > cc_len ||
+ (value_off & 0x7) != 0 ||
+ (value_off && (value_off < name_off + name_len)) ||
+ ((u64)value_off + value_len > cc_len))
+ return ERR_PTR(-EINVAL);
+
+ name = (char *)cc + name_off;
+ if (memcmp(name, tag, name_len) == 0)
+ return cc;
+
+ remain_len -= next;
} while (next != 0);
return NULL;
diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c
index e6a9f6aa47eb..2a2b2135bfde 100644
--- a/fs/ksmbd/server.c
+++ b/fs/ksmbd/server.c
@@ -584,6 +584,9 @@ static int __init ksmbd_server_init(void)
ret = ksmbd_workqueue_init();
if (ret)
goto err_crypto_destroy;
+
+ pr_warn_once("The ksmbd server is experimental, use at your own risk.\n");
+
return 0;
err_crypto_destroy:
diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c
index 9aa46bb3e10d..030ca57c3784 100644
--- a/fs/ksmbd/smb2misc.c
+++ b/fs/ksmbd/smb2misc.c
@@ -80,18 +80,21 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = {
};
/*
- * Returns the pointer to the beginning of the data area. Length of the data
- * area and the offset to it (from the beginning of the smb are also returned.
+ * Set length of the data area and the offset to arguments.
+ * if they are invalid, return error.
*/
-static char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
+static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
+ struct smb2_hdr *hdr)
{
+ int ret = 0;
+
*off = 0;
*len = 0;
/* error reqeusts do not have data area */
if (hdr->Status && hdr->Status != STATUS_MORE_PROCESSING_REQUIRED &&
(((struct smb2_err_rsp *)hdr)->StructureSize) == SMB2_ERROR_STRUCTURE_SIZE2_LE)
- return NULL;
+ return ret;
/*
* Following commands have data areas so we have to get the location
@@ -165,69 +168,60 @@ static char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
case SMB2_IOCTL:
*off = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputOffset);
*len = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputCount);
-
break;
default:
ksmbd_debug(SMB, "no length check for command\n");
break;
}
- /*
- * Invalid length or offset probably means data area is invalid, but
- * we have little choice but to ignore the data area in this case.
- */
if (*off > 4096) {
- ksmbd_debug(SMB, "offset %d too large, data area ignored\n",
- *off);
- *len = 0;
- *off = 0;
- } else if (*off < 0) {
- ksmbd_debug(SMB,
- "negative offset %d to data invalid ignore data area\n",
- *off);
- *off = 0;
- *len = 0;
- } else if (*len < 0) {
- ksmbd_debug(SMB,
- "negative data length %d invalid, data area ignored\n",
- *len);
- *len = 0;
- } else if (*len > 128 * 1024) {
- ksmbd_debug(SMB, "data area larger than 128K: %d\n", *len);
- *len = 0;
+ ksmbd_debug(SMB, "offset %d too large\n", *off);
+ ret = -EINVAL;
+ } else if ((u64)*off + *len > MAX_STREAM_PROT_LEN) {
+ ksmbd_debug(SMB, "Request is larger than maximum stream protocol length(%u): %llu\n",
+ MAX_STREAM_PROT_LEN, (u64)*off + *len);
+ ret = -EINVAL;
}
- /* return pointer to beginning of data area, ie offset from SMB start */
- if ((*off != 0) && (*len != 0))
- return (char *)hdr + *off;
- else
- return NULL;
+ return ret;
}
/*
* Calculate the size of the SMB message based on the fixed header
* portion, the number of word parameters and the data portion of the message.
*/
-static unsigned int smb2_calc_size(void *buf)
+static int smb2_calc_size(void *buf, unsigned int *len)
{
struct smb2_pdu *pdu = (struct smb2_pdu *)buf;
struct smb2_hdr *hdr = &pdu->hdr;
- int offset; /* the offset from the beginning of SMB to data area */
- int data_length; /* the length of the variable length data area */
+ unsigned int offset; /* the offset from the beginning of SMB to data area */
+ unsigned int data_length; /* the length of the variable length data area */
+ int ret;
+
/* Structure Size has already been checked to make sure it is 64 */
- int len = le16_to_cpu(hdr->StructureSize);
+ *len = le16_to_cpu(hdr->StructureSize);
/*
* StructureSize2, ie length of fixed parameter area has already
* been checked to make sure it is the correct length.
*/
- len += le16_to_cpu(pdu->StructureSize2);
+ *len += le16_to_cpu(pdu->StructureSize2);
+ /*
+ * StructureSize2 of smb2_lock pdu is set to 48, indicating
+ * the size of smb2 lock request with single smb2_lock_element
+ * regardless of number of locks. Subtract single
+ * smb2_lock_element for correct buffer size check.
+ */
+ if (hdr->Command == SMB2_LOCK)
+ *len -= sizeof(struct smb2_lock_element);
if (has_smb2_data_area[le16_to_cpu(hdr->Command)] == false)
goto calc_size_exit;
- smb2_get_data_area_len(&offset, &data_length, hdr);
- ksmbd_debug(SMB, "SMB2 data length %d offset %d\n", data_length,
+ ret = smb2_get_data_area_len(&offset, &data_length, hdr);
+ if (ret)
+ return ret;
+ ksmbd_debug(SMB, "SMB2 data length %u offset %u\n", data_length,
offset);
if (data_length > 0) {
@@ -237,16 +231,19 @@ static unsigned int smb2_calc_size(void *buf)
* for some commands, typically those with odd StructureSize,
* so we must add one to the calculation.
*/
- if (offset + 1 < len)
+ if (offset + 1 < *len) {
ksmbd_debug(SMB,
- "data area offset %d overlaps SMB2 header %d\n",
- offset + 1, len);
- else
- len = offset + data_length;
+ "data area offset %d overlaps SMB2 header %u\n",
+ offset + 1, *len);
+ return -EINVAL;
+ }
+
+ *len = offset + data_length;
}
+
calc_size_exit:
- ksmbd_debug(SMB, "SMB2 len %d\n", len);
- return len;
+ ksmbd_debug(SMB, "SMB2 len %u\n", *len);
+ return 0;
}
static inline int smb2_query_info_req_len(struct smb2_query_info_req *h)
@@ -287,11 +284,13 @@ static inline int smb2_ioctl_resp_len(struct smb2_ioctl_req *h)
le32_to_cpu(h->MaxOutputResponse);
}
-static int smb2_validate_credit_charge(struct smb2_hdr *hdr)
+static int smb2_validate_credit_charge(struct ksmbd_conn *conn,
+ struct smb2_hdr *hdr)
{
- int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len;
- int credit_charge = le16_to_cpu(hdr->CreditCharge);
+ unsigned int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len;
+ unsigned short credit_charge = le16_to_cpu(hdr->CreditCharge);
void *__hdr = hdr;
+ int ret;
switch (hdr->Command) {
case SMB2_QUERY_INFO:
@@ -313,21 +312,37 @@ static int smb2_validate_credit_charge(struct smb2_hdr *hdr)
req_len = smb2_ioctl_req_len(__hdr);
expect_resp_len = smb2_ioctl_resp_len(__hdr);
break;
- default:
+ case SMB2_CANCEL:
return 0;
+ default:
+ req_len = 1;
+ break;
}
- credit_charge = max(1, credit_charge);
- max_len = max(req_len, expect_resp_len);
+ credit_charge = max_t(unsigned short, credit_charge, 1);
+ max_len = max_t(unsigned int, req_len, expect_resp_len);
calc_credit_num = DIV_ROUND_UP(max_len, SMB2_MAX_BUFFER_SIZE);
if (credit_charge < calc_credit_num) {
- pr_err("Insufficient credit charge, given: %d, needed: %d\n",
- credit_charge, calc_credit_num);
+ ksmbd_debug(SMB, "Insufficient credit charge, given: %d, needed: %d\n",
+ credit_charge, calc_credit_num);
+ return 1;
+ } else if (credit_charge > conn->max_credits) {
+ ksmbd_debug(SMB, "Too large credit charge: %d\n", credit_charge);
return 1;
}
- return 0;
+ spin_lock(&conn->credits_lock);
+ if (credit_charge <= conn->total_credits) {
+ conn->total_credits -= credit_charge;
+ ret = 0;
+ } else {
+ ksmbd_debug(SMB, "Insufficient credits granted, given: %u, granted: %u\n",
+ credit_charge, conn->total_credits);
+ ret = 1;
+ }
+ spin_unlock(&conn->credits_lock);
+ return ret;
}
int ksmbd_smb2_check_message(struct ksmbd_work *work)
@@ -385,24 +400,20 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
}
}
- if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) &&
- smb2_validate_credit_charge(hdr)) {
- work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER);
+ if (smb2_calc_size(hdr, &clc_len))
return 1;
- }
- clc_len = smb2_calc_size(hdr);
if (len != clc_len) {
- /* server can return one byte more due to implied bcc[0] */
+ /* client can return one byte more due to implied bcc[0] */
if (clc_len == len + 1)
- return 0;
+ goto validate_credit;
/*
* Some windows servers (win2016) will pad also the final
* PDU in a compound to 8 bytes.
*/
if (ALIGN(clc_len, 8) == len)
- return 0;
+ goto validate_credit;
/*
* windows client also pad up to 8 bytes when compounding.
@@ -415,12 +426,9 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
"cli req padded more than expected. Length %d not %d for cmd:%d mid:%llu\n",
len, clc_len, command,
le64_to_cpu(hdr->MessageId));
- return 0;
+ goto validate_credit;
}
- if (command == SMB2_LOCK_HE && len == 88)
- return 0;
-
ksmbd_debug(SMB,
"cli req too short, len %d not %d. cmd:%d mid:%llu\n",
len, clc_len, command,
@@ -429,6 +437,13 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
return 1;
}
+validate_credit:
+ if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) &&
+ smb2_validate_credit_charge(work->conn, hdr)) {
+ work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER);
+ return 1;
+ }
+
return 0;
}
diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c
index 197473871aa4..fb6a65d23139 100644
--- a/fs/ksmbd/smb2ops.c
+++ b/fs/ksmbd/smb2ops.c
@@ -187,11 +187,6 @@ static struct smb_version_cmds smb2_0_server_cmds[NUMBER_OF_SMB2_COMMANDS] = {
[SMB2_CHANGE_NOTIFY_HE] = { .proc = smb2_notify},
};
-int init_smb2_0_server(struct ksmbd_conn *conn)
-{
- return -EOPNOTSUPP;
-}
-
/**
* init_smb2_1_server() - initialize a smb server connection with smb2.1
* command dispatcher
@@ -289,6 +284,7 @@ int init_smb3_11_server(struct ksmbd_conn *conn)
void init_smb2_max_read_size(unsigned int sz)
{
+ sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE);
smb21_server_values.max_read_size = sz;
smb30_server_values.max_read_size = sz;
smb302_server_values.max_read_size = sz;
@@ -297,6 +293,7 @@ void init_smb2_max_read_size(unsigned int sz)
void init_smb2_max_write_size(unsigned int sz)
{
+ sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE);
smb21_server_values.max_write_size = sz;
smb30_server_values.max_write_size = sz;
smb302_server_values.max_write_size = sz;
@@ -305,6 +302,7 @@ void init_smb2_max_write_size(unsigned int sz)
void init_smb2_max_trans_size(unsigned int sz)
{
+ sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE);
smb21_server_values.max_trans_size = sz;
smb30_server_values.max_trans_size = sz;
smb302_server_values.max_trans_size = sz;
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index c86164dc70bb..7e448df3f847 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -236,9 +236,6 @@ int init_smb2_neg_rsp(struct ksmbd_work *work)
if (conn->need_neg == false)
return -EINVAL;
- if (!(conn->dialect >= SMB20_PROT_ID &&
- conn->dialect <= SMB311_PROT_ID))
- return -EINVAL;
rsp_hdr = work->response_buf;
@@ -295,22 +292,6 @@ int init_smb2_neg_rsp(struct ksmbd_work *work)
return 0;
}
-static int smb2_consume_credit_charge(struct ksmbd_work *work,
- unsigned short credit_charge)
-{
- struct ksmbd_conn *conn = work->conn;
- unsigned int rsp_credits = 1;
-
- if (!conn->total_credits)
- return 0;
-
- if (credit_charge > 0)
- rsp_credits = credit_charge;
-
- conn->total_credits -= rsp_credits;
- return rsp_credits;
-}
-
/**
* smb2_set_rsp_credits() - set number of credits in response buffer
* @work: smb work containing smb response buffer
@@ -320,49 +301,43 @@ int smb2_set_rsp_credits(struct ksmbd_work *work)
struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work);
struct smb2_hdr *hdr = ksmbd_resp_buf_next(work);
struct ksmbd_conn *conn = work->conn;
- unsigned short credits_requested = le16_to_cpu(req_hdr->CreditRequest);
- unsigned short credit_charge = 1, credits_granted = 0;
- unsigned short aux_max, aux_credits, min_credits;
- int rsp_credit_charge;
+ unsigned short credits_requested;
+ unsigned short credit_charge, credits_granted = 0;
+ unsigned short aux_max, aux_credits;
- if (hdr->Command == SMB2_CANCEL)
- goto out;
+ if (work->send_no_response)
+ return 0;
- /* get default minimum credits by shifting maximum credits by 4 */
- min_credits = conn->max_credits >> 4;
+ hdr->CreditCharge = req_hdr->CreditCharge;
- if (conn->total_credits >= conn->max_credits) {
+ if (conn->total_credits > conn->max_credits) {
+ hdr->CreditRequest = 0;
pr_err("Total credits overflow: %d\n", conn->total_credits);
- conn->total_credits = min_credits;
- }
-
- rsp_credit_charge =
- smb2_consume_credit_charge(work, le16_to_cpu(req_hdr->CreditCharge));
- if (rsp_credit_charge < 0)
return -EINVAL;
+ }
- hdr->CreditCharge = cpu_to_le16(rsp_credit_charge);
+ credit_charge = max_t(unsigned short,
+ le16_to_cpu(req_hdr->CreditCharge), 1);
+ credits_requested = max_t(unsigned short,
+ le16_to_cpu(req_hdr->CreditRequest), 1);
- if (credits_requested > 0) {
- aux_credits = credits_requested - 1;
- aux_max = 32;
- if (hdr->Command == SMB2_NEGOTIATE)
- aux_max = 0;
- aux_credits = (aux_credits < aux_max) ? aux_credits : aux_max;
- credits_granted = aux_credits + credit_charge;
+ /* according to smb2.credits smbtorture, Windows server
+ * 2016 or later grant up to 8192 credits at once.
+ *
+ * TODO: Need to adjuct CreditRequest value according to
+ * current cpu load
+ */
+ aux_credits = credits_requested - 1;
+ if (hdr->Command == SMB2_NEGOTIATE)
+ aux_max = 0;
+ else
+ aux_max = conn->max_credits - credit_charge;
+ aux_credits = min_t(unsigned short, aux_credits, aux_max);
+ credits_granted = credit_charge + aux_credits;
- /* if credits granted per client is getting bigger than default
- * minimum credits then we should wrap it up within the limits.
- */
- if ((conn->total_credits + credits_granted) > min_credits)
- credits_granted = min_credits - conn->total_credits;
- /*
- * TODO: Need to adjuct CreditRequest value according to
- * current cpu load
- */
- } else if (conn->total_credits == 0) {
- credits_granted = 1;
- }
+ if (conn->max_credits - conn->total_credits < credits_granted)
+ credits_granted = conn->max_credits -
+ conn->total_credits;
conn->total_credits += credits_granted;
work->credits_granted += credits_granted;
@@ -371,7 +346,6 @@ int smb2_set_rsp_credits(struct ksmbd_work *work)
/* Update CreditRequest in last request */
hdr->CreditRequest = cpu_to_le16(work->credits_granted);
}
-out:
ksmbd_debug(SMB,
"credits: requested[%d] granted[%d] total_granted[%d]\n",
credits_requested, credits_granted,
@@ -433,7 +407,7 @@ static void init_chained_smb2_rsp(struct ksmbd_work *work)
work->compound_pfid = KSMBD_NO_FID;
}
memset((char *)rsp_hdr + 4, 0, sizeof(struct smb2_hdr) + 2);
- rsp_hdr->ProtocolId = rcv_hdr->ProtocolId;
+ rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER;
rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE;
rsp_hdr->Command = rcv_hdr->Command;
@@ -459,13 +433,28 @@ static void init_chained_smb2_rsp(struct ksmbd_work *work)
bool is_chained_smb2_message(struct ksmbd_work *work)
{
struct smb2_hdr *hdr = work->request_buf;
- unsigned int len;
+ unsigned int len, next_cmd;
if (hdr->ProtocolId != SMB2_PROTO_NUMBER)
return false;
hdr = ksmbd_req_buf_next(work);
- if (le32_to_cpu(hdr->NextCommand) > 0) {
+ next_cmd = le32_to_cpu(hdr->NextCommand);
+ if (next_cmd > 0) {
+ if ((u64)work->next_smb2_rcv_hdr_off + next_cmd +
+ __SMB2_HEADER_STRUCTURE_SIZE >
+ get_rfc1002_len(work->request_buf)) {
+ pr_err("next command(%u) offset exceeds smb msg size\n",
+ next_cmd);
+ return false;
+ }
+
+ if ((u64)get_rfc1002_len(work->response_buf) + MAX_CIFS_SMALL_BUFFER_SIZE >
+ work->response_sz) {
+ pr_err("next response offset exceeds response buffer size\n");
+ return false;
+ }
+
ksmbd_debug(SMB, "got SMB2 chained command\n");
init_chained_smb2_rsp(work);
return true;
@@ -535,7 +524,7 @@ int smb2_allocate_rsp_buf(struct ksmbd_work *work)
{
struct smb2_hdr *hdr = work->request_buf;
size_t small_sz = MAX_CIFS_SMALL_BUFFER_SIZE;
- size_t large_sz = work->conn->vals->max_trans_size + MAX_SMB2_HDR_SIZE;
+ size_t large_sz = small_sz + work->conn->vals->max_trans_size;
size_t sz = small_sz;
int cmd = le16_to_cpu(hdr->Command);
@@ -634,7 +623,7 @@ static char *
smb2_get_name(struct ksmbd_share_config *share, const char *src,
const int maxlen, struct nls_table *local_nls)
{
- char *name, *unixname;
+ char *name;
name = smb_strndup_from_utf16(src, maxlen, 1, local_nls);
if (IS_ERR(name)) {
@@ -642,19 +631,9 @@ smb2_get_name(struct ksmbd_share_config *share, const char *src,
return name;
}
- /* change it to absolute unix name */
ksmbd_conv_path_to_unix(name);
ksmbd_strip_last_slash(name);
-
- unixname = convert_to_unix_name(share, name);
- kfree(name);
- if (!unixname) {
- pr_err("can not convert absolute name\n");
- return ERR_PTR(-ENOMEM);
- }
-
- ksmbd_debug(SMB, "absolute name = %s\n", unixname);
- return unixname;
+ return name;
}
int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg)
@@ -1068,6 +1047,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
struct smb2_negotiate_req *req = work->request_buf;
struct smb2_negotiate_rsp *rsp = work->response_buf;
int rc = 0;
+ unsigned int smb2_buf_len, smb2_neg_size;
__le32 status;
ksmbd_debug(SMB, "Received negotiate request\n");
@@ -1085,6 +1065,44 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
goto err_out;
}
+ smb2_buf_len = get_rfc1002_len(work->request_buf);
+ smb2_neg_size = offsetof(struct smb2_negotiate_req, Dialects) - 4;
+ if (smb2_neg_size > smb2_buf_len) {
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ rc = -EINVAL;
+ goto err_out;
+ }
+
+ if (conn->dialect == SMB311_PROT_ID) {
+ unsigned int nego_ctxt_off = le32_to_cpu(req->NegotiateContextOffset);
+
+ if (smb2_buf_len < nego_ctxt_off) {
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ rc = -EINVAL;
+ goto err_out;
+ }
+
+ if (smb2_neg_size > nego_ctxt_off) {
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ rc = -EINVAL;
+ goto err_out;
+ }
+
+ if (smb2_neg_size + le16_to_cpu(req->DialectCount) * sizeof(__le16) >
+ nego_ctxt_off) {
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ rc = -EINVAL;
+ goto err_out;
+ }
+ } else {
+ if (smb2_neg_size + le16_to_cpu(req->DialectCount) * sizeof(__le16) >
+ smb2_buf_len) {
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ rc = -EINVAL;
+ goto err_out;
+ }
+ }
+
conn->cli_cap = le32_to_cpu(req->Capabilities);
switch (conn->dialect) {
case SMB311_PROT_ID:
@@ -1128,13 +1146,6 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
case SMB21_PROT_ID:
init_smb2_1_server(conn);
break;
- case SMB20_PROT_ID:
- rc = init_smb2_0_server(conn);
- if (rc) {
- rsp->hdr.Status = STATUS_NOT_SUPPORTED;
- goto err_out;
- }
- break;
case SMB2X_PROT_ID:
case BAD_PROT_ID:
default:
@@ -1153,11 +1164,9 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
rsp->MaxReadSize = cpu_to_le32(conn->vals->max_read_size);
rsp->MaxWriteSize = cpu_to_le32(conn->vals->max_write_size);
- if (conn->dialect > SMB20_PROT_ID) {
- memcpy(conn->ClientGUID, req->ClientGUID,
- SMB2_CLIENT_GUID_SIZE);
- conn->cli_sec_mode = le16_to_cpu(req->SecurityMode);
- }
+ memcpy(conn->ClientGUID, req->ClientGUID,
+ SMB2_CLIENT_GUID_SIZE);
+ conn->cli_sec_mode = le16_to_cpu(req->SecurityMode);
rsp->StructureSize = cpu_to_le16(65);
rsp->DialectRevision = cpu_to_le16(conn->dialect);
@@ -1248,19 +1257,13 @@ static int generate_preauth_hash(struct ksmbd_work *work)
return 0;
}
-static int decode_negotiation_token(struct ksmbd_work *work,
- struct negotiate_message *negblob)
+static int decode_negotiation_token(struct ksmbd_conn *conn,
+ struct negotiate_message *negblob,
+ size_t sz)
{
- struct ksmbd_conn *conn = work->conn;
- struct smb2_sess_setup_req *req;
- int sz;
-
if (!conn->use_spnego)
return -EINVAL;
- req = work->request_buf;
- sz = le16_to_cpu(req->SecurityBufferLength);
-
if (ksmbd_decode_negTokenInit((char *)negblob, sz, conn)) {
if (ksmbd_decode_negTokenTarg((char *)negblob, sz, conn)) {
conn->auth_mechs |= KSMBD_AUTH_NTLMSSP;
@@ -1272,9 +1275,9 @@ static int decode_negotiation_token(struct ksmbd_work *work,
}
static int ntlm_negotiate(struct ksmbd_work *work,
- struct negotiate_message *negblob)
+ struct negotiate_message *negblob,
+ size_t negblob_len)
{
- struct smb2_sess_setup_req *req = work->request_buf;
struct smb2_sess_setup_rsp *rsp = work->response_buf;
struct challenge_message *chgblob;
unsigned char *spnego_blob = NULL;
@@ -1283,8 +1286,7 @@ static int ntlm_negotiate(struct ksmbd_work *work,
int sz, rc;
ksmbd_debug(SMB, "negotiate phase\n");
- sz = le16_to_cpu(req->SecurityBufferLength);
- rc = ksmbd_decode_ntlmssp_neg_blob(negblob, sz, work->sess);
+ rc = ksmbd_decode_ntlmssp_neg_blob(negblob, negblob_len, work->sess);
if (rc)
return rc;
@@ -1352,12 +1354,23 @@ static struct ksmbd_user *session_user(struct ksmbd_conn *conn,
struct authenticate_message *authblob;
struct ksmbd_user *user;
char *name;
- int sz;
+ unsigned int auth_msg_len, name_off, name_len, secbuf_len;
+ secbuf_len = le16_to_cpu(req->SecurityBufferLength);
+ if (secbuf_len < sizeof(struct authenticate_message)) {
+ ksmbd_debug(SMB, "blob len %d too small\n", secbuf_len);
+ return NULL;
+ }
authblob = user_authblob(conn, req);
- sz = le32_to_cpu(authblob->UserName.BufferOffset);
- name = smb_strndup_from_utf16((const char *)authblob + sz,
- le16_to_cpu(authblob->UserName.Length),
+ name_off = le32_to_cpu(authblob->UserName.BufferOffset);
+ name_len = le16_to_cpu(authblob->UserName.Length);
+ auth_msg_len = le16_to_cpu(req->SecurityBufferOffset) + secbuf_len;
+
+ if (auth_msg_len < (u64)name_off + name_len)
+ return NULL;
+
+ name = smb_strndup_from_utf16((const char *)authblob + name_off,
+ name_len,
true,
conn->local_nls);
if (IS_ERR(name)) {
@@ -1499,11 +1512,9 @@ binding_session:
}
}
- if (conn->dialect > SMB20_PROT_ID) {
- if (!ksmbd_conn_lookup_dialect(conn)) {
- pr_err("fail to verify the dialect\n");
- return -ENOENT;
- }
+ if (!ksmbd_conn_lookup_dialect(conn)) {
+ pr_err("fail to verify the dialect\n");
+ return -ENOENT;
}
return 0;
}
@@ -1585,11 +1596,9 @@ static int krb5_authenticate(struct ksmbd_work *work)
}
}
- if (conn->dialect > SMB20_PROT_ID) {
- if (!ksmbd_conn_lookup_dialect(conn)) {
- pr_err("fail to verify the dialect\n");
- return -ENOENT;
- }
+ if (!ksmbd_conn_lookup_dialect(conn)) {
+ pr_err("fail to verify the dialect\n");
+ return -ENOENT;
}
return 0;
}
@@ -1607,6 +1616,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
struct smb2_sess_setup_rsp *rsp = work->response_buf;
struct ksmbd_session *sess;
struct negotiate_message *negblob;
+ unsigned int negblob_len, negblob_off;
int rc = 0;
ksmbd_debug(SMB, "Received request for session setup\n");
@@ -1687,10 +1697,16 @@ int smb2_sess_setup(struct ksmbd_work *work)
if (sess->state == SMB2_SESSION_EXPIRED)
sess->state = SMB2_SESSION_IN_PROGRESS;
+ negblob_off = le16_to_cpu(req->SecurityBufferOffset);
+ negblob_len = le16_to_cpu(req->SecurityBufferLength);
+ if (negblob_off < (offsetof(struct smb2_sess_setup_req, Buffer) - 4) ||
+ negblob_len < offsetof(struct negotiate_message, NegotiateFlags))
+ return -EINVAL;
+
negblob = (struct negotiate_message *)((char *)&req->hdr.ProtocolId +
- le16_to_cpu(req->SecurityBufferOffset));
+ negblob_off);
- if (decode_negotiation_token(work, negblob) == 0) {
+ if (decode_negotiation_token(conn, negblob, negblob_len) == 0) {
if (conn->mechToken)
negblob = (struct negotiate_message *)conn->mechToken;
}
@@ -1714,7 +1730,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
sess->Preauth_HashValue = NULL;
} else if (conn->preferred_auth_mech == KSMBD_AUTH_NTLMSSP) {
if (negblob->MessageType == NtLmNegotiate) {
- rc = ntlm_negotiate(work, negblob);
+ rc = ntlm_negotiate(work, negblob, negblob_len);
if (rc)
goto out_err;
rsp->hdr.Status =
@@ -1774,9 +1790,30 @@ out_err:
conn->mechToken = NULL;
}
- if (rc < 0 && sess) {
- ksmbd_session_destroy(sess);
- work->sess = NULL;
+ if (rc < 0) {
+ /*
+ * SecurityBufferOffset should be set to zero
+ * in session setup error response.
+ */
+ rsp->SecurityBufferOffset = 0;
+
+ if (sess) {
+ bool try_delay = false;
+
+ /*
+ * To avoid dictionary attacks (repeated session setups rapidly sent) to
+ * connect to server, ksmbd make a delay of a 5 seconds on session setup
+ * failure to make it harder to send enough random connection requests
+ * to break into a server.
+ */
+ if (sess->user && sess->user->flags & KSMBD_USER_FLAG_DELAY_SESSION)
+ try_delay = true;
+
+ ksmbd_session_destroy(sess);
+ work->sess = NULL;
+ if (try_delay)
+ ssleep(5);
+ }
}
return rc;
@@ -2103,16 +2140,22 @@ out:
* smb2_set_ea() - handler for setting extended attributes using set
* info command
* @eabuf: set info command buffer
+ * @buf_len: set info command buffer length
* @path: dentry path for get ea
*
* Return: 0 on success, otherwise error
*/
-static int smb2_set_ea(struct smb2_ea_info *eabuf, struct path *path)
+static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len,
+ struct path *path)
{
struct user_namespace *user_ns = mnt_user_ns(path->mnt);
char *attr_name = NULL, *value;
int rc = 0;
- int next = 0;
+ unsigned int next = 0;
+
+ if (buf_len < sizeof(struct smb2_ea_info) + eabuf->EaNameLength +
+ le16_to_cpu(eabuf->EaValueLength))
+ return -EINVAL;
attr_name = kmalloc(XATTR_NAME_MAX + 1, GFP_KERNEL);
if (!attr_name)
@@ -2177,7 +2220,13 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, struct path *path)
next:
next = le32_to_cpu(eabuf->NextEntryOffset);
+ if (next == 0 || buf_len < next)
+ break;
+ buf_len -= next;
eabuf = (struct smb2_ea_info *)((char *)eabuf + next);
+ if (next < (u32)eabuf->EaNameLength + le16_to_cpu(eabuf->EaValueLength))
+ break;
+
} while (next != 0);
kfree(attr_name);
@@ -2348,7 +2397,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name,
return rc;
}
- rc = ksmbd_vfs_kern_path(name, 0, path, 0);
+ rc = ksmbd_vfs_kern_path(work, name, 0, path, 0);
if (rc) {
pr_err("cannot get linux path (%s), err = %d\n",
name, rc);
@@ -2377,6 +2426,10 @@ static int smb2_create_sd_buffer(struct ksmbd_work *work,
ksmbd_debug(SMB,
"Set ACLs using SMB2_CREATE_SD_BUFFER context\n");
sd_buf = (struct create_sd_buf_req *)context;
+ if (le16_to_cpu(context->DataOffset) +
+ le32_to_cpu(context->DataLength) <
+ sizeof(struct create_sd_buf_req))
+ return -EINVAL;
return set_info_sec(work->conn, work->tcon, path, &sd_buf->ntsd,
le32_to_cpu(sd_buf->ccontext.DataLength), true);
}
@@ -2423,7 +2476,7 @@ int smb2_open(struct ksmbd_work *work)
struct oplock_info *opinfo;
__le32 *next_ptr = NULL;
int req_op_level = 0, open_flags = 0, may_flags = 0, file_info = 0;
- int rc = 0, len = 0;
+ int rc = 0;
int contxt_cnt = 0, query_disk_id = 0;
int maximal_access_ctxt = 0, posix_ctxt = 0;
int s_type = 0;
@@ -2495,17 +2548,11 @@ int smb2_open(struct ksmbd_work *work)
goto err_out1;
}
} else {
- len = strlen(share->path);
- ksmbd_debug(SMB, "share path len %d\n", len);
- name = kmalloc(len + 1, GFP_KERNEL);
+ name = kstrdup("", GFP_KERNEL);
if (!name) {
- rsp->hdr.Status = STATUS_NO_MEMORY;
rc = -ENOMEM;
goto err_out1;
}
-
- memcpy(name, share->path, len);
- *(name + len) = '\0';
}
req_op_level = req->RequestedOplockLevel;
@@ -2577,6 +2624,12 @@ int smb2_open(struct ksmbd_work *work)
goto err_out1;
} else if (context) {
ea_buf = (struct create_ea_buf_req *)context;
+ if (le16_to_cpu(context->DataOffset) +
+ le32_to_cpu(context->DataLength) <
+ sizeof(struct create_ea_buf_req)) {
+ rc = -EINVAL;
+ goto err_out1;
+ }
if (req->CreateOptions & FILE_NO_EA_KNOWLEDGE_LE) {
rsp->hdr.Status = STATUS_ACCESS_DENIED;
rc = -EACCES;
@@ -2615,6 +2668,12 @@ int smb2_open(struct ksmbd_work *work)
} else if (context) {
struct create_posix *posix =
(struct create_posix *)context;
+ if (le16_to_cpu(context->DataOffset) +
+ le32_to_cpu(context->DataLength) <
+ sizeof(struct create_posix)) {
+ rc = -EINVAL;
+ goto err_out1;
+ }
ksmbd_debug(SMB, "get posix context\n");
posix_mode = le32_to_cpu(posix->Mode);
@@ -2628,13 +2687,9 @@ int smb2_open(struct ksmbd_work *work)
goto err_out1;
}
- if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE) {
- /*
- * On delete request, instead of following up, need to
- * look the current entity
- */
- rc = ksmbd_vfs_kern_path(name, 0, &path, 1);
- if (!rc) {
+ rc = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS, &path, 1);
+ if (!rc) {
+ if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE) {
/*
* If file exists with under flags, return access
* denied error.
@@ -2653,34 +2708,16 @@ int smb2_open(struct ksmbd_work *work)
path_put(&path);
goto err_out;
}
- }
- } else {
- if (test_share_config_flag(work->tcon->share_conf,
- KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS)) {
- /*
- * Use LOOKUP_FOLLOW to follow the path of
- * symlink in path buildup
- */
- rc = ksmbd_vfs_kern_path(name, LOOKUP_FOLLOW, &path, 1);
- if (rc) { /* Case for broken link ?*/
- rc = ksmbd_vfs_kern_path(name, 0, &path, 1);
- }
- } else {
- rc = ksmbd_vfs_kern_path(name, 0, &path, 1);
- if (!rc && d_is_symlink(path.dentry)) {
- rc = -EACCES;
- path_put(&path);
- goto err_out;
- }
+ } else if (d_is_symlink(path.dentry)) {
+ rc = -EACCES;
+ path_put(&path);
+ goto err_out;
}
}
if (rc) {
- if (rc == -EACCES) {
- ksmbd_debug(SMB,
- "User does not have right permission\n");
+ if (rc != -ENOENT)
goto err_out;
- }
ksmbd_debug(SMB, "can not get linux path for %s, rc = %d\n",
name, rc);
rc = 0;
@@ -2786,7 +2823,15 @@ int smb2_open(struct ksmbd_work *work)
created = true;
user_ns = mnt_user_ns(path.mnt);
if (ea_buf) {
- rc = smb2_set_ea(&ea_buf->ea, &path);
+ if (le32_to_cpu(ea_buf->ccontext.DataLength) <
+ sizeof(struct smb2_ea_info)) {
+ rc = -EINVAL;
+ goto err_out;
+ }
+
+ rc = smb2_set_ea(&ea_buf->ea,
+ le32_to_cpu(ea_buf->ccontext.DataLength),
+ &path);
if (rc == -EOPNOTSUPP)
rc = 0;
else if (rc)
@@ -3019,9 +3064,16 @@ int smb2_open(struct ksmbd_work *work)
rc = PTR_ERR(az_req);
goto err_out;
} else if (az_req) {
- loff_t alloc_size = le64_to_cpu(az_req->AllocationSize);
+ loff_t alloc_size;
int err;
+ if (le16_to_cpu(az_req->ccontext.DataOffset) +
+ le32_to_cpu(az_req->ccontext.DataLength) <
+ sizeof(struct create_alloc_size_req)) {
+ rc = -EINVAL;
+ goto err_out;
+ }
+ alloc_size = le64_to_cpu(az_req->AllocationSize);
ksmbd_debug(SMB,
"request smb2 create allocate size : %llu\n",
alloc_size);
@@ -3176,7 +3228,7 @@ err_out1:
rsp->hdr.Status = STATUS_INVALID_PARAMETER;
else if (rc == -EOPNOTSUPP)
rsp->hdr.Status = STATUS_NOT_SUPPORTED;
- else if (rc == -EACCES || rc == -ESTALE)
+ else if (rc == -EACCES || rc == -ESTALE || rc == -EXDEV)
rsp->hdr.Status = STATUS_ACCESS_DENIED;
else if (rc == -ENOENT)
rsp->hdr.Status = STATUS_OBJECT_NAME_INVALID;
@@ -3742,6 +3794,24 @@ static int verify_info_level(int info_level)
return 0;
}
+static int smb2_calc_max_out_buf_len(struct ksmbd_work *work,
+ unsigned short hdr2_len,
+ unsigned int out_buf_len)
+{
+ int free_len;
+
+ if (out_buf_len > work->conn->vals->max_trans_size)
+ return -EINVAL;
+
+ free_len = (int)(work->response_sz -
+ (get_rfc1002_len(work->response_buf) + 4)) -
+ hdr2_len;
+ if (free_len < 0)
+ return -EINVAL;
+
+ return min_t(int, out_buf_len, free_len);
+}
+
int smb2_query_dir(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
@@ -3818,9 +3888,13 @@ int smb2_query_dir(struct ksmbd_work *work)
memset(&d_info, 0, sizeof(struct ksmbd_dir_info));
d_info.wptr = (char *)rsp->Buffer;
d_info.rptr = (char *)rsp->Buffer;
- d_info.out_buf_len = (work->response_sz - (get_rfc1002_len(rsp_org) + 4));
- d_info.out_buf_len = min_t(int, d_info.out_buf_len, le32_to_cpu(req->OutputBufferLength)) -
- sizeof(struct smb2_query_directory_rsp);
+ d_info.out_buf_len =
+ smb2_calc_max_out_buf_len(work, 8,
+ le32_to_cpu(req->OutputBufferLength));
+ if (d_info.out_buf_len < 0) {
+ rc = -EINVAL;
+ goto err_out;
+ }
d_info.flags = srch_flag;
/*
@@ -4041,6 +4115,10 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp,
path = &fp->filp->f_path;
/* single EA entry is requested with given user.* name */
if (req->InputBufferLength) {
+ if (le32_to_cpu(req->InputBufferLength) <
+ sizeof(struct smb2_ea_info_req))
+ return -EINVAL;
+
ea_req = (struct smb2_ea_info_req *)req->Buffer;
} else {
/* need to send all EAs, if no specific EA is requested*/
@@ -4050,12 +4128,11 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp,
le32_to_cpu(req->Flags));
}
- buf_free_len = work->response_sz -
- (get_rfc1002_len(rsp_org) + 4) -
- sizeof(struct smb2_query_info_rsp);
-
- if (le32_to_cpu(req->OutputBufferLength) < buf_free_len)
- buf_free_len = le32_to_cpu(req->OutputBufferLength);
+ buf_free_len =
+ smb2_calc_max_out_buf_len(work, 8,
+ le32_to_cpu(req->OutputBufferLength));
+ if (buf_free_len < 0)
+ return -EINVAL;
rc = ksmbd_vfs_listxattr(path->dentry, &xattr_list);
if (rc < 0) {
@@ -4186,7 +4263,7 @@ static void get_file_access_info(struct smb2_query_info_rsp *rsp,
static int get_file_basic_info(struct smb2_query_info_rsp *rsp,
struct ksmbd_file *fp, void *rsp_org)
{
- struct smb2_file_all_info *basic_info;
+ struct smb2_file_basic_info *basic_info;
struct kstat stat;
u64 time;
@@ -4196,7 +4273,7 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp,
return -EACCES;
}
- basic_info = (struct smb2_file_all_info *)rsp->Buffer;
+ basic_info = (struct smb2_file_basic_info *)rsp->Buffer;
generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp),
&stat);
basic_info->CreationTime = cpu_to_le64(fp->create_time);
@@ -4209,9 +4286,8 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp,
basic_info->Attributes = fp->f_ci->m_fattr;
basic_info->Pad1 = 0;
rsp->OutputBufferLength =
- cpu_to_le32(offsetof(struct smb2_file_all_info, AllocationSize));
- inc_rfc1001_len(rsp_org, offsetof(struct smb2_file_all_info,
- AllocationSize));
+ cpu_to_le32(sizeof(struct smb2_file_basic_info));
+ inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_basic_info));
return 0;
}
@@ -4288,8 +4364,7 @@ static int get_file_all_info(struct ksmbd_work *work,
return -EACCES;
}
- filename = convert_to_nt_pathname(fp->filename,
- work->tcon->share_conf->path);
+ filename = convert_to_nt_pathname(fp->filename);
if (!filename)
return -ENOMEM;
@@ -4368,6 +4443,8 @@ static void get_file_stream_info(struct ksmbd_work *work,
struct path *path = &fp->filp->f_path;
ssize_t xattr_list_len;
int nbytes = 0, streamlen, stream_name_len, next, idx = 0;
+ int buf_free_len;
+ struct smb2_query_info_req *req = ksmbd_req_buf_next(work);
generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp),
&stat);
@@ -4381,6 +4458,12 @@ static void get_file_stream_info(struct ksmbd_work *work,
goto out;
}
+ buf_free_len =
+ smb2_calc_max_out_buf_len(work, 8,
+ le32_to_cpu(req->OutputBufferLength));
+ if (buf_free_len < 0)
+ goto out;
+
while (idx < xattr_list_len) {
stream_name = xattr_list + idx;
streamlen = strlen(stream_name);
@@ -4405,6 +4488,10 @@ static void get_file_stream_info(struct ksmbd_work *work,
streamlen = snprintf(stream_buf, streamlen + 1,
":%s", &stream_name[XATTR_NAME_STREAM_LEN]);
+ next = sizeof(struct smb2_file_stream_info) + streamlen * 2;
+ if (next > buf_free_len)
+ break;
+
file_info = (struct smb2_file_stream_info *)&rsp->Buffer[nbytes];
streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName,
stream_buf, streamlen,
@@ -4415,22 +4502,21 @@ static void get_file_stream_info(struct ksmbd_work *work,
file_info->StreamSize = cpu_to_le64(stream_name_len);
file_info->StreamAllocationSize = cpu_to_le64(stream_name_len);
- next = sizeof(struct smb2_file_stream_info) + streamlen;
nbytes += next;
+ buf_free_len -= next;
file_info->NextEntryOffset = cpu_to_le32(next);
}
- if (nbytes) {
+ if (!S_ISDIR(stat.mode) &&
+ buf_free_len >= sizeof(struct smb2_file_stream_info) + 7 * 2) {
file_info = (struct smb2_file_stream_info *)
&rsp->Buffer[nbytes];
streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName,
"::$DATA", 7, conn->local_nls, 0);
streamlen *= 2;
file_info->StreamNameLength = cpu_to_le32(streamlen);
- file_info->StreamSize = S_ISDIR(stat.mode) ? 0 :
- cpu_to_le64(stat.size);
- file_info->StreamAllocationSize = S_ISDIR(stat.mode) ? 0 :
- cpu_to_le64(stat.size);
+ file_info->StreamSize = 0;
+ file_info->StreamAllocationSize = 0;
nbytes += sizeof(struct smb2_file_stream_info) + streamlen;
}
@@ -4745,12 +4831,8 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
struct path path;
int rc = 0, len;
int fs_infoclass_size = 0;
- int lookup_flags = 0;
-
- if (test_share_config_flag(share, KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS))
- lookup_flags = LOOKUP_FOLLOW;
- rc = ksmbd_vfs_kern_path(share->path, lookup_flags, &path, 0);
+ rc = kern_path(share->path, LOOKUP_NO_SYMLINKS, &path);
if (rc) {
pr_err("cannot create vfs path\n");
return -EIO;
@@ -5299,7 +5381,7 @@ static int smb2_rename(struct ksmbd_work *work,
goto out;
len = strlen(new_name);
- if (new_name[len - 1] != '/') {
+ if (len > 0 && new_name[len - 1] != '/') {
pr_err("not allow base filename in rename\n");
rc = -ESHARE;
goto out;
@@ -5327,11 +5409,14 @@ static int smb2_rename(struct ksmbd_work *work,
}
ksmbd_debug(SMB, "new name %s\n", new_name);
- rc = ksmbd_vfs_kern_path(new_name, 0, &path, 1);
- if (rc)
+ rc = ksmbd_vfs_kern_path(work, new_name, LOOKUP_NO_SYMLINKS, &path, 1);
+ if (rc) {
+ if (rc != -ENOENT)
+ goto out;
file_present = false;
- else
+ } else {
path_put(&path);
+ }
if (ksmbd_share_veto_filename(share, new_name)) {
rc = -ENOENT;
@@ -5371,7 +5456,7 @@ out:
static int smb2_create_link(struct ksmbd_work *work,
struct ksmbd_share_config *share,
struct smb2_file_link_info *file_info,
- struct file *filp,
+ unsigned int buf_len, struct file *filp,
struct nls_table *local_nls)
{
char *link_name = NULL, *target_name = NULL, *pathname = NULL;
@@ -5379,6 +5464,10 @@ static int smb2_create_link(struct ksmbd_work *work,
bool file_present = true;
int rc;
+ if (buf_len < (u64)sizeof(struct smb2_file_link_info) +
+ le32_to_cpu(file_info->FileNameLength))
+ return -EINVAL;
+
ksmbd_debug(SMB, "setting FILE_LINK_INFORMATION\n");
pathname = kmalloc(PATH_MAX, GFP_KERNEL);
if (!pathname)
@@ -5401,11 +5490,14 @@ static int smb2_create_link(struct ksmbd_work *work,
}
ksmbd_debug(SMB, "target name is %s\n", target_name);
- rc = ksmbd_vfs_kern_path(link_name, 0, &path, 0);
- if (rc)
+ rc = ksmbd_vfs_kern_path(work, link_name, LOOKUP_NO_SYMLINKS, &path, 0);
+ if (rc) {
+ if (rc != -ENOENT)
+ goto out;
file_present = false;
- else
+ } else {
path_put(&path);
+ }
if (file_info->ReplaceIfExists) {
if (file_present) {
@@ -5435,12 +5527,11 @@ out:
return rc;
}
-static int set_file_basic_info(struct ksmbd_file *fp, char *buf,
+static int set_file_basic_info(struct ksmbd_file *fp,
+ struct smb2_file_basic_info *file_info,
struct ksmbd_share_config *share)
{
- struct smb2_file_all_info *file_info;
struct iattr attrs;
- struct timespec64 ctime;
struct file *filp;
struct inode *inode;
struct user_namespace *user_ns;
@@ -5449,7 +5540,6 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf,
if (!(fp->daccess & FILE_WRITE_ATTRIBUTES_LE))
return -EACCES;
- file_info = (struct smb2_file_all_info *)buf;
attrs.ia_valid = 0;
filp = fp->filp;
inode = file_inode(filp);
@@ -5463,13 +5553,11 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf,
attrs.ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
}
- if (file_info->ChangeTime) {
+ attrs.ia_valid |= ATTR_CTIME;
+ if (file_info->ChangeTime)
attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime);
- ctime = attrs.ia_ctime;
- attrs.ia_valid |= ATTR_CTIME;
- } else {
- ctime = inode->i_ctime;
- }
+ else
+ attrs.ia_ctime = inode->i_ctime;
if (file_info->LastWriteTime) {
attrs.ia_mtime = ksmbd_NTtimeToUnix(file_info->LastWriteTime);
@@ -5515,18 +5603,17 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf,
return -EACCES;
inode_lock(inode);
+ inode->i_ctime = attrs.ia_ctime;
+ attrs.ia_valid &= ~ATTR_CTIME;
rc = notify_change(user_ns, dentry, &attrs, NULL);
- if (!rc) {
- inode->i_ctime = ctime;
- mark_inode_dirty(inode);
- }
inode_unlock(inode);
}
return rc;
}
static int set_file_allocation_info(struct ksmbd_work *work,
- struct ksmbd_file *fp, char *buf)
+ struct ksmbd_file *fp,
+ struct smb2_file_alloc_info *file_alloc_info)
{
/*
* TODO : It's working fine only when store dos attributes
@@ -5534,7 +5621,6 @@ static int set_file_allocation_info(struct ksmbd_work *work,
* properly with any smb.conf option
*/
- struct smb2_file_alloc_info *file_alloc_info;
loff_t alloc_blks;
struct inode *inode;
int rc;
@@ -5542,7 +5628,6 @@ static int set_file_allocation_info(struct ksmbd_work *work,
if (!(fp->daccess & FILE_WRITE_DATA_LE))
return -EACCES;
- file_alloc_info = (struct smb2_file_alloc_info *)buf;
alloc_blks = (le64_to_cpu(file_alloc_info->AllocationSize) + 511) >> 9;
inode = file_inode(fp->filp);
@@ -5565,7 +5650,7 @@ static int set_file_allocation_info(struct ksmbd_work *work,
* inode size is retained by backup inode size.
*/
size = i_size_read(inode);
- rc = ksmbd_vfs_truncate(work, NULL, fp, alloc_blks * 512);
+ rc = ksmbd_vfs_truncate(work, fp, alloc_blks * 512);
if (rc) {
pr_err("truncate failed! filename : %s, err %d\n",
fp->filename, rc);
@@ -5578,9 +5663,8 @@ static int set_file_allocation_info(struct ksmbd_work *work,
}
static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp,
- char *buf)
+ struct smb2_file_eof_info *file_eof_info)
{
- struct smb2_file_eof_info *file_eof_info;
loff_t newsize;
struct inode *inode;
int rc;
@@ -5588,7 +5672,6 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp,
if (!(fp->daccess & FILE_WRITE_DATA_LE))
return -EACCES;
- file_eof_info = (struct smb2_file_eof_info *)buf;
newsize = le64_to_cpu(file_eof_info->EndOfFile);
inode = file_inode(fp->filp);
@@ -5602,7 +5685,7 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp,
if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC) {
ksmbd_debug(SMB, "filename : %s truncated to newsize %lld\n",
fp->filename, newsize);
- rc = ksmbd_vfs_truncate(work, NULL, fp, newsize);
+ rc = ksmbd_vfs_truncate(work, fp, newsize);
if (rc) {
ksmbd_debug(SMB, "truncate failed! filename : %s err %d\n",
fp->filename, rc);
@@ -5615,7 +5698,8 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp,
}
static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
- char *buf)
+ struct smb2_file_rename_info *rename_info,
+ unsigned int buf_len)
{
struct user_namespace *user_ns;
struct ksmbd_file *parent_fp;
@@ -5628,6 +5712,10 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
return -EACCES;
}
+ if (buf_len < (u64)sizeof(struct smb2_file_rename_info) +
+ le32_to_cpu(rename_info->FileNameLength))
+ return -EINVAL;
+
user_ns = file_mnt_user_ns(fp->filp);
if (ksmbd_stream_fd(fp))
goto next;
@@ -5650,14 +5738,13 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
}
}
next:
- return smb2_rename(work, fp, user_ns,
- (struct smb2_file_rename_info *)buf,
+ return smb2_rename(work, fp, user_ns, rename_info,
work->sess->conn->local_nls);
}
-static int set_file_disposition_info(struct ksmbd_file *fp, char *buf)
+static int set_file_disposition_info(struct ksmbd_file *fp,
+ struct smb2_file_disposition_info *file_info)
{
- struct smb2_file_disposition_info *file_info;
struct inode *inode;
if (!(fp->daccess & FILE_DELETE_LE)) {
@@ -5666,7 +5753,6 @@ static int set_file_disposition_info(struct ksmbd_file *fp, char *buf)
}
inode = file_inode(fp->filp);
- file_info = (struct smb2_file_disposition_info *)buf;
if (file_info->DeletePending) {
if (S_ISDIR(inode->i_mode) &&
ksmbd_vfs_empty_dir(fp) == -ENOTEMPTY)
@@ -5678,15 +5764,14 @@ static int set_file_disposition_info(struct ksmbd_file *fp, char *buf)
return 0;
}
-static int set_file_position_info(struct ksmbd_file *fp, char *buf)
+static int set_file_position_info(struct ksmbd_file *fp,
+ struct smb2_file_pos_info *file_info)
{
- struct smb2_file_pos_info *file_info;
loff_t current_byte_offset;
unsigned long sector_size;
struct inode *inode;
inode = file_inode(fp->filp);
- file_info = (struct smb2_file_pos_info *)buf;
current_byte_offset = le64_to_cpu(file_info->CurrentByteOffset);
sector_size = inode->i_sb->s_blocksize;
@@ -5702,12 +5787,11 @@ static int set_file_position_info(struct ksmbd_file *fp, char *buf)
return 0;
}
-static int set_file_mode_info(struct ksmbd_file *fp, char *buf)
+static int set_file_mode_info(struct ksmbd_file *fp,
+ struct smb2_file_mode_info *file_info)
{
- struct smb2_file_mode_info *file_info;
__le32 mode;
- file_info = (struct smb2_file_mode_info *)buf;
mode = file_info->Mode;
if ((mode & ~FILE_MODE_INFO_MASK) ||
@@ -5737,40 +5821,74 @@ static int set_file_mode_info(struct ksmbd_file *fp, char *buf)
* TODO: need to implement an error handling for STATUS_INFO_LENGTH_MISMATCH
*/
static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
- int info_class, char *buf,
+ struct smb2_set_info_req *req,
struct ksmbd_share_config *share)
{
- switch (info_class) {
+ unsigned int buf_len = le32_to_cpu(req->BufferLength);
+
+ switch (req->FileInfoClass) {
case FILE_BASIC_INFORMATION:
- return set_file_basic_info(fp, buf, share);
+ {
+ if (buf_len < sizeof(struct smb2_file_basic_info))
+ return -EINVAL;
+ return set_file_basic_info(fp, (struct smb2_file_basic_info *)req->Buffer, share);
+ }
case FILE_ALLOCATION_INFORMATION:
- return set_file_allocation_info(work, fp, buf);
+ {
+ if (buf_len < sizeof(struct smb2_file_alloc_info))
+ return -EINVAL;
+ return set_file_allocation_info(work, fp,
+ (struct smb2_file_alloc_info *)req->Buffer);
+ }
case FILE_END_OF_FILE_INFORMATION:
- return set_end_of_file_info(work, fp, buf);
+ {
+ if (buf_len < sizeof(struct smb2_file_eof_info))
+ return -EINVAL;
+ return set_end_of_file_info(work, fp,
+ (struct smb2_file_eof_info *)req->Buffer);
+ }
case FILE_RENAME_INFORMATION:
+ {
if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
ksmbd_debug(SMB,
"User does not have write permission\n");
return -EACCES;
}
- return set_rename_info(work, fp, buf);
+ if (buf_len < sizeof(struct smb2_file_rename_info))
+ return -EINVAL;
+
+ return set_rename_info(work, fp,
+ (struct smb2_file_rename_info *)req->Buffer,
+ buf_len);
+ }
case FILE_LINK_INFORMATION:
+ {
+ if (buf_len < sizeof(struct smb2_file_link_info))
+ return -EINVAL;
+
return smb2_create_link(work, work->tcon->share_conf,
- (struct smb2_file_link_info *)buf, fp->filp,
+ (struct smb2_file_link_info *)req->Buffer,
+ buf_len, fp->filp,
work->sess->conn->local_nls);
-
+ }
case FILE_DISPOSITION_INFORMATION:
+ {
if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
ksmbd_debug(SMB,
"User does not have write permission\n");
return -EACCES;
}
- return set_file_disposition_info(fp, buf);
+ if (buf_len < sizeof(struct smb2_file_disposition_info))
+ return -EINVAL;
+
+ return set_file_disposition_info(fp,
+ (struct smb2_file_disposition_info *)req->Buffer);
+ }
case FILE_FULL_EA_INFORMATION:
{
if (!(fp->daccess & FILE_WRITE_EA_LE)) {
@@ -5779,18 +5897,29 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
return -EACCES;
}
- return smb2_set_ea((struct smb2_ea_info *)buf,
- &fp->filp->f_path);
- }
+ if (buf_len < sizeof(struct smb2_ea_info))
+ return -EINVAL;
+ return smb2_set_ea((struct smb2_ea_info *)req->Buffer,
+ buf_len, &fp->filp->f_path);
+ }
case FILE_POSITION_INFORMATION:
- return set_file_position_info(fp, buf);
+ {
+ if (buf_len < sizeof(struct smb2_file_pos_info))
+ return -EINVAL;
+ return set_file_position_info(fp, (struct smb2_file_pos_info *)req->Buffer);
+ }
case FILE_MODE_INFORMATION:
- return set_file_mode_info(fp, buf);
+ {
+ if (buf_len < sizeof(struct smb2_file_mode_info))
+ return -EINVAL;
+
+ return set_file_mode_info(fp, (struct smb2_file_mode_info *)req->Buffer);
+ }
}
- pr_err("Unimplemented Fileinfoclass :%d\n", info_class);
+ pr_err("Unimplemented Fileinfoclass :%d\n", req->FileInfoClass);
return -EOPNOTSUPP;
}
@@ -5851,8 +5980,7 @@ int smb2_set_info(struct ksmbd_work *work)
switch (req->InfoType) {
case SMB2_O_INFO_FILE:
ksmbd_debug(SMB, "GOT SMB2_O_INFO_FILE\n");
- rc = smb2_set_info_file(work, fp, req->FileInfoClass,
- req->Buffer, work->tcon->share_conf);
+ rc = smb2_set_info_file(work, fp, req, work->tcon->share_conf);
break;
case SMB2_O_INFO_SECURITY:
ksmbd_debug(SMB, "GOT SMB2_O_INFO_SECURITY\n");
@@ -5879,7 +6007,7 @@ int smb2_set_info(struct ksmbd_work *work)
return 0;
err_out:
- if (rc == -EACCES || rc == -EPERM)
+ if (rc == -EACCES || rc == -EPERM || rc == -EXDEV)
rsp->hdr.Status = STATUS_ACCESS_DENIED;
else if (rc == -EINVAL)
rsp->hdr.Status = STATUS_INVALID_PARAMETER;
@@ -6141,8 +6269,7 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work)
(offsetof(struct smb2_write_req, Buffer) - 4)) {
data_buf = (char *)&req->Buffer[0];
} else {
- if ((le16_to_cpu(req->DataOffset) > get_rfc1002_len(req)) ||
- (le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req))) {
+ if ((u64)le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req)) {
pr_err("invalid write data offset %u, smb_len %u\n",
le16_to_cpu(req->DataOffset),
get_rfc1002_len(req));
@@ -6300,8 +6427,7 @@ int smb2_write(struct ksmbd_work *work)
(offsetof(struct smb2_write_req, Buffer) - 4)) {
data_buf = (char *)&req->Buffer[0];
} else {
- if ((le16_to_cpu(req->DataOffset) > get_rfc1002_len(req)) ||
- (le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req))) {
+ if ((u64)le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req)) {
pr_err("invalid write data offset %u, smb_len %u\n",
le16_to_cpu(req->DataOffset),
get_rfc1002_len(req));
@@ -6944,24 +7070,26 @@ out2:
return err;
}
-static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req,
+static int fsctl_copychunk(struct ksmbd_work *work,
+ struct copychunk_ioctl_req *ci_req,
+ unsigned int cnt_code,
+ unsigned int input_count,
+ unsigned long long volatile_id,
+ unsigned long long persistent_id,
struct smb2_ioctl_rsp *rsp)
{
- struct copychunk_ioctl_req *ci_req;
struct copychunk_ioctl_rsp *ci_rsp;
struct ksmbd_file *src_fp = NULL, *dst_fp = NULL;
struct srv_copychunk *chunks;
unsigned int i, chunk_count, chunk_count_written = 0;
unsigned int chunk_size_written = 0;
loff_t total_size_written = 0;
- int ret, cnt_code;
+ int ret = 0;
- cnt_code = le32_to_cpu(req->CntCode);
- ci_req = (struct copychunk_ioctl_req *)&req->Buffer[0];
ci_rsp = (struct copychunk_ioctl_rsp *)&rsp->Buffer[0];
- rsp->VolatileFileId = req->VolatileFileId;
- rsp->PersistentFileId = req->PersistentFileId;
+ rsp->VolatileFileId = cpu_to_le64(volatile_id);
+ rsp->PersistentFileId = cpu_to_le64(persistent_id);
ci_rsp->ChunksWritten =
cpu_to_le32(ksmbd_server_side_copy_max_chunk_count());
ci_rsp->ChunkBytesWritten =
@@ -6971,12 +7099,13 @@ static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req,
chunks = (struct srv_copychunk *)&ci_req->Chunks[0];
chunk_count = le32_to_cpu(ci_req->ChunkCount);
+ if (chunk_count == 0)
+ goto out;
total_size_written = 0;
/* verify the SRV_COPYCHUNK_COPY packet */
if (chunk_count > ksmbd_server_side_copy_max_chunk_count() ||
- le32_to_cpu(req->InputCount) <
- offsetof(struct copychunk_ioctl_req, Chunks) +
+ input_count < offsetof(struct copychunk_ioctl_req, Chunks) +
chunk_count * sizeof(struct srv_copychunk)) {
rsp->hdr.Status = STATUS_INVALID_PARAMETER;
return -EINVAL;
@@ -6997,9 +7126,7 @@ static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req,
src_fp = ksmbd_lookup_foreign_fd(work,
le64_to_cpu(ci_req->ResumeKey[0]));
- dst_fp = ksmbd_lookup_fd_slow(work,
- le64_to_cpu(req->VolatileFileId),
- le64_to_cpu(req->PersistentFileId));
+ dst_fp = ksmbd_lookup_fd_slow(work, volatile_id, persistent_id);
ret = -EINVAL;
if (!src_fp ||
src_fp->persistent_id != le64_to_cpu(ci_req->ResumeKey[1])) {
@@ -7074,8 +7201,8 @@ static __be32 idev_ipv4_address(struct in_device *idev)
}
static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
- struct smb2_ioctl_req *req,
- struct smb2_ioctl_rsp *rsp)
+ struct smb2_ioctl_rsp *rsp,
+ unsigned int out_buf_len)
{
struct network_interface_info_ioctl_rsp *nii_rsp = NULL;
int nbytes = 0;
@@ -7087,6 +7214,12 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
rtnl_lock();
for_each_netdev(&init_net, netdev) {
+ if (out_buf_len <
+ nbytes + sizeof(struct network_interface_info_ioctl_rsp)) {
+ rtnl_unlock();
+ return -ENOSPC;
+ }
+
if (netdev->type == ARPHRD_LOOPBACK)
continue;
@@ -7166,11 +7299,6 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
if (nii_rsp)
nii_rsp->Next = 0;
- if (!nbytes) {
- rsp->hdr.Status = STATUS_BUFFER_TOO_SMALL;
- return -EINVAL;
- }
-
rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID);
rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID);
return nbytes;
@@ -7178,11 +7306,16 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
static int fsctl_validate_negotiate_info(struct ksmbd_conn *conn,
struct validate_negotiate_info_req *neg_req,
- struct validate_negotiate_info_rsp *neg_rsp)
+ struct validate_negotiate_info_rsp *neg_rsp,
+ unsigned int in_buf_len)
{
int ret = 0;
int dialect;
+ if (in_buf_len < sizeof(struct validate_negotiate_info_req) +
+ le16_to_cpu(neg_req->DialectCount) * sizeof(__le16))
+ return -EINVAL;
+
dialect = ksmbd_lookup_dialect_by_id(neg_req->Dialects,
neg_req->DialectCount);
if (dialect == BAD_PROT_ID || dialect != conn->dialect) {
@@ -7216,7 +7349,7 @@ err_out:
static int fsctl_query_allocated_ranges(struct ksmbd_work *work, u64 id,
struct file_allocated_range_buffer *qar_req,
struct file_allocated_range_buffer *qar_rsp,
- int in_count, int *out_count)
+ unsigned int in_count, unsigned int *out_count)
{
struct ksmbd_file *fp;
loff_t start, length;
@@ -7243,7 +7376,8 @@ static int fsctl_query_allocated_ranges(struct ksmbd_work *work, u64 id,
}
static int fsctl_pipe_transceive(struct ksmbd_work *work, u64 id,
- int out_buf_len, struct smb2_ioctl_req *req,
+ unsigned int out_buf_len,
+ struct smb2_ioctl_req *req,
struct smb2_ioctl_rsp *rsp)
{
struct ksmbd_rpc_command *rpc_resp;
@@ -7357,8 +7491,7 @@ int smb2_ioctl(struct ksmbd_work *work)
{
struct smb2_ioctl_req *req;
struct smb2_ioctl_rsp *rsp, *rsp_org;
- int cnt_code, nbytes = 0;
- int out_buf_len;
+ unsigned int cnt_code, nbytes = 0, out_buf_len, in_buf_len;
u64 id = KSMBD_NO_FID;
struct ksmbd_conn *conn = work->conn;
int ret = 0;
@@ -7386,8 +7519,14 @@ int smb2_ioctl(struct ksmbd_work *work)
}
cnt_code = le32_to_cpu(req->CntCode);
- out_buf_len = le32_to_cpu(req->MaxOutputResponse);
- out_buf_len = min(KSMBD_IPC_MAX_PAYLOAD, out_buf_len);
+ ret = smb2_calc_max_out_buf_len(work, 48,
+ le32_to_cpu(req->MaxOutputResponse));
+ if (ret < 0) {
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ goto out;
+ }
+ out_buf_len = (unsigned int)ret;
+ in_buf_len = le32_to_cpu(req->InputCount);
switch (cnt_code) {
case FSCTL_DFS_GET_REFERRALS:
@@ -7415,6 +7554,7 @@ int smb2_ioctl(struct ksmbd_work *work)
break;
}
case FSCTL_PIPE_TRANSCEIVE:
+ out_buf_len = min_t(u32, KSMBD_IPC_MAX_PAYLOAD, out_buf_len);
nbytes = fsctl_pipe_transceive(work, id, out_buf_len, req, rsp);
break;
case FSCTL_VALIDATE_NEGOTIATE_INFO:
@@ -7423,9 +7563,16 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
+ if (in_buf_len < sizeof(struct validate_negotiate_info_req))
+ return -EINVAL;
+
+ if (out_buf_len < sizeof(struct validate_negotiate_info_rsp))
+ return -EINVAL;
+
ret = fsctl_validate_negotiate_info(conn,
(struct validate_negotiate_info_req *)&req->Buffer[0],
- (struct validate_negotiate_info_rsp *)&rsp->Buffer[0]);
+ (struct validate_negotiate_info_rsp *)&rsp->Buffer[0],
+ in_buf_len);
if (ret < 0)
goto out;
@@ -7434,9 +7581,10 @@ int smb2_ioctl(struct ksmbd_work *work)
rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID);
break;
case FSCTL_QUERY_NETWORK_INTERFACE_INFO:
- nbytes = fsctl_query_iface_info_ioctl(conn, req, rsp);
- if (nbytes < 0)
+ ret = fsctl_query_iface_info_ioctl(conn, rsp, out_buf_len);
+ if (ret < 0)
goto out;
+ nbytes = ret;
break;
case FSCTL_REQUEST_RESUME_KEY:
if (out_buf_len < sizeof(struct resume_key_ioctl_rsp)) {
@@ -7461,15 +7609,33 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
+ if (in_buf_len < sizeof(struct copychunk_ioctl_req)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (out_buf_len < sizeof(struct copychunk_ioctl_rsp)) {
ret = -EINVAL;
goto out;
}
nbytes = sizeof(struct copychunk_ioctl_rsp);
- fsctl_copychunk(work, req, rsp);
+ rsp->VolatileFileId = req->VolatileFileId;
+ rsp->PersistentFileId = req->PersistentFileId;
+ fsctl_copychunk(work,
+ (struct copychunk_ioctl_req *)&req->Buffer[0],
+ le32_to_cpu(req->CntCode),
+ le32_to_cpu(req->InputCount),
+ le64_to_cpu(req->VolatileFileId),
+ le64_to_cpu(req->PersistentFileId),
+ rsp);
break;
case FSCTL_SET_SPARSE:
+ if (in_buf_len < sizeof(struct file_sparse)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
ret = fsctl_set_sparse(work, id,
(struct file_sparse *)&req->Buffer[0]);
if (ret < 0)
@@ -7488,6 +7654,11 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
+ if (in_buf_len < sizeof(struct file_zero_data_information)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
zero_data =
(struct file_zero_data_information *)&req->Buffer[0];
@@ -7507,6 +7678,11 @@ int smb2_ioctl(struct ksmbd_work *work)
break;
}
case FSCTL_QUERY_ALLOCATED_RANGES:
+ if (in_buf_len < sizeof(struct file_allocated_range_buffer)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
ret = fsctl_query_allocated_ranges(work, id,
(struct file_allocated_range_buffer *)&req->Buffer[0],
(struct file_allocated_range_buffer *)&rsp->Buffer[0],
@@ -7547,6 +7723,11 @@ int smb2_ioctl(struct ksmbd_work *work)
struct duplicate_extents_to_file *dup_ext;
loff_t src_off, dst_off, length, cloned;
+ if (in_buf_len < sizeof(struct duplicate_extents_to_file)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
dup_ext = (struct duplicate_extents_to_file *)&req->Buffer[0];
fp_in = ksmbd_lookup_fd_slow(work, dup_ext->VolatileFileHandle,
@@ -7617,6 +7798,8 @@ out:
rsp->hdr.Status = STATUS_OBJECT_NAME_NOT_FOUND;
else if (ret == -EOPNOTSUPP)
rsp->hdr.Status = STATUS_NOT_SUPPORTED;
+ else if (ret == -ENOSPC)
+ rsp->hdr.Status = STATUS_BUFFER_TOO_SMALL;
else if (ret < 0 || rsp->hdr.Status == 0)
rsp->hdr.Status = STATUS_INVALID_PARAMETER;
smb2_set_err_rsp(work);
@@ -8206,7 +8389,8 @@ void smb3_preauth_hash_rsp(struct ksmbd_work *work)
WORK_BUFFERS(work, req, rsp);
- if (le16_to_cpu(req->Command) == SMB2_NEGOTIATE_HE)
+ if (le16_to_cpu(req->Command) == SMB2_NEGOTIATE_HE &&
+ conn->preauth_info)
ksmbd_gen_preauth_integrity_hash(conn, (char *)rsp,
conn->preauth_info->Preauth_HashValue);
@@ -8310,31 +8494,29 @@ int smb3_decrypt_req(struct ksmbd_work *work)
struct smb2_hdr *hdr;
unsigned int pdu_length = get_rfc1002_len(buf);
struct kvec iov[2];
- unsigned int buf_data_size = pdu_length + 4 -
+ int buf_data_size = pdu_length + 4 -
sizeof(struct smb2_transform_hdr);
struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf;
- unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
int rc = 0;
- sess = ksmbd_session_lookup_all(conn, le64_to_cpu(tr_hdr->SessionId));
- if (!sess) {
- pr_err("invalid session id(%llx) in transform header\n",
- le64_to_cpu(tr_hdr->SessionId));
- return -ECONNABORTED;
- }
-
- if (pdu_length + 4 <
- sizeof(struct smb2_transform_hdr) + sizeof(struct smb2_hdr)) {
+ if (buf_data_size < sizeof(struct smb2_hdr)) {
pr_err("Transform message is too small (%u)\n",
pdu_length);
return -ECONNABORTED;
}
- if (pdu_length + 4 < orig_len + sizeof(struct smb2_transform_hdr)) {
+ if (buf_data_size < le32_to_cpu(tr_hdr->OriginalMessageSize)) {
pr_err("Transform message is broken\n");
return -ECONNABORTED;
}
+ sess = ksmbd_session_lookup_all(conn, le64_to_cpu(tr_hdr->SessionId));
+ if (!sess) {
+ pr_err("invalid session id(%llx) in transform header\n",
+ le64_to_cpu(tr_hdr->SessionId));
+ return -ECONNABORTED;
+ }
+
iov[0].iov_base = buf;
iov[0].iov_len = sizeof(struct smb2_transform_hdr);
iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr);
diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h
index bcec845b03f3..ff5a2f01d34a 100644
--- a/fs/ksmbd/smb2pdu.h
+++ b/fs/ksmbd/smb2pdu.h
@@ -113,6 +113,8 @@
#define SMB21_DEFAULT_IOSIZE (1024 * 1024)
#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024)
#define SMB3_DEFAULT_TRANS_SIZE (1024 * 1024)
+#define SMB3_MIN_IOSIZE (64 * 1024)
+#define SMB3_MAX_IOSIZE (8 * 1024 * 1024)
/*
* SMB2 Header Definition
@@ -1464,6 +1466,15 @@ struct smb2_file_all_info { /* data block encoding of response to level 18 */
char FileName[1];
} __packed; /* level 18 Query */
+struct smb2_file_basic_info { /* data block encoding of response to level 18 */
+ __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le32 Attributes;
+ __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */
+} __packed;
+
struct smb2_file_alt_name_info {
__le32 FileNameLength;
char FileName[0];
@@ -1628,7 +1639,6 @@ struct smb2_posix_info {
} __packed;
/* functions */
-int init_smb2_0_server(struct ksmbd_conn *conn);
void init_smb2_1_server(struct ksmbd_conn *conn);
void init_smb3_0_server(struct ksmbd_conn *conn);
void init_smb3_02_server(struct ksmbd_conn *conn);
diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c
index 43d3123d8b62..707490ab1f4c 100644
--- a/fs/ksmbd/smb_common.c
+++ b/fs/ksmbd/smb_common.c
@@ -21,7 +21,6 @@ static const char basechars[43] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_-!@#$%";
#define MAGIC_CHAR '~'
#define PERIOD '.'
#define mangle(V) ((char)(basechars[(V) % MANGLE_BASE]))
-#define KSMBD_MIN_SUPPORTED_HEADER_SIZE (sizeof(struct smb2_hdr))
struct smb_protocol {
int index;
@@ -89,7 +88,7 @@ unsigned int ksmbd_server_side_copy_max_total_size(void)
inline int ksmbd_min_protocol(void)
{
- return SMB2_PROT;
+ return SMB21_PROT;
}
inline int ksmbd_max_protocol(void)
@@ -129,16 +128,22 @@ int ksmbd_lookup_protocol_idx(char *str)
*
* check for valid smb signature and packet direction(request/response)
*
- * Return: 0 on success, otherwise 1
+ * Return: 0 on success, otherwise -EINVAL
*/
int ksmbd_verify_smb_message(struct ksmbd_work *work)
{
- struct smb2_hdr *smb2_hdr = work->request_buf;
+ struct smb2_hdr *smb2_hdr = work->request_buf + work->next_smb2_rcv_hdr_off;
+ struct smb_hdr *hdr;
if (smb2_hdr->ProtocolId == SMB2_PROTO_NUMBER)
return ksmbd_smb2_check_message(work);
- return 0;
+ hdr = work->request_buf;
+ if (*(__le32 *)hdr->Protocol == SMB1_PROTO_NUMBER &&
+ hdr->Command == SMB_COM_NEGOTIATE)
+ return 0;
+
+ return -EINVAL;
}
/**
@@ -149,20 +154,7 @@ int ksmbd_verify_smb_message(struct ksmbd_work *work)
*/
bool ksmbd_smb_request(struct ksmbd_conn *conn)
{
- int type = *(char *)conn->request_buf;
-
- switch (type) {
- case RFC1002_SESSION_MESSAGE:
- /* Regular SMB request */
- return true;
- case RFC1002_SESSION_KEEP_ALIVE:
- ksmbd_debug(SMB, "RFC 1002 session keep alive\n");
- break;
- default:
- ksmbd_debug(SMB, "RFC 1002 unknown request type 0x%x\n", type);
- }
-
- return false;
+ return conn->request_buf[0] == 0;
}
static bool supported_protocol(int idx)
@@ -176,10 +168,12 @@ static bool supported_protocol(int idx)
idx <= server_conf.max_protocol);
}
-static char *next_dialect(char *dialect, int *next_off)
+static char *next_dialect(char *dialect, int *next_off, int bcount)
{
dialect = dialect + *next_off;
- *next_off = strlen(dialect);
+ *next_off = strnlen(dialect, bcount);
+ if (dialect[*next_off] != '\0')
+ return NULL;
return dialect;
}
@@ -194,7 +188,9 @@ static int ksmbd_lookup_dialect_by_name(char *cli_dialects, __le16 byte_count)
dialect = cli_dialects;
bcount = le16_to_cpu(byte_count);
do {
- dialect = next_dialect(dialect, &next);
+ dialect = next_dialect(dialect, &next, bcount);
+ if (!dialect)
+ break;
ksmbd_debug(SMB, "client requested dialect %s\n",
dialect);
if (!strcmp(dialect, smb1_protos[i].name)) {
@@ -242,13 +238,22 @@ int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count)
static int ksmbd_negotiate_smb_dialect(void *buf)
{
- __le32 proto;
+ int smb_buf_length = get_rfc1002_len(buf);
+ __le32 proto = ((struct smb2_hdr *)buf)->ProtocolId;
- proto = ((struct smb2_hdr *)buf)->ProtocolId;
if (proto == SMB2_PROTO_NUMBER) {
struct smb2_negotiate_req *req;
+ int smb2_neg_size =
+ offsetof(struct smb2_negotiate_req, Dialects) - 4;
req = (struct smb2_negotiate_req *)buf;
+ if (smb2_neg_size > smb_buf_length)
+ goto err_out;
+
+ if (smb2_neg_size + le16_to_cpu(req->DialectCount) * sizeof(__le16) >
+ smb_buf_length)
+ goto err_out;
+
return ksmbd_lookup_dialect_by_id(req->Dialects,
req->DialectCount);
}
@@ -258,14 +263,22 @@ static int ksmbd_negotiate_smb_dialect(void *buf)
struct smb_negotiate_req *req;
req = (struct smb_negotiate_req *)buf;
+ if (le16_to_cpu(req->ByteCount) < 2)
+ goto err_out;
+
+ if (offsetof(struct smb_negotiate_req, DialectsArray) - 4 +
+ le16_to_cpu(req->ByteCount) > smb_buf_length) {
+ goto err_out;
+ }
+
return ksmbd_lookup_dialect_by_name(req->DialectsArray,
req->ByteCount);
}
+err_out:
return BAD_PROT_ID;
}
-#define SMB_COM_NEGOTIATE 0x72
int ksmbd_init_smb_server(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
@@ -280,11 +293,6 @@ int ksmbd_init_smb_server(struct ksmbd_work *work)
return 0;
}
-bool ksmbd_pdu_size_has_room(unsigned int pdu)
-{
- return (pdu >= KSMBD_MIN_SUPPORTED_HEADER_SIZE - 4);
-}
-
int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level,
struct ksmbd_file *dir,
struct ksmbd_dir_info *d_info,
@@ -419,7 +427,7 @@ int ksmbd_extract_shortname(struct ksmbd_conn *conn, const char *longname,
static int __smb2_negotiate(struct ksmbd_conn *conn)
{
- return (conn->dialect >= SMB20_PROT_ID &&
+ return (conn->dialect >= SMB21_PROT_ID &&
conn->dialect <= SMB311_PROT_ID);
}
@@ -449,7 +457,7 @@ int ksmbd_smb_negotiate_common(struct ksmbd_work *work, unsigned int command)
}
}
- if (command == SMB2_NEGOTIATE_HE) {
+ if (command == SMB2_NEGOTIATE_HE && __smb2_negotiate(conn)) {
ret = smb2_handle_negotiate(work);
init_smb2_neg_rsp(work);
return ret;
diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h
index 57c667c1be06..6e79e7577f6b 100644
--- a/fs/ksmbd/smb_common.h
+++ b/fs/ksmbd/smb_common.h
@@ -48,13 +48,7 @@
#define CIFS_DEFAULT_IOSIZE (64 * 1024)
#define MAX_CIFS_SMALL_BUFFER_SIZE 448 /* big enough for most */
-/* RFC 1002 session packet types */
-#define RFC1002_SESSION_MESSAGE 0x00
-#define RFC1002_SESSION_REQUEST 0x81
-#define RFC1002_POSITIVE_SESSION_RESPONSE 0x82
-#define RFC1002_NEGATIVE_SESSION_RESPONSE 0x83
-#define RFC1002_RETARGET_SESSION_RESPONSE 0x84
-#define RFC1002_SESSION_KEEP_ALIVE 0x85
+#define MAX_STREAM_PROT_LEN 0x00FFFFFF
/* Responses when opening a file. */
#define F_SUPERSEDED 0
@@ -210,6 +204,7 @@
FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES)
#define SMB1_PROTO_NUMBER cpu_to_le32(0x424d53ff)
+#define SMB_COM_NEGOTIATE 0x72
#define SMB1_CLIENT_GUID_SIZE (16)
struct smb_hdr {
@@ -500,8 +495,6 @@ int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count);
int ksmbd_init_smb_server(struct ksmbd_work *work);
-bool ksmbd_pdu_size_has_room(unsigned int pdu);
-
struct ksmbd_kstat;
int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work,
int info_level,
diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c
index 0a95cdec8c80..bd792db32623 100644
--- a/fs/ksmbd/smbacl.c
+++ b/fs/ksmbd/smbacl.c
@@ -380,7 +380,7 @@ static void parse_dacl(struct user_namespace *user_ns,
{
int i, ret;
int num_aces = 0;
- int acl_size;
+ unsigned int acl_size;
char *acl_base;
struct smb_ace **ppace;
struct posix_acl_entry *cf_pace, *cf_pdace;
@@ -392,7 +392,7 @@ static void parse_dacl(struct user_namespace *user_ns,
return;
/* validate that we do not go past end of acl */
- if (end_of_acl <= (char *)pdacl ||
+ if (end_of_acl < (char *)pdacl + sizeof(struct smb_acl) ||
end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
pr_err("ACL too small to parse DACL\n");
return;
@@ -431,8 +431,22 @@ static void parse_dacl(struct user_namespace *user_ns,
* user/group/other have no permissions
*/
for (i = 0; i < num_aces; ++i) {
+ if (end_of_acl - acl_base < acl_size)
+ break;
+
ppace[i] = (struct smb_ace *)(acl_base + acl_size);
acl_base = (char *)ppace[i];
+ acl_size = offsetof(struct smb_ace, sid) +
+ offsetof(struct smb_sid, sub_auth);
+
+ if (end_of_acl - acl_base < acl_size ||
+ ppace[i]->sid.num_subauth > SID_MAX_SUB_AUTHORITIES ||
+ (end_of_acl - acl_base <
+ acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth) ||
+ (le16_to_cpu(ppace[i]->size) <
+ acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth))
+ break;
+
acl_size = le16_to_cpu(ppace[i]->size);
ppace[i]->access_req =
smb_map_generic_desired_access(ppace[i]->access_req);
@@ -807,6 +821,9 @@ int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd,
if (!pntsd)
return -EIO;
+ if (acl_len < sizeof(struct smb_ntsd))
+ return -EINVAL;
+
owner_sid_ptr = (struct smb_sid *)((char *)pntsd +
le32_to_cpu(pntsd->osidoffset));
group_sid_ptr = (struct smb_sid *)((char *)pntsd +
diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c
index 44aea33a67fa..1acf1892a466 100644
--- a/fs/ksmbd/transport_ipc.c
+++ b/fs/ksmbd/transport_ipc.c
@@ -601,7 +601,7 @@ int ksmbd_ipc_tree_disconnect_request(unsigned long long session_id,
return ret;
}
-int ksmbd_ipc_logout_request(const char *account)
+int ksmbd_ipc_logout_request(const char *account, int flags)
{
struct ksmbd_ipc_msg *msg;
struct ksmbd_logout_request *req;
@@ -616,6 +616,7 @@ int ksmbd_ipc_logout_request(const char *account)
msg->type = KSMBD_EVENT_LOGOUT_REQUEST;
req = (struct ksmbd_logout_request *)msg->payload;
+ req->account_flags = flags;
strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ);
ret = ipc_msg_send(msg);
diff --git a/fs/ksmbd/transport_ipc.h b/fs/ksmbd/transport_ipc.h
index 9eacc895ffdb..5e5b90a0c187 100644
--- a/fs/ksmbd/transport_ipc.h
+++ b/fs/ksmbd/transport_ipc.h
@@ -25,7 +25,7 @@ ksmbd_ipc_tree_connect_request(struct ksmbd_session *sess,
struct sockaddr *peer_addr);
int ksmbd_ipc_tree_disconnect_request(unsigned long long session_id,
unsigned long long connect_id);
-int ksmbd_ipc_logout_request(const char *account);
+int ksmbd_ipc_logout_request(const char *account, int flags);
struct ksmbd_share_config_response *
ksmbd_ipc_share_config_request(const char *name);
struct ksmbd_spnego_authen_response *
diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
index 52b2556e76b1..a2fd5a4d4cd5 100644
--- a/fs/ksmbd/transport_rdma.c
+++ b/fs/ksmbd/transport_rdma.c
@@ -20,7 +20,6 @@
#define SUBMOD_NAME "smb_direct"
#include <linux/kthread.h>
-#include <linux/rwlock.h>
#include <linux/list.h>
#include <linux/mempool.h>
#include <linux/highmem.h>
@@ -550,6 +549,10 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
switch (recvmsg->type) {
case SMB_DIRECT_MSG_NEGOTIATE_REQ:
+ if (wc->byte_len < sizeof(struct smb_direct_negotiate_req)) {
+ put_empty_recvmsg(t, recvmsg);
+ return;
+ }
t->negotiation_requested = true;
t->full_packet_received = true;
wake_up_interruptible(&t->wait_status);
@@ -557,10 +560,23 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
case SMB_DIRECT_MSG_DATA_TRANSFER: {
struct smb_direct_data_transfer *data_transfer =
(struct smb_direct_data_transfer *)recvmsg->packet;
- int data_length = le32_to_cpu(data_transfer->data_length);
+ unsigned int data_length;
int avail_recvmsg_count, receive_credits;
+ if (wc->byte_len <
+ offsetof(struct smb_direct_data_transfer, padding)) {
+ put_empty_recvmsg(t, recvmsg);
+ return;
+ }
+
+ data_length = le32_to_cpu(data_transfer->data_length);
if (data_length) {
+ if (wc->byte_len < sizeof(struct smb_direct_data_transfer) +
+ (u64)data_length) {
+ put_empty_recvmsg(t, recvmsg);
+ return;
+ }
+
if (t->full_packet_received)
recvmsg->first_segment = true;
@@ -569,7 +585,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
else
t->full_packet_received = true;
- enqueue_reassembly(t, recvmsg, data_length);
+ enqueue_reassembly(t, recvmsg, (int)data_length);
wake_up_interruptible(&t->wait_reassembly_queue);
spin_lock(&t->receive_credit_lock);
diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c
index dc15a5ecd2e0..c14320e03b69 100644
--- a/fs/ksmbd/transport_tcp.c
+++ b/fs/ksmbd/transport_tcp.c
@@ -215,7 +215,7 @@ out_error:
* ksmbd_kthread_fn() - listen to new SMB connections and callback server
* @p: arguments to forker thread
*
- * Return: Returns a task_struct or ERR_PTR
+ * Return: 0 on success, error number otherwise
*/
static int ksmbd_kthread_fn(void *p)
{
@@ -387,7 +387,7 @@ static void tcp_destroy_socket(struct socket *ksmbd_socket)
/**
* create_socket - create socket for ksmbd/0
*
- * Return: Returns a task_struct or ERR_PTR
+ * Return: 0 on success, error number otherwise
*/
static int create_socket(struct interface *iface)
{
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index b047f2980d96..835b384b0895 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -19,6 +19,8 @@
#include <linux/sched/xacct.h>
#include <linux/crc32c.h>
+#include "../internal.h" /* for vfs_path_lookup */
+
#include "glob.h"
#include "oplock.h"
#include "connection.h"
@@ -44,7 +46,6 @@ static char *extract_last_component(char *path)
p++;
} else {
p = NULL;
- pr_err("Invalid path %s\n", path);
}
return p;
}
@@ -155,7 +156,7 @@ int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns,
/**
* ksmbd_vfs_create() - vfs helper for smb create file
* @work: work
- * @name: file name
+ * @name: file name that is relative to share
* @mode: file create mode
*
* Return: 0 on success, otherwise error
@@ -166,7 +167,8 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode)
struct dentry *dentry;
int err;
- dentry = kern_path_create(AT_FDCWD, name, &path, 0);
+ dentry = ksmbd_vfs_kern_path_create(work, name,
+ LOOKUP_NO_SYMLINKS, &path);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
if (err != -ENOENT)
@@ -191,7 +193,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode)
/**
* ksmbd_vfs_mkdir() - vfs helper for smb create directory
* @work: work
- * @name: directory name
+ * @name: directory name that is relative to share
* @mode: directory create mode
*
* Return: 0 on success, otherwise error
@@ -203,7 +205,9 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
struct dentry *dentry;
int err;
- dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY);
+ dentry = ksmbd_vfs_kern_path_create(work, name,
+ LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY,
+ &path);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
if (err != -EEXIST)
@@ -578,7 +582,7 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id)
/**
* ksmbd_vfs_remove_file() - vfs helper for smb rmdir or unlink
- * @name: absolute directory or file name
+ * @name: directory or file name that is relative to share
*
* Return: 0 on success, otherwise error
*/
@@ -588,16 +592,11 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name)
struct path path;
struct dentry *parent;
int err;
- int flags = 0;
if (ksmbd_override_fsids(work))
return -ENOMEM;
- if (test_share_config_flag(work->tcon->share_conf,
- KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS))
- flags = LOOKUP_FOLLOW;
-
- err = kern_path(name, flags, &path);
+ err = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS, &path, false);
if (err) {
ksmbd_debug(VFS, "can't get %s, err %d\n", name, err);
ksmbd_revert_fsids(work);
@@ -642,7 +641,7 @@ out_err:
/**
* ksmbd_vfs_link() - vfs helper for creating smb hardlink
* @oldname: source file name
- * @newname: hardlink name
+ * @newname: hardlink name that is relative to share
*
* Return: 0 on success, otherwise error
*/
@@ -652,24 +651,20 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname,
struct path oldpath, newpath;
struct dentry *dentry;
int err;
- int flags = 0;
if (ksmbd_override_fsids(work))
return -ENOMEM;
- if (test_share_config_flag(work->tcon->share_conf,
- KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS))
- flags = LOOKUP_FOLLOW;
-
- err = kern_path(oldname, flags, &oldpath);
+ err = kern_path(oldname, LOOKUP_NO_SYMLINKS, &oldpath);
if (err) {
pr_err("cannot get linux path for %s, err = %d\n",
oldname, err);
goto out1;
}
- dentry = kern_path_create(AT_FDCWD, newname, &newpath,
- flags | LOOKUP_REVAL);
+ dentry = ksmbd_vfs_kern_path_create(work, newname,
+ LOOKUP_NO_SYMLINKS | LOOKUP_REVAL,
+ &newpath);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
pr_err("path create err for %s, err %d\n", newname, err);
@@ -788,21 +783,19 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
struct dentry *src_dent, *trap_dent, *src_child;
char *dst_name;
int err;
- int flags;
dst_name = extract_last_component(newname);
- if (!dst_name)
- return -EINVAL;
+ if (!dst_name) {
+ dst_name = newname;
+ newname = "";
+ }
src_dent_parent = dget_parent(fp->filp->f_path.dentry);
src_dent = fp->filp->f_path.dentry;
- flags = LOOKUP_DIRECTORY;
- if (test_share_config_flag(work->tcon->share_conf,
- KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS))
- flags |= LOOKUP_FOLLOW;
-
- err = kern_path(newname, flags, &dst_path);
+ err = ksmbd_vfs_kern_path(work, newname,
+ LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY,
+ &dst_path, false);
if (err) {
ksmbd_debug(VFS, "Cannot get path for %s [%d]\n", newname, err);
goto out;
@@ -848,61 +841,43 @@ out:
/**
* ksmbd_vfs_truncate() - vfs helper for smb file truncate
* @work: work
- * @name: old filename
* @fid: file id of old file
* @size: truncate to given size
*
* Return: 0 on success, otherwise error
*/
-int ksmbd_vfs_truncate(struct ksmbd_work *work, const char *name,
+int ksmbd_vfs_truncate(struct ksmbd_work *work,
struct ksmbd_file *fp, loff_t size)
{
- struct path path;
int err = 0;
+ struct file *filp;
- if (name) {
- err = kern_path(name, 0, &path);
- if (err) {
- pr_err("cannot get linux path for %s, err %d\n",
- name, err);
- return err;
- }
- err = vfs_truncate(&path, size);
- if (err)
- pr_err("truncate failed for %s err %d\n",
- name, err);
- path_put(&path);
- } else {
- struct file *filp;
-
- filp = fp->filp;
-
- /* Do we need to break any of a levelII oplock? */
- smb_break_all_levII_oplock(work, fp, 1);
+ filp = fp->filp;
- if (!work->tcon->posix_extensions) {
- struct inode *inode = file_inode(filp);
+ /* Do we need to break any of a levelII oplock? */
+ smb_break_all_levII_oplock(work, fp, 1);
- if (size < inode->i_size) {
- err = check_lock_range(filp, size,
- inode->i_size - 1, WRITE);
- } else {
- err = check_lock_range(filp, inode->i_size,
- size - 1, WRITE);
- }
+ if (!work->tcon->posix_extensions) {
+ struct inode *inode = file_inode(filp);
- if (err) {
- pr_err("failed due to lock\n");
- return -EAGAIN;
- }
+ if (size < inode->i_size) {
+ err = check_lock_range(filp, size,
+ inode->i_size - 1, WRITE);
+ } else {
+ err = check_lock_range(filp, inode->i_size,
+ size - 1, WRITE);
}
- err = vfs_truncate(&filp->f_path, size);
- if (err)
- pr_err("truncate failed for filename : %s err %d\n",
- fp->filename, err);
+ if (err) {
+ pr_err("failed due to lock\n");
+ return -EAGAIN;
+ }
}
+ err = vfs_truncate(&filp->f_path, size);
+ if (err)
+ pr_err("truncate failed for filename : %s err %d\n",
+ fp->filename, err);
return err;
}
@@ -1048,7 +1023,7 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp,
int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
struct file_allocated_range_buffer *ranges,
- int in_count, int *out_count)
+ unsigned int in_count, unsigned int *out_count)
{
struct file *f = fp->filp;
struct inode *inode = file_inode(fp->filp);
@@ -1220,22 +1195,25 @@ static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen)
/**
* ksmbd_vfs_kern_path() - lookup a file and get path info
- * @name: name of file for lookup
+ * @name: file path that is relative to share
* @flags: lookup flags
* @path: if lookup succeed, return path info
* @caseless: caseless filename lookup
*
* Return: 0 on success, otherwise error
*/
-int ksmbd_vfs_kern_path(char *name, unsigned int flags, struct path *path,
- bool caseless)
+int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
+ unsigned int flags, struct path *path, bool caseless)
{
+ struct ksmbd_share_config *share_conf = work->tcon->share_conf;
int err;
- if (name[0] != '/')
- return -EINVAL;
-
- err = kern_path(name, flags, path);
+ flags |= LOOKUP_BENEATH;
+ err = vfs_path_lookup(share_conf->vfs_path.dentry,
+ share_conf->vfs_path.mnt,
+ name,
+ flags,
+ path);
if (!err)
return 0;
@@ -1249,11 +1227,10 @@ int ksmbd_vfs_kern_path(char *name, unsigned int flags, struct path *path,
return -ENOMEM;
path_len = strlen(filepath);
- remain_len = path_len - 1;
+ remain_len = path_len;
- err = kern_path("/", flags, &parent);
- if (err)
- goto out;
+ parent = share_conf->vfs_path;
+ path_get(&parent);
while (d_can_lookup(parent.dentry)) {
char *filename = filepath + path_len - remain_len;
@@ -1266,21 +1243,21 @@ int ksmbd_vfs_kern_path(char *name, unsigned int flags, struct path *path,
err = ksmbd_vfs_lookup_in_dir(&parent, filename,
filename_len);
- if (err) {
- path_put(&parent);
+ path_put(&parent);
+ if (err)
goto out;
- }
- path_put(&parent);
next[0] = '\0';
- err = kern_path(filepath, flags, &parent);
+ err = vfs_path_lookup(share_conf->vfs_path.dentry,
+ share_conf->vfs_path.mnt,
+ filepath,
+ flags,
+ &parent);
if (err)
goto out;
-
- if (is_last) {
- path->mnt = parent.mnt;
- path->dentry = parent.dentry;
+ else if (is_last) {
+ *path = parent;
goto out;
}
@@ -1296,6 +1273,23 @@ out:
return err;
}
+struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
+ const char *name,
+ unsigned int flags,
+ struct path *path)
+{
+ char *abs_name;
+ struct dentry *dent;
+
+ abs_name = convert_to_unix_name(work->tcon->share_conf, name);
+ if (!abs_name)
+ return ERR_PTR(-ENOMEM);
+
+ dent = kern_path_create(AT_FDCWD, abs_name, path, flags);
+ kfree(abs_name);
+ return dent;
+}
+
int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns,
struct dentry *dentry)
{
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index 85db50abdb24..b0d5b8feb4a3 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -126,7 +126,7 @@ int ksmbd_vfs_link(struct ksmbd_work *work,
int ksmbd_vfs_getattr(struct path *path, struct kstat *stat);
int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
char *newname);
-int ksmbd_vfs_truncate(struct ksmbd_work *work, const char *name,
+int ksmbd_vfs_truncate(struct ksmbd_work *work,
struct ksmbd_file *fp, loff_t size);
struct srv_copychunk;
int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work,
@@ -152,8 +152,13 @@ int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
size_t *xattr_stream_name_size, int s_type);
int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns,
struct dentry *dentry, char *attr_name);
-int ksmbd_vfs_kern_path(char *name, unsigned int flags, struct path *path,
+int ksmbd_vfs_kern_path(struct ksmbd_work *work,
+ char *name, unsigned int flags, struct path *path,
bool caseless);
+struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
+ const char *name,
+ unsigned int flags,
+ struct path *path);
int ksmbd_vfs_empty_dir(struct ksmbd_file *fp);
void ksmbd_vfs_set_fadvise(struct file *filp, __le32 option);
int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp,
@@ -161,7 +166,7 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp,
struct file_allocated_range_buffer;
int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
struct file_allocated_range_buffer *ranges,
- int in_count, int *out_count);
+ unsigned int in_count, unsigned int *out_count);
int ksmbd_vfs_unlink(struct user_namespace *user_ns,
struct dentry *dir, struct dentry *dentry);
void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat);
diff --git a/fs/lockd/svcxdr.h b/fs/lockd/svcxdr.h
index c69a0bb76c94..4f1a451da5ba 100644
--- a/fs/lockd/svcxdr.h
+++ b/fs/lockd/svcxdr.h
@@ -134,18 +134,9 @@ svcxdr_decode_owner(struct xdr_stream *xdr, struct xdr_netobj *obj)
static inline bool
svcxdr_encode_owner(struct xdr_stream *xdr, const struct xdr_netobj *obj)
{
- unsigned int quadlen = XDR_QUADLEN(obj->len);
- __be32 *p;
-
- if (xdr_stream_encode_u32(xdr, obj->len) < 0)
- return false;
- p = xdr_reserve_space(xdr, obj->len);
- if (!p)
+ if (obj->len > XDR_MAX_NETOBJ)
return false;
- p[quadlen - 1] = 0; /* XDR pad */
- memcpy(p, obj->data, obj->len);
-
- return true;
+ return xdr_stream_encode_opaque(xdr, obj->data, obj->len) > 0;
}
#endif /* _LOCKD_SVCXDR_H_ */
diff --git a/fs/locks.c b/fs/locks.c
index 3d6fb4ae847b..0fca9d680978 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2,117 +2,11 @@
/*
* linux/fs/locks.c
*
- * Provide support for fcntl()'s F_GETLK, F_SETLK, and F_SETLKW calls.
- * Doug Evans (dje@spiff.uucp), August 07, 1992
+ * We implement four types of file locks: BSD locks, posix locks, open
+ * file description locks, and leases. For details about BSD locks,
+ * see the flock(2) man page; for details about the other three, see
+ * fcntl(2).
*
- * Deadlock detection added.
- * FIXME: one thing isn't handled yet:
- * - mandatory locks (requires lots of changes elsewhere)
- * Kelly Carmichael (kelly@[142.24.8.65]), September 17, 1994.
- *
- * Miscellaneous edits, and a total rewrite of posix_lock_file() code.
- * Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994
- *
- * Converted file_lock_table to a linked list from an array, which eliminates
- * the limits on how many active file locks are open.
- * Chad Page (pageone@netcom.com), November 27, 1994
- *
- * Removed dependency on file descriptors. dup()'ed file descriptors now
- * get the same locks as the original file descriptors, and a close() on
- * any file descriptor removes ALL the locks on the file for the current
- * process. Since locks still depend on the process id, locks are inherited
- * after an exec() but not after a fork(). This agrees with POSIX, and both
- * BSD and SVR4 practice.
- * Andy Walker (andy@lysaker.kvaerner.no), February 14, 1995
- *
- * Scrapped free list which is redundant now that we allocate locks
- * dynamically with kmalloc()/kfree().
- * Andy Walker (andy@lysaker.kvaerner.no), February 21, 1995
- *
- * Implemented two lock personalities - FL_FLOCK and FL_POSIX.
- *
- * FL_POSIX locks are created with calls to fcntl() and lockf() through the
- * fcntl() system call. They have the semantics described above.
- *
- * FL_FLOCK locks are created with calls to flock(), through the flock()
- * system call, which is new. Old C libraries implement flock() via fcntl()
- * and will continue to use the old, broken implementation.
- *
- * FL_FLOCK locks follow the 4.4 BSD flock() semantics. They are associated
- * with a file pointer (filp). As a result they can be shared by a parent
- * process and its children after a fork(). They are removed when the last
- * file descriptor referring to the file pointer is closed (unless explicitly
- * unlocked).
- *
- * FL_FLOCK locks never deadlock, an existing lock is always removed before
- * upgrading from shared to exclusive (or vice versa). When this happens
- * any processes blocked by the current lock are woken up and allowed to
- * run before the new lock is applied.
- * Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995
- *
- * Removed some race conditions in flock_lock_file(), marked other possible
- * races. Just grep for FIXME to see them.
- * Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996.
- *
- * Addressed Dmitry's concerns. Deadlock checking no longer recursive.
- * Lock allocation changed to GFP_ATOMIC as we can't afford to sleep
- * once we've checked for blocking and deadlocking.
- * Andy Walker (andy@lysaker.kvaerner.no), April 03, 1996.
- *
- * Initial implementation of mandatory locks. SunOS turned out to be
- * a rotten model, so I implemented the "obvious" semantics.
- * See 'Documentation/filesystems/mandatory-locking.rst' for details.
- * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996.
- *
- * Don't allow mandatory locks on mmap()'ed files. Added simple functions to
- * check if a file has mandatory locks, used by mmap(), open() and creat() to
- * see if system call should be rejected. Ref. HP-UX/SunOS/Solaris Reference
- * Manual, Section 2.
- * Andy Walker (andy@lysaker.kvaerner.no), April 09, 1996.
- *
- * Tidied up block list handling. Added '/proc/locks' interface.
- * Andy Walker (andy@lysaker.kvaerner.no), April 24, 1996.
- *
- * Fixed deadlock condition for pathological code that mixes calls to
- * flock() and fcntl().
- * Andy Walker (andy@lysaker.kvaerner.no), April 29, 1996.
- *
- * Allow only one type of locking scheme (FL_POSIX or FL_FLOCK) to be in use
- * for a given file at a time. Changed the CONFIG_LOCK_MANDATORY scheme to
- * guarantee sensible behaviour in the case where file system modules might
- * be compiled with different options than the kernel itself.
- * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
- *
- * Added a couple of missing wake_up() calls. Thanks to Thomas Meckel
- * (Thomas.Meckel@mni.fh-giessen.de) for spotting this.
- * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
- *
- * Changed FL_POSIX locks to use the block list in the same way as FL_FLOCK
- * locks. Changed process synchronisation to avoid dereferencing locks that
- * have already been freed.
- * Andy Walker (andy@lysaker.kvaerner.no), Sep 21, 1996.
- *
- * Made the block list a circular list to minimise searching in the list.
- * Andy Walker (andy@lysaker.kvaerner.no), Sep 25, 1996.
- *
- * Made mandatory locking a mount option. Default is not to allow mandatory
- * locking.
- * Andy Walker (andy@lysaker.kvaerner.no), Oct 04, 1996.
- *
- * Some adaptations for NFS support.
- * Olaf Kirch (okir@monad.swb.de), Dec 1996,
- *
- * Fixed /proc/locks interface so that we can't overrun the buffer we are handed.
- * Andy Walker (andy@lysaker.kvaerner.no), May 12, 1997.
- *
- * Use slab allocator instead of kmalloc/kfree.
- * Use generic list implementation from <linux/list.h>.
- * Sped up posix_locks_deadlock by only considering blocked locks.
- * Matthew Wilcox <willy@debian.org>, March, 2000.
- *
- * Leases and LOCK_MAND
- * Matthew Wilcox <willy@debian.org>, June, 2000.
- * Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000.
*
* Locking conflicts and dependencies:
* If multiple threads attempt to lock the same byte (or flock the same file)
@@ -461,8 +355,6 @@ static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
}
static inline int flock_translate_cmd(int cmd) {
- if (cmd & LOCK_MAND)
- return cmd & (LOCK_MAND | LOCK_RW);
switch (cmd) {
case LOCK_SH:
return F_RDLCK;
@@ -942,8 +834,6 @@ static bool flock_locks_conflict(struct file_lock *caller_fl,
*/
if (caller_fl->fl_file == sys_fl->fl_file)
return false;
- if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND))
- return false;
return locks_conflict(caller_fl, sys_fl);
}
@@ -2116,11 +2006,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait);
* - %LOCK_SH -- a shared lock.
* - %LOCK_EX -- an exclusive lock.
* - %LOCK_UN -- remove an existing lock.
- * - %LOCK_MAND -- a 'mandatory' flock.
- * This exists to emulate Windows Share Modes.
+ * - %LOCK_MAND -- a 'mandatory' flock. (DEPRECATED)
*
- * %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other
- * processes read and write access respectively.
+ * %LOCK_MAND support has been removed from the kernel.
*/
SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
{
@@ -2137,9 +2025,22 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
cmd &= ~LOCK_NB;
unlock = (cmd == LOCK_UN);
- if (!unlock && !(cmd & LOCK_MAND) &&
- !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
+ if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
+ goto out_putf;
+
+ /*
+ * LOCK_MAND locks were broken for a long time in that they never
+ * conflicted with one another and didn't prevent any sort of open,
+ * read or write activity.
+ *
+ * Just ignore these requests now, to preserve legacy behavior, but
+ * throw a warning to let people know that they don't actually work.
+ */
+ if (cmd & LOCK_MAND) {
+ pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n");
+ error = 0;
goto out_putf;
+ }
lock = flock_make_lock(f.file, cmd, NULL);
if (IS_ERR(lock)) {
@@ -2718,6 +2619,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
struct inode *inode = NULL;
unsigned int fl_pid;
struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
+ int type;
fl_pid = locks_translate_pid(fl, proc_pidns);
/*
@@ -2745,11 +2647,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
seq_printf(f, " %s ",
(inode == NULL) ? "*NOINODE*" : "ADVISORY ");
} else if (IS_FLOCK(fl)) {
- if (fl->fl_type & LOCK_MAND) {
- seq_puts(f, "FLOCK MSNFS ");
- } else {
- seq_puts(f, "FLOCK ADVISORY ");
- }
+ seq_puts(f, "FLOCK ADVISORY ");
} else if (IS_LEASE(fl)) {
if (fl->fl_flags & FL_DELEG)
seq_puts(f, "DELEG ");
@@ -2765,17 +2663,10 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
} else {
seq_puts(f, "UNKNOWN UNKNOWN ");
}
- if (fl->fl_type & LOCK_MAND) {
- seq_printf(f, "%s ",
- (fl->fl_type & LOCK_READ)
- ? (fl->fl_type & LOCK_WRITE) ? "RW " : "READ "
- : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
- } else {
- int type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
+ type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
- seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
- (type == F_RDLCK) ? "READ" : "UNLCK");
- }
+ seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
+ (type == F_RDLCK) ? "READ" : "UNLCK");
if (inode) {
/* userspace relies on this representation of dev_t */
seq_printf(f, "%d %02x:%02x:%lu ", fl_pid,
diff --git a/fs/namei.c b/fs/namei.c
index 1946d9667790..1f9d2187c765 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3076,9 +3076,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
int error = get_write_access(inode);
if (error)
return error;
- /*
- * Refuse to truncate files with mandatory locks held on them.
- */
+
error = security_path_truncate(path);
if (!error) {
error = do_truncate(mnt_userns, path->dentry, 0,
diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index 0b6cd3b8734c..994ec22d4040 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -150,7 +150,7 @@ static void netfs_clear_unread(struct netfs_read_subrequest *subreq)
{
struct iov_iter iter;
- iov_iter_xarray(&iter, WRITE, &subreq->rreq->mapping->i_pages,
+ iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages,
subreq->start + subreq->transferred,
subreq->len - subreq->transferred);
iov_iter_zero(iov_iter_count(&iter), &iter);
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index acb1d22907da..5e56da748b2a 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -252,7 +252,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
d->bdev = bdev;
- d->len = i_size_read(d->bdev->bd_inode);
+ d->len = bdev_nr_bytes(d->bdev);
d->map = bl_map_simple;
printk(KERN_INFO "pNFS: using block device %s\n",
@@ -367,7 +367,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
return PTR_ERR(bdev);
d->bdev = bdev;
- d->len = i_size_read(d->bdev->bd_inode);
+ d->len = bdev_nr_bytes(d->bdev);
d->map = bl_map_simple;
d->pr_key = v->scsi.pr_key;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 2e894fec036b..7a5f287c5391 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -275,7 +275,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
res = (long) dreq->count;
WARN_ON_ONCE(dreq->count < 0);
}
- dreq->iocb->ki_complete(dreq->iocb, res, 0);
+ dreq->iocb->ki_complete(dreq->iocb, res);
}
complete(&dreq->completion);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index aa353fd58240..24e7dccce355 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -843,15 +843,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- /*
- * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of
- * any standard. In principle we might be able to support LOCK_MAND
- * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the
- * NFS code is not set up for it.
- */
- if (fl->fl_type & LOCK_MAND)
- return -EINVAL;
-
if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
is_local = 1;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e1214bb6b7ee..459860aa8fd7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -127,7 +127,8 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
return NULL;
err = security_dentry_init_security(dentry, sattr->ia_mode,
- &dentry->d_name, (void **)&label->label, &label->len);
+ &dentry->d_name, NULL,
+ (void **)&label->label, &label->len);
if (err == 0)
return label;
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
index edec45831585..0a9b72685f98 100644
--- a/fs/nfs_common/grace.c
+++ b/fs/nfs_common/grace.c
@@ -42,7 +42,6 @@ EXPORT_SYMBOL_GPL(locks_start_grace);
/**
* locks_end_grace
- * @net: net namespace that this lock manager belongs to
* @lm: who this grace period is for
*
* Call this function to state that the given lock manager is ready to
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 6e9ea4ee0f73..3d1d17256a91 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -109,7 +109,6 @@ config NFSD_SCSILAYOUT
depends on NFSD_V4 && BLOCK
select NFSD_PNFS
select EXPORTFS_BLOCK_OPS
- select SCSI_COMMON
help
This option enables support for the exporting pNFS SCSI layouts
in the kernel's NFS server. The pNFS SCSI layout enables NFS
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index c99dee99a3c1..e5c0982a381d 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -9,9 +9,6 @@
#include <linux/pr.h>
#include <linux/nfsd/debug.h>
-#include <scsi/scsi_proto.h>
-#include <scsi/scsi_common.h>
-#include <scsi/scsi_request.h>
#include "blocklayoutxdr.h"
#include "pnfs.h"
@@ -211,109 +208,6 @@ const struct nfsd4_layout_ops bl_layout_ops = {
#endif /* CONFIG_NFSD_BLOCKLAYOUT */
#ifdef CONFIG_NFSD_SCSILAYOUT
-static int nfsd4_scsi_identify_device(struct block_device *bdev,
- struct pnfs_block_volume *b)
-{
- struct request_queue *q = bdev->bd_disk->queue;
- struct request *rq;
- struct scsi_request *req;
- /*
- * The allocation length (passed in bytes 3 and 4 of the INQUIRY
- * command descriptor block) specifies the number of bytes that have
- * been allocated for the data-in buffer.
- * 252 is the highest one-byte value that is a multiple of 4.
- * 65532 is the highest two-byte value that is a multiple of 4.
- */
- size_t bufflen = 252, maxlen = 65532, len, id_len;
- u8 *buf, *d, type, assoc;
- int retries = 1, error;
-
- if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q)))
- return -EINVAL;
-
-again:
- buf = kzalloc(bufflen, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
- if (IS_ERR(rq)) {
- error = -ENOMEM;
- goto out_free_buf;
- }
- req = scsi_req(rq);
-
- error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
- if (error)
- goto out_put_request;
-
- req->cmd[0] = INQUIRY;
- req->cmd[1] = 1;
- req->cmd[2] = 0x83;
- req->cmd[3] = bufflen >> 8;
- req->cmd[4] = bufflen & 0xff;
- req->cmd_len = COMMAND_SIZE(INQUIRY);
-
- blk_execute_rq(NULL, rq, 1);
- if (req->result) {
- pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
- req->result);
- error = -EIO;
- goto out_put_request;
- }
-
- len = (buf[2] << 8) + buf[3] + 4;
- if (len > bufflen) {
- if (len <= maxlen && retries--) {
- blk_put_request(rq);
- kfree(buf);
- bufflen = len;
- goto again;
- }
- pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n",
- len);
- goto out_put_request;
- }
-
- d = buf + 4;
- for (d = buf + 4; d < buf + len; d += id_len + 4) {
- id_len = d[3];
- type = d[1] & 0xf;
- assoc = (d[1] >> 4) & 0x3;
-
- /*
- * We only care about a EUI-64 and NAA designator types
- * with LU association.
- */
- if (assoc != 0x00)
- continue;
- if (type != 0x02 && type != 0x03)
- continue;
- if (id_len != 8 && id_len != 12 && id_len != 16)
- continue;
-
- b->scsi.code_set = PS_CODE_SET_BINARY;
- b->scsi.designator_type = type == 0x02 ?
- PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA;
- b->scsi.designator_len = id_len;
- memcpy(b->scsi.designator, d + 4, id_len);
-
- /*
- * If we found a 8 or 12 byte descriptor continue on to
- * see if a 16 byte one is available. If we find a
- * 16 byte descriptor we're done.
- */
- if (id_len == 16)
- break;
- }
-
-out_put_request:
- blk_put_request(rq);
-out_free_buf:
- kfree(buf);
- return error;
-}
-
#define NFSD_MDS_PR_KEY 0x0100000000000000ULL
/*
@@ -325,6 +219,31 @@ static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp)
return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id;
}
+static const u8 designator_types[] = {
+ PS_DESIGNATOR_EUI64,
+ PS_DESIGNATOR_NAA,
+};
+
+static int
+nfsd4_block_get_unique_id(struct gendisk *disk, struct pnfs_block_volume *b)
+{
+ int ret, i;
+
+ for (i = 0; i < ARRAY_SIZE(designator_types); i++) {
+ u8 type = designator_types[i];
+
+ ret = disk->fops->get_unique_id(disk, b->scsi.designator, type);
+ if (ret > 0) {
+ b->scsi.code_set = PS_CODE_SET_BINARY;
+ b->scsi.designator_type = type;
+ b->scsi.designator_len = ret;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
static int
nfsd4_block_get_device_info_scsi(struct super_block *sb,
struct nfs4_client *clp,
@@ -333,7 +252,7 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb,
struct pnfs_block_deviceaddr *dev;
struct pnfs_block_volume *b;
const struct pr_ops *ops;
- int error;
+ int ret;
dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
sizeof(struct pnfs_block_volume), GFP_KERNEL);
@@ -347,33 +266,38 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb,
b->type = PNFS_BLOCK_VOLUME_SCSI;
b->scsi.pr_key = nfsd4_scsi_pr_key(clp);
- error = nfsd4_scsi_identify_device(sb->s_bdev, b);
- if (error)
- return error;
+ ret = nfsd4_block_get_unique_id(sb->s_bdev->bd_disk, b);
+ if (ret < 0)
+ goto out_free_dev;
+ ret = -EINVAL;
ops = sb->s_bdev->bd_disk->fops->pr_ops;
if (!ops) {
pr_err("pNFS: device %s does not support PRs.\n",
sb->s_id);
- return -EINVAL;
+ goto out_free_dev;
}
- error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
- if (error) {
+ ret = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
+ if (ret) {
pr_err("pNFS: failed to register key for device %s.\n",
sb->s_id);
- return -EINVAL;
+ goto out_free_dev;
}
- error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
+ ret = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
PR_EXCLUSIVE_ACCESS_REG_ONLY, 0);
- if (error) {
+ if (ret) {
pr_err("pNFS: failed to reserve device %s.\n",
sb->s_id);
- return -EINVAL;
+ goto out_free_dev;
}
return 0;
+
+out_free_dev:
+ kfree(dev);
+ return ret;
}
static __be32
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 7629248fdd53..fdf89fcf1a0c 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -542,7 +542,7 @@ nfsd_file_close_inode_sync(struct inode *inode)
}
/**
- * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
* @inode: inode of the file to attempt to remove
*
* Walk the whole hash bucket, looking for any files that correspond to "inode".
@@ -602,6 +602,9 @@ nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask,
struct inode *inode, struct inode *dir,
const struct qstr *name, u32 cookie)
{
+ if (WARN_ON_ONCE(!inode))
+ return 0;
+
trace_nfsd_file_fsnotify_handle_event(inode, mask);
/* Should be no marks on non-regular files */
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index a97873f2d22b..6d1b5bb051c5 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -145,8 +145,9 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
#ifdef CONFIG_NFSD_SCSILAYOUT
if (sb->s_export_op->map_blocks &&
sb->s_export_op->commit_blocks &&
- sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops &&
- blk_queue_scsi_passthrough(sb->s_bdev->bd_disk->queue))
+ sb->s_bdev &&
+ sb->s_bdev->bd_disk->fops->pr_ops &&
+ sb->s_bdev->bd_disk->fops->get_unique_id)
exp->ex_layout_types |= 1 << LAYOUT_SCSI;
#endif
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 42356416f0a0..3f4027a5de88 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3570,7 +3570,7 @@ static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_s
}
static __be32 nfsd4_match_existing_connection(struct svc_rqst *rqst,
- struct nfsd4_session *session, u32 req)
+ struct nfsd4_session *session, u32 req, struct nfsd4_conn **conn)
{
struct nfs4_client *clp = session->se_client;
struct svc_xprt *xpt = rqst->rq_xprt;
@@ -3593,6 +3593,8 @@ static __be32 nfsd4_match_existing_connection(struct svc_rqst *rqst,
else
status = nfserr_inval;
spin_unlock(&clp->cl_lock);
+ if (status == nfs_ok && conn)
+ *conn = c;
return status;
}
@@ -3617,8 +3619,16 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
status = nfserr_wrong_cred;
if (!nfsd4_mach_creds_match(session->se_client, rqstp))
goto out;
- status = nfsd4_match_existing_connection(rqstp, session, bcts->dir);
- if (status == nfs_ok || status == nfserr_inval)
+ status = nfsd4_match_existing_connection(rqstp, session,
+ bcts->dir, &conn);
+ if (status == nfs_ok) {
+ if (bcts->dir == NFS4_CDFC4_FORE_OR_BOTH ||
+ bcts->dir == NFS4_CDFC4_BACK)
+ conn->cn_flags |= NFS4_CDFC4_BACK;
+ nfsd4_probe_callback(session->se_client);
+ goto out;
+ }
+ if (status == nfserr_inval)
goto out;
status = nfsd4_map_bcts_dir(&bcts->dir);
if (status)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 7abeccb975b2..cf030ebe2827 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3544,15 +3544,18 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
goto fail;
cd->rd_maxcount -= entry_bytes;
/*
- * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
- * let's always let through the first entry, at least:
+ * RFC 3530 14.2.24 describes rd_dircount as only a "hint", and
+ * notes that it could be zero. If it is zero, then the server
+ * should enforce only the rd_maxcount value.
*/
- if (!cd->rd_dircount)
- goto fail;
- name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8;
- if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
- goto fail;
- cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
+ if (cd->rd_dircount) {
+ name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8;
+ if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
+ goto fail;
+ cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
+ if (!cd->rd_dircount)
+ cd->rd_maxcount = 0;
+ }
cd->cookie_offset = cookie_offset;
skip_entry:
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c2c3d9077dc5..070e5dd03e26 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -793,7 +793,10 @@ out_close:
svc_xprt_put(xprt);
}
out_err:
- nfsd_destroy(net);
+ if (!list_empty(&nn->nfsd_serv->sv_permsocks))
+ nn->nfsd_serv->sv_nrthreads--;
+ else
+ nfsd_destroy(net);
return err;
}
@@ -1545,7 +1548,7 @@ static int __init init_nfsd(void)
goto out_free_all;
return 0;
out_free_all:
- unregister_pernet_subsys(&nfsd_net_ops);
+ unregister_filesystem(&nfsd_fs_type);
out_free_exports:
remove_proc_entry("fs/nfs/exports", NULL);
remove_proc_entry("fs/nfs", NULL);
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index adf3bb0a8048..6ce8617b562d 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * alloc.c - NILFS dat/inode allocator
+ * NILFS dat/inode allocator
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 0303c3968cee..b667e869ac07 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator
+ * Persistent object (dat entry/disk inode) allocator/deallocator
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 5900879d5693..798a2c1b38c6 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * bmap.c - NILFS block mapping.
+ * NILFS block mapping.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 2c63858e81c9..608168a5cb88 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * bmap.h - NILFS block mapping.
+ * NILFS block mapping.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 4391fd3abd8f..66bdaa2cf496 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * btnode.c - NILFS B-tree node cache
+ * NILFS B-tree node cache
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 0f88dbc9bcb3..11663650add7 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * btnode.h - NILFS B-tree node cache
+ * NILFS B-tree node cache
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index ab9ec073330f..3594eabe1419 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * btree.c - NILFS B-tree.
+ * NILFS B-tree.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index d1421b646ce4..92868e1a48ca 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * btree.h - NILFS B-tree.
+ * NILFS B-tree.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index ce144776b4ef..9ebefb3acb0e 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * cpfile.c - NILFS checkpoint file.
+ * NILFS checkpoint file.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 6336222df24a..edabb2dc5756 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * cpfile.h - NILFS checkpoint file.
+ * NILFS checkpoint file.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 8bccdf1158fc..dc51d3b7a7bf 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * dat.c - NILFS disk address translation.
+ * NILFS disk address translation.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index b17ee34580ae..468c82d26183 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * dat.h - NILFS disk address translation.
+ * NILFS disk address translation.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 81394e22d0a0..f8f4c2ff52f4 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * dir.c - NILFS directory entry operations
+ * NILFS directory entry operations
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index f353101955e3..a35f2795b242 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * direct.c - NILFS direct block pointer.
+ * NILFS direct block pointer.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index ec9a23c77994..b7ca896269af 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * direct.h - NILFS direct block pointer.
+ * NILFS direct block pointer.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 7cf765258fda..a265d391ffe9 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * file.c - NILFS regular file handling primitives including fsync().
+ * NILFS regular file handling primitives including fsync().
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 448320496856..a8f5315f01e3 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * gcinode.c - dummy inodes to buffer blocks for garbage collection
+ * Dummy inodes to buffer blocks for garbage collection
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 02727ed3a7c6..a8a4bc8490b4 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * ifile.c - NILFS inode file
+ * NILFS inode file
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index a1e1e5711a05..35c5273f4821 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * ifile.h - NILFS inode file
+ * NILFS inode file
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2e8eb263cf0f..e3d807d5b83a 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * inode.c - NILFS inode operations.
+ * NILFS inode operations.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 640ac8fe891e..fec194a666f4 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * ioctl.c - NILFS ioctl operations.
+ * NILFS ioctl operations.
*
* Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation.
*
@@ -1107,7 +1107,7 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
goto out;
ret = -ERANGE;
- if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode))
+ if (range[1] > bdev_nr_bytes(inode->i_sb->s_bdev))
goto out;
segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize;
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 97769fe4d588..4b3d33cf0041 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * mdt.c - meta data file for NILFS
+ * Meta data file for NILFS
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index e77aea4bb921..8f86080a436d 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * mdt.h - NILFS meta data file prototype and definitions
+ * NILFS meta data file prototype and definitions
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 91eebeb0c48b..23899e0ae850 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * namei.c - NILFS pathname lookup operations.
+ * NILFS pathname lookup operations.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 60b21b6eeac0..a7b81755c350 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * nilfs.h - NILFS local header file.
+ * NILFS local header file.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 171fb5cd427f..bc3e2cd4117f 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * page.c - buffer/page management specific to NILFS
+ * Buffer/page management specific to NILFS
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 62b9bb469e92..569263b23c0c 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * page.h - buffer/page management specific to NILFS
+ * Buffer/page management specific to NILFS
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 2217f904a7cf..9e2ed76c0f25 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * recovery.c - NILFS recovery logic
+ * NILFS recovery logic
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 56872e93823d..43287b0d3e9b 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * segbuf.c - NILFS segment buffer
+ * NILFS segment buffer
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 9bea1bd59041..e20091ededba 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * segbuf.h - NILFS Segment buffer prototypes and definitions
+ * NILFS Segment buffer prototypes and definitions
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 686c8ee7b29c..85a853334771 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * segment.c - NILFS segment constructor.
+ * NILFS segment constructor.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index f5cf5308f3fc..1060f72ebf5a 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * segment.h - NILFS Segment constructor prototypes and definitions
+ * NILFS Segment constructor prototypes and definitions
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 63722475e17e..e385cca2004a 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * sufile.c - NILFS segment usage file.
+ * NILFS segment usage file.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index c4e2c7a7add1..8e8a1a5a0402 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * sufile.h - NILFS segment usage file.
+ * NILFS segment usage file.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f6b2d280aab5..63e5fa74016c 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * super.c - NILFS module and super block management.
+ * NILFS module and super block management.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -403,7 +403,7 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize)
int ret;
ret = -ERANGE;
- devsize = i_size_read(sb->s_bdev->bd_inode);
+ devsize = bdev_nr_bytes(sb->s_bdev);
if (newsize > devsize)
goto out;
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index 62f8a7ac19c8..81f35c5b5a40 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * sysfs.c - sysfs support implementation.
+ * Sysfs support implementation.
*
* Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation.
* Copyright (C) 2014 HGST, Inc., a Western Digital Company.
@@ -95,7 +95,7 @@ static ssize_t
nilfs_snapshot_inodes_count_show(struct nilfs_snapshot_attr *attr,
struct nilfs_root *root, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)atomic64_read(&root->inodes_count));
}
@@ -103,7 +103,7 @@ static ssize_t
nilfs_snapshot_blocks_count_show(struct nilfs_snapshot_attr *attr,
struct nilfs_root *root, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)atomic64_read(&root->blocks_count));
}
@@ -116,7 +116,7 @@ static ssize_t
nilfs_snapshot_README_show(struct nilfs_snapshot_attr *attr,
struct nilfs_root *root, char *buf)
{
- return snprintf(buf, PAGE_SIZE, snapshot_readme_str);
+ return sysfs_emit(buf, snapshot_readme_str);
}
NILFS_SNAPSHOT_RO_ATTR(inodes_count);
@@ -217,7 +217,7 @@ static ssize_t
nilfs_mounted_snapshots_README_show(struct nilfs_mounted_snapshots_attr *attr,
struct the_nilfs *nilfs, char *buf)
{
- return snprintf(buf, PAGE_SIZE, mounted_snapshots_readme_str);
+ return sysfs_emit(buf, mounted_snapshots_readme_str);
}
NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(README);
@@ -255,7 +255,7 @@ nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr,
ncheckpoints = cpstat.cs_ncps;
- return snprintf(buf, PAGE_SIZE, "%llu\n", ncheckpoints);
+ return sysfs_emit(buf, "%llu\n", ncheckpoints);
}
static ssize_t
@@ -278,7 +278,7 @@ nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr,
nsnapshots = cpstat.cs_nsss;
- return snprintf(buf, PAGE_SIZE, "%llu\n", nsnapshots);
+ return sysfs_emit(buf, "%llu\n", nsnapshots);
}
static ssize_t
@@ -292,7 +292,7 @@ nilfs_checkpoints_last_seg_checkpoint_show(struct nilfs_checkpoints_attr *attr,
last_cno = nilfs->ns_last_cno;
spin_unlock(&nilfs->ns_last_segment_lock);
- return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno);
+ return sysfs_emit(buf, "%llu\n", last_cno);
}
static ssize_t
@@ -306,7 +306,7 @@ nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr,
cno = nilfs->ns_cno;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
+ return sysfs_emit(buf, "%llu\n", cno);
}
static const char checkpoints_readme_str[] =
@@ -322,7 +322,7 @@ static ssize_t
nilfs_checkpoints_README_show(struct nilfs_checkpoints_attr *attr,
struct the_nilfs *nilfs, char *buf)
{
- return snprintf(buf, PAGE_SIZE, checkpoints_readme_str);
+ return sysfs_emit(buf, checkpoints_readme_str);
}
NILFS_CHECKPOINTS_RO_ATTR(checkpoints_number);
@@ -353,7 +353,7 @@ nilfs_segments_segments_number_show(struct nilfs_segments_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_nsegments);
+ return sysfs_emit(buf, "%lu\n", nilfs->ns_nsegments);
}
static ssize_t
@@ -361,7 +361,7 @@ nilfs_segments_blocks_per_segment_show(struct nilfs_segments_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_blocks_per_segment);
+ return sysfs_emit(buf, "%lu\n", nilfs->ns_blocks_per_segment);
}
static ssize_t
@@ -375,7 +375,7 @@ nilfs_segments_clean_segments_show(struct nilfs_segments_attr *attr,
ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
- return snprintf(buf, PAGE_SIZE, "%lu\n", ncleansegs);
+ return sysfs_emit(buf, "%lu\n", ncleansegs);
}
static ssize_t
@@ -395,7 +395,7 @@ nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr,
return err;
}
- return snprintf(buf, PAGE_SIZE, "%llu\n", sustat.ss_ndirtysegs);
+ return sysfs_emit(buf, "%llu\n", sustat.ss_ndirtysegs);
}
static const char segments_readme_str[] =
@@ -411,7 +411,7 @@ nilfs_segments_README_show(struct nilfs_segments_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, segments_readme_str);
+ return sysfs_emit(buf, segments_readme_str);
}
NILFS_SEGMENTS_RO_ATTR(segments_number);
@@ -448,7 +448,7 @@ nilfs_segctor_last_pseg_block_show(struct nilfs_segctor_attr *attr,
last_pseg = nilfs->ns_last_pseg;
spin_unlock(&nilfs->ns_last_segment_lock);
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)last_pseg);
}
@@ -463,7 +463,7 @@ nilfs_segctor_last_seg_sequence_show(struct nilfs_segctor_attr *attr,
last_seq = nilfs->ns_last_seq;
spin_unlock(&nilfs->ns_last_segment_lock);
- return snprintf(buf, PAGE_SIZE, "%llu\n", last_seq);
+ return sysfs_emit(buf, "%llu\n", last_seq);
}
static ssize_t
@@ -477,7 +477,7 @@ nilfs_segctor_last_seg_checkpoint_show(struct nilfs_segctor_attr *attr,
last_cno = nilfs->ns_last_cno;
spin_unlock(&nilfs->ns_last_segment_lock);
- return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno);
+ return sysfs_emit(buf, "%llu\n", last_cno);
}
static ssize_t
@@ -491,7 +491,7 @@ nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr,
seg_seq = nilfs->ns_seg_seq;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", seg_seq);
+ return sysfs_emit(buf, "%llu\n", seg_seq);
}
static ssize_t
@@ -505,7 +505,7 @@ nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr,
segnum = nilfs->ns_segnum;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", segnum);
+ return sysfs_emit(buf, "%llu\n", segnum);
}
static ssize_t
@@ -519,7 +519,7 @@ nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr,
nextnum = nilfs->ns_nextnum;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", nextnum);
+ return sysfs_emit(buf, "%llu\n", nextnum);
}
static ssize_t
@@ -533,7 +533,7 @@ nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr,
pseg_offset = nilfs->ns_pseg_offset;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%lu\n", pseg_offset);
+ return sysfs_emit(buf, "%lu\n", pseg_offset);
}
static ssize_t
@@ -547,7 +547,7 @@ nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr,
cno = nilfs->ns_cno;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
+ return sysfs_emit(buf, "%llu\n", cno);
}
static ssize_t
@@ -575,7 +575,7 @@ nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr,
ctime = nilfs->ns_ctime;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", ctime);
+ return sysfs_emit(buf, "%llu\n", ctime);
}
static ssize_t
@@ -603,7 +603,7 @@ nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr,
nongc_ctime = nilfs->ns_nongc_ctime;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", nongc_ctime);
+ return sysfs_emit(buf, "%llu\n", nongc_ctime);
}
static ssize_t
@@ -617,7 +617,7 @@ nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr,
ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks);
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%u\n", ndirtyblks);
+ return sysfs_emit(buf, "%u\n", ndirtyblks);
}
static const char segctor_readme_str[] =
@@ -654,7 +654,7 @@ static ssize_t
nilfs_segctor_README_show(struct nilfs_segctor_attr *attr,
struct the_nilfs *nilfs, char *buf)
{
- return snprintf(buf, PAGE_SIZE, segctor_readme_str);
+ return sysfs_emit(buf, segctor_readme_str);
}
NILFS_SEGCTOR_RO_ATTR(last_pseg_block);
@@ -723,7 +723,7 @@ nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr,
sbwtime = nilfs->ns_sbwtime;
up_read(&nilfs->ns_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", sbwtime);
+ return sysfs_emit(buf, "%llu\n", sbwtime);
}
static ssize_t
@@ -737,7 +737,7 @@ nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr,
sbwcount = nilfs->ns_sbwcount;
up_read(&nilfs->ns_sem);
- return snprintf(buf, PAGE_SIZE, "%u\n", sbwcount);
+ return sysfs_emit(buf, "%u\n", sbwcount);
}
static ssize_t
@@ -751,7 +751,7 @@ nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr,
sb_update_freq = nilfs->ns_sb_update_freq;
up_read(&nilfs->ns_sem);
- return snprintf(buf, PAGE_SIZE, "%u\n", sb_update_freq);
+ return sysfs_emit(buf, "%u\n", sb_update_freq);
}
static ssize_t
@@ -799,7 +799,7 @@ static ssize_t
nilfs_superblock_README_show(struct nilfs_superblock_attr *attr,
struct the_nilfs *nilfs, char *buf)
{
- return snprintf(buf, PAGE_SIZE, sb_readme_str);
+ return sysfs_emit(buf, sb_readme_str);
}
NILFS_SUPERBLOCK_RO_ATTR(sb_write_time);
@@ -834,7 +834,7 @@ ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr,
u32 major = le32_to_cpu(sbp[0]->s_rev_level);
u16 minor = le16_to_cpu(sbp[0]->s_minor_rev_level);
- return snprintf(buf, PAGE_SIZE, "%d.%d\n", major, minor);
+ return sysfs_emit(buf, "%d.%d\n", major, minor);
}
static
@@ -842,7 +842,7 @@ ssize_t nilfs_dev_blocksize_show(struct nilfs_dev_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%u\n", nilfs->ns_blocksize);
+ return sysfs_emit(buf, "%u\n", nilfs->ns_blocksize);
}
static
@@ -853,7 +853,7 @@ ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr,
struct nilfs_super_block **sbp = nilfs->ns_sbp;
u64 dev_size = le64_to_cpu(sbp[0]->s_dev_size);
- return snprintf(buf, PAGE_SIZE, "%llu\n", dev_size);
+ return sysfs_emit(buf, "%llu\n", dev_size);
}
static
@@ -864,7 +864,7 @@ ssize_t nilfs_dev_free_blocks_show(struct nilfs_dev_attr *attr,
sector_t free_blocks = 0;
nilfs_count_free_blocks(nilfs, &free_blocks);
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)free_blocks);
}
@@ -875,7 +875,7 @@ ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr,
{
struct nilfs_super_block **sbp = nilfs->ns_sbp;
- return snprintf(buf, PAGE_SIZE, "%pUb\n", sbp[0]->s_uuid);
+ return sysfs_emit(buf, "%pUb\n", sbp[0]->s_uuid);
}
static
@@ -903,7 +903,7 @@ static ssize_t nilfs_dev_README_show(struct nilfs_dev_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, dev_readme_str);
+ return sysfs_emit(buf, dev_readme_str);
}
NILFS_DEV_RO_ATTR(revision);
@@ -1047,7 +1047,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs)
static ssize_t nilfs_feature_revision_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d.%d\n",
+ return sysfs_emit(buf, "%d.%d\n",
NILFS_CURRENT_REV, NILFS_MINOR_REV);
}
@@ -1060,7 +1060,7 @@ static ssize_t nilfs_feature_README_show(struct kobject *kobj,
struct attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, features_readme_str);
+ return sysfs_emit(buf, features_readme_str);
}
NILFS_FEATURE_RO_ATTR(revision);
diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h
index d001eb862dae..78a87a016928 100644
--- a/fs/nilfs2/sysfs.h
+++ b/fs/nilfs2/sysfs.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * sysfs.h - sysfs support declarations.
+ * Sysfs support declarations.
*
* Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation.
* Copyright (C) 2014 HGST, Inc., a Western Digital Company.
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index c8bfc01da5d7..dd48a8f74d57 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * the_nilfs.c - the_nilfs shared structure.
+ * the_nilfs shared structure.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -489,7 +489,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
{
struct nilfs_super_block **sbp = nilfs->ns_sbp;
struct buffer_head **sbh = nilfs->ns_sbh;
- u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
+ u64 sb2off = NILFS_SB2_OFFSET_BYTES(bdev_nr_bytes(nilfs->ns_bdev));
int valid[2], swp = 0;
sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 987c8ab02aee..47c7dfbb7ea5 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * the_nilfs.h - the_nilfs shared structure.
+ * the_nilfs shared structure.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 057abd2cf887..b6091775aa6e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -111,6 +111,16 @@ static bool fanotify_name_event_equal(struct fanotify_name_event *fne1,
return fanotify_info_equal(info1, info2);
}
+static bool fanotify_error_event_equal(struct fanotify_error_event *fee1,
+ struct fanotify_error_event *fee2)
+{
+ /* Error events against the same file system are always merged. */
+ if (!fanotify_fsid_equal(&fee1->fsid, &fee2->fsid))
+ return false;
+
+ return true;
+}
+
static bool fanotify_should_merge(struct fanotify_event *old,
struct fanotify_event *new)
{
@@ -141,6 +151,9 @@ static bool fanotify_should_merge(struct fanotify_event *old,
case FANOTIFY_EVENT_TYPE_FID_NAME:
return fanotify_name_event_equal(FANOTIFY_NE(old),
FANOTIFY_NE(new));
+ case FANOTIFY_EVENT_TYPE_FS_ERROR:
+ return fanotify_error_event_equal(FANOTIFY_EE(old),
+ FANOTIFY_EE(new));
default:
WARN_ON_ONCE(1);
}
@@ -176,6 +189,10 @@ static int fanotify_merge(struct fsnotify_group *group,
break;
if (fanotify_should_merge(old, new)) {
old->mask |= new->mask;
+
+ if (fanotify_is_error_event(old->mask))
+ FANOTIFY_EE(old)->err_count++;
+
return 1;
}
}
@@ -343,13 +360,23 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
static int fanotify_encode_fh_len(struct inode *inode)
{
int dwords = 0;
+ int fh_len;
if (!inode)
return 0;
exportfs_encode_inode_fh(inode, NULL, &dwords, NULL);
+ fh_len = dwords << 2;
+
+ /*
+ * struct fanotify_error_event might be preallocated and is
+ * limited to MAX_HANDLE_SZ. This should never happen, but
+ * safeguard by forcing an invalid file handle.
+ */
+ if (WARN_ON_ONCE(fh_len > MAX_HANDLE_SZ))
+ return 0;
- return dwords << 2;
+ return fh_len;
}
/*
@@ -370,8 +397,14 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
fh->type = FILEID_ROOT;
fh->len = 0;
fh->flags = 0;
+
+ /*
+ * Invalid FHs are used by FAN_FS_ERROR for errors not
+ * linked to any inode. The f_handle won't be reported
+ * back to userspace.
+ */
if (!inode)
- return 0;
+ goto out;
/*
* !gpf means preallocated variable size fh, but fh_len could
@@ -403,8 +436,13 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
fh->type = type;
fh->len = fh_len;
- /* Mix fh into event merge key */
- *hash ^= fanotify_hash_fh(fh);
+out:
+ /*
+ * Mix fh into event merge key. Hash might be NULL in case of
+ * unhashed FID events (i.e. FAN_FS_ERROR).
+ */
+ if (hash)
+ *hash ^= fanotify_hash_fh(fh);
return FANOTIFY_FH_HDR_LEN + fh_len;
@@ -452,7 +490,7 @@ static struct inode *fanotify_dfid_inode(u32 event_mask, const void *data,
if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS)
return dir;
- if (S_ISDIR(inode->i_mode))
+ if (inode && S_ISDIR(inode->i_mode))
return inode;
return dir;
@@ -563,6 +601,44 @@ static struct fanotify_event *fanotify_alloc_name_event(struct inode *id,
return &fne->fae;
}
+static struct fanotify_event *fanotify_alloc_error_event(
+ struct fsnotify_group *group,
+ __kernel_fsid_t *fsid,
+ const void *data, int data_type,
+ unsigned int *hash)
+{
+ struct fs_error_report *report =
+ fsnotify_data_error_report(data, data_type);
+ struct inode *inode;
+ struct fanotify_error_event *fee;
+ int fh_len;
+
+ if (WARN_ON_ONCE(!report))
+ return NULL;
+
+ fee = mempool_alloc(&group->fanotify_data.error_events_pool, GFP_NOFS);
+ if (!fee)
+ return NULL;
+
+ fee->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR;
+ fee->error = report->error;
+ fee->err_count = 1;
+ fee->fsid = *fsid;
+
+ inode = report->inode;
+ fh_len = fanotify_encode_fh_len(inode);
+
+ /* Bad fh_len. Fallback to using an invalid fh. Should never happen. */
+ if (!fh_len && inode)
+ inode = NULL;
+
+ fanotify_encode_fh(&fee->object_fh, inode, fh_len, NULL, 0);
+
+ *hash ^= fanotify_hash_fsid(fsid);
+
+ return &fee->fae;
+}
+
static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
u32 mask, const void *data,
int data_type, struct inode *dir,
@@ -630,6 +706,9 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
if (fanotify_is_perm_event(mask)) {
event = fanotify_alloc_perm_event(path, gfp);
+ } else if (fanotify_is_error_event(mask)) {
+ event = fanotify_alloc_error_event(group, fsid, data,
+ data_type, &hash);
} else if (name_event && (file_name || child)) {
event = fanotify_alloc_name_event(id, fsid, file_name, child,
&hash, gfp);
@@ -702,6 +781,9 @@ static void fanotify_insert_event(struct fsnotify_group *group,
assert_spin_locked(&group->notification_lock);
+ if (!fanotify_is_hashed_event(event->mask))
+ return;
+
pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
group, event, bucket);
@@ -738,8 +820,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC);
BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
+ BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR);
- BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 20);
mask = fanotify_group_event_mask(group, iter_info, mask, data,
data_type, dir);
@@ -778,9 +861,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
}
fsn_event = &event->fse;
- ret = fsnotify_add_event(group, fsn_event, fanotify_merge,
- fanotify_is_hashed_event(mask) ?
- fanotify_insert_event : NULL);
+ ret = fsnotify_insert_event(group, fsn_event, fanotify_merge,
+ fanotify_insert_event);
if (ret) {
/* Permission events shouldn't be merged */
BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
@@ -805,6 +887,9 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
if (group->fanotify_data.ucounts)
dec_ucount(group->fanotify_data.ucounts,
UCOUNT_FANOTIFY_GROUPS);
+
+ if (mempool_initialized(&group->fanotify_data.error_events_pool))
+ mempool_exit(&group->fanotify_data.error_events_pool);
}
static void fanotify_free_path_event(struct fanotify_event *event)
@@ -833,7 +918,16 @@ static void fanotify_free_name_event(struct fanotify_event *event)
kfree(FANOTIFY_NE(event));
}
-static void fanotify_free_event(struct fsnotify_event *fsn_event)
+static void fanotify_free_error_event(struct fsnotify_group *group,
+ struct fanotify_event *event)
+{
+ struct fanotify_error_event *fee = FANOTIFY_EE(event);
+
+ mempool_free(fee, &group->fanotify_data.error_events_pool);
+}
+
+static void fanotify_free_event(struct fsnotify_group *group,
+ struct fsnotify_event *fsn_event)
{
struct fanotify_event *event;
@@ -855,6 +949,9 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
case FANOTIFY_EVENT_TYPE_OVERFLOW:
kfree(event);
break;
+ case FANOTIFY_EVENT_TYPE_FS_ERROR:
+ fanotify_free_error_event(group, event);
+ break;
default:
WARN_ON_ONCE(1);
}
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 4a5e555dc3d2..d25f500bf7e7 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -141,6 +141,7 @@ enum fanotify_event_type {
FANOTIFY_EVENT_TYPE_PATH,
FANOTIFY_EVENT_TYPE_PATH_PERM,
FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */
+ FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */
__FANOTIFY_EVENT_TYPE_NUM
};
@@ -170,12 +171,18 @@ static inline void fanotify_init_event(struct fanotify_event *event,
event->pid = NULL;
}
+#define FANOTIFY_INLINE_FH(name, size) \
+struct { \
+ struct fanotify_fh (name); \
+ /* Space for object_fh.buf[] - access with fanotify_fh_buf() */ \
+ unsigned char _inline_fh_buf[(size)]; \
+}
+
struct fanotify_fid_event {
struct fanotify_event fae;
__kernel_fsid_t fsid;
- struct fanotify_fh object_fh;
- /* Reserve space in object_fh.buf[] - access with fanotify_fh_buf() */
- unsigned char _inline_fh_buf[FANOTIFY_INLINE_FH_LEN];
+
+ FANOTIFY_INLINE_FH(object_fh, FANOTIFY_INLINE_FH_LEN);
};
static inline struct fanotify_fid_event *
@@ -196,12 +203,30 @@ FANOTIFY_NE(struct fanotify_event *event)
return container_of(event, struct fanotify_name_event, fae);
}
+struct fanotify_error_event {
+ struct fanotify_event fae;
+ s32 error; /* Error reported by the Filesystem. */
+ u32 err_count; /* Suppressed errors count */
+
+ __kernel_fsid_t fsid; /* FSID this error refers to. */
+
+ FANOTIFY_INLINE_FH(object_fh, MAX_HANDLE_SZ);
+};
+
+static inline struct fanotify_error_event *
+FANOTIFY_EE(struct fanotify_event *event)
+{
+ return container_of(event, struct fanotify_error_event, fae);
+}
+
static inline __kernel_fsid_t *fanotify_event_fsid(struct fanotify_event *event)
{
if (event->type == FANOTIFY_EVENT_TYPE_FID)
return &FANOTIFY_FE(event)->fsid;
else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME)
return &FANOTIFY_NE(event)->fsid;
+ else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR)
+ return &FANOTIFY_EE(event)->fsid;
else
return NULL;
}
@@ -213,6 +238,8 @@ static inline struct fanotify_fh *fanotify_event_object_fh(
return &FANOTIFY_FE(event)->object_fh;
else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME)
return fanotify_info_file_fh(&FANOTIFY_NE(event)->info);
+ else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR)
+ return &FANOTIFY_EE(event)->object_fh;
else
return NULL;
}
@@ -244,6 +271,19 @@ static inline int fanotify_event_dir_fh_len(struct fanotify_event *event)
return info ? fanotify_info_dir_fh_len(info) : 0;
}
+static inline bool fanotify_event_has_object_fh(struct fanotify_event *event)
+{
+ /* For error events, even zeroed fh are reported. */
+ if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR)
+ return true;
+ return fanotify_event_object_fh_len(event) > 0;
+}
+
+static inline bool fanotify_event_has_dir_fh(struct fanotify_event *event)
+{
+ return fanotify_event_dir_fh_len(event) > 0;
+}
+
struct fanotify_path_event {
struct fanotify_event fae;
struct path path;
@@ -287,6 +327,11 @@ static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
return container_of(fse, struct fanotify_event, fse);
}
+static inline bool fanotify_is_error_event(u32 mask)
+{
+ return mask & FAN_FS_ERROR;
+}
+
static inline bool fanotify_event_has_path(struct fanotify_event *event)
{
return event->type == FANOTIFY_EVENT_TYPE_PATH ||
@@ -315,7 +360,8 @@ static inline struct path *fanotify_event_path(struct fanotify_event *event)
*/
static inline bool fanotify_is_hashed_event(u32 mask)
{
- return !fanotify_is_perm_event(mask) && !(mask & FS_Q_OVERFLOW);
+ return !(fanotify_is_perm_event(mask) ||
+ fsnotify_is_overflow_event(mask));
}
static inline unsigned int fanotify_event_hash_bucket(
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 6facdf476255..559bc1e9926d 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -30,6 +30,7 @@
#define FANOTIFY_DEFAULT_MAX_EVENTS 16384
#define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192
#define FANOTIFY_DEFAULT_MAX_GROUPS 128
+#define FANOTIFY_DEFAULT_FEE_POOL_SIZE 32
/*
* Legacy fanotify marks limits (8192) is per group and we introduced a tunable
@@ -114,6 +115,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
#define FANOTIFY_PIDFD_INFO_HDR_LEN \
sizeof(struct fanotify_event_info_pidfd)
+#define FANOTIFY_ERROR_INFO_LEN \
+ (sizeof(struct fanotify_event_info_error))
static int fanotify_fid_info_len(int fh_len, int name_len)
{
@@ -126,17 +129,26 @@ static int fanotify_fid_info_len(int fh_len, int name_len)
FANOTIFY_EVENT_ALIGN);
}
-static int fanotify_event_info_len(unsigned int info_mode,
- struct fanotify_event *event)
+static size_t fanotify_event_len(unsigned int info_mode,
+ struct fanotify_event *event)
{
- struct fanotify_info *info = fanotify_event_info(event);
- int dir_fh_len = fanotify_event_dir_fh_len(event);
- int fh_len = fanotify_event_object_fh_len(event);
- int info_len = 0;
+ size_t event_len = FAN_EVENT_METADATA_LEN;
+ struct fanotify_info *info;
+ int dir_fh_len;
+ int fh_len;
int dot_len = 0;
- if (dir_fh_len) {
- info_len += fanotify_fid_info_len(dir_fh_len, info->name_len);
+ if (!info_mode)
+ return event_len;
+
+ if (fanotify_is_error_event(event->mask))
+ event_len += FANOTIFY_ERROR_INFO_LEN;
+
+ info = fanotify_event_info(event);
+
+ if (fanotify_event_has_dir_fh(event)) {
+ dir_fh_len = fanotify_event_dir_fh_len(event);
+ event_len += fanotify_fid_info_len(dir_fh_len, info->name_len);
} else if ((info_mode & FAN_REPORT_NAME) &&
(event->mask & FAN_ONDIR)) {
/*
@@ -147,12 +159,14 @@ static int fanotify_event_info_len(unsigned int info_mode,
}
if (info_mode & FAN_REPORT_PIDFD)
- info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
+ event_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
- if (fh_len)
- info_len += fanotify_fid_info_len(fh_len, dot_len);
+ if (fanotify_event_has_object_fh(event)) {
+ fh_len = fanotify_event_object_fh_len(event);
+ event_len += fanotify_fid_info_len(fh_len, dot_len);
+ }
- return info_len;
+ return event_len;
}
/*
@@ -181,7 +195,7 @@ static void fanotify_unhash_event(struct fsnotify_group *group,
static struct fanotify_event *get_one_event(struct fsnotify_group *group,
size_t count)
{
- size_t event_size = FAN_EVENT_METADATA_LEN;
+ size_t event_size;
struct fanotify_event *event = NULL;
struct fsnotify_event *fsn_event;
unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
@@ -194,8 +208,7 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group,
goto out;
event = FANOTIFY_E(fsn_event);
- if (info_mode)
- event_size += fanotify_event_info_len(info_mode, event);
+ event_size = fanotify_event_len(info_mode, event);
if (event_size > count) {
event = ERR_PTR(-EINVAL);
@@ -316,6 +329,28 @@ static int process_access_response(struct fsnotify_group *group,
return -ENOENT;
}
+static size_t copy_error_info_to_user(struct fanotify_event *event,
+ char __user *buf, int count)
+{
+ struct fanotify_event_info_error info;
+ struct fanotify_error_event *fee = FANOTIFY_EE(event);
+
+ info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR;
+ info.hdr.pad = 0;
+ info.hdr.len = FANOTIFY_ERROR_INFO_LEN;
+
+ if (WARN_ON(count < info.hdr.len))
+ return -EFAULT;
+
+ info.error = fee->error;
+ info.error_count = fee->err_count;
+
+ if (copy_to_user(buf, &info, sizeof(info)))
+ return -EFAULT;
+
+ return info.hdr.len;
+}
+
static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
int info_type, const char *name,
size_t name_len,
@@ -331,9 +366,6 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
__func__, fh_len, name_len, info_len, count);
- if (!fh_len)
- return 0;
-
if (WARN_ON_ONCE(len < sizeof(info) || len > count))
return -EFAULT;
@@ -368,6 +400,11 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
handle.handle_type = fh->type;
handle.handle_bytes = fh_len;
+
+ /* Mangle handle_type for bad file_handle */
+ if (!fh_len)
+ handle.handle_type = FILEID_INVALID;
+
if (copy_to_user(buf, &handle, sizeof(handle)))
return -EFAULT;
@@ -444,7 +481,7 @@ static int copy_info_records_to_user(struct fanotify_event *event,
/*
* Event info records order is as follows: dir fid + name, child fid.
*/
- if (fanotify_event_dir_fh_len(event)) {
+ if (fanotify_event_has_dir_fh(event)) {
info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
FAN_EVENT_INFO_TYPE_DFID;
ret = copy_fid_info_to_user(fanotify_event_fsid(event),
@@ -460,7 +497,7 @@ static int copy_info_records_to_user(struct fanotify_event *event,
total_bytes += ret;
}
- if (fanotify_event_object_fh_len(event)) {
+ if (fanotify_event_has_object_fh(event)) {
const char *dot = NULL;
int dot_len = 0;
@@ -520,6 +557,15 @@ static int copy_info_records_to_user(struct fanotify_event *event,
total_bytes += ret;
}
+ if (fanotify_is_error_event(event->mask)) {
+ ret = copy_error_info_to_user(event, buf, count);
+ if (ret < 0)
+ return ret;
+ buf += ret;
+ count -= ret;
+ total_bytes += ret;
+ }
+
return total_bytes;
}
@@ -537,8 +583,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- metadata.event_len = FAN_EVENT_METADATA_LEN +
- fanotify_event_info_len(info_mode, event);
+ metadata.event_len = fanotify_event_len(info_mode, event);
metadata.metadata_len = FAN_EVENT_METADATA_LEN;
metadata.vers = FANOTIFY_METADATA_VERSION;
metadata.reserved = 0;
@@ -1049,6 +1094,15 @@ out_dec_ucounts:
return ERR_PTR(ret);
}
+static int fanotify_group_init_error_pool(struct fsnotify_group *group)
+{
+ if (mempool_initialized(&group->fanotify_data.error_events_pool))
+ return 0;
+
+ return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool,
+ FANOTIFY_DEFAULT_FEE_POOL_SIZE,
+ sizeof(struct fanotify_error_event));
+}
static int fanotify_add_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp, unsigned int type,
@@ -1057,6 +1111,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
{
struct fsnotify_mark *fsn_mark;
__u32 added;
+ int ret = 0;
mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_mark(connp, group);
@@ -1067,13 +1122,26 @@ static int fanotify_add_mark(struct fsnotify_group *group,
return PTR_ERR(fsn_mark);
}
}
+
+ /*
+ * Error events are pre-allocated per group, only if strictly
+ * needed (i.e. FAN_FS_ERROR was requested).
+ */
+ if (!(flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) {
+ ret = fanotify_group_init_error_pool(group);
+ if (ret)
+ goto out;
+ }
+
added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
if (added & ~fsnotify_conn_mask(fsn_mark->connector))
fsnotify_recalc_mask(fsn_mark->connector);
+
+out:
mutex_unlock(&group->mark_mutex);
fsnotify_put_mark(fsn_mark);
- return 0;
+ return ret;
}
static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
@@ -1295,16 +1363,15 @@ out_destroy_group:
return fd;
}
-/* Check if filesystem can encode a unique fid */
-static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
+static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
{
__kernel_fsid_t root_fsid;
int err;
/*
- * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
+ * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse).
*/
- err = vfs_get_fsid(path->dentry, fsid);
+ err = vfs_get_fsid(dentry, fsid);
if (err)
return err;
@@ -1312,10 +1379,10 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
return -ENODEV;
/*
- * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
+ * Make sure dentry is not of a filesystem subvolume (e.g. btrfs)
* which uses a different fsid than sb root.
*/
- err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
+ err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid);
if (err)
return err;
@@ -1323,6 +1390,12 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
root_fsid.val[1] != fsid->val[1])
return -EXDEV;
+ return 0;
+}
+
+/* Check if filesystem can encode a unique fid */
+static int fanotify_test_fid(struct dentry *dentry)
+{
/*
* We need to make sure that the file system supports at least
* encoding a file handle so user can use name_to_handle_at() to
@@ -1330,8 +1403,8 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
* objects. However, name_to_handle_at() requires that the
* filesystem also supports decoding file handles.
*/
- if (!path->dentry->d_sb->s_export_op ||
- !path->dentry->d_sb->s_export_op->fh_to_dentry)
+ if (!dentry->d_sb->s_export_op ||
+ !dentry->d_sb->s_export_op->fh_to_dentry)
return -EOPNOTSUPP;
return 0;
@@ -1447,15 +1520,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
group->priority == FS_PRIO_0)
goto fput_and_out;
+ if (mask & FAN_FS_ERROR &&
+ mark_type != FAN_MARK_FILESYSTEM)
+ goto fput_and_out;
+
/*
- * Events with data type inode do not carry enough information to report
- * event->fd, so we do not allow setting a mask for inode events unless
- * group supports reporting fid.
- * inode events are not supported on a mount mark, because they do not
- * carry enough information (i.e. path) to be filtered by mount point.
+ * Events that do not carry enough information to report
+ * event->fd require a group that supports reporting fid. Those
+ * events are not supported on a mount mark, because they do not
+ * carry enough information (i.e. path) to be filtered by mount
+ * point.
*/
fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
- if (mask & FANOTIFY_INODE_EVENTS &&
+ if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
(!fid_mode || mark_type == FAN_MARK_MOUNT))
goto fput_and_out;
@@ -1482,7 +1559,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
}
if (fid_mode) {
- ret = fanotify_test_fid(&path, &__fsid);
+ ret = fanotify_test_fsid(path.dentry, &__fsid);
+ if (ret)
+ goto path_put_and_out;
+
+ ret = fanotify_test_fid(path.dentry);
if (ret)
goto path_put_and_out;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 963e6ce75b96..4034ca566f95 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -252,6 +252,9 @@ static int fsnotify_handle_inode_event(struct fsnotify_group *group,
if (WARN_ON_ONCE(!ops->handle_inode_event))
return 0;
+ if (WARN_ON_ONCE(!inode && !dir))
+ return 0;
+
if ((inode_mark->mask & FS_EXCL_UNLINK) &&
path && d_unlinked(path->dentry))
return 0;
@@ -455,16 +458,16 @@ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
* @file_name is relative to
* @file_name: optional file name associated with event
* @inode: optional inode associated with event -
- * either @dir or @inode must be non-NULL.
- * if both are non-NULL event may be reported to both.
+ * If @dir and @inode are both non-NULL, event may be
+ * reported to both.
* @cookie: inotify rename cookie
*/
int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
const struct qstr *file_name, struct inode *inode, u32 cookie)
{
const struct path *path = fsnotify_data_path(data, data_type);
+ struct super_block *sb = fsnotify_data_sb(data, data_type);
struct fsnotify_iter_info iter_info = {};
- struct super_block *sb;
struct mount *mnt = NULL;
struct inode *parent = NULL;
int ret = 0;
@@ -483,7 +486,6 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
*/
parent = dir;
}
- sb = inode->i_sb;
/*
* Optimization: srcu_read_lock() has a memory barrier which can
diff --git a/fs/notify/group.c b/fs/notify/group.c
index fb89c351295d..6a297efc4788 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -88,7 +88,7 @@ void fsnotify_destroy_group(struct fsnotify_group *group)
* that deliberately ignores overflow events.
*/
if (group->overflow_event)
- group->ops->free_event(group->overflow_event);
+ group->ops->free_event(group, group->overflow_event);
fsnotify_put_group(group);
}
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index d1a64daa0171..d92d7b0adc9a 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -116,7 +116,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
if (len)
strcpy(event->name, name->name);
- ret = fsnotify_add_event(group, fsn_event, inotify_merge, NULL);
+ ret = fsnotify_add_event(group, fsn_event, inotify_merge);
if (ret) {
/* Our event wasn't used in the end. Free it. */
fsnotify_destroy_event(group, fsn_event);
@@ -177,7 +177,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
dec_inotify_instances(group->inotify_data.ucounts);
}
-static void inotify_free_event(struct fsnotify_event *fsn_event)
+static void inotify_free_event(struct fsnotify_group *group,
+ struct fsnotify_event *fsn_event)
{
kfree(INOTIFY_E(fsn_event));
}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 62051247f6d2..29fca3284bb5 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -94,10 +94,10 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg)
__u32 mask;
/*
- * Everything should accept their own ignored and should receive events
- * when the inode is unmounted. All directories care about children.
+ * Everything should receive events when the inode is unmounted.
+ * All directories care about children.
*/
- mask = (FS_IN_IGNORED | FS_UNMOUNT);
+ mask = (FS_UNMOUNT);
if (S_ISDIR(inode->i_mode))
mask |= FS_EVENT_ON_CHILD;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 32f45543b9c6..9022ae650cf8 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -64,7 +64,7 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
WARN_ON(!list_empty(&event->list));
spin_unlock(&group->notification_lock);
}
- group->ops->free_event(event);
+ group->ops->free_event(group, event);
}
/*
@@ -78,12 +78,12 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
* 2 if the event was not queued - either the queue of events has overflown
* or the group is shutting down.
*/
-int fsnotify_add_event(struct fsnotify_group *group,
- struct fsnotify_event *event,
- int (*merge)(struct fsnotify_group *,
- struct fsnotify_event *),
- void (*insert)(struct fsnotify_group *,
- struct fsnotify_event *))
+int fsnotify_insert_event(struct fsnotify_group *group,
+ struct fsnotify_event *event,
+ int (*merge)(struct fsnotify_group *,
+ struct fsnotify_event *),
+ void (*insert)(struct fsnotify_group *,
+ struct fsnotify_event *))
{
int ret = 0;
struct list_head *list = &group->notification_list;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index ab4f3362466d..2ae25e48a41a 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -5,6 +5,7 @@
* Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
*/
+#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include <linux/gfp.h>
@@ -1829,7 +1830,7 @@ again:
* pages being swapped out between us bringing them into memory
* and doing the actual copying.
*/
- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
status = -EFAULT;
break;
}
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 0d7e948cb29c..5ae8de09b271 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2772,13 +2772,12 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
ntfs_debug("Set device block size to %i bytes (block size bits %i).",
blocksize, sb->s_blocksize_bits);
/* Determine the size of the device in units of block_size bytes. */
- if (!i_size_read(sb->s_bdev->bd_inode)) {
+ vol->nr_blocks = sb_bdev_nr_blocks(sb);
+ if (!vol->nr_blocks) {
if (!silent)
ntfs_error(sb, "Unable to determine device size.");
goto err_out_now;
}
- vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits;
/* Read the boot sector and return unlocked buffer head to it. */
if (!(bh = read_ntfs_boot_sector(sb, silent))) {
if (!silent)
@@ -2816,8 +2815,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
goto err_out_now;
}
BUG_ON(blocksize != sb->s_blocksize);
- vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits;
+ vol->nr_blocks = sb_bdev_nr_blocks(sb);
ntfs_debug("Changed device block size to %i bytes (block size "
"bits %i) to match volume sector size.",
blocksize, sb->s_blocksize_bits);
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index 34c4cbf7e29b..e8c00dda42ad 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -6,13 +6,9 @@
* TODO: Merge attr_set_size/attr_data_get_block/attr_allocate_frame?
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/hash.h>
-#include <linux/nls.h>
-#include <linux/ratelimit.h>
#include <linux/slab.h>
+#include <linux/kernel.h>
#include "debug.h"
#include "ntfs.h"
@@ -291,7 +287,7 @@ int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr,
if (!rsize) {
/* Empty resident -> Non empty nonresident. */
} else if (!is_data) {
- err = ntfs_sb_write_run(sbi, run, 0, data, rsize);
+ err = ntfs_sb_write_run(sbi, run, 0, data, rsize, 0);
if (err)
goto out2;
} else if (!page) {
@@ -451,11 +447,8 @@ again:
again_1:
align = sbi->cluster_size;
- if (is_ext) {
+ if (is_ext)
align <<= attr_b->nres.c_unit;
- if (is_attr_sparsed(attr_b))
- keep_prealloc = false;
- }
old_valid = le64_to_cpu(attr_b->nres.valid_size);
old_size = le64_to_cpu(attr_b->nres.data_size);
@@ -465,9 +458,6 @@ again_1:
new_alloc = (new_size + align - 1) & ~(u64)(align - 1);
new_alen = new_alloc >> cluster_bits;
- if (keep_prealloc && is_ext)
- keep_prealloc = false;
-
if (keep_prealloc && new_size < old_size) {
attr_b->nres.data_size = cpu_to_le64(new_size);
mi_b->dirty = true;
@@ -529,7 +519,7 @@ add_alloc_in_same_attr_seg:
} else if (pre_alloc == -1) {
pre_alloc = 0;
if (type == ATTR_DATA && !name_len &&
- sbi->options.prealloc) {
+ sbi->options->prealloc) {
CLST new_alen2 = bytes_to_cluster(
sbi, get_pre_allocated(new_size));
pre_alloc = new_alen2 - new_alen;
@@ -1966,7 +1956,7 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size)
return 0;
from = vbo;
- to = (vbo + bytes) < data_size ? (vbo + bytes) : data_size;
+ to = min_t(u64, vbo + bytes, data_size);
memset(Add2Ptr(resident_data(attr_b), from), 0, to - from);
return 0;
}
diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c
index fa32399eb517..bad6d8a849a2 100644
--- a/fs/ntfs3/attrlist.c
+++ b/fs/ntfs3/attrlist.c
@@ -5,10 +5,7 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/nls.h>
#include "debug.h"
#include "ntfs.h"
@@ -336,7 +333,7 @@ int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name,
if (attr && attr->non_res) {
err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le,
- al->size);
+ al->size, 0);
if (err)
return err;
al->dirty = false;
@@ -423,7 +420,7 @@ next:
return true;
}
-int al_update(struct ntfs_inode *ni)
+int al_update(struct ntfs_inode *ni, int sync)
{
int err;
struct ATTRIB *attr;
@@ -445,7 +442,7 @@ int al_update(struct ntfs_inode *ni)
memcpy(resident_data(attr), al->le, al->size);
} else {
err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le,
- al->size);
+ al->size, sync);
if (err)
goto out;
diff --git a/fs/ntfs3/bitfunc.c b/fs/ntfs3/bitfunc.c
index ce304d40b5e1..50d838093790 100644
--- a/fs/ntfs3/bitfunc.c
+++ b/fs/ntfs3/bitfunc.c
@@ -5,13 +5,8 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
-#include <linux/fs.h>
-#include <linux/nls.h>
+#include <linux/types.h>
-#include "debug.h"
-#include "ntfs.h"
#include "ntfs_fs.h"
#define BITS_IN_SIZE_T (sizeof(size_t) * 8)
@@ -124,8 +119,7 @@ bool are_bits_set(const ulong *lmap, size_t bit, size_t nbits)
pos = nbits & 7;
if (pos) {
- u8 mask = fill_mask[pos];
-
+ mask = fill_mask[pos];
if ((*map & mask) != mask)
return false;
}
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
index 831501555009..aa184407520f 100644
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
@@ -10,12 +10,10 @@
*
*/
-#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/nls.h>
+#include <linux/kernel.h>
-#include "debug.h"
#include "ntfs.h"
#include "ntfs_fs.h"
@@ -435,7 +433,7 @@ static void wnd_remove_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len)
;
} else {
n3 = rb_next(&e->count.node);
- max_new_len = len > new_len ? len : new_len;
+ max_new_len = max(len, new_len);
if (!n3) {
wnd->extent_max = max_new_len;
} else {
@@ -731,7 +729,7 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits)
wbits = wnd->bits_last;
tail = wbits - wbit;
- op = tail < bits ? tail : bits;
+ op = min_t(u32, tail, bits);
bh = wnd_map(wnd, iw);
if (IS_ERR(bh)) {
@@ -784,7 +782,7 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits)
wbits = wnd->bits_last;
tail = wbits - wbit;
- op = tail < bits ? tail : bits;
+ op = min_t(u32, tail, bits);
bh = wnd_map(wnd, iw);
if (IS_ERR(bh)) {
@@ -834,7 +832,7 @@ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits)
wbits = wnd->bits_last;
tail = wbits - wbit;
- op = tail < bits ? tail : bits;
+ op = min_t(u32, tail, bits);
if (wbits != wnd->free_bits[iw]) {
bool ret;
@@ -926,7 +924,7 @@ use_wnd:
wbits = wnd->bits_last;
tail = wbits - wbit;
- op = tail < bits ? tail : bits;
+ op = min_t(u32, tail, bits);
if (wnd->free_bits[iw]) {
bool ret;
diff --git a/fs/ntfs3/debug.h b/fs/ntfs3/debug.h
index 31120569a87b..53ef7489c75f 100644
--- a/fs/ntfs3/debug.h
+++ b/fs/ntfs3/debug.h
@@ -11,6 +11,9 @@
#ifndef _LINUX_NTFS3_DEBUG_H
#define _LINUX_NTFS3_DEBUG_H
+struct super_block;
+struct inode;
+
#ifndef Add2Ptr
#define Add2Ptr(P, I) ((void *)((u8 *)(P) + (I)))
#define PtrOffset(B, O) ((size_t)((size_t)(O) - (size_t)(B)))
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
index 93f6d485564e..fb438d604040 100644
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -7,10 +7,7 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/iversion.h>
#include <linux/nls.h>
#include "debug.h"
@@ -18,30 +15,27 @@
#include "ntfs_fs.h"
/* Convert little endian UTF-16 to NLS string. */
-int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni,
+int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len,
u8 *buf, int buf_len)
{
- int ret, uni_len, warn;
- const __le16 *ip;
+ int ret, warn;
u8 *op;
- struct nls_table *nls = sbi->options.nls;
+ struct nls_table *nls = sbi->options->nls;
static_assert(sizeof(wchar_t) == sizeof(__le16));
if (!nls) {
/* UTF-16 -> UTF-8 */
- ret = utf16s_to_utf8s((wchar_t *)uni->name, uni->len,
- UTF16_LITTLE_ENDIAN, buf, buf_len);
+ ret = utf16s_to_utf8s(name, len, UTF16_LITTLE_ENDIAN, buf,
+ buf_len);
buf[ret] = '\0';
return ret;
}
- ip = uni->name;
op = buf;
- uni_len = uni->len;
warn = 0;
- while (uni_len--) {
+ while (len--) {
u16 ec;
int charlen;
char dump[5];
@@ -52,7 +46,7 @@ int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni,
break;
}
- ec = le16_to_cpu(*ip++);
+ ec = le16_to_cpu(*name++);
charlen = nls->uni2char(ec, op, buf_len);
if (charlen > 0) {
@@ -186,7 +180,7 @@ int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len,
{
int ret, slen;
const u8 *end;
- struct nls_table *nls = sbi->options.nls;
+ struct nls_table *nls = sbi->options->nls;
u16 *uname = uni->name;
static_assert(sizeof(wchar_t) == sizeof(u16));
@@ -301,14 +295,14 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni,
return 0;
/* Skip meta files. Unless option to show metafiles is set. */
- if (!sbi->options.showmeta && ntfs_is_meta_file(sbi, ino))
+ if (!sbi->options->showmeta && ntfs_is_meta_file(sbi, ino))
return 0;
- if (sbi->options.nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN))
+ if (sbi->options->nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN))
return 0;
- name_len = ntfs_utf16_to_nls(sbi, (struct le_str *)&fname->name_len,
- name, PATH_MAX);
+ name_len = ntfs_utf16_to_nls(sbi, fname->name, fname->name_len, name,
+ PATH_MAX);
if (name_len <= 0) {
ntfs_warn(sbi->sb, "failed to convert name for inode %lx.",
ino);
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 424450e77ad5..787b53b984ee 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -8,11 +8,11 @@
*/
#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/compat.h>
#include <linux/falloc.h>
#include <linux/fiemap.h>
-#include <linux/nls.h>
#include "debug.h"
#include "ntfs.h"
@@ -588,8 +588,11 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
truncate_pagecache(inode, vbo_down);
if (!is_sparsed(ni) && !is_compressed(ni)) {
- /* Normal file. */
- err = ntfs_zero_range(inode, vbo, end);
+ /*
+ * Normal file, can't make hole.
+ * TODO: Try to find way to save info about hole.
+ */
+ err = -EOPNOTSUPP;
goto out;
}
@@ -737,7 +740,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
umode_t mode = inode->i_mode;
int err;
- if (sbi->options.no_acs_rules) {
+ if (sbi->options->noacsrules) {
/* "No access rules" - Force any changes of time etc. */
attr->ia_valid |= ATTR_FORCE;
/* and disable for editing some attributes. */
@@ -987,7 +990,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
frame_vbo = pos & ~(frame_size - 1);
index = frame_vbo >> PAGE_SHIFT;
- if (unlikely(iov_iter_fault_in_readable(from, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(from, bytes))) {
err = -EFAULT;
goto out;
}
@@ -1185,7 +1188,7 @@ static int ntfs_file_release(struct inode *inode, struct file *file)
int err = 0;
/* If we are last writer on the inode, drop the block reservation. */
- if (sbi->options.prealloc && ((file->f_mode & FMODE_WRITE) &&
+ if (sbi->options->prealloc && ((file->f_mode & FMODE_WRITE) &&
atomic_read(&inode->i_writecount) == 1)) {
ni_lock(ni);
down_write(&ni->file.run_lock);
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 938b12d56ca6..6f47a9c17f89 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -5,11 +5,8 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fiemap.h>
#include <linux/fs.h>
-#include <linux/nls.h>
#include <linux/vmalloc.h>
#include "debug.h"
@@ -708,18 +705,35 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
continue;
mi = ni_find_mi(ni, ino_get(&le->ref));
+ if (!mi) {
+ /* Should never happened, 'cause already checked. */
+ goto bad;
+ }
attr = mi_find_attr(mi, NULL, le->type, le_name(le),
le->name_len, &le->id);
+ if (!attr) {
+ /* Should never happened, 'cause already checked. */
+ goto bad;
+ }
asize = le32_to_cpu(attr->size);
/* Insert into primary record. */
attr_ins = mi_insert_attr(&ni->mi, le->type, le_name(le),
le->name_len, asize,
le16_to_cpu(attr->name_off));
- id = attr_ins->id;
+ if (!attr_ins) {
+ /*
+ * Internal error.
+ * Either no space in primary record (already checked).
+ * Either tried to insert another
+ * non indexed attribute (logic error).
+ */
+ goto bad;
+ }
/* Copy all except id. */
+ id = attr_ins->id;
memcpy(attr_ins, attr, asize);
attr_ins->id = id;
@@ -735,6 +749,10 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
ni->attr_list.dirty = false;
return 0;
+bad:
+ ntfs_inode_err(&ni->vfs_inode, "Internal error");
+ make_bad_inode(&ni->vfs_inode);
+ return -EINVAL;
}
/*
@@ -956,6 +974,13 @@ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le,
continue;
}
+ /*
+ * Do not try to insert this attribute
+ * if there is no room in record.
+ */
+ if (le32_to_cpu(mi->mrec->used) + asize > sbi->record_size)
+ continue;
+
/* Try to insert attribute into this subrecord. */
attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize,
name_off, svcn, ins_le);
@@ -1451,7 +1476,7 @@ int ni_insert_resident(struct ntfs_inode *ni, u32 data_size,
attr->res.flags = RESIDENT_FLAG_INDEXED;
/* is_attr_indexed(attr)) == true */
- le16_add_cpu(&ni->mi.mrec->hard_links, +1);
+ le16_add_cpu(&ni->mi.mrec->hard_links, 1);
ni->mi.dirty = true;
}
attr->res.res = 0;
@@ -1606,7 +1631,7 @@ struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type,
*le = NULL;
- if (FILE_NAME_POSIX == name_type)
+ if (name_type == FILE_NAME_POSIX)
return NULL;
/* Enumerate all names. */
@@ -1706,18 +1731,16 @@ out:
/*
* ni_parse_reparse
*
- * Buffer is at least 24 bytes.
+ * buffer - memory for reparse buffer header
*/
enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr,
- void *buffer)
+ struct REPARSE_DATA_BUFFER *buffer)
{
const struct REPARSE_DATA_BUFFER *rp = NULL;
u8 bits;
u16 len;
typeof(rp->CompressReparseBuffer) *cmpr;
- static_assert(sizeof(struct REPARSE_DATA_BUFFER) <= 24);
-
/* Try to estimate reparse point. */
if (!attr->non_res) {
rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER));
@@ -1803,6 +1826,9 @@ enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr,
return REPARSE_NONE;
}
+ if (buffer != rp)
+ memcpy(buffer, rp, sizeof(struct REPARSE_DATA_BUFFER));
+
/* Looks like normal symlink. */
return REPARSE_LINK;
}
@@ -2906,9 +2932,8 @@ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de + 1, de_key_size);
mi_get_ref(&ni->mi, &de->ref);
- if (indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 1)) {
+ if (indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 1))
return false;
- }
}
return true;
@@ -3077,7 +3102,9 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup,
const struct EA_INFO *info;
info = resident_data_ex(attr, sizeof(struct EA_INFO));
- dup->ea_size = info->size_pack;
+ /* If ATTR_EA_INFO exists 'info' can't be NULL. */
+ if (info)
+ dup->ea_size = info->size_pack;
}
}
@@ -3205,7 +3232,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
goto out;
}
- err = al_update(ni);
+ err = al_update(ni, sync);
if (err)
goto out;
}
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index b5853aed0e25..06492f088d60 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -6,12 +6,8 @@
*/
#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/hash.h>
-#include <linux/nls.h>
#include <linux/random.h>
-#include <linux/ratelimit.h>
#include <linux/slab.h>
#include "debug.h"
@@ -2219,7 +2215,7 @@ file_is_valid:
err = ntfs_sb_write_run(log->ni->mi.sbi,
&log->ni->file.run, off, page,
- log->page_size);
+ log->page_size, 0);
if (err)
goto out;
@@ -3710,7 +3706,7 @@ move_data:
if (a_dirty) {
attr = oa->attr;
- err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes);
+ err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes, 0);
if (err)
goto out;
}
@@ -5152,10 +5148,10 @@ end_reply:
ntfs_fix_pre_write(&rh->rhdr, log->page_size);
- err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rh, log->page_size);
+ err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rh, log->page_size, 0);
if (!err)
err = ntfs_sb_write_run(sbi, &log->ni->file.run, log->page_size,
- rh, log->page_size);
+ rh, log->page_size, 0);
kfree(rh);
if (err)
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 91e3743e1442..4de9acb16968 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -8,7 +8,7 @@
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/nls.h>
+#include <linux/kernel.h>
#include "debug.h"
#include "ntfs.h"
@@ -358,7 +358,7 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len,
enum ALLOCATE_OPT opt)
{
int err;
- CLST alen = 0;
+ CLST alen;
struct super_block *sb = sbi->sb;
size_t alcn, zlen, zeroes, zlcn, zlen2, ztrim, new_zlen;
struct wnd_bitmap *wnd = &sbi->used.bitmap;
@@ -370,27 +370,28 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len,
if (!zlen) {
err = ntfs_refresh_zone(sbi);
if (err)
- goto out;
+ goto up_write;
+
zlen = wnd_zone_len(wnd);
}
if (!zlen) {
ntfs_err(sbi->sb, "no free space to extend mft");
- goto out;
+ err = -ENOSPC;
+ goto up_write;
}
lcn = wnd_zone_bit(wnd);
- alen = zlen > len ? len : zlen;
+ alen = min_t(CLST, len, zlen);
wnd_zone_set(wnd, lcn + alen, zlen - alen);
err = wnd_set_used(wnd, lcn, alen);
- if (err) {
- up_write(&wnd->rw_lock);
- return err;
- }
+ if (err)
+ goto up_write;
+
alcn = lcn;
- goto out;
+ goto space_found;
}
/*
* 'Cause cluster 0 is always used this value means that we should use
@@ -404,49 +405,45 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len,
alen = wnd_find(wnd, len, lcn, BITMAP_FIND_MARK_AS_USED, &alcn);
if (alen)
- goto out;
+ goto space_found;
/* Try to use clusters from MftZone. */
zlen = wnd_zone_len(wnd);
zeroes = wnd_zeroes(wnd);
/* Check too big request */
- if (len > zeroes + zlen || zlen <= NTFS_MIN_MFT_ZONE)
- goto out;
+ if (len > zeroes + zlen || zlen <= NTFS_MIN_MFT_ZONE) {
+ err = -ENOSPC;
+ goto up_write;
+ }
/* How many clusters to cat from zone. */
zlcn = wnd_zone_bit(wnd);
zlen2 = zlen >> 1;
- ztrim = len > zlen ? zlen : (len > zlen2 ? len : zlen2);
- new_zlen = zlen - ztrim;
-
- if (new_zlen < NTFS_MIN_MFT_ZONE) {
- new_zlen = NTFS_MIN_MFT_ZONE;
- if (new_zlen > zlen)
- new_zlen = zlen;
- }
+ ztrim = clamp_val(len, zlen2, zlen);
+ new_zlen = max_t(size_t, zlen - ztrim, NTFS_MIN_MFT_ZONE);
wnd_zone_set(wnd, zlcn, new_zlen);
/* Allocate continues clusters. */
alen = wnd_find(wnd, len, 0,
BITMAP_FIND_MARK_AS_USED | BITMAP_FIND_FULL, &alcn);
-
-out:
- if (alen) {
- err = 0;
- *new_len = alen;
- *new_lcn = alcn;
-
- ntfs_unmap_meta(sb, alcn, alen);
-
- /* Set hint for next requests. */
- if (!(opt & ALLOCATE_MFT))
- sbi->used.next_free_lcn = alcn + alen;
- } else {
+ if (!alen) {
err = -ENOSPC;
+ goto up_write;
}
+space_found:
+ err = 0;
+ *new_len = alen;
+ *new_lcn = alcn;
+
+ ntfs_unmap_meta(sb, alcn, alen);
+
+ /* Set hint for next requests. */
+ if (!(opt & ALLOCATE_MFT))
+ sbi->used.next_free_lcn = alcn + alen;
+up_write:
up_write(&wnd->rw_lock);
return err;
}
@@ -1080,7 +1077,7 @@ int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes,
}
int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run,
- u64 vbo, const void *buf, size_t bytes)
+ u64 vbo, const void *buf, size_t bytes, int sync)
{
struct super_block *sb = sbi->sb;
u8 cluster_bits = sbi->cluster_bits;
@@ -1099,8 +1096,8 @@ int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run,
len = ((u64)clen << cluster_bits) - off;
for (;;) {
- u32 op = len < bytes ? len : bytes;
- int err = ntfs_sb_write(sb, lbo, op, buf, 0);
+ u32 op = min_t(u64, len, bytes);
+ int err = ntfs_sb_write(sb, lbo, op, buf, sync);
if (err)
return err;
@@ -1300,7 +1297,7 @@ int ntfs_get_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo,
nb->off = off = lbo & (blocksize - 1);
for (;;) {
- u32 len32 = len < bytes ? len : bytes;
+ u32 len32 = min_t(u64, len, bytes);
sector_t block = lbo >> sb->s_blocksize_bits;
do {
@@ -2175,7 +2172,7 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi,
/* Write main SDS bucket. */
err = ntfs_sb_write_run(sbi, &ni->file.run, sbi->security.next_off,
- d_security, aligned_sec_size);
+ d_security, aligned_sec_size, 0);
if (err)
goto out;
@@ -2193,7 +2190,7 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi,
/* Write copy SDS bucket. */
err = ntfs_sb_write_run(sbi, &ni->file.run, mirr_off, d_security,
- aligned_sec_size);
+ aligned_sec_size, 0);
if (err)
goto out;
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 0daca9adc54c..6f81e3a49abf 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -8,7 +8,7 @@
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/nls.h>
+#include <linux/kernel.h>
#include "debug.h"
#include "ntfs.h"
@@ -671,138 +671,74 @@ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx,
const struct INDEX_HDR *hdr, const void *key,
size_t key_len, const void *ctx, int *diff)
{
- struct NTFS_DE *e;
+ struct NTFS_DE *e, *found = NULL;
NTFS_CMP_FUNC cmp = indx->cmp;
+ int min_idx = 0, mid_idx, max_idx = 0;
+ int diff2;
+ int table_size = 8;
u32 e_size, e_key_len;
u32 end = le32_to_cpu(hdr->used);
u32 off = le32_to_cpu(hdr->de_off);
+ u16 offs[128];
-#ifdef NTFS3_INDEX_BINARY_SEARCH
- int max_idx = 0, fnd, min_idx;
- int nslots = 64;
- u16 *offs;
-
- if (end > 0x10000)
- goto next;
-
- offs = kmalloc(sizeof(u16) * nslots, GFP_NOFS);
- if (!offs)
- goto next;
+fill_table:
+ if (off + sizeof(struct NTFS_DE) > end)
+ return NULL;
- /* Use binary search algorithm. */
-next1:
- if (off + sizeof(struct NTFS_DE) > end) {
- e = NULL;
- goto out1;
- }
e = Add2Ptr(hdr, off);
e_size = le16_to_cpu(e->size);
- if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) {
- e = NULL;
- goto out1;
- }
-
- if (max_idx >= nslots) {
- u16 *ptr;
- int new_slots = ALIGN(2 * nslots, 8);
-
- ptr = kmalloc(sizeof(u16) * new_slots, GFP_NOFS);
- if (ptr)
- memcpy(ptr, offs, sizeof(u16) * max_idx);
- kfree(offs);
- offs = ptr;
- nslots = new_slots;
- if (!ptr)
- goto next;
- }
-
- /* Store entry table. */
- offs[max_idx] = off;
+ if (e_size < sizeof(struct NTFS_DE) || off + e_size > end)
+ return NULL;
if (!de_is_last(e)) {
+ offs[max_idx] = off;
off += e_size;
- max_idx += 1;
- goto next1;
- }
- /*
- * Table of pointers is created.
- * Use binary search to find entry that is <= to the search value.
- */
- fnd = -1;
- min_idx = 0;
+ max_idx++;
+ if (max_idx < table_size)
+ goto fill_table;
- while (min_idx <= max_idx) {
- int mid_idx = min_idx + ((max_idx - min_idx) >> 1);
- int diff2;
-
- e = Add2Ptr(hdr, offs[mid_idx]);
+ max_idx--;
+ }
- e_key_len = le16_to_cpu(e->key_size);
+binary_search:
+ e_key_len = le16_to_cpu(e->key_size);
- diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx);
+ diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx);
+ if (diff2 > 0) {
+ if (found) {
+ min_idx = mid_idx + 1;
+ } else {
+ if (de_is_last(e))
+ return NULL;
- if (!diff2) {
- *diff = 0;
- goto out1;
+ max_idx = 0;
+ table_size = min(table_size * 2,
+ (int)ARRAY_SIZE(offs));
+ goto fill_table;
}
-
- if (diff2 < 0) {
+ } else if (diff2 < 0) {
+ if (found)
max_idx = mid_idx - 1;
- fnd = mid_idx;
- if (!fnd)
- break;
- } else {
- min_idx = mid_idx + 1;
- }
- }
+ else
+ max_idx--;
- if (fnd == -1) {
- e = NULL;
- goto out1;
+ found = e;
+ } else {
+ *diff = 0;
+ return e;
}
- *diff = -1;
- e = Add2Ptr(hdr, offs[fnd]);
-
-out1:
- kfree(offs);
-
- return e;
-#endif
-
-next:
- /*
- * Entries index are sorted.
- * Enumerate all entries until we find entry
- * that is <= to the search value.
- */
- if (off + sizeof(struct NTFS_DE) > end)
- return NULL;
-
- e = Add2Ptr(hdr, off);
- e_size = le16_to_cpu(e->size);
-
- if (e_size < sizeof(struct NTFS_DE) || off + e_size > end)
- return NULL;
-
- off += e_size;
-
- e_key_len = le16_to_cpu(e->key_size);
-
- *diff = (*cmp)(key, key_len, e + 1, e_key_len, ctx);
- if (!*diff)
- return e;
+ if (min_idx > max_idx) {
+ *diff = -1;
+ return found;
+ }
- if (*diff <= 0)
- return e;
+ mid_idx = (min_idx + max_idx) >> 1;
+ e = Add2Ptr(hdr, offs[mid_idx]);
- if (de_is_last(e)) {
- *diff = 1;
- return e;
- }
- goto next;
+ goto binary_search;
}
/*
@@ -1136,9 +1072,7 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni,
if (!e)
return -EINVAL;
- if (fnd)
- fnd->root_de = e;
-
+ fnd->root_de = e;
err = 0;
for (;;) {
@@ -1401,7 +1335,7 @@ ok:
static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
CLST *vbn)
{
- int err = -ENOMEM;
+ int err;
struct ntfs_sb_info *sbi = ni->mi.sbi;
struct ATTRIB *bitmap;
struct ATTRIB *alloc;
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index db2a5a4c38e4..a87ab3ad3cd3 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -5,10 +5,8 @@
*
*/
-#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/iversion.h>
#include <linux/mpage.h>
#include <linux/namei.h>
#include <linux/nls.h>
@@ -49,8 +47,8 @@ static struct inode *ntfs_read_mft(struct inode *inode,
inode->i_op = NULL;
/* Setup 'uid' and 'gid' */
- inode->i_uid = sbi->options.fs_uid;
- inode->i_gid = sbi->options.fs_gid;
+ inode->i_uid = sbi->options->fs_uid;
+ inode->i_gid = sbi->options->fs_gid;
err = mi_init(&ni->mi, sbi, ino);
if (err)
@@ -224,12 +222,9 @@ next_attr:
if (!attr->non_res) {
ni->i_valid = inode->i_size = rsize;
inode_set_bytes(inode, rsize);
- t32 = asize;
- } else {
- t32 = le16_to_cpu(attr->nres.run_off);
}
- mode = S_IFREG | (0777 & sbi->options.fs_fmask_inv);
+ mode = S_IFREG | (0777 & sbi->options->fs_fmask_inv);
if (!attr->non_res) {
ni->ni_flags |= NI_FLAG_RESIDENT;
@@ -272,7 +267,7 @@ next_attr:
goto out;
mode = sb->s_root
- ? (S_IFDIR | (0777 & sbi->options.fs_dmask_inv))
+ ? (S_IFDIR | (0777 & sbi->options->fs_dmask_inv))
: (S_IFDIR | 0777);
goto next_attr;
@@ -315,17 +310,14 @@ next_attr:
rp_fa = ni_parse_reparse(ni, attr, &rp);
switch (rp_fa) {
case REPARSE_LINK:
- if (!attr->non_res) {
- inode->i_size = rsize;
- inode_set_bytes(inode, rsize);
- t32 = asize;
- } else {
- inode->i_size =
- le64_to_cpu(attr->nres.data_size);
- t32 = le16_to_cpu(attr->nres.run_off);
- }
+ /*
+ * Normal symlink.
+ * Assume one unicode symbol == one utf8.
+ */
+ inode->i_size = le16_to_cpu(rp.SymbolicLinkReparseBuffer
+ .PrintNameLength) /
+ sizeof(u16);
- /* Looks like normal symlink. */
ni->i_valid = inode->i_size;
/* Clear directory bit. */
@@ -422,7 +414,7 @@ end_enum:
ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY;
inode->i_op = &ntfs_link_inode_operations;
inode->i_fop = NULL;
- inode_nohighmem(inode); // ??
+ inode_nohighmem(inode);
} else if (S_ISREG(mode)) {
ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY;
inode->i_op = &ntfs_file_inode_operations;
@@ -443,7 +435,7 @@ end_enum:
goto out;
}
- if ((sbi->options.sys_immutable &&
+ if ((sbi->options->sys_immutable &&
(std5->fa & FILE_ATTRIBUTE_SYSTEM)) &&
!S_ISFIFO(mode) && !S_ISSOCK(mode) && !S_ISLNK(mode)) {
inode->i_flags |= S_IMMUTABLE;
@@ -1054,7 +1046,7 @@ int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
if (!ret && i2)
ret = writeback_inode(i2);
if (!ret)
- ret = filemap_flush(sb->s_bdev->bd_inode->i_mapping);
+ ret = sync_blockdev_nowait(sb->s_bdev);
return ret;
}
@@ -1200,9 +1192,13 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
struct REPARSE_DATA_BUFFER *rp = NULL;
bool rp_inserted = false;
+ ni_lock_dir(dir_ni);
+
dir_root = indx_get_root(&dir_ni->dir, dir_ni, NULL, NULL);
- if (!dir_root)
- return ERR_PTR(-EINVAL);
+ if (!dir_root) {
+ err = -EINVAL;
+ goto out1;
+ }
if (S_ISDIR(mode)) {
/* Use parent's directory attributes. */
@@ -1244,7 +1240,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
* }
*/
} else if (S_ISREG(mode)) {
- if (sbi->options.sparse) {
+ if (sbi->options->sparse) {
/* Sparsed regular file, cause option 'sparse'. */
fa = FILE_ATTRIBUTE_SPARSE_FILE |
FILE_ATTRIBUTE_ARCHIVE;
@@ -1486,7 +1482,10 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
asize = ALIGN(SIZEOF_RESIDENT + nsize, 8);
t16 = PtrOffset(rec, attr);
- /* 0x78 - the size of EA + EAINFO to store WSL */
+ /*
+ * Below function 'ntfs_save_wsl_perm' requires 0x78 bytes.
+ * It is good idea to keep extened attributes resident.
+ */
if (asize + t16 + 0x78 + 8 > sbi->record_size) {
CLST alen;
CLST clst = bytes_to_cluster(sbi, nsize);
@@ -1521,14 +1520,14 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
}
asize = SIZEOF_NONRESIDENT + ALIGN(err, 8);
- inode->i_size = nsize;
} else {
attr->res.data_off = SIZEOF_RESIDENT_LE;
attr->res.data_size = cpu_to_le32(nsize);
memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), rp, nsize);
- inode->i_size = nsize;
nsize = 0;
}
+ /* Size of symlink equals the length of input string. */
+ inode->i_size = size;
attr->size = cpu_to_le32(asize);
@@ -1551,6 +1550,9 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
if (err)
goto out6;
+ /* Unlock parent directory before ntfs_init_acl. */
+ ni_unlock(dir_ni);
+
inode->i_generation = le16_to_cpu(rec->seq);
dir->i_mtime = dir->i_ctime = inode->i_atime;
@@ -1562,6 +1564,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
inode->i_op = &ntfs_link_inode_operations;
inode->i_fop = NULL;
inode->i_mapping->a_ops = &ntfs_aops;
+ inode->i_size = size;
+ inode_nohighmem(inode);
} else if (S_ISREG(mode)) {
inode->i_op = &ntfs_file_inode_operations;
inode->i_fop = &ntfs_file_operations;
@@ -1577,7 +1581,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) {
err = ntfs_init_acl(mnt_userns, inode, dir);
if (err)
- goto out6;
+ goto out7;
} else
#endif
{
@@ -1586,7 +1590,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
/* Write non resident data. */
if (nsize) {
- err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize);
+ err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize, 0);
if (err)
goto out7;
}
@@ -1607,8 +1611,10 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
out7:
/* Undo 'indx_insert_entry'. */
+ ni_lock_dir(dir_ni);
indx_delete_entry(&dir_ni->dir, dir_ni, new_de + 1,
le16_to_cpu(new_de->key_size), sbi);
+ /* ni_unlock(dir_ni); will be called later. */
out6:
if (rp_inserted)
ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref);
@@ -1632,8 +1638,10 @@ out2:
kfree(rp);
out1:
- if (err)
+ if (err) {
+ ni_unlock(dir_ni);
return ERR_PTR(err);
+ }
unlock_new_inode(inode);
@@ -1754,15 +1762,15 @@ void ntfs_evict_inode(struct inode *inode)
static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer,
int buflen)
{
- int i, err = 0;
+ int i, err = -EINVAL;
struct ntfs_inode *ni = ntfs_i(inode);
struct super_block *sb = inode->i_sb;
struct ntfs_sb_info *sbi = sb->s_fs_info;
- u64 i_size = inode->i_size;
- u16 nlen = 0;
+ u64 size;
+ u16 ulen = 0;
void *to_free = NULL;
struct REPARSE_DATA_BUFFER *rp;
- struct le_str *uni;
+ const __le16 *uname;
struct ATTRIB *attr;
/* Reparse data present. Try to parse it. */
@@ -1771,68 +1779,64 @@ static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer,
*buffer = 0;
- /* Read into temporal buffer. */
- if (i_size > sbi->reparse.max_size || i_size <= sizeof(u32)) {
- err = -EINVAL;
- goto out;
- }
-
attr = ni_find_attr(ni, NULL, NULL, ATTR_REPARSE, NULL, 0, NULL, NULL);
- if (!attr) {
- err = -EINVAL;
+ if (!attr)
goto out;
- }
if (!attr->non_res) {
- rp = resident_data_ex(attr, i_size);
- if (!rp) {
- err = -EINVAL;
+ rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER));
+ if (!rp)
goto out;
- }
+ size = le32_to_cpu(attr->res.data_size);
} else {
- rp = kmalloc(i_size, GFP_NOFS);
+ size = le64_to_cpu(attr->nres.data_size);
+ rp = NULL;
+ }
+
+ if (size > sbi->reparse.max_size || size <= sizeof(u32))
+ goto out;
+
+ if (!rp) {
+ rp = kmalloc(size, GFP_NOFS);
if (!rp) {
err = -ENOMEM;
goto out;
}
to_free = rp;
- err = ntfs_read_run_nb(sbi, &ni->file.run, 0, rp, i_size, NULL);
+ /* Read into temporal buffer. */
+ err = ntfs_read_run_nb(sbi, &ni->file.run, 0, rp, size, NULL);
if (err)
goto out;
}
- err = -EINVAL;
-
/* Microsoft Tag. */
switch (rp->ReparseTag) {
case IO_REPARSE_TAG_MOUNT_POINT:
/* Mount points and junctions. */
/* Can we use 'Rp->MountPointReparseBuffer.PrintNameLength'? */
- if (i_size <= offsetof(struct REPARSE_DATA_BUFFER,
- MountPointReparseBuffer.PathBuffer))
+ if (size <= offsetof(struct REPARSE_DATA_BUFFER,
+ MountPointReparseBuffer.PathBuffer))
goto out;
- uni = Add2Ptr(rp,
- offsetof(struct REPARSE_DATA_BUFFER,
- MountPointReparseBuffer.PathBuffer) +
- le16_to_cpu(rp->MountPointReparseBuffer
- .PrintNameOffset) -
- 2);
- nlen = le16_to_cpu(rp->MountPointReparseBuffer.PrintNameLength);
+ uname = Add2Ptr(rp,
+ offsetof(struct REPARSE_DATA_BUFFER,
+ MountPointReparseBuffer.PathBuffer) +
+ le16_to_cpu(rp->MountPointReparseBuffer
+ .PrintNameOffset));
+ ulen = le16_to_cpu(rp->MountPointReparseBuffer.PrintNameLength);
break;
case IO_REPARSE_TAG_SYMLINK:
/* FolderSymbolicLink */
/* Can we use 'Rp->SymbolicLinkReparseBuffer.PrintNameLength'? */
- if (i_size <= offsetof(struct REPARSE_DATA_BUFFER,
- SymbolicLinkReparseBuffer.PathBuffer))
+ if (size <= offsetof(struct REPARSE_DATA_BUFFER,
+ SymbolicLinkReparseBuffer.PathBuffer))
goto out;
- uni = Add2Ptr(rp,
- offsetof(struct REPARSE_DATA_BUFFER,
- SymbolicLinkReparseBuffer.PathBuffer) +
- le16_to_cpu(rp->SymbolicLinkReparseBuffer
- .PrintNameOffset) -
- 2);
- nlen = le16_to_cpu(
+ uname = Add2Ptr(
+ rp, offsetof(struct REPARSE_DATA_BUFFER,
+ SymbolicLinkReparseBuffer.PathBuffer) +
+ le16_to_cpu(rp->SymbolicLinkReparseBuffer
+ .PrintNameOffset));
+ ulen = le16_to_cpu(
rp->SymbolicLinkReparseBuffer.PrintNameLength);
break;
@@ -1864,29 +1868,28 @@ static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer,
goto out;
}
if (!IsReparseTagNameSurrogate(rp->ReparseTag) ||
- i_size <= sizeof(struct REPARSE_POINT)) {
+ size <= sizeof(struct REPARSE_POINT)) {
goto out;
}
/* Users tag. */
- uni = Add2Ptr(rp, sizeof(struct REPARSE_POINT) - 2);
- nlen = le16_to_cpu(rp->ReparseDataLength) -
+ uname = Add2Ptr(rp, sizeof(struct REPARSE_POINT));
+ ulen = le16_to_cpu(rp->ReparseDataLength) -
sizeof(struct REPARSE_POINT);
}
/* Convert nlen from bytes to UNICODE chars. */
- nlen >>= 1;
+ ulen >>= 1;
/* Check that name is available. */
- if (!nlen || &uni->name[nlen] > (__le16 *)Add2Ptr(rp, i_size))
+ if (!ulen || uname + ulen > (__le16 *)Add2Ptr(rp, size))
goto out;
/* If name is already zero terminated then truncate it now. */
- if (!uni->name[nlen - 1])
- nlen -= 1;
- uni->len = nlen;
+ if (!uname[ulen - 1])
+ ulen -= 1;
- err = ntfs_utf16_to_nls(sbi, uni, buffer, buflen);
+ err = ntfs_utf16_to_nls(sbi, uname, ulen, buffer, buflen);
if (err < 0)
goto out;
diff --git a/fs/ntfs3/lib/decompress_common.h b/fs/ntfs3/lib/decompress_common.h
index 2d70ae42f1b5..dd7ced000d0e 100644
--- a/fs/ntfs3/lib/decompress_common.h
+++ b/fs/ntfs3/lib/decompress_common.h
@@ -5,6 +5,9 @@
* Copyright (C) 2015 Eric Biggers
*/
+#ifndef _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H
+#define _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H
+
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/types.h>
@@ -336,3 +339,5 @@ static forceinline u8 *lz_copy(u8 *dst, u32 length, u32 offset, const u8 *bufend
return dst;
}
+
+#endif /* _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H */
diff --git a/fs/ntfs3/lib/lib.h b/fs/ntfs3/lib/lib.h
index f508fbad2e71..90309a5ae59c 100644
--- a/fs/ntfs3/lib/lib.h
+++ b/fs/ntfs3/lib/lib.h
@@ -7,6 +7,10 @@
* - linux kernel code style
*/
+#ifndef _LINUX_NTFS3_LIB_LIB_H
+#define _LINUX_NTFS3_LIB_LIB_H
+
+#include <linux/types.h>
/* globals from xpress_decompress.c */
struct xpress_decompressor *xpress_allocate_decompressor(void);
@@ -24,3 +28,5 @@ int lzx_decompress(struct lzx_decompressor *__restrict d,
const void *__restrict compressed_data,
size_t compressed_size, void *__restrict uncompressed_data,
size_t uncompressed_size);
+
+#endif /* _LINUX_NTFS3_LIB_LIB_H */
diff --git a/fs/ntfs3/lznt.c b/fs/ntfs3/lznt.c
index f1f691a67cc4..28f654561f27 100644
--- a/fs/ntfs3/lznt.c
+++ b/fs/ntfs3/lznt.c
@@ -5,13 +5,13 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
-#include <linux/fs.h>
-#include <linux/nls.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
#include "debug.h"
-#include "ntfs.h"
#include "ntfs_fs.h"
// clang-format off
@@ -292,7 +292,7 @@ next:
/*
* get_lznt_ctx
* @level: 0 - Standard compression.
- * !0 - Best compression, requires a lot of cpu.
+ * !0 - Best compression, requires a lot of cpu.
*/
struct lznt *get_lznt_ctx(int level)
{
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index e58415d07132..bc741213ad84 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -5,11 +5,7 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/iversion.h>
-#include <linux/namei.h>
#include <linux/nls.h>
#include "debug.h"
@@ -99,16 +95,11 @@ static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry,
static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl)
{
- struct ntfs_inode *ni = ntfs_i(dir);
struct inode *inode;
- ni_lock_dir(ni);
-
inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFREG | mode,
0, NULL, 0, NULL);
- ni_unlock(ni);
-
return IS_ERR(inode) ? PTR_ERR(inode) : 0;
}
@@ -120,16 +111,11 @@ static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir,
static int ntfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t rdev)
{
- struct ntfs_inode *ni = ntfs_i(dir);
struct inode *inode;
- ni_lock_dir(ni);
-
inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, mode, rdev,
NULL, 0, NULL);
- ni_unlock(ni);
-
return IS_ERR(inode) ? PTR_ERR(inode) : 0;
}
@@ -200,15 +186,10 @@ static int ntfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
{
u32 size = strlen(symname);
struct inode *inode;
- struct ntfs_inode *ni = ntfs_i(dir);
-
- ni_lock_dir(ni);
inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFLNK | 0777,
0, symname, size, NULL);
- ni_unlock(ni);
-
return IS_ERR(inode) ? PTR_ERR(inode) : 0;
}
@@ -219,15 +200,10 @@ static int ntfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
struct inode *inode;
- struct ntfs_inode *ni = ntfs_i(dir);
-
- ni_lock_dir(ni);
inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFDIR | mode,
0, NULL, 0, NULL);
- ni_unlock(ni);
-
return IS_ERR(inode) ? PTR_ERR(inode) : 0;
}
diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h
index 6bb3e595263b..9cc396b117bf 100644
--- a/fs/ntfs3/ntfs.h
+++ b/fs/ntfs3/ntfs.h
@@ -10,19 +10,27 @@
#ifndef _LINUX_NTFS3_NTFS_H
#define _LINUX_NTFS3_NTFS_H
-/* TODO: Check 4K MFT record and 512 bytes cluster. */
+#include <linux/blkdev.h>
+#include <linux/build_bug.h>
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "debug.h"
-/* Activate this define to use binary search in indexes. */
-#define NTFS3_INDEX_BINARY_SEARCH
+/* TODO: Check 4K MFT record and 512 bytes cluster. */
/* Check each run for marked clusters. */
#define NTFS3_CHECK_FREE_CLST
#define NTFS_NAME_LEN 255
-/* ntfs.sys used 500 maximum links on-disk struct allows up to 0xffff. */
-#define NTFS_LINK_MAX 0x400
-//#define NTFS_LINK_MAX 0xffff
+/*
+ * ntfs.sys used 500 maximum links on-disk struct allows up to 0xffff.
+ * xfstest generic/041 creates 3003 hardlinks.
+ */
+#define NTFS_LINK_MAX 4000
/*
* Activate to use 64 bit clusters instead of 32 bits in ntfs.sys.
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index dc71c59fd445..8aaec7e0804e 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -9,6 +9,37 @@
#ifndef _LINUX_NTFS3_NTFS_FS_H
#define _LINUX_NTFS3_NTFS_FS_H
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/cleancache.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/page-flags.h>
+#include <linux/pagemap.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/time64.h>
+#include <linux/types.h>
+#include <linux/uidgid.h>
+#include <asm/div64.h>
+#include <asm/page.h>
+
+#include "debug.h"
+#include "ntfs.h"
+
+struct dentry;
+struct fiemap_extent_info;
+struct user_namespace;
+struct page;
+struct writeback_control;
+enum utf16_endian;
+
+
#define MINUS_ONE_T ((size_t)(-1))
/* Biggest MFT / smallest cluster */
#define MAXIMUM_BYTES_PER_MFT 4096
@@ -52,6 +83,7 @@
// clang-format on
struct ntfs_mount_options {
+ char *nls_name;
struct nls_table *nls;
kuid_t fs_uid;
@@ -59,19 +91,16 @@ struct ntfs_mount_options {
u16 fs_fmask_inv;
u16 fs_dmask_inv;
- unsigned uid : 1, /* uid was set. */
- gid : 1, /* gid was set. */
- fmask : 1, /* fmask was set. */
- dmask : 1, /* dmask was set. */
- sys_immutable : 1, /* Immutable system files. */
- discard : 1, /* Issue discard requests on deletions. */
- sparse : 1, /* Create sparse files. */
- showmeta : 1, /* Show meta files. */
- nohidden : 1, /* Do not show hidden files. */
- force : 1, /* Rw mount dirty volume. */
- no_acs_rules : 1, /*Exclude acs rules. */
- prealloc : 1 /* Preallocate space when file is growing. */
- ;
+ unsigned fmask : 1; /* fmask was set. */
+ unsigned dmask : 1; /*dmask was set. */
+ unsigned sys_immutable : 1; /* Immutable system files. */
+ unsigned discard : 1; /* Issue discard requests on deletions. */
+ unsigned sparse : 1; /* Create sparse files. */
+ unsigned showmeta : 1; /* Show meta files. */
+ unsigned nohidden : 1; /* Do not show hidden files. */
+ unsigned force : 1; /* RW mount dirty volume. */
+ unsigned noacsrules : 1; /* Exclude acs rules. */
+ unsigned prealloc : 1; /* Preallocate space when file is growing. */
};
/* Special value to unpack and deallocate. */
@@ -182,10 +211,8 @@ struct ntfs_sb_info {
u32 blocks_per_cluster; // cluster_size / sb->s_blocksize
u32 record_size;
- u32 sector_size;
u32 index_size;
- u8 sector_bits;
u8 cluster_bits;
u8 record_bits;
@@ -279,7 +306,7 @@ struct ntfs_sb_info {
#endif
} compress;
- struct ntfs_mount_options options;
+ struct ntfs_mount_options *options;
struct ratelimit_state msg_ratelimit;
};
@@ -436,7 +463,7 @@ bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le);
bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn,
const __le16 *name, size_t name_len,
const struct MFT_REF *ref);
-int al_update(struct ntfs_inode *ni);
+int al_update(struct ntfs_inode *ni, int sync);
static inline size_t al_aligned(size_t size)
{
return (size + 1023) & ~(size_t)1023;
@@ -448,7 +475,7 @@ bool are_bits_set(const ulong *map, size_t bit, size_t nbits);
size_t get_set_bits_ex(const ulong *map, size_t bit, size_t nbits);
/* Globals from dir.c */
-int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni,
+int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len,
u8 *buf, int buf_len);
int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len,
struct cpu_str *uni, u32 max_ulen,
@@ -520,7 +547,7 @@ struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type,
struct ATTR_LIST_ENTRY **entry);
int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa);
enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr,
- void *buffer);
+ struct REPARSE_DATA_BUFFER *buffer);
int ni_write_inode(struct inode *inode, int sync, const char *hint);
#define _ni_write_inode(i, w) ni_write_inode(i, w, __func__)
int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo,
@@ -577,7 +604,7 @@ int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer);
int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes,
const void *buffer, int wait);
int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run,
- u64 vbo, const void *buf, size_t bytes);
+ u64 vbo, const void *buf, size_t bytes, int sync);
struct buffer_head *ntfs_bread_run(struct ntfs_sb_info *sbi,
const struct runs_tree *run, u64 vbo);
int ntfs_read_run_nb(struct ntfs_sb_info *sbi, const struct runs_tree *run,
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index 103705c86772..861e35791506 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -5,10 +5,7 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/nls.h>
#include "debug.h"
#include "ntfs.h"
diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c
index 26ed2b64345e..a8fec651f973 100644
--- a/fs/ntfs3/run.c
+++ b/fs/ntfs3/run.c
@@ -7,10 +7,8 @@
*/
#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fs.h>
#include <linux/log2.h>
-#include <linux/nls.h>
#include "debug.h"
#include "ntfs.h"
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 55bbc9200a10..29813200c7af 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -23,16 +23,15 @@
*
*/
-#include <linux/backing-dev.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
#include <linux/fs.h>
-#include <linux/iversion.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/log2.h>
#include <linux/module.h>
#include <linux/nls.h>
-#include <linux/parser.h>
#include <linux/seq_file.h>
#include <linux/statfs.h>
@@ -205,9 +204,11 @@ void *ntfs_put_shared(void *ptr)
return ret;
}
-static inline void clear_mount_options(struct ntfs_mount_options *options)
+static inline void put_mount_options(struct ntfs_mount_options *options)
{
+ kfree(options->nls_name);
unload_nls(options->nls);
+ kfree(options);
}
enum Opt {
@@ -223,218 +224,175 @@ enum Opt {
Opt_nohidden,
Opt_showmeta,
Opt_acl,
- Opt_noatime,
- Opt_nls,
+ Opt_iocharset,
Opt_prealloc,
- Opt_no_acs_rules,
+ Opt_noacsrules,
Opt_err,
};
-static const match_table_t ntfs_tokens = {
- { Opt_uid, "uid=%u" },
- { Opt_gid, "gid=%u" },
- { Opt_umask, "umask=%o" },
- { Opt_dmask, "dmask=%o" },
- { Opt_fmask, "fmask=%o" },
- { Opt_immutable, "sys_immutable" },
- { Opt_discard, "discard" },
- { Opt_force, "force" },
- { Opt_sparse, "sparse" },
- { Opt_nohidden, "nohidden" },
- { Opt_acl, "acl" },
- { Opt_noatime, "noatime" },
- { Opt_showmeta, "showmeta" },
- { Opt_nls, "nls=%s" },
- { Opt_prealloc, "prealloc" },
- { Opt_no_acs_rules, "no_acs_rules" },
- { Opt_err, NULL },
+static const struct fs_parameter_spec ntfs_fs_parameters[] = {
+ fsparam_u32("uid", Opt_uid),
+ fsparam_u32("gid", Opt_gid),
+ fsparam_u32oct("umask", Opt_umask),
+ fsparam_u32oct("dmask", Opt_dmask),
+ fsparam_u32oct("fmask", Opt_fmask),
+ fsparam_flag_no("sys_immutable", Opt_immutable),
+ fsparam_flag_no("discard", Opt_discard),
+ fsparam_flag_no("force", Opt_force),
+ fsparam_flag_no("sparse", Opt_sparse),
+ fsparam_flag_no("hidden", Opt_nohidden),
+ fsparam_flag_no("acl", Opt_acl),
+ fsparam_flag_no("showmeta", Opt_showmeta),
+ fsparam_flag_no("prealloc", Opt_prealloc),
+ fsparam_flag_no("acsrules", Opt_noacsrules),
+ fsparam_string("iocharset", Opt_iocharset),
+ {}
};
-static noinline int ntfs_parse_options(struct super_block *sb, char *options,
- int silent,
- struct ntfs_mount_options *opts)
+/*
+ * Load nls table or if @nls is utf8 then return NULL.
+ */
+static struct nls_table *ntfs_load_nls(char *nls)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
- char nls_name[30];
- struct nls_table *nls;
+ struct nls_table *ret;
- opts->fs_uid = current_uid();
- opts->fs_gid = current_gid();
- opts->fs_fmask_inv = opts->fs_dmask_inv = ~current_umask();
- nls_name[0] = 0;
+ if (!nls)
+ nls = CONFIG_NLS_DEFAULT;
- if (!options)
- goto out;
+ if (strcmp(nls, "utf8") == 0)
+ return NULL;
- while ((p = strsep(&options, ","))) {
- int token;
+ if (strcmp(nls, CONFIG_NLS_DEFAULT) == 0)
+ return load_nls_default();
- if (!*p)
- continue;
+ ret = load_nls(nls);
+ if (ret)
+ return ret;
- token = match_token(p, ntfs_tokens, args);
- switch (token) {
- case Opt_immutable:
- opts->sys_immutable = 1;
- break;
- case Opt_uid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- opts->fs_uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(opts->fs_uid))
- return -EINVAL;
- opts->uid = 1;
- break;
- case Opt_gid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- opts->fs_gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(opts->fs_gid))
- return -EINVAL;
- opts->gid = 1;
- break;
- case Opt_umask:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->fs_fmask_inv = opts->fs_dmask_inv = ~option;
- opts->fmask = opts->dmask = 1;
- break;
- case Opt_dmask:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->fs_dmask_inv = ~option;
- opts->dmask = 1;
- break;
- case Opt_fmask:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->fs_fmask_inv = ~option;
- opts->fmask = 1;
- break;
- case Opt_discard:
- opts->discard = 1;
- break;
- case Opt_force:
- opts->force = 1;
- break;
- case Opt_sparse:
- opts->sparse = 1;
- break;
- case Opt_nohidden:
- opts->nohidden = 1;
- break;
- case Opt_acl:
+ return ERR_PTR(-EINVAL);
+}
+
+static int ntfs_fs_parse_param(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct ntfs_mount_options *opts = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, ntfs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_uid:
+ opts->fs_uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(opts->fs_uid))
+ return invalf(fc, "ntfs3: Invalid value for uid.");
+ break;
+ case Opt_gid:
+ opts->fs_gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(opts->fs_gid))
+ return invalf(fc, "ntfs3: Invalid value for gid.");
+ break;
+ case Opt_umask:
+ if (result.uint_32 & ~07777)
+ return invalf(fc, "ntfs3: Invalid value for umask.");
+ opts->fs_fmask_inv = ~result.uint_32;
+ opts->fs_dmask_inv = ~result.uint_32;
+ opts->fmask = 1;
+ opts->dmask = 1;
+ break;
+ case Opt_dmask:
+ if (result.uint_32 & ~07777)
+ return invalf(fc, "ntfs3: Invalid value for dmask.");
+ opts->fs_dmask_inv = ~result.uint_32;
+ opts->dmask = 1;
+ break;
+ case Opt_fmask:
+ if (result.uint_32 & ~07777)
+ return invalf(fc, "ntfs3: Invalid value for fmask.");
+ opts->fs_fmask_inv = ~result.uint_32;
+ opts->fmask = 1;
+ break;
+ case Opt_immutable:
+ opts->sys_immutable = result.negated ? 0 : 1;
+ break;
+ case Opt_discard:
+ opts->discard = result.negated ? 0 : 1;
+ break;
+ case Opt_force:
+ opts->force = result.negated ? 0 : 1;
+ break;
+ case Opt_sparse:
+ opts->sparse = result.negated ? 0 : 1;
+ break;
+ case Opt_nohidden:
+ opts->nohidden = result.negated ? 1 : 0;
+ break;
+ case Opt_acl:
+ if (!result.negated)
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
- sb->s_flags |= SB_POSIXACL;
- break;
+ fc->sb_flags |= SB_POSIXACL;
#else
- ntfs_err(sb, "support for ACL not compiled in!");
- return -EINVAL;
+ return invalf(fc, "ntfs3: Support for ACL not compiled in!");
#endif
- case Opt_noatime:
- sb->s_flags |= SB_NOATIME;
- break;
- case Opt_showmeta:
- opts->showmeta = 1;
- break;
- case Opt_nls:
- match_strlcpy(nls_name, &args[0], sizeof(nls_name));
- break;
- case Opt_prealloc:
- opts->prealloc = 1;
- break;
- case Opt_no_acs_rules:
- opts->no_acs_rules = 1;
- break;
- default:
- if (!silent)
- ntfs_err(
- sb,
- "Unrecognized mount option \"%s\" or missing value",
- p);
- //return -EINVAL;
- }
- }
-
-out:
- if (!strcmp(nls_name[0] ? nls_name : CONFIG_NLS_DEFAULT, "utf8")) {
- /*
- * For UTF-8 use utf16s_to_utf8s()/utf8s_to_utf16s()
- * instead of NLS.
- */
- nls = NULL;
- } else if (nls_name[0]) {
- nls = load_nls(nls_name);
- if (!nls) {
- ntfs_err(sb, "failed to load \"%s\"", nls_name);
- return -EINVAL;
- }
- } else {
- nls = load_nls_default();
- if (!nls) {
- ntfs_err(sb, "failed to load default nls");
- return -EINVAL;
- }
+ else
+ fc->sb_flags &= ~SB_POSIXACL;
+ break;
+ case Opt_showmeta:
+ opts->showmeta = result.negated ? 0 : 1;
+ break;
+ case Opt_iocharset:
+ kfree(opts->nls_name);
+ opts->nls_name = param->string;
+ param->string = NULL;
+ break;
+ case Opt_prealloc:
+ opts->prealloc = result.negated ? 0 : 1;
+ break;
+ case Opt_noacsrules:
+ opts->noacsrules = result.negated ? 1 : 0;
+ break;
+ default:
+ /* Should not be here unless we forget add case. */
+ return -EINVAL;
}
- opts->nls = nls;
-
return 0;
}
-static int ntfs_remount(struct super_block *sb, int *flags, char *data)
+static int ntfs_fs_reconfigure(struct fs_context *fc)
{
- int err, ro_rw;
+ struct super_block *sb = fc->root->d_sb;
struct ntfs_sb_info *sbi = sb->s_fs_info;
- struct ntfs_mount_options old_opts;
- char *orig_data = kstrdup(data, GFP_KERNEL);
-
- if (data && !orig_data)
- return -ENOMEM;
+ struct ntfs_mount_options *new_opts = fc->fs_private;
+ int ro_rw;
- /* Store original options. */
- memcpy(&old_opts, &sbi->options, sizeof(old_opts));
- clear_mount_options(&sbi->options);
- memset(&sbi->options, 0, sizeof(sbi->options));
-
- err = ntfs_parse_options(sb, data, 0, &sbi->options);
- if (err)
- goto restore_opts;
-
- ro_rw = sb_rdonly(sb) && !(*flags & SB_RDONLY);
+ ro_rw = sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY);
if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) {
- ntfs_warn(
- sb,
- "Couldn't remount rw because journal is not replayed. Please umount/remount instead\n");
- err = -EINVAL;
- goto restore_opts;
+ errorf(fc, "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n");
+ return -EINVAL;
}
+ new_opts->nls = ntfs_load_nls(new_opts->nls_name);
+ if (IS_ERR(new_opts->nls)) {
+ new_opts->nls = NULL;
+ errorf(fc, "ntfs3: Cannot load iocharset %s", new_opts->nls_name);
+ return -EINVAL;
+ }
+ if (new_opts->nls != sbi->options->nls)
+ return invalf(fc, "ntfs3: Cannot use different iocharset when remounting!");
+
sync_filesystem(sb);
if (ro_rw && (sbi->volume.flags & VOLUME_FLAG_DIRTY) &&
- !sbi->options.force) {
- ntfs_warn(sb, "volume is dirty and \"force\" flag is not set!");
- err = -EINVAL;
- goto restore_opts;
+ !new_opts->force) {
+ errorf(fc, "ntfs3: Volume is dirty and \"force\" flag is not set!");
+ return -EINVAL;
}
- clear_mount_options(&old_opts);
-
- *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME) |
- SB_NODIRATIME | SB_NOATIME;
- ntfs_info(sb, "re-mounted. Opts: %s", orig_data);
- err = 0;
- goto out;
+ memcpy(sbi->options, new_opts, sizeof(*new_opts));
-restore_opts:
- clear_mount_options(&sbi->options);
- memcpy(&sbi->options, &old_opts, sizeof(old_opts));
-
-out:
- kfree(orig_data);
- return err;
+ return 0;
}
static struct kmem_cache *ntfs_inode_cachep;
@@ -513,8 +471,6 @@ static noinline void put_ntfs(struct ntfs_sb_info *sbi)
xpress_free_decompressor(sbi->compress.xpress);
lzx_free_decompressor(sbi->compress.lzx);
#endif
- clear_mount_options(&sbi->options);
-
kfree(sbi);
}
@@ -525,7 +481,9 @@ static void ntfs_put_super(struct super_block *sb)
/* Mark rw ntfs as clear, if possible. */
ntfs_set_state(sbi, NTFS_DIRTY_CLEAR);
+ put_mount_options(sbi->options);
put_ntfs(sbi);
+ sb->s_fs_info = NULL;
sync_blockdev(sb->s_bdev);
}
@@ -552,23 +510,21 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root)
{
struct super_block *sb = root->d_sb;
struct ntfs_sb_info *sbi = sb->s_fs_info;
- struct ntfs_mount_options *opts = &sbi->options;
+ struct ntfs_mount_options *opts = sbi->options;
struct user_namespace *user_ns = seq_user_ns(m);
- if (opts->uid)
- seq_printf(m, ",uid=%u",
- from_kuid_munged(user_ns, opts->fs_uid));
- if (opts->gid)
- seq_printf(m, ",gid=%u",
- from_kgid_munged(user_ns, opts->fs_gid));
+ seq_printf(m, ",uid=%u",
+ from_kuid_munged(user_ns, opts->fs_uid));
+ seq_printf(m, ",gid=%u",
+ from_kgid_munged(user_ns, opts->fs_gid));
if (opts->fmask)
seq_printf(m, ",fmask=%04o", ~opts->fs_fmask_inv);
if (opts->dmask)
seq_printf(m, ",dmask=%04o", ~opts->fs_dmask_inv);
if (opts->nls)
- seq_printf(m, ",nls=%s", opts->nls->charset);
+ seq_printf(m, ",iocharset=%s", opts->nls->charset);
else
- seq_puts(m, ",nls=utf8");
+ seq_puts(m, ",iocharset=utf8");
if (opts->sys_immutable)
seq_puts(m, ",sys_immutable");
if (opts->discard)
@@ -581,14 +537,12 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",nohidden");
if (opts->force)
seq_puts(m, ",force");
- if (opts->no_acs_rules)
- seq_puts(m, ",no_acs_rules");
+ if (opts->noacsrules)
+ seq_puts(m, ",noacsrules");
if (opts->prealloc)
seq_puts(m, ",prealloc");
if (sb->s_flags & SB_POSIXACL)
seq_puts(m, ",acl");
- if (sb->s_flags & SB_NOATIME)
- seq_puts(m, ",noatime");
return 0;
}
@@ -643,7 +597,6 @@ static const struct super_operations ntfs_sops = {
.statfs = ntfs_statfs,
.show_options = ntfs_show_options,
.sync_fs = ntfs_sync_fs,
- .remount_fs = ntfs_remount,
.write_inode = ntfs3_write_inode,
};
@@ -729,7 +682,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
struct ntfs_sb_info *sbi = sb->s_fs_info;
int err;
u32 mb, gb, boot_sector_size, sct_per_clst, record_size;
- u64 sectors, clusters, fs_size, mlcn, mlcn2;
+ u64 sectors, clusters, mlcn, mlcn2;
struct NTFS_BOOT *boot;
struct buffer_head *bh;
struct MFT_REC *rec;
@@ -787,20 +740,20 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
goto out;
}
- sbi->sector_size = boot_sector_size;
- sbi->sector_bits = blksize_bits(boot_sector_size);
- fs_size = (sectors + 1) << sbi->sector_bits;
+ sbi->volume.size = sectors * boot_sector_size;
- gb = format_size_gb(fs_size, &mb);
+ gb = format_size_gb(sbi->volume.size + boot_sector_size, &mb);
/*
* - Volume formatted and mounted with the same sector size.
* - Volume formatted 4K and mounted as 512.
* - Volume formatted 512 and mounted as 4K.
*/
- if (sbi->sector_size != sector_size) {
- ntfs_warn(sb,
- "Different NTFS' sector size and media sector size");
+ if (boot_sector_size != sector_size) {
+ ntfs_warn(
+ sb,
+ "Different NTFS' sector size (%u) and media sector size (%u)",
+ boot_sector_size, sector_size);
dev_size += sector_size - 1;
}
@@ -810,8 +763,19 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
sbi->mft.lbo = mlcn << sbi->cluster_bits;
sbi->mft.lbo2 = mlcn2 << sbi->cluster_bits;
- if (sbi->cluster_size < sbi->sector_size)
+ /* Compare boot's cluster and sector. */
+ if (sbi->cluster_size < boot_sector_size)
+ goto out;
+
+ /* Compare boot's cluster and media sector. */
+ if (sbi->cluster_size < sector_size) {
+ /* No way to use ntfs_get_block in this case. */
+ ntfs_err(
+ sb,
+ "Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u)",
+ sbi->cluster_size, sector_size);
goto out;
+ }
sbi->cluster_mask = sbi->cluster_size - 1;
sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask;
@@ -836,10 +800,9 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
: (u32)boot->index_size << sbi->cluster_bits;
sbi->volume.ser_num = le64_to_cpu(boot->serial_num);
- sbi->volume.size = sectors << sbi->sector_bits;
/* Warning if RAW volume. */
- if (dev_size < fs_size) {
+ if (dev_size < sbi->volume.size + boot_sector_size) {
u32 mb0, gb0;
gb0 = format_size_gb(dev_size, &mb0);
@@ -883,8 +846,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
rec->total = cpu_to_le32(sbi->record_size);
((struct ATTRIB *)Add2Ptr(rec, ao))->type = ATTR_END;
- if (sbi->cluster_size < PAGE_SIZE)
- sb_set_blocksize(sb, sbi->cluster_size);
+ sb_set_blocksize(sb, min_t(u32, sbi->cluster_size, PAGE_SIZE));
sbi->block_mask = sb->s_blocksize - 1;
sbi->blocks_per_cluster = sbi->cluster_size >> sb->s_blocksize_bits;
@@ -897,9 +859,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
if (clusters >= (1ull << (64 - sbi->cluster_bits)))
sbi->maxbytes = -1;
sbi->maxbytes_sparse = -1;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
#else
/* Maximum size for sparse file. */
sbi->maxbytes_sparse = (1ull << (sbi->cluster_bits + 32)) - 1;
+ sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits;
#endif
err = 0;
@@ -913,14 +877,13 @@ out:
/*
* ntfs_fill_super - Try to mount.
*/
-static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
+static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
int err;
- struct ntfs_sb_info *sbi;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
struct block_device *bdev = sb->s_bdev;
- struct inode *bd_inode = bdev->bd_inode;
- struct request_queue *rq = bdev_get_queue(bdev);
- struct inode *inode = NULL;
+ struct request_queue *rq;
+ struct inode *inode;
struct ntfs_inode *ni;
size_t i, tt;
CLST vcn, lcn, len;
@@ -928,18 +891,11 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
const struct VOLUME_INFO *info;
u32 idx, done, bytes;
struct ATTR_DEF_ENTRY *t;
- u16 *upcase = NULL;
u16 *shared;
- bool is_ro;
struct MFT_REF ref;
ref.high = 0;
- sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS);
- if (!sbi)
- return -ENOMEM;
-
- sb->s_fs_info = sbi;
sbi->sb = sb;
sb->s_flags |= SB_NODIRATIME;
sb->s_magic = 0x7366746e; // "ntfs"
@@ -948,41 +904,27 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec
sb->s_xattr = ntfs_xattr_handlers;
- ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
-
- err = ntfs_parse_options(sb, data, silent, &sbi->options);
- if (err)
+ sbi->options->nls = ntfs_load_nls(sbi->options->nls_name);
+ if (IS_ERR(sbi->options->nls)) {
+ sbi->options->nls = NULL;
+ errorf(fc, "Cannot load nls %s", sbi->options->nls_name);
+ err = -EINVAL;
goto out;
+ }
- if (!rq || !blk_queue_discard(rq) || !rq->limits.discard_granularity) {
- ;
- } else {
+ rq = bdev_get_queue(bdev);
+ if (blk_queue_discard(rq) && rq->limits.discard_granularity) {
sbi->discard_granularity = rq->limits.discard_granularity;
sbi->discard_granularity_mask_inv =
~(u64)(sbi->discard_granularity - 1);
}
- sb_set_blocksize(sb, PAGE_SIZE);
-
/* Parse boot. */
err = ntfs_init_from_boot(sb, rq ? queue_logical_block_size(rq) : 512,
- bd_inode->i_size);
+ bdev_nr_bytes(bdev));
if (err)
goto out;
-#ifdef CONFIG_NTFS3_64BIT_CLUSTER
- sb->s_maxbytes = MAX_LFS_FILESIZE;
-#else
- sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits;
-#endif
-
- mutex_init(&sbi->compress.mtx_lznt);
-#ifdef CONFIG_NTFS3_LZX_XPRESS
- mutex_init(&sbi->compress.mtx_xpress);
- mutex_init(&sbi->compress.mtx_lzx);
-#endif
-
/*
* Load $Volume. This should be done before $LogFile
* 'cause 'sbi->volume.ni' is used 'ntfs_set_state'.
@@ -991,9 +933,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
ref.seq = cpu_to_le16(MFT_REC_VOL);
inode = ntfs_iget5(sb, &ref, &NAME_VOLUME);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
ntfs_err(sb, "Failed to load $Volume.");
- inode = NULL;
+ err = PTR_ERR(inode);
goto out;
}
@@ -1015,36 +956,33 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
} else {
/* Should we break mounting here? */
//err = -EINVAL;
- //goto out;
+ //goto put_inode_out;
}
attr = ni_find_attr(ni, attr, NULL, ATTR_VOL_INFO, NULL, 0, NULL, NULL);
if (!attr || is_attr_ext(attr)) {
err = -EINVAL;
- goto out;
+ goto put_inode_out;
}
info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO);
if (!info) {
err = -EINVAL;
- goto out;
+ goto put_inode_out;
}
sbi->volume.major_ver = info->major_ver;
sbi->volume.minor_ver = info->minor_ver;
sbi->volume.flags = info->flags;
-
sbi->volume.ni = ni;
- inode = NULL;
/* Load $MFTMirr to estimate recs_mirr. */
ref.low = cpu_to_le32(MFT_REC_MIRR);
ref.seq = cpu_to_le16(MFT_REC_MIRR);
inode = ntfs_iget5(sb, &ref, &NAME_MIRROR);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
ntfs_err(sb, "Failed to load $MFTMirr.");
- inode = NULL;
+ err = PTR_ERR(inode);
goto out;
}
@@ -1058,9 +996,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
ref.seq = cpu_to_le16(MFT_REC_LOG);
inode = ntfs_iget5(sb, &ref, &NAME_LOGFILE);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
ntfs_err(sb, "Failed to load \x24LogFile.");
- inode = NULL;
+ err = PTR_ERR(inode);
goto out;
}
@@ -1068,22 +1005,19 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
err = ntfs_loadlog_and_replay(ni, sbi);
if (err)
- goto out;
+ goto put_inode_out;
iput(inode);
- inode = NULL;
-
- is_ro = sb_rdonly(sbi->sb);
if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) {
- if (!is_ro) {
+ if (!sb_rdonly(sb)) {
ntfs_warn(sb,
"failed to replay log file. Can't mount rw!");
err = -EINVAL;
goto out;
}
} else if (sbi->volume.flags & VOLUME_FLAG_DIRTY) {
- if (!is_ro && !sbi->options.force) {
+ if (!sb_rdonly(sb) && !sbi->options->force) {
ntfs_warn(
sb,
"volume is dirty and \"force\" flag is not set!");
@@ -1098,9 +1032,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
inode = ntfs_iget5(sb, &ref, &NAME_MFT);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
ntfs_err(sb, "Failed to load $MFT.");
- inode = NULL;
+ err = PTR_ERR(inode);
goto out;
}
@@ -1112,11 +1045,11 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
err = wnd_init(&sbi->mft.bitmap, sb, tt);
if (err)
- goto out;
+ goto put_inode_out;
err = ni_load_all_mi(ni);
if (err)
- goto out;
+ goto put_inode_out;
sbi->mft.ni = ni;
@@ -1125,9 +1058,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
ref.seq = cpu_to_le16(MFT_REC_BADCLUST);
inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
ntfs_err(sb, "Failed to load $BadClus.");
- inode = NULL;
+ err = PTR_ERR(inode);
goto out;
}
@@ -1150,18 +1082,15 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
ref.seq = cpu_to_le16(MFT_REC_BITMAP);
inode = ntfs_iget5(sb, &ref, &NAME_BITMAP);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
ntfs_err(sb, "Failed to load $Bitmap.");
- inode = NULL;
+ err = PTR_ERR(inode);
goto out;
}
- ni = ntfs_i(inode);
-
#ifndef CONFIG_NTFS3_64BIT_CLUSTER
if (inode->i_size >> 32) {
err = -EINVAL;
- goto out;
+ goto put_inode_out;
}
#endif
@@ -1169,14 +1098,14 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
tt = sbi->used.bitmap.nbits;
if (inode->i_size < bitmap_size(tt)) {
err = -EINVAL;
- goto out;
+ goto put_inode_out;
}
/* Not necessary. */
sbi->used.bitmap.set_tail = true;
- err = wnd_init(&sbi->used.bitmap, sbi->sb, tt);
+ err = wnd_init(&sbi->used.bitmap, sb, tt);
if (err)
- goto out;
+ goto put_inode_out;
iput(inode);
@@ -1188,23 +1117,22 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
/* Load $AttrDef. */
ref.low = cpu_to_le32(MFT_REC_ATTR);
ref.seq = cpu_to_le16(MFT_REC_ATTR);
- inode = ntfs_iget5(sbi->sb, &ref, &NAME_ATTRDEF);
+ inode = ntfs_iget5(sb, &ref, &NAME_ATTRDEF);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
ntfs_err(sb, "Failed to load $AttrDef -> %d", err);
- inode = NULL;
+ err = PTR_ERR(inode);
goto out;
}
if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY)) {
err = -EINVAL;
- goto out;
+ goto put_inode_out;
}
bytes = inode->i_size;
sbi->def_table = t = kmalloc(bytes, GFP_NOFS);
if (!t) {
err = -ENOMEM;
- goto out;
+ goto put_inode_out;
}
for (done = idx = 0; done < bytes; done += PAGE_SIZE, idx++) {
@@ -1213,7 +1141,7 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
if (IS_ERR(page)) {
err = PTR_ERR(page);
- goto out;
+ goto put_inode_out;
}
memcpy(Add2Ptr(t, done), page_address(page),
min(PAGE_SIZE, tail));
@@ -1221,7 +1149,7 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
if (!idx && ATTR_STD != t->type) {
err = -EINVAL;
- goto out;
+ goto put_inode_out;
}
}
@@ -1254,33 +1182,24 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
ref.seq = cpu_to_le16(MFT_REC_UPCASE);
inode = ntfs_iget5(sb, &ref, &NAME_UPCASE);
if (IS_ERR(inode)) {
+ ntfs_err(sb, "Failed to load $UpCase.");
err = PTR_ERR(inode);
- ntfs_err(sb, "Failed to load \x24LogFile.");
- inode = NULL;
goto out;
}
- ni = ntfs_i(inode);
-
if (inode->i_size != 0x10000 * sizeof(short)) {
err = -EINVAL;
- goto out;
- }
-
- sbi->upcase = upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL);
- if (!upcase) {
- err = -ENOMEM;
- goto out;
+ goto put_inode_out;
}
for (idx = 0; idx < (0x10000 * sizeof(short) >> PAGE_SHIFT); idx++) {
const __le16 *src;
- u16 *dst = Add2Ptr(upcase, idx << PAGE_SHIFT);
+ u16 *dst = Add2Ptr(sbi->upcase, idx << PAGE_SHIFT);
struct page *page = ntfs_map_page(inode->i_mapping, idx);
if (IS_ERR(page)) {
err = PTR_ERR(page);
- goto out;
+ goto put_inode_out;
}
src = page_address(page);
@@ -1294,14 +1213,13 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent)
ntfs_unmap_page(page);
}
- shared = ntfs_set_shared(upcase, 0x10000 * sizeof(short));
- if (shared && upcase != shared) {
+ shared = ntfs_set_shared(sbi->upcase, 0x10000 * sizeof(short));
+ if (shared && sbi->upcase != shared) {
+ kvfree(sbi->upcase);
sbi->upcase = shared;
- kvfree(upcase);
}
iput(inode);
- inode = NULL;
if (is_ntfs3(sbi)) {
/* Load $Secure. */
@@ -1331,34 +1249,31 @@ load_root:
ref.seq = cpu_to_le16(MFT_REC_ROOT);
inode = ntfs_iget5(sb, &ref, &NAME_ROOT);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
ntfs_err(sb, "Failed to load root.");
- inode = NULL;
+ err = PTR_ERR(inode);
goto out;
}
- ni = ntfs_i(inode);
-
sb->s_root = d_make_root(inode);
-
if (!sb->s_root) {
- err = -EINVAL;
- goto out;
+ err = -ENOMEM;
+ goto put_inode_out;
}
+ fc->fs_private = NULL;
+
return 0;
-out:
+put_inode_out:
iput(inode);
-
- if (sb->s_root) {
- d_drop(sb->s_root);
- sb->s_root = NULL;
- }
-
+out:
+ /*
+ * Free resources here.
+ * ntfs_fs_free will be called with fc->s_fs_info = NULL
+ */
put_ntfs(sbi);
-
sb->s_fs_info = NULL;
+
return err;
}
@@ -1403,7 +1318,7 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len)
if (sbi->flags & NTFS_FLAGS_NODISCARD)
return -EOPNOTSUPP;
- if (!sbi->options.discard)
+ if (!sbi->options->discard)
return -EOPNOTSUPP;
lbo = (u64)lcn << sbi->cluster_bits;
@@ -1428,19 +1343,99 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len)
return err;
}
-static struct dentry *ntfs_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
+static int ntfs_fs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, ntfs_fill_super);
+}
+
+/*
+ * ntfs_fs_free - Free fs_context.
+ *
+ * Note that this will be called after fill_super and reconfigure
+ * even when they pass. So they have to take pointers if they pass.
+ */
+static void ntfs_fs_free(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
+ struct ntfs_mount_options *opts = fc->fs_private;
+ struct ntfs_sb_info *sbi = fc->s_fs_info;
+
+ if (sbi)
+ put_ntfs(sbi);
+
+ if (opts)
+ put_mount_options(opts);
+}
+
+static const struct fs_context_operations ntfs_context_ops = {
+ .parse_param = ntfs_fs_parse_param,
+ .get_tree = ntfs_fs_get_tree,
+ .reconfigure = ntfs_fs_reconfigure,
+ .free = ntfs_fs_free,
+};
+
+/*
+ * ntfs_init_fs_context - Initialize spi and opts
+ *
+ * This will called when mount/remount. We will first initiliaze
+ * options so that if remount we can use just that.
+ */
+static int ntfs_init_fs_context(struct fs_context *fc)
+{
+ struct ntfs_mount_options *opts;
+ struct ntfs_sb_info *sbi;
+
+ opts = kzalloc(sizeof(struct ntfs_mount_options), GFP_NOFS);
+ if (!opts)
+ return -ENOMEM;
+
+ /* Default options. */
+ opts->fs_uid = current_uid();
+ opts->fs_gid = current_gid();
+ opts->fs_fmask_inv = ~current_umask();
+ opts->fs_dmask_inv = ~current_umask();
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
+ goto ok;
+
+ sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS);
+ if (!sbi)
+ goto free_opts;
+
+ sbi->upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL);
+ if (!sbi->upcase)
+ goto free_sbi;
+
+ ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ mutex_init(&sbi->compress.mtx_lznt);
+#ifdef CONFIG_NTFS3_LZX_XPRESS
+ mutex_init(&sbi->compress.mtx_xpress);
+ mutex_init(&sbi->compress.mtx_lzx);
+#endif
+
+ sbi->options = opts;
+ fc->s_fs_info = sbi;
+ok:
+ fc->fs_private = opts;
+ fc->ops = &ntfs_context_ops;
+
+ return 0;
+free_sbi:
+ kfree(sbi);
+free_opts:
+ kfree(opts);
+ return -ENOMEM;
}
// clang-format off
static struct file_system_type ntfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "ntfs3",
- .mount = ntfs_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .owner = THIS_MODULE,
+ .name = "ntfs3",
+ .init_fs_context = ntfs_init_fs_context,
+ .parameters = ntfs_fs_parameters,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
// clang-format on
diff --git a/fs/ntfs3/upcase.c b/fs/ntfs3/upcase.c
index bbeba778237e..b5e8256fd710 100644
--- a/fs/ntfs3/upcase.c
+++ b/fs/ntfs3/upcase.c
@@ -5,13 +5,9 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
-#include <linux/module.h>
-#include <linux/nls.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
-#include "debug.h"
-#include "ntfs.h"
#include "ntfs_fs.h"
static inline u16 upcase_unicode_char(const u16 *upcase, u16 chr)
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 7282d85c4ece..afd0ddad826f 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -5,10 +5,7 @@
*
*/
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/fs.h>
-#include <linux/nls.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
@@ -78,6 +75,7 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea,
size_t add_bytes, const struct EA_INFO **info)
{
int err;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
struct ATTR_LIST_ENTRY *le = NULL;
struct ATTRIB *attr_info, *attr_ea;
void *ea_p;
@@ -102,10 +100,10 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea,
/* Check Ea limit. */
size = le32_to_cpu((*info)->size);
- if (size > ni->mi.sbi->ea_max_size)
+ if (size > sbi->ea_max_size)
return -EFBIG;
- if (attr_size(attr_ea) > ni->mi.sbi->ea_max_size)
+ if (attr_size(attr_ea) > sbi->ea_max_size)
return -EFBIG;
/* Allocate memory for packed Ea. */
@@ -113,15 +111,16 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea,
if (!ea_p)
return -ENOMEM;
- if (attr_ea->non_res) {
+ if (!size) {
+ ;
+ } else if (attr_ea->non_res) {
struct runs_tree run;
run_init(&run);
err = attr_load_runs(attr_ea, ni, &run, NULL);
if (!err)
- err = ntfs_read_run_nb(ni->mi.sbi, &run, 0, ea_p, size,
- NULL);
+ err = ntfs_read_run_nb(sbi, &run, 0, ea_p, size, NULL);
run_close(&run);
if (err)
@@ -260,7 +259,7 @@ out:
static noinline int ntfs_set_ea(struct inode *inode, const char *name,
size_t name_len, const void *value,
- size_t val_size, int flags, int locked)
+ size_t val_size, int flags)
{
struct ntfs_inode *ni = ntfs_i(inode);
struct ntfs_sb_info *sbi = ni->mi.sbi;
@@ -279,8 +278,7 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name,
u64 new_sz;
void *p;
- if (!locked)
- ni_lock(ni);
+ ni_lock(ni);
run_init(&ea_run);
@@ -370,21 +368,22 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name,
new_ea->name[name_len] = 0;
memcpy(new_ea->name + name_len + 1, value, val_size);
new_pack = le16_to_cpu(ea_info.size_pack) + packed_ea_size(new_ea);
-
- /* Should fit into 16 bits. */
- if (new_pack > 0xffff) {
- err = -EFBIG; // -EINVAL?
- goto out;
- }
ea_info.size_pack = cpu_to_le16(new_pack);
-
/* New size of ATTR_EA. */
size += add;
- if (size > sbi->ea_max_size) {
+ ea_info.size = cpu_to_le32(size);
+
+ /*
+ * 1. Check ea_info.size_pack for overflow.
+ * 2. New attibute size must fit value from $AttrDef
+ */
+ if (new_pack > 0xffff || size > sbi->ea_max_size) {
+ ntfs_inode_warn(
+ inode,
+ "The size of extended attributes must not exceed 64KiB");
err = -EFBIG; // -EINVAL?
goto out;
}
- ea_info.size = cpu_to_le32(size);
update_ea:
@@ -444,7 +443,7 @@ update_ea:
/* Delete xattr, ATTR_EA */
ni_remove_attr_le(ni, attr, mi, le);
} else if (attr->non_res) {
- err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size);
+ err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size, 0);
if (err)
goto out;
} else {
@@ -468,8 +467,7 @@ update_ea:
mark_inode_dirty(&ni->vfs_inode);
out:
- if (!locked)
- ni_unlock(ni);
+ ni_unlock(ni);
run_close(&ea_run);
kfree(ea_all);
@@ -478,12 +476,6 @@ out:
}
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
-static inline void ntfs_posix_acl_release(struct posix_acl *acl)
-{
- if (acl && refcount_dec_and_test(&acl->a_refcount))
- kfree(acl);
-}
-
static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns,
struct inode *inode, int type,
int locked)
@@ -521,12 +513,15 @@ static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns,
/* Translate extended attribute to acl. */
if (err >= 0) {
acl = posix_acl_from_xattr(mnt_userns, buf, err);
- if (!IS_ERR(acl))
- set_cached_acl(inode, type, acl);
+ } else if (err == -ENODATA) {
+ acl = NULL;
} else {
- acl = err == -ENODATA ? NULL : ERR_PTR(err);
+ acl = ERR_PTR(err);
}
+ if (!IS_ERR(acl))
+ set_cached_acl(inode, type, acl);
+
__putname(buf);
return acl;
@@ -546,12 +541,13 @@ struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu)
static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
struct inode *inode, struct posix_acl *acl,
- int type, int locked)
+ int type)
{
const char *name;
size_t size, name_len;
void *value = NULL;
int err = 0;
+ int flags;
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
@@ -561,22 +557,15 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
if (acl) {
umode_t mode = inode->i_mode;
- err = posix_acl_equiv_mode(acl, &mode);
- if (err < 0)
- return err;
+ err = posix_acl_update_mode(mnt_userns, inode, &mode,
+ &acl);
+ if (err)
+ goto out;
if (inode->i_mode != mode) {
inode->i_mode = mode;
mark_inode_dirty(inode);
}
-
- if (!err) {
- /*
- * ACL can be exactly represented in the
- * traditional file mode permission bits.
- */
- acl = NULL;
- }
}
name = XATTR_NAME_POSIX_ACL_ACCESS;
name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1;
@@ -594,20 +583,24 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
}
if (!acl) {
+ /* Remove xattr if it can be presented via mode. */
size = 0;
value = NULL;
+ flags = XATTR_REPLACE;
} else {
size = posix_acl_xattr_size(acl->a_count);
value = kmalloc(size, GFP_NOFS);
if (!value)
return -ENOMEM;
-
err = posix_acl_to_xattr(mnt_userns, acl, value, size);
if (err < 0)
goto out;
+ flags = 0;
}
- err = ntfs_set_ea(inode, name, name_len, value, size, 0, locked);
+ err = ntfs_set_ea(inode, name, name_len, value, size, flags);
+ if (err == -ENODATA && !size)
+ err = 0; /* Removing non existed xattr. */
if (!err)
set_cached_acl(inode, type, acl);
@@ -623,68 +616,7 @@ out:
int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type)
{
- return ntfs_set_acl_ex(mnt_userns, inode, acl, type, 0);
-}
-
-static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns,
- struct inode *inode, int type, void *buffer,
- size_t size)
-{
- struct posix_acl *acl;
- int err;
-
- if (!(inode->i_sb->s_flags & SB_POSIXACL)) {
- ntfs_inode_warn(inode, "add mount option \"acl\" to use acl");
- return -EOPNOTSUPP;
- }
-
- acl = ntfs_get_acl(inode, type, false);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
-
- if (!acl)
- return -ENODATA;
-
- err = posix_acl_to_xattr(mnt_userns, acl, buffer, size);
- ntfs_posix_acl_release(acl);
-
- return err;
-}
-
-static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns,
- struct inode *inode, int type, const void *value,
- size_t size)
-{
- struct posix_acl *acl;
- int err;
-
- if (!(inode->i_sb->s_flags & SB_POSIXACL)) {
- ntfs_inode_warn(inode, "add mount option \"acl\" to use acl");
- return -EOPNOTSUPP;
- }
-
- if (!inode_owner_or_capable(mnt_userns, inode))
- return -EPERM;
-
- if (!value) {
- acl = NULL;
- } else {
- acl = posix_acl_from_xattr(mnt_userns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
-
- if (acl) {
- err = posix_acl_valid(mnt_userns, acl);
- if (err)
- goto release_and_out;
- }
- }
-
- err = ntfs_set_acl(mnt_userns, inode, acl, type);
-
-release_and_out:
- ntfs_posix_acl_release(acl);
- return err;
+ return ntfs_set_acl_ex(mnt_userns, inode, acl, type);
}
/*
@@ -698,54 +630,27 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *default_acl, *acl;
int err;
- /*
- * TODO: Refactoring lock.
- * ni_lock(dir) ... -> posix_acl_create(dir,...) -> ntfs_get_acl -> ni_lock(dir)
- */
- inode->i_default_acl = NULL;
-
- default_acl = ntfs_get_acl_ex(mnt_userns, dir, ACL_TYPE_DEFAULT, 1);
-
- if (!default_acl || default_acl == ERR_PTR(-EOPNOTSUPP)) {
- inode->i_mode &= ~current_umask();
- err = 0;
- goto out;
- }
-
- if (IS_ERR(default_acl)) {
- err = PTR_ERR(default_acl);
- goto out;
- }
-
- acl = default_acl;
- err = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
- if (err < 0)
- goto out1;
- if (!err) {
- posix_acl_release(acl);
- acl = NULL;
- }
+ err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (err)
+ return err;
- if (!S_ISDIR(inode->i_mode)) {
+ if (default_acl) {
+ err = ntfs_set_acl_ex(mnt_userns, inode, default_acl,
+ ACL_TYPE_DEFAULT);
posix_acl_release(default_acl);
- default_acl = NULL;
+ } else {
+ inode->i_default_acl = NULL;
}
- if (default_acl)
- err = ntfs_set_acl_ex(mnt_userns, inode, default_acl,
- ACL_TYPE_DEFAULT, 1);
-
if (!acl)
inode->i_acl = NULL;
- else if (!err)
- err = ntfs_set_acl_ex(mnt_userns, inode, acl, ACL_TYPE_ACCESS,
- 1);
-
- posix_acl_release(acl);
-out1:
- posix_acl_release(default_acl);
+ else {
+ if (!err)
+ err = ntfs_set_acl_ex(mnt_userns, inode, acl,
+ ACL_TYPE_ACCESS);
+ posix_acl_release(acl);
+ }
-out:
return err;
}
#endif
@@ -772,7 +677,7 @@ int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode)
int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
int mask)
{
- if (ntfs_sb(inode->i_sb)->options.no_acs_rules) {
+ if (ntfs_sb(inode->i_sb)->options->noacsrules) {
/* "No access rules" mode - Allow all changes. */
return 0;
}
@@ -880,23 +785,6 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
goto out;
}
-#ifdef CONFIG_NTFS3_FS_POSIX_ACL
- if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 &&
- !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS,
- sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) ||
- (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 &&
- !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
- sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) {
- /* TODO: init_user_ns? */
- err = ntfs_xattr_get_acl(
- &init_user_ns, inode,
- name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1
- ? ACL_TYPE_ACCESS
- : ACL_TYPE_DEFAULT,
- buffer, size);
- goto out;
- }
-#endif
/* Deal with NTFS extended attribute. */
err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL);
@@ -1009,24 +897,8 @@ set_new_fa:
goto out;
}
-#ifdef CONFIG_NTFS3_FS_POSIX_ACL
- if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 &&
- !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS,
- sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) ||
- (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 &&
- !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
- sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) {
- err = ntfs_xattr_set_acl(
- mnt_userns, inode,
- name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1
- ? ACL_TYPE_ACCESS
- : ACL_TYPE_DEFAULT,
- value, size);
- goto out;
- }
-#endif
/* Deal with NTFS extended attribute. */
- err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0);
+ err = ntfs_set_ea(inode, name, name_len, value, size, flags);
out:
return err;
@@ -1042,28 +914,29 @@ int ntfs_save_wsl_perm(struct inode *inode)
int err;
__le32 value;
+ /* TODO: refactor this, so we don't lock 4 times in ntfs_set_ea */
value = cpu_to_le32(i_uid_read(inode));
err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value,
- sizeof(value), 0, 0);
+ sizeof(value), 0);
if (err)
goto out;
value = cpu_to_le32(i_gid_read(inode));
err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value,
- sizeof(value), 0, 0);
+ sizeof(value), 0);
if (err)
goto out;
value = cpu_to_le32(inode->i_mode);
err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value,
- sizeof(value), 0, 0);
+ sizeof(value), 0);
if (err)
goto out;
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
value = cpu_to_le32(inode->i_rdev);
err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value,
- sizeof(value), 0, 0);
+ sizeof(value), 0);
if (err)
goto out;
}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f1cc8258d34a..bb247bc349e4 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5940,6 +5940,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
+ ocfs2_commit_trans(osb, handle);
mlog_errno(status);
goto bail;
}
@@ -5964,6 +5965,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
data_alloc_bh, start_blk,
num_clusters);
if (status < 0) {
+ ocfs2_commit_trans(osb, handle);
mlog_errno(status);
goto bail;
}
@@ -6921,13 +6923,12 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
}
/*
- * Zero the area past i_size but still within an allocated
- * cluster. This avoids exposing nonzero data on subsequent file
- * extends.
+ * Zero partial cluster for a hole punch or truncate. This avoids exposing
+ * nonzero data on subsequent file extends.
*
* We need to call this before i_size is updated on the inode because
* otherwise block_write_full_page() will skip writeout of pages past
- * i_size. The new_i_size parameter is passed for this reason.
+ * i_size.
*/
int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
u64 range_start, u64 range_end)
@@ -6945,6 +6946,15 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
return 0;
+ /*
+ * Avoid zeroing pages fully beyond current i_size. It is pointless as
+ * underlying blocks of those pages should be already zeroed out and
+ * page writeback will skip them anyway.
+ */
+ range_end = min_t(u64, range_end, i_size_read(inode));
+ if (range_start >= range_end)
+ return 0;
+
pages = kcalloc(ocfs2_pages_per_cluster(sb),
sizeof(struct page *), GFP_NOFS);
if (pages == NULL) {
@@ -6953,9 +6963,6 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
goto out;
}
- if (range_start == range_end)
- goto out;
-
ret = ocfs2_extent_map_get_blocks(inode,
range_start >> sb->s_blocksize_bits,
&phys, NULL, &ext_flags);
@@ -7045,7 +7052,7 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct buffer_head *di_bh)
{
- int ret, i, has_data, num_pages = 0;
+ int ret, has_data, num_pages = 0;
int need_free = 0;
u32 bit_off, num;
handle_t *handle;
@@ -7054,26 +7061,17 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_alloc_context *data_ac = NULL;
- struct page **pages = NULL;
- loff_t end = osb->s_clustersize;
+ struct page *page = NULL;
struct ocfs2_extent_tree et;
int did_quota = 0;
has_data = i_size_read(inode) ? 1 : 0;
if (has_data) {
- pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
- sizeof(struct page *), GFP_NOFS);
- if (pages == NULL) {
- ret = -ENOMEM;
- mlog_errno(ret);
- return ret;
- }
-
ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
if (ret) {
mlog_errno(ret);
- goto free_pages;
+ goto out;
}
}
@@ -7093,7 +7091,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
}
if (has_data) {
- unsigned int page_end;
+ unsigned int page_end = min_t(unsigned, PAGE_SIZE,
+ osb->s_clustersize);
u64 phys;
ret = dquot_alloc_space_nodirty(inode,
@@ -7117,15 +7116,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
*/
block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
- /*
- * Non sparse file systems zero on extend, so no need
- * to do that now.
- */
- if (!ocfs2_sparse_alloc(osb) &&
- PAGE_SIZE < osb->s_clustersize)
- end = PAGE_SIZE;
-
- ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
+ ret = ocfs2_grab_eof_pages(inode, 0, page_end, &page,
+ &num_pages);
if (ret) {
mlog_errno(ret);
need_free = 1;
@@ -7136,20 +7128,15 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
* This should populate the 1st page for us and mark
* it up to date.
*/
- ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
+ ret = ocfs2_read_inline_data(inode, page, di_bh);
if (ret) {
mlog_errno(ret);
need_free = 1;
goto out_unlock;
}
- page_end = PAGE_SIZE;
- if (PAGE_SIZE > osb->s_clustersize)
- page_end = osb->s_clustersize;
-
- for (i = 0; i < num_pages; i++)
- ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
- pages[i], i > 0, &phys);
+ ocfs2_map_and_dirty_page(inode, handle, 0, page_end, page, 0,
+ &phys);
}
spin_lock(&oi->ip_lock);
@@ -7180,8 +7167,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
}
out_unlock:
- if (pages)
- ocfs2_unlock_and_free_pages(pages, num_pages);
+ if (page)
+ ocfs2_unlock_and_free_pages(&page, num_pages);
out_commit:
if (ret < 0 && did_quota)
@@ -7205,8 +7192,6 @@ out_commit:
out:
if (data_ac)
ocfs2_free_alloc_context(data_ac);
-free_pages:
- kfree(pages);
return ret;
}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 0e7aad1b11cc..5cd5f7511dac 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2698,7 +2698,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
continue;
}
retry:
- ret = -EINVAL;
mlog(0, "attempting to send begin reco msg to %d\n",
nodenum);
ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 359524b7341f..801e60bab955 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3951,7 +3951,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
oi = OCFS2_I(inode);
oi->ip_dir_lock_gen++;
mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
- goto out;
+ goto out_forget;
}
if (!S_ISREG(inode->i_mode))
@@ -3982,6 +3982,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
filemap_fdatawait(mapping);
}
+out_forget:
forget_all_cached_acls(inode);
out:
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 54d7843c0211..fc5f780fa235 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -476,10 +476,11 @@ int ocfs2_truncate_file(struct inode *inode,
* greater than page size, so we have to truncate them
* anyway.
*/
- unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(inode->i_mapping, new_i_size);
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ unmap_mapping_range(inode->i_mapping,
+ new_i_size + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(inode->i_mapping, new_i_size);
status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
i_size_read(inode), 1);
if (status)
@@ -498,6 +499,9 @@ int ocfs2_truncate_file(struct inode *inode,
goto bail_unlock_sem;
}
+ unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(inode->i_mapping, new_i_size);
+
status = ocfs2_commit_truncate(osb, inode, di_bh);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index bc8f32fab964..6c2411c2afcf 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -125,7 +125,6 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
struct ocfs2_find_inode_args args;
- journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
sysfile_type);
@@ -172,10 +171,11 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
* part of the transaction - the inode could have been reclaimed and
* now it is reread from disk.
*/
- if (journal) {
+ if (osb->journal) {
transaction_t *transaction;
tid_t tid;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ journal_t *journal = osb->journal->j_journal;
read_lock(&journal->j_state_lock);
if (journal->j_running_transaction)
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 329986f12db3..dbf9b9e97d74 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -810,19 +810,34 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
write_unlock(&journal->j_state_lock);
}
-int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
+int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
{
int status = -1;
struct inode *inode = NULL; /* the journal inode */
journal_t *j_journal = NULL;
+ struct ocfs2_journal *journal = NULL;
struct ocfs2_dinode *di = NULL;
struct buffer_head *bh = NULL;
- struct ocfs2_super *osb;
int inode_lock = 0;
- BUG_ON(!journal);
+ /* initialize our journal structure */
+ journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
+ if (!journal) {
+ mlog(ML_ERROR, "unable to alloc journal\n");
+ status = -ENOMEM;
+ goto done;
+ }
+ osb->journal = journal;
+ journal->j_osb = osb;
- osb = journal->j_osb;
+ atomic_set(&journal->j_num_trans, 0);
+ init_rwsem(&journal->j_trans_barrier);
+ init_waitqueue_head(&journal->j_checkpointed);
+ spin_lock_init(&journal->j_lock);
+ journal->j_trans_id = 1UL;
+ INIT_LIST_HEAD(&journal->j_la_cleanups);
+ INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
+ journal->j_state = OCFS2_JOURNAL_FREE;
/* already have the inode for our journal */
inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
@@ -1028,9 +1043,10 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
journal->j_state = OCFS2_JOURNAL_FREE;
-// up_write(&journal->j_trans_barrier);
done:
iput(inode);
+ kfree(journal);
+ osb->journal = NULL;
}
static void ocfs2_clear_journal_error(struct super_block *sb,
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d158acb8b38a..8dcb2f2cadbc 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -167,8 +167,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
* ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
*/
void ocfs2_set_journal_params(struct ocfs2_super *osb);
-int ocfs2_journal_init(struct ocfs2_journal *journal,
- int *dirty);
+int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty);
void ocfs2_journal_shutdown(struct ocfs2_super *osb);
int ocfs2_journal_wipe(struct ocfs2_journal *journal,
int full);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8521942f5af2..481017e1dac5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1251,7 +1251,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
{
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
struct journal_head *jh;
- int ret;
+ int ret = 1;
if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
return 0;
@@ -1259,14 +1259,18 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
if (!buffer_jbd(bg_bh))
return 1;
- jh = bh2jh(bg_bh);
- spin_lock(&jh->b_state_lock);
- bg = (struct ocfs2_group_desc *) jh->b_committed_data;
- if (bg)
- ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
- else
- ret = 1;
- spin_unlock(&jh->b_state_lock);
+ jbd_lock_bh_journal_head(bg_bh);
+ if (buffer_jbd(bg_bh)) {
+ jh = bh2jh(bg_bh);
+ spin_lock(&jh->b_state_lock);
+ bg = (struct ocfs2_group_desc *) jh->b_committed_data;
+ if (bg)
+ ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+ else
+ ret = 1;
+ spin_unlock(&jh->b_state_lock);
+ }
+ jbd_unlock_bh_journal_head(bg_bh);
return ret;
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c86bd4e60e20..1286b88b6fa1 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1894,8 +1894,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
/* This will disable recovery and flush any recovery work. */
ocfs2_recovery_exit(osb);
- ocfs2_journal_shutdown(osb);
-
ocfs2_sync_blockdev(sb);
ocfs2_purge_refcount_trees(osb);
@@ -1918,6 +1916,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
ocfs2_release_system_inodes(osb);
+ ocfs2_journal_shutdown(osb);
+
/*
* If we're dismounting due to mount error, mount.ocfs2 will clean
* up heartbeat. If we're a local mount, there is no heartbeat.
@@ -2016,7 +2016,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
int i, cbits, bbits;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
struct inode *inode = NULL;
- struct ocfs2_journal *journal;
struct ocfs2_super *osb;
u64 total_blocks;
@@ -2167,11 +2166,17 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
if (ocfs2_clusterinfo_valid(osb)) {
+ /*
+ * ci_stack and ci_cluster in ocfs2_cluster_info may not be null
+ * terminated, so make sure no overflow happens here by using
+ * memcpy. Destination strings will always be null terminated
+ * because osb is allocated using kzalloc.
+ */
osb->osb_stackflags =
OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
- strlcpy(osb->osb_cluster_stack,
+ memcpy(osb->osb_cluster_stack,
OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
- OCFS2_STACK_LABEL_LEN + 1);
+ OCFS2_STACK_LABEL_LEN);
if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
mlog(ML_ERROR,
"couldn't mount because of an invalid "
@@ -2180,9 +2185,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
status = -EINVAL;
goto bail;
}
- strlcpy(osb->osb_cluster_name,
+ memcpy(osb->osb_cluster_name,
OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
- OCFS2_CLUSTER_NAME_LEN + 1);
+ OCFS2_CLUSTER_NAME_LEN);
} else {
/* The empty string is identical with classic tools that
* don't know about s_cluster_info. */
@@ -2191,33 +2196,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
get_random_bytes(&osb->s_next_generation, sizeof(u32));
- /* FIXME
- * This should be done in ocfs2_journal_init(), but unknown
- * ordering issues will cause the filesystem to crash.
- * If anyone wants to figure out what part of the code
- * refers to osb->journal before ocfs2_journal_init() is run,
- * be my guest.
- */
- /* initialize our journal structure */
-
- journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
- if (!journal) {
- mlog(ML_ERROR, "unable to alloc journal\n");
- status = -ENOMEM;
- goto bail;
- }
- osb->journal = journal;
- journal->j_osb = osb;
-
- atomic_set(&journal->j_num_trans, 0);
- init_rwsem(&journal->j_trans_barrier);
- init_waitqueue_head(&journal->j_checkpointed);
- spin_lock_init(&journal->j_lock);
- journal->j_trans_id = (unsigned long) 1;
- INIT_LIST_HEAD(&journal->j_la_cleanups);
- INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
- journal->j_state = OCFS2_JOURNAL_FREE;
-
INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
init_llist_head(&osb->dquot_drop_list);
@@ -2398,7 +2376,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
* ourselves. */
/* Init our journal object. */
- status = ocfs2_journal_init(osb->journal, &dirty);
+ status = ocfs2_journal_init(osb, &dirty);
if (status < 0) {
mlog(ML_ERROR, "Could not initialize journal!\n");
goto finally;
@@ -2507,12 +2485,6 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
kfree(osb->osb_orphan_wipes);
kfree(osb->slot_recovery_generations);
- /* FIXME
- * This belongs in journal shutdown, but because we have to
- * allocate osb->journal at the start of ocfs2_initialize_osb(),
- * we free it here.
- */
- kfree(osb->journal);
kfree(osb->local_alloc_copy);
kfree(osb->uuid_str);
kfree(osb->vol_label);
diff --git a/fs/open.c b/fs/open.c
index daa324606a41..f732fb94600c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -856,8 +856,20 @@ static int do_dentry_open(struct file *f,
* of THPs into the page cache will fail.
*/
smp_mb();
- if (filemap_nr_thps(inode->i_mapping))
- truncate_pagecache(inode, 0);
+ if (filemap_nr_thps(inode->i_mapping)) {
+ struct address_space *mapping = inode->i_mapping;
+
+ filemap_invalidate_lock(inode->i_mapping);
+ /*
+ * unmap_mapping_range just need to be called once
+ * here, because the private pages is not need to be
+ * unmapped mapping (e.g. data segment of dynamic
+ * shared libraries here).
+ */
+ unmap_mapping_range(mapping, 0, 0, 0);
+ truncate_inode_pages(mapping, 0);
+ filemap_invalidate_unlock(inode->i_mapping);
+ }
}
return 0;
@@ -1248,6 +1260,8 @@ SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
if (err)
return err;
+ audit_openat2_how(&tmp);
+
/* O_LARGEFILE is only allowed for non-O_PATH. */
if (!(tmp.flags & O_PATH) && force_o_largefile())
tmp.flags |= O_LARGEFILE;
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
index fe484cf93e5c..8bbe9486e3a6 100644
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
@@ -26,8 +26,10 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__);
new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP);
- if (!new_op)
+ if (!new_op) {
+ ret = -ENOMEM;
goto out_put_parent;
+ }
new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
new_op->upcall.req.lookup.parent_refn = parent->refn;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index c1bb4c4b5d67..e5e3e500ed46 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -10,7 +10,7 @@
* Linux VFS inode operations.
*/
-#include <linux/bvec.h>
+#include <linux/blkdev.h>
#include <linux/fileattr.h>
#include "protocol.h"
#include "orangefs-kernel.h"
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 2f2e430461b2..d90d8addbfc2 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -11,6 +11,7 @@
#include <linux/parser.h>
#include <linux/hashtable.h>
+#include <linux/seq_file.h>
/* a cache for orangefs-inode objects (i.e. orangefs inode private data) */
static struct kmem_cache *orangefs_inode_cache;
@@ -475,7 +476,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
const char *devname,
void *data)
{
- int ret = -EINVAL;
+ int ret;
struct super_block *sb = ERR_PTR(-EINVAL);
struct orangefs_kernel_op_s *new_op;
struct dentry *d = ERR_PTR(-EINVAL);
@@ -526,7 +527,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
if (!ORANGEFS_SB(sb)) {
d = ERR_PTR(-ENOMEM);
- goto free_op;
+ goto free_sb_and_op;
}
ret = orangefs_fill_sb(sb,
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 4e7d5bfa2949..b193d08a3dc3 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -140,12 +140,14 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old,
int err;
err = ovl_real_fileattr_get(old, &oldfa);
- if (err)
- return err;
-
- err = ovl_real_fileattr_get(new, &newfa);
- if (err)
+ if (err) {
+ /* Ntfs-3g returns -EINVAL for "no fileattr support" */
+ if (err == -ENOTTY || err == -EINVAL)
+ return 0;
+ pr_warn("failed to retrieve lower fileattr (%pd2, err=%i)\n",
+ old, err);
return err;
+ }
/*
* We cannot set immutable and append-only flags on upper inode,
@@ -159,6 +161,17 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old,
return err;
}
+ /* Don't bother copying flags if none are set */
+ if (!(oldfa.flags & OVL_COPY_FS_FLAGS_MASK))
+ return 0;
+
+ err = ovl_real_fileattr_get(new, &newfa);
+ if (err) {
+ pr_warn("failed to retrieve upper fileattr (%pd2, err=%i)\n",
+ new, err);
+ return err;
+ }
+
BUILD_BUG_ON(OVL_COPY_FS_FLAGS_MASK & ~FS_COMMON_FL);
newfa.flags &= ~OVL_COPY_FS_FLAGS_MASK;
newfa.flags |= (oldfa.flags & OVL_COPY_FS_FLAGS_MASK);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 1fefb2b8960e..f18490813170 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -137,8 +137,7 @@ kill_whiteout:
goto out;
}
-static int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry,
- umode_t mode)
+int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode)
{
int err;
struct dentry *d, *dentry = *newdentry;
@@ -1219,9 +1218,13 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir,
goto out_dput;
}
} else {
- if (!d_is_negative(newdentry) &&
- (!new_opaque || !ovl_is_whiteout(newdentry)))
- goto out_dput;
+ if (!d_is_negative(newdentry)) {
+ if (!new_opaque || !ovl_is_whiteout(newdentry))
+ goto out_dput;
+ } else {
+ if (flags & RENAME_EXCHANGE)
+ goto out_dput;
+ }
}
if (olddentry == trap)
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index d081faa55e83..fa125feed0ff 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -17,6 +17,7 @@
struct ovl_aio_req {
struct kiocb iocb;
+ refcount_t ref;
struct kiocb *orig_iocb;
struct fd fd;
};
@@ -252,6 +253,14 @@ static rwf_t ovl_iocb_to_rwf(int ifl)
return flags;
}
+static inline void ovl_aio_put(struct ovl_aio_req *aio_req)
+{
+ if (refcount_dec_and_test(&aio_req->ref)) {
+ fdput(aio_req->fd);
+ kmem_cache_free(ovl_aio_request_cachep, aio_req);
+ }
+}
+
static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
{
struct kiocb *iocb = &aio_req->iocb;
@@ -268,18 +277,17 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
}
orig_iocb->ki_pos = iocb->ki_pos;
- fdput(aio_req->fd);
- kmem_cache_free(ovl_aio_request_cachep, aio_req);
+ ovl_aio_put(aio_req);
}
-static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2)
+static void ovl_aio_rw_complete(struct kiocb *iocb, long res)
{
struct ovl_aio_req *aio_req = container_of(iocb,
struct ovl_aio_req, iocb);
struct kiocb *orig_iocb = aio_req->orig_iocb;
ovl_aio_cleanup_handler(aio_req);
- orig_iocb->ki_complete(orig_iocb, res, res2);
+ orig_iocb->ki_complete(orig_iocb, res);
}
static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
@@ -296,6 +304,12 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (ret)
return ret;
+ ret = -EINVAL;
+ if (iocb->ki_flags & IOCB_DIRECT &&
+ (!real.file->f_mapping->a_ops ||
+ !real.file->f_mapping->a_ops->direct_IO))
+ goto out_fdput;
+
old_cred = ovl_override_creds(file_inode(file)->i_sb);
if (is_sync_kiocb(iocb)) {
ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
@@ -313,14 +327,16 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
aio_req->orig_iocb = iocb;
kiocb_clone(&aio_req->iocb, iocb, real.file);
aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+ refcount_set(&aio_req->ref, 2);
ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
+ ovl_aio_put(aio_req);
if (ret != -EIOCBQUEUED)
ovl_aio_cleanup_handler(aio_req);
}
out:
revert_creds(old_cred);
ovl_file_accessed(file);
-
+out_fdput:
fdput(real);
return ret;
@@ -349,6 +365,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (ret)
goto out_unlock;
+ ret = -EINVAL;
+ if (iocb->ki_flags & IOCB_DIRECT &&
+ (!real.file->f_mapping->a_ops ||
+ !real.file->f_mapping->a_ops->direct_IO))
+ goto out_fdput;
+
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
@@ -378,12 +400,15 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
kiocb_clone(&aio_req->iocb, iocb, real.file);
aio_req->iocb.ki_flags = ifl;
aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+ refcount_set(&aio_req->ref, 2);
ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
+ ovl_aio_put(aio_req);
if (ret != -EIOCBQUEUED)
ovl_aio_cleanup_handler(aio_req);
}
out:
revert_creds(old_cred);
+out_fdput:
fdput(real);
out_unlock:
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 832b17589733..1f36158c7dbe 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -610,7 +610,10 @@ int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa)
if (err)
return err;
- return vfs_fileattr_get(realpath->dentry, fa);
+ err = vfs_fileattr_get(realpath->dentry, fa);
+ if (err == -ENOIOCTLCMD)
+ err = -ENOTTY;
+ return err;
}
int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa)
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 3894f3347955..2cd5741c873b 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -570,6 +570,7 @@ struct ovl_cattr {
#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) })
+int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode);
struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct ovl_cattr *attr);
int ovl_cleanup(struct inode *dir, struct dentry *dentry);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 178daa5e82c9..265181c110ae 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -787,10 +787,14 @@ retry:
goto retry;
}
- work = ovl_create_real(dir, work, OVL_CATTR(attr.ia_mode));
- err = PTR_ERR(work);
- if (IS_ERR(work))
- goto out_err;
+ err = ovl_mkdir_real(dir, &work, attr.ia_mode);
+ if (err)
+ goto out_dput;
+
+ /* Weird filesystem returning with hashed negative (kernfs)? */
+ err = -EINVAL;
+ if (d_really_is_negative(work))
+ goto out_dput;
/*
* Try to remove POSIX ACL xattrs from workdir. We are good if:
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index f5c25f580dd9..9323a854a60a 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -134,8 +134,7 @@ struct posix_acl *get_acl(struct inode *inode, int type)
* to just call ->get_acl to fetch the ACL ourself. (This is going to
* be an unlikely race.)
*/
- if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED)
- /* fall through */ ;
+ cmpxchg(p, ACL_NOT_CACHED, sentinel);
/*
* Normally, the ACL returned by ->get_acl will be cached.
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 49be8c8ef555..ff869a66b34e 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -408,9 +408,9 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
cpumask_pr_args(&task->cpus_mask));
}
-static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
+static inline void task_core_dumping(struct seq_file *m, struct task_struct *task)
{
- seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state);
+ seq_put_decimal_ull(m, "CoreDumping:\t", !!task->signal->core_state);
seq_putc(m, '\n');
}
@@ -436,7 +436,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
if (mm) {
task_mem(m, mm);
- task_core_dumping(m, mm);
+ task_core_dumping(m, task);
task_thp_status(m, mm);
mmput(mm);
}
@@ -541,7 +541,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
}
if (permitted && (!whole || num_threads < 2))
- wchan = get_wchan(task);
+ wchan = !task_is_running(task);
if (!whole) {
min_flt = task->min_flt;
maj_flt = task->maj_flt;
@@ -606,10 +606,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
*
* This works with older implementations of procps as well.
*/
- if (wchan)
- seq_puts(m, " 1");
- else
- seq_puts(m, " 0");
+ seq_put_decimal_ull(m, " ", wchan);
seq_put_decimal_ull(m, " ", 0);
seq_put_decimal_ull(m, " ", 0);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 533d5836eb9a..13eda8de2998 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -67,6 +67,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/rcupdate.h>
+#include <linux/kallsyms.h>
#include <linux/stacktrace.h>
#include <linux/resource.h>
#include <linux/module.h>
@@ -386,17 +387,19 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
unsigned long wchan;
+ char symname[KSYM_NAME_LEN];
- if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
- wchan = get_wchan(task);
- else
- wchan = 0;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ goto print0;
- if (wchan)
- seq_printf(m, "%ps", (void *) wchan);
- else
- seq_putc(m, '0');
+ wchan = get_wchan(task);
+ if (wchan && !lookup_symbol_name(wchan, symname)) {
+ seq_puts(m, symname);
+ return 0;
+ }
+print0:
+ seq_putc(m, '0');
return 0;
}
#endif /* CONFIG_KALLSYMS */
@@ -1979,19 +1982,21 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
struct task_struct *task;
+ int ret = 0;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- inode = d_inode(dentry);
- task = get_proc_task(inode);
+ rcu_read_lock();
+ inode = d_inode_rcu(dentry);
+ if (!inode)
+ goto out;
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task) {
pid_update_inode(task, inode);
- put_task_struct(task);
- return 1;
+ ret = 1;
}
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
static inline bool proc_inode_is_dead(struct inode *inode)
@@ -3799,7 +3804,10 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
task = next_tid(task), ctx->pos++) {
char name[10 + 1];
unsigned int len;
+
tid = task_pid_nr_ns(task, ns);
+ if (!tid)
+ continue; /* The task has just exited. */
len = snprintf(name, sizeof(name), "%u", tid);
if (!proc_fill_cache(file, ctx, name, len,
proc_task_instantiate, task, NULL)) {
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 6561a06ef905..4fb8729a68d4 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -24,7 +24,7 @@
#ifdef arch_idle_time
-static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
+u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle;
@@ -46,7 +46,7 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
#else
-static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
+u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle, idle_usecs = -1ULL;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index cf25be3e0321..ad667dbc96f5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -397,7 +397,6 @@ struct mem_size_stats {
u64 pss_shmem;
u64 pss_locked;
u64 swap_pss;
- bool check_shmem_swap;
};
static void smaps_page_accumulate(struct mem_size_stats *mss,
@@ -478,9 +477,11 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end,
__always_unused int depth, struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
- mss->swap += shmem_partial_swap_usage(
- walk->vma->vm_file->f_mapping, addr, end);
+ mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping,
+ linear_page_index(vma, addr),
+ linear_page_index(vma, end));
return 0;
}
@@ -488,6 +489,16 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end,
#define smaps_pte_hole NULL
#endif /* CONFIG_SHMEM */
+static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk)
+{
+#ifdef CONFIG_SHMEM
+ if (walk->ops->pte_hole) {
+ /* depth is not used */
+ smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk);
+ }
+#endif
+}
+
static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct mm_walk *walk)
{
@@ -516,12 +527,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
}
} else if (is_pfn_swap_entry(swpent))
page = pfn_swap_entry_to_page(swpent);
- } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
- && pte_none(*pte))) {
- page = xa_load(&vma->vm_file->f_mapping->i_pages,
- linear_page_index(vma, addr));
- if (xa_is_value(page))
- mss->swap += PAGE_SIZE;
+ } else {
+ smaps_pte_hole_lookup(addr, walk);
return;
}
@@ -735,8 +742,6 @@ static void smap_gather_stats(struct vm_area_struct *vma,
return;
#ifdef CONFIG_SHMEM
- /* In case of smaps_rollup, reset the value from previous vma */
- mss->check_shmem_swap = false;
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
/*
* For shared or readonly shmem mappings we know that all
@@ -754,7 +759,6 @@ static void smap_gather_stats(struct vm_area_struct *vma,
!(vma->vm_flags & VM_WRITE))) {
mss->swap += shmem_swapped;
} else {
- mss->check_shmem_swap = true;
ops = &smaps_shmem_walk_ops;
}
}
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 5a1b228964fb..deb99bc9b7e6 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -12,18 +12,22 @@ static int uptime_proc_show(struct seq_file *m, void *v)
{
struct timespec64 uptime;
struct timespec64 idle;
- u64 nsec;
+ u64 idle_nsec;
u32 rem;
int i;
- nsec = 0;
- for_each_possible_cpu(i)
- nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
+ idle_nsec = 0;
+ for_each_possible_cpu(i) {
+ struct kernel_cpustat kcs;
+
+ kcpustat_cpu_fetch(&kcs, i);
+ idle_nsec += get_idle_time(&kcs, i);
+ }
ktime_get_boottime_ts64(&uptime);
timens_add_boottime(&uptime);
- idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
+ idle.tv_sec = div_u64_rem(idle_nsec, NSEC_PER_SEC, &rem);
idle.tv_nsec = rem;
seq_printf(m, "%lu.%02lu %lu.%02lu\n",
(unsigned long) uptime.tv_sec,
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9a15334da208..30a3b66f475a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -26,7 +26,7 @@
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/uaccess.h>
-#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <asm/io.h>
#include "internal.h"
@@ -62,46 +62,75 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0);
/* Device Dump Size */
static size_t vmcoredd_orig_sz;
-/*
- * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
- * The called function has to take care of module refcounting.
- */
-static int (*oldmem_pfn_is_ram)(unsigned long pfn);
+static DECLARE_RWSEM(vmcore_cb_rwsem);
+/* List of registered vmcore callbacks. */
+static LIST_HEAD(vmcore_cb_list);
+/* Whether we had a surprise unregistration of a callback. */
+static bool vmcore_cb_unstable;
+/* Whether the vmcore has been opened once. */
+static bool vmcore_opened;
-int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn))
+void register_vmcore_cb(struct vmcore_cb *cb)
{
- if (oldmem_pfn_is_ram)
- return -EBUSY;
- oldmem_pfn_is_ram = fn;
- return 0;
+ down_write(&vmcore_cb_rwsem);
+ INIT_LIST_HEAD(&cb->next);
+ list_add_tail(&cb->next, &vmcore_cb_list);
+ /*
+ * Registering a vmcore callback after the vmcore was opened is
+ * very unusual (e.g., manual driver loading).
+ */
+ if (vmcore_opened)
+ pr_warn_once("Unexpected vmcore callback registration\n");
+ up_write(&vmcore_cb_rwsem);
}
-EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram);
+EXPORT_SYMBOL_GPL(register_vmcore_cb);
-void unregister_oldmem_pfn_is_ram(void)
+void unregister_vmcore_cb(struct vmcore_cb *cb)
{
- oldmem_pfn_is_ram = NULL;
- wmb();
+ down_write(&vmcore_cb_rwsem);
+ list_del(&cb->next);
+ /*
+ * Unregistering a vmcore callback after the vmcore was opened is
+ * very unusual (e.g., forced driver removal), but we cannot stop
+ * unregistering.
+ */
+ if (vmcore_opened) {
+ pr_warn_once("Unexpected vmcore callback unregistration\n");
+ vmcore_cb_unstable = true;
+ }
+ up_write(&vmcore_cb_rwsem);
}
-EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram);
+EXPORT_SYMBOL_GPL(unregister_vmcore_cb);
-static int pfn_is_ram(unsigned long pfn)
+static bool pfn_is_ram(unsigned long pfn)
{
- int (*fn)(unsigned long pfn);
- /* pfn is ram unless fn() checks pagetype */
- int ret = 1;
+ struct vmcore_cb *cb;
+ bool ret = true;
- /*
- * Ask hypervisor if the pfn is really ram.
- * A ballooned page contains no data and reading from such a page
- * will cause high load in the hypervisor.
- */
- fn = oldmem_pfn_is_ram;
- if (fn)
- ret = fn(pfn);
+ lockdep_assert_held_read(&vmcore_cb_rwsem);
+ if (unlikely(vmcore_cb_unstable))
+ return false;
+
+ list_for_each_entry(cb, &vmcore_cb_list, next) {
+ if (unlikely(!cb->pfn_is_ram))
+ continue;
+ ret = cb->pfn_is_ram(cb, pfn);
+ if (!ret)
+ break;
+ }
return ret;
}
+static int open_vmcore(struct inode *inode, struct file *file)
+{
+ down_read(&vmcore_cb_rwsem);
+ vmcore_opened = true;
+ up_read(&vmcore_cb_rwsem);
+
+ return 0;
+}
+
/* Reads a page from the oldmem device from given offset. */
ssize_t read_from_oldmem(char *buf, size_t count,
u64 *ppos, int userbuf,
@@ -117,6 +146,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
offset = (unsigned long)(*ppos % PAGE_SIZE);
pfn = (unsigned long)(*ppos / PAGE_SIZE);
+ down_read(&vmcore_cb_rwsem);
do {
if (count > (PAGE_SIZE - offset))
nr_bytes = PAGE_SIZE - offset;
@@ -124,7 +154,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
nr_bytes = count;
/* If pfn is not ram, return zeros for sparse dump files */
- if (pfn_is_ram(pfn) == 0)
+ if (!pfn_is_ram(pfn))
memset(buf, 0, nr_bytes);
else {
if (encrypted)
@@ -136,8 +166,10 @@ ssize_t read_from_oldmem(char *buf, size_t count,
tmp = copy_oldmem_page(pfn, buf, nr_bytes,
offset, userbuf);
- if (tmp < 0)
+ if (tmp < 0) {
+ up_read(&vmcore_cb_rwsem);
return tmp;
+ }
}
*ppos += nr_bytes;
count -= nr_bytes;
@@ -147,6 +179,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
offset = 0;
} while (count);
+ up_read(&vmcore_cb_rwsem);
return read;
}
@@ -177,7 +210,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
*/
ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0, mem_encrypt_active());
+ return read_from_oldmem(buf, count, ppos, 0, cc_platform_has(CC_ATTR_MEM_ENCRYPT));
}
/*
@@ -378,7 +411,7 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
buflen);
start = m->paddr + *fpos - m->offset;
tmp = read_from_oldmem(buffer, tsz, &start,
- userbuf, mem_encrypt_active());
+ userbuf, cc_platform_has(CC_ATTR_MEM_ENCRYPT));
if (tmp < 0)
return tmp;
buflen -= tsz;
@@ -537,14 +570,19 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot)
{
+ int ret;
+
/*
* Check if oldmem_pfn_is_ram was registered to avoid
* looping over all pages without a reason.
*/
- if (oldmem_pfn_is_ram)
- return remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
+ down_read(&vmcore_cb_rwsem);
+ if (!list_empty(&vmcore_cb_list) || vmcore_cb_unstable)
+ ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
else
- return remap_oldmem_pfn_range(vma, from, pfn, size, prot);
+ ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot);
+ up_read(&vmcore_cb_rwsem);
+ return ret;
}
static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
@@ -668,6 +706,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
#endif
static const struct proc_ops vmcore_proc_ops = {
+ .proc_open = open_vmcore,
.proc_read = read_vmcore,
.proc_lseek = default_llseek,
.proc_mmap = mmap_vmcore,
diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c
index 04ce58c939a0..5d1fbaffd66a 100644
--- a/fs/pstore/blk.c
+++ b/fs/pstore/blk.c
@@ -205,7 +205,6 @@ static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes,
static int __register_pstore_blk(struct pstore_device_info *dev,
const char *devpath)
{
- struct inode *inode;
int ret = -ENODEV;
lockdep_assert_held(&pstore_blk_lock);
@@ -217,14 +216,13 @@ static int __register_pstore_blk(struct pstore_device_info *dev,
goto err;
}
- inode = file_inode(psblk_file);
- if (!S_ISBLK(inode->i_mode)) {
+ if (!S_ISBLK(file_inode(psblk_file)->i_mode)) {
pr_err("'%s' is not block device!\n", devpath);
goto err_fput;
}
- inode = I_BDEV(psblk_file->f_mapping->host)->bd_inode;
- dev->zone.total_size = i_size_read(inode);
+ dev->zone.total_size =
+ bdev_nr_bytes(I_BDEV(psblk_file->f_mapping->host));
ret = __register_pstore_device(dev);
if (ret)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index a6ee23aadd28..66645a5a35f3 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -15,13 +15,48 @@
#include <linux/buffer_head.h>
#include "qnx4.h"
+/*
+ * A qnx4 directory entry is an inode entry or link info
+ * depending on the status field in the last byte. The
+ * first byte is where the name start either way, and a
+ * zero means it's empty.
+ *
+ * Also, due to a bug in gcc, we don't want to use the
+ * real (differently sized) name arrays in the inode and
+ * link entries, but always the 'de_name[]' one in the
+ * fake struct entry.
+ *
+ * See
+ *
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99578#c6
+ *
+ * for details, but basically gcc will take the size of the
+ * 'name' array from one of the used union entries randomly.
+ *
+ * This use of 'de_name[]' (48 bytes) avoids the false positive
+ * warnings that would happen if gcc decides to use 'inode.di_name'
+ * (16 bytes) even when the pointer and size were to come from
+ * 'link.dl_name' (48 bytes).
+ *
+ * In all cases the actual name pointer itself is the same, it's
+ * only the gcc internal 'what is the size of this field' logic
+ * that can get confused.
+ */
+union qnx4_directory_entry {
+ struct {
+ const char de_name[48];
+ u8 de_pad[15];
+ u8 de_status;
+ };
+ struct qnx4_inode_entry inode;
+ struct qnx4_link_info link;
+};
+
static int qnx4_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
unsigned int offset;
struct buffer_head *bh;
- struct qnx4_inode_entry *de;
- struct qnx4_link_info *le;
unsigned long blknum;
int ix, ino;
int size;
@@ -38,27 +73,27 @@ static int qnx4_readdir(struct file *file, struct dir_context *ctx)
}
ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) {
+ union qnx4_directory_entry *de;
+
offset = ix * QNX4_DIR_ENTRY_SIZE;
- de = (struct qnx4_inode_entry *) (bh->b_data + offset);
- if (!de->di_fname[0])
+ de = (union qnx4_directory_entry *) (bh->b_data + offset);
+
+ if (!de->de_name[0])
continue;
- if (!(de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)))
+ if (!(de->de_status & (QNX4_FILE_USED|QNX4_FILE_LINK)))
continue;
- if (!(de->di_status & QNX4_FILE_LINK))
- size = QNX4_SHORT_NAME_MAX;
- else
- size = QNX4_NAME_MAX;
- size = strnlen(de->di_fname, size);
- QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
- if (!(de->di_status & QNX4_FILE_LINK))
+ if (!(de->de_status & QNX4_FILE_LINK)) {
+ size = sizeof(de->inode.di_fname);
ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
- else {
- le = (struct qnx4_link_info*)de;
- ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
+ } else {
+ size = sizeof(de->link.dl_fname);
+ ino = ( le32_to_cpu(de->link.dl_inode_blk) - 1 ) *
QNX4_INODES_PER_BLOCK +
- le->dl_inode_ndx;
+ de->link.dl_inode_ndx;
}
- if (!dir_emit(ctx, de->di_fname, size, ino, DT_UNKNOWN)) {
+ size = strnlen(de->de_name, size);
+ QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, name));
+ if (!dir_emit(ctx, de->de_name, size, ino, DT_UNKNOWN)) {
brelse(bh);
return 0;
}
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 2bcc9a6f1bfc..052f143e2e0e 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -10,6 +10,7 @@
#include <linux/namei.h>
#include <linux/slab.h>
#include <asm/current.h>
+#include <linux/blkdev.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/security.h>
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index d3e995e1046f..5f2405994280 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -414,6 +414,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
quota_error(dquot->dq_sb, "Quota structure has offset to "
"other block (%u) than it should (%u)", blk,
(uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+ ret = -EIO;
goto out_buf;
}
ret = read_blk(info, blk, buf);
@@ -479,6 +480,13 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
goto out_buf;
}
newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+ if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) {
+ quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
+ newblk, info->dqi_blocks);
+ ret = -EUCLEAN;
+ goto out_buf;
+ }
+
if (depth == info->dqi_qtree_depth - 1) {
ret = free_dqentry(info, dquot, newblk);
newblk = 0;
@@ -578,6 +586,13 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
if (!blk) /* No reference? */
goto out_buf;
+ if (blk < QT_TREEOFF || blk >= info->dqi_blocks) {
+ quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
+ blk, info->dqi_blocks);
+ ret = -EUCLEAN;
+ goto out_buf;
+ }
+
if (depth < info->dqi_qtree_depth - 1)
ret = find_tree_dqentry(info, dquot, blk, depth+1);
else
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 65e7e56005b8..bc66d0173e33 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -38,6 +38,7 @@
#include <linux/uaccess.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
+#include <linux/seq_file.h>
#include "internal.h"
struct ramfs_mount_opts {
@@ -203,17 +204,20 @@ static int ramfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
int opt;
opt = fs_parse(fc, ramfs_fs_parameters, param, &result);
- if (opt < 0) {
+ if (opt == -ENOPARAM) {
+ opt = vfs_parse_fs_param_source(fc, param);
+ if (opt != -ENOPARAM)
+ return opt;
/*
* We might like to report bad mount options here;
* but traditionally ramfs has ignored all mount options,
* and as it is used as a !CONFIG_SHMEM simple substitute
* for tmpfs, better continue to ignore other mount options.
*/
- if (opt == -ENOPARAM)
- opt = 0;
- return opt;
+ return 0;
}
+ if (opt < 0)
+ return opt;
switch (opt) {
case Opt_mode:
diff --git a/fs/read_write.c b/fs/read_write.c
index af057c57bdc6..0074afa7ecb3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -368,10 +368,6 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
if (unlikely((ssize_t) count < 0))
return -EINVAL;
- /*
- * ranged mandatory locking does not apply to streams - it makes sense
- * only for files where position has a meaning.
- */
if (ppos) {
loff_t pos = *ppos;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 58481f8d63d5..82e09901462e 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1199,9 +1199,7 @@ static int reiserfs_parse_options(struct super_block *s,
if (!strcmp(arg, "auto")) {
/* From JFS code, to auto-get the size. */
- *blocks =
- i_size_read(s->s_bdev->bd_inode) >> s->
- s_blocksize_bits;
+ *blocks = sb_bdev_nr_blocks(s);
} else {
*blocks = simple_strtoul(arg, &p, 0);
if (*p != '\0') {
@@ -1437,7 +1435,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
unsigned long safe_mask = 0;
unsigned int commit_max_age = (unsigned int)-1;
struct reiserfs_journal *journal = SB_JOURNAL(s);
- char *new_opts;
int err;
char *qf_names[REISERFS_MAXQUOTAS];
unsigned int qfmt = 0;
@@ -1445,10 +1442,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
int i;
#endif
- new_opts = kstrdup(arg, GFP_KERNEL);
- if (arg && !new_opts)
- return -ENOMEM;
-
sync_filesystem(s);
reiserfs_write_lock(s);
@@ -1599,7 +1592,6 @@ out_ok_unlocked:
out_err_unlock:
reiserfs_write_unlock(s);
out_err:
- kfree(new_opts);
return err;
}
@@ -1986,9 +1978,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
* smaller than the filesystem. If the check fails then abort and
* scream, because bad stuff will happen otherwise.
*/
- if (s->s_bdev && s->s_bdev->bd_inode
- && i_size_read(s->s_bdev->bd_inode) <
- sb_block_count(rs) * sb_blocksize(rs)) {
+ if (bdev_nr_bytes(s->s_bdev) < sb_block_count(rs) * sb_blocksize(rs)) {
SWARN(silent, s, "", "Filesystem cannot be "
"mounted because it is bigger than the device");
SWARN(silent, s, "", "You may need to run fsck "
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 4a2cda04d3e2..f8e1f4ee87ff 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -383,22 +383,6 @@ void seq_escape_mem(struct seq_file *m, const char *src, size_t len,
}
EXPORT_SYMBOL(seq_escape_mem);
-/**
- * seq_escape - print string into buffer, escaping some characters
- * @m: target buffer
- * @s: string
- * @esc: set of characters that need escaping
- *
- * Puts string into buffer, replacing each occurrence of character from
- * @esc with usual octal escape.
- * Use seq_has_overflowed() to check for errors.
- */
-void seq_escape(struct seq_file *m, const char *s, const char *esc)
-{
- seq_escape_str(m, s, ESCAPE_OCTAL, esc);
-}
-EXPORT_SYMBOL(seq_escape);
-
void seq_vprintf(struct seq_file *m, const char *f, va_list args)
{
int len;
diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h
new file mode 100644
index 000000000000..7ccadcbe684b
--- /dev/null
+++ b/fs/smbfs_common/smb2pdu.h
@@ -0,0 +1,989 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
+#ifndef _COMMON_SMB2PDU_H
+#define _COMMON_SMB2PDU_H
+
+/*
+ * Note that, due to trying to use names similar to the protocol specifications,
+ * there are many mixed case field names in the structures below. Although
+ * this does not match typical Linux kernel style, it is necessary to be
+ * able to match against the protocol specfication.
+ *
+ * SMB2 commands
+ * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
+ * (ie no useful data other than the SMB error code itself) and are marked such.
+ * Knowing this helps avoid response buffer allocations and copy in some cases.
+ */
+
+/* List of commands in host endian */
+#define SMB2_NEGOTIATE_HE 0x0000
+#define SMB2_SESSION_SETUP_HE 0x0001
+#define SMB2_LOGOFF_HE 0x0002 /* trivial request/resp */
+#define SMB2_TREE_CONNECT_HE 0x0003
+#define SMB2_TREE_DISCONNECT_HE 0x0004 /* trivial req/resp */
+#define SMB2_CREATE_HE 0x0005
+#define SMB2_CLOSE_HE 0x0006
+#define SMB2_FLUSH_HE 0x0007 /* trivial resp */
+#define SMB2_READ_HE 0x0008
+#define SMB2_WRITE_HE 0x0009
+#define SMB2_LOCK_HE 0x000A
+#define SMB2_IOCTL_HE 0x000B
+#define SMB2_CANCEL_HE 0x000C
+#define SMB2_ECHO_HE 0x000D
+#define SMB2_QUERY_DIRECTORY_HE 0x000E
+#define SMB2_CHANGE_NOTIFY_HE 0x000F
+#define SMB2_QUERY_INFO_HE 0x0010
+#define SMB2_SET_INFO_HE 0x0011
+#define SMB2_OPLOCK_BREAK_HE 0x0012
+
+/* The same list in little endian */
+#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE)
+#define SMB2_SESSION_SETUP cpu_to_le16(SMB2_SESSION_SETUP_HE)
+#define SMB2_LOGOFF cpu_to_le16(SMB2_LOGOFF_HE)
+#define SMB2_TREE_CONNECT cpu_to_le16(SMB2_TREE_CONNECT_HE)
+#define SMB2_TREE_DISCONNECT cpu_to_le16(SMB2_TREE_DISCONNECT_HE)
+#define SMB2_CREATE cpu_to_le16(SMB2_CREATE_HE)
+#define SMB2_CLOSE cpu_to_le16(SMB2_CLOSE_HE)
+#define SMB2_FLUSH cpu_to_le16(SMB2_FLUSH_HE)
+#define SMB2_READ cpu_to_le16(SMB2_READ_HE)
+#define SMB2_WRITE cpu_to_le16(SMB2_WRITE_HE)
+#define SMB2_LOCK cpu_to_le16(SMB2_LOCK_HE)
+#define SMB2_IOCTL cpu_to_le16(SMB2_IOCTL_HE)
+#define SMB2_CANCEL cpu_to_le16(SMB2_CANCEL_HE)
+#define SMB2_ECHO cpu_to_le16(SMB2_ECHO_HE)
+#define SMB2_QUERY_DIRECTORY cpu_to_le16(SMB2_QUERY_DIRECTORY_HE)
+#define SMB2_CHANGE_NOTIFY cpu_to_le16(SMB2_CHANGE_NOTIFY_HE)
+#define SMB2_QUERY_INFO cpu_to_le16(SMB2_QUERY_INFO_HE)
+#define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE)
+#define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE)
+
+#define SMB2_INTERNAL_CMD cpu_to_le16(0xFFFF)
+
+#define NUMBER_OF_SMB2_COMMANDS 0x0013
+
+/*
+ * SMB2 Header Definition
+ *
+ * "MBZ" : Must be Zero
+ * "BB" : BugBug, Something to check/review/analyze later
+ * "PDU" : "Protocol Data Unit" (ie a network "frame")
+ *
+ */
+
+#define __SMB2_HEADER_STRUCTURE_SIZE 64
+#define SMB2_HEADER_STRUCTURE_SIZE \
+ cpu_to_le16(__SMB2_HEADER_STRUCTURE_SIZE)
+
+#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
+#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
+#define SMB2_COMPRESSION_TRANSFORM_ID cpu_to_le32(0x424d53fc)
+
+/*
+ * SMB2 flag definitions
+ */
+#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001)
+#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002)
+#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004)
+#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008)
+#define SMB2_FLAGS_PRIORITY_MASK cpu_to_le32(0x00000070) /* SMB3.1.1 */
+#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000)
+#define SMB2_FLAGS_REPLAY_OPERATION cpu_to_le32(0x20000000) /* SMB3 & up */
+
+/* See MS-SMB2 section 2.2.1 */
+struct smb2_hdr {
+ __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */
+ __le16 StructureSize; /* 64 */
+ __le16 CreditCharge; /* MBZ */
+ __le32 Status; /* Error from server */
+ __le16 Command;
+ __le16 CreditRequest; /* CreditResponse */
+ __le32 Flags;
+ __le32 NextCommand;
+ __le64 MessageId;
+ union {
+ struct {
+ __le32 ProcessId;
+ __le32 TreeId;
+ } __packed SyncId;
+ __le64 AsyncId;
+ } __packed Id;
+ __le64 SessionId;
+ __u8 Signature[16];
+} __packed;
+
+struct smb2_pdu {
+ struct smb2_hdr hdr;
+ __le16 StructureSize2; /* size of wct area (varies, request specific) */
+} __packed;
+
+#define SMB3_AES_CCM_NONCE 11
+#define SMB3_AES_GCM_NONCE 12
+
+/* Transform flags (for 3.0 dialect this flag indicates CCM */
+#define TRANSFORM_FLAG_ENCRYPTED 0x0001
+struct smb2_transform_hdr {
+ __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */
+ __u8 Signature[16];
+ __u8 Nonce[16];
+ __le32 OriginalMessageSize;
+ __u16 Reserved1;
+ __le16 Flags; /* EncryptionAlgorithm for 3.0, enc enabled for 3.1.1 */
+ __le64 SessionId;
+} __packed;
+
+
+/* See MS-SMB2 2.2.42 */
+struct smb2_compression_transform_hdr_unchained {
+ __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
+ __le32 OriginalCompressedSegmentSize;
+ __le16 CompressionAlgorithm;
+ __le16 Flags;
+ __le16 Length; /* if chained it is length, else offset */
+} __packed;
+
+/* See MS-SMB2 2.2.42.1 */
+#define SMB2_COMPRESSION_FLAG_NONE 0x0000
+#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001
+
+struct compression_payload_header {
+ __le16 CompressionAlgorithm;
+ __le16 Flags;
+ __le32 Length; /* length of compressed playload including field below if present */
+ /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */
+} __packed;
+
+/* See MS-SMB2 2.2.42.2 */
+struct smb2_compression_transform_hdr_chained {
+ __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
+ __le32 OriginalCompressedSegmentSize;
+ /* struct compression_payload_header[] */
+} __packed;
+
+/* See MS-SMB2 2.2.42.2.2 */
+struct compression_pattern_payload_v1 {
+ __le16 Pattern;
+ __le16 Reserved1;
+ __le16 Reserved2;
+ __le32 Repetitions;
+} __packed;
+
+/* See MS-SMB2 section 2.2.9.2 */
+/* Context Types */
+#define SMB2_RESERVED_TREE_CONNECT_CONTEXT_ID 0x0000
+#define SMB2_REMOTED_IDENTITY_TREE_CONNECT_CONTEXT_ID cpu_to_le16(0x0001)
+
+struct tree_connect_contexts {
+ __le16 ContextType;
+ __le16 DataLength;
+ __le32 Reserved;
+ __u8 Data[];
+} __packed;
+
+/* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */
+struct smb3_blob_data {
+ __le16 BlobSize;
+ __u8 BlobData[];
+} __packed;
+
+/* Valid values for Attr */
+#define SE_GROUP_MANDATORY 0x00000001
+#define SE_GROUP_ENABLED_BY_DEFAULT 0x00000002
+#define SE_GROUP_ENABLED 0x00000004
+#define SE_GROUP_OWNER 0x00000008
+#define SE_GROUP_USE_FOR_DENY_ONLY 0x00000010
+#define SE_GROUP_INTEGRITY 0x00000020
+#define SE_GROUP_INTEGRITY_ENABLED 0x00000040
+#define SE_GROUP_RESOURCE 0x20000000
+#define SE_GROUP_LOGON_ID 0xC0000000
+
+/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */
+
+struct sid_array_data {
+ __le16 SidAttrCount;
+ /* SidAttrList - array of sid_attr_data structs */
+} __packed;
+
+struct luid_attr_data {
+
+} __packed;
+
+/*
+ * struct privilege_data is the same as BLOB_DATA - see MS-SMB2 2.2.9.2.1.5
+ * but with size of LUID_ATTR_DATA struct and BlobData set to LUID_ATTR DATA
+ */
+
+struct privilege_array_data {
+ __le16 PrivilegeCount;
+ /* array of privilege_data structs */
+} __packed;
+
+struct remoted_identity_tcon_context {
+ __le16 TicketType; /* must be 0x0001 */
+ __le16 TicketSize; /* total size of this struct */
+ __le16 User; /* offset to SID_ATTR_DATA struct with user info */
+ __le16 UserName; /* offset to null terminated Unicode username string */
+ __le16 Domain; /* offset to null terminated Unicode domain name */
+ __le16 Groups; /* offset to SID_ARRAY_DATA struct with group info */
+ __le16 RestrictedGroups; /* similar to above */
+ __le16 Privileges; /* offset to PRIVILEGE_ARRAY_DATA struct */
+ __le16 PrimaryGroup; /* offset to SID_ARRAY_DATA struct */
+ __le16 Owner; /* offset to BLOB_DATA struct */
+ __le16 DefaultDacl; /* offset to BLOB_DATA struct */
+ __le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */
+ __le16 UserClaims; /* offset to BLOB_DATA struct */
+ __le16 DeviceClaims; /* offset to BLOB_DATA struct */
+ __u8 TicketInfo[]; /* variable length buf - remoted identity data */
+} __packed;
+
+struct smb2_tree_connect_req_extension {
+ __le32 TreeConnectContextOffset;
+ __le16 TreeConnectContextCount;
+ __u8 Reserved[10];
+ __u8 PathName[]; /* variable sized array */
+ /* followed by array of TreeConnectContexts */
+} __packed;
+
+/* Flags/Reserved for SMB3.1.1 */
+#define SMB2_TREE_CONNECT_FLAG_CLUSTER_RECONNECT cpu_to_le16(0x0001)
+#define SMB2_TREE_CONNECT_FLAG_REDIRECT_TO_OWNER cpu_to_le16(0x0002)
+#define SMB2_TREE_CONNECT_FLAG_EXTENSION_PRESENT cpu_to_le16(0x0004)
+
+struct smb2_tree_connect_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 Flags; /* Flags in SMB3.1.1 */
+ __le16 PathOffset;
+ __le16 PathLength;
+ __u8 Buffer[1]; /* variable length */
+} __packed;
+
+/* Possible ShareType values */
+#define SMB2_SHARE_TYPE_DISK 0x01
+#define SMB2_SHARE_TYPE_PIPE 0x02
+#define SMB2_SHARE_TYPE_PRINT 0x03
+
+/*
+ * Possible ShareFlags - exactly one and only one of the first 4 caching flags
+ * must be set (any of the remaining, SHI1005, flags may be set individually
+ * or in combination.
+ */
+#define SMB2_SHAREFLAG_MANUAL_CACHING 0x00000000
+#define SMB2_SHAREFLAG_AUTO_CACHING 0x00000010
+#define SMB2_SHAREFLAG_VDO_CACHING 0x00000020
+#define SMB2_SHAREFLAG_NO_CACHING 0x00000030
+#define SHI1005_FLAGS_DFS 0x00000001
+#define SHI1005_FLAGS_DFS_ROOT 0x00000002
+#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100
+#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200
+#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400
+#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800
+#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000
+#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000
+#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000
+#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000
+#define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */
+#define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */
+#define SHI1005_FLAGS_ALL 0x0014FF33
+
+/* Possible share capabilities */
+#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */
+#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */
+#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */
+#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */
+#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
+#define SMB2_SHARE_CAP_REDIRECT_TO_OWNER cpu_to_le32(0x00000100) /* 3.1.1 */
+
+struct smb2_tree_connect_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 16 */
+ __u8 ShareType; /* see below */
+ __u8 Reserved;
+ __le32 ShareFlags; /* see below */
+ __le32 Capabilities; /* see below */
+ __le32 MaximalAccess;
+} __packed;
+
+struct smb2_tree_disconnect_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+struct smb2_tree_disconnect_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+
+/*
+ * SMB2_NEGOTIATE_PROTOCOL See MS-SMB2 section 2.2.3
+ */
+/* SecurityMode flags */
+#define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001
+#define SMB2_NEGOTIATE_SIGNING_ENABLED_LE cpu_to_le16(0x0001)
+#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002
+#define SMB2_NEGOTIATE_SIGNING_REQUIRED_LE cpu_to_le16(0x0002)
+#define SMB2_SEC_MODE_FLAGS_ALL 0x0003
+
+/* Capabilities flags */
+#define SMB2_GLOBAL_CAP_DFS 0x00000001
+#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */
+#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */
+#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */
+/* Internal types */
+#define SMB2_NT_FIND 0x00100000
+#define SMB2_LARGE_FILES 0x00200000
+
+#define SMB2_CLIENT_GUID_SIZE 16
+#define SMB2_CREATE_GUID_SIZE 16
+
+/* Dialects */
+#define SMB10_PROT_ID 0x0000 /* local only, not sent on wire w/CIFS negprot */
+#define SMB20_PROT_ID 0x0202
+#define SMB21_PROT_ID 0x0210
+#define SMB2X_PROT_ID 0x02FF
+#define SMB30_PROT_ID 0x0300
+#define SMB302_PROT_ID 0x0302
+#define SMB311_PROT_ID 0x0311
+#define BAD_PROT_ID 0xFFFF
+
+#define SMB311_SALT_SIZE 32
+/* Hash Algorithm Types */
+#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001)
+#define SMB2_PREAUTH_HASH_SIZE 64
+
+/* Negotiate Contexts - ContextTypes. See MS-SMB2 section 2.2.3.1 for details */
+#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1)
+#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2)
+#define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3)
+#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5)
+#define SMB2_TRANSPORT_CAPABILITIES cpu_to_le16(6)
+#define SMB2_RDMA_TRANSFORM_CAPABILITIES cpu_to_le16(7)
+#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8)
+#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100)
+
+struct smb2_neg_context {
+ __le16 ContextType;
+ __le16 DataLength;
+ __le32 Reserved;
+ /* Followed by array of data. NOTE: some servers require padding to 8 byte boundary */
+} __packed;
+
+/*
+ * SaltLength that the server send can be zero, so the only three required
+ * fields (all __le16) end up six bytes total, so the minimum context data len
+ * in the response is six bytes which accounts for
+ *
+ * HashAlgorithmCount, SaltLength, and 1 HashAlgorithm.
+ */
+#define MIN_PREAUTH_CTXT_DATA_LEN 6
+
+struct smb2_preauth_neg_context {
+ __le16 ContextType; /* 1 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 HashAlgorithmCount; /* 1 */
+ __le16 SaltLength;
+ __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */
+ __u8 Salt[SMB311_SALT_SIZE];
+} __packed;
+
+/* Encryption Algorithms Ciphers */
+#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001)
+#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002)
+#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003)
+#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004)
+
+/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */
+#define MIN_ENCRYPT_CTXT_DATA_LEN 4
+struct smb2_encryption_neg_context {
+ __le16 ContextType; /* 2 */
+ __le16 DataLength;
+ __le32 Reserved;
+ /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */
+ __le16 CipherCount; /* AES128-GCM and AES128-CCM by default */
+ __le16 Ciphers[];
+} __packed;
+
+/* See MS-SMB2 2.2.3.1.3 */
+#define SMB3_COMPRESS_NONE cpu_to_le16(0x0000)
+#define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001)
+#define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002)
+#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003)
+/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */
+#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) /* Pattern_V1 */
+
+/* Compression Flags */
+#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000)
+#define SMB2_COMPRESSION_CAPABILITIES_FLAG_CHAINED cpu_to_le32(0x00000001)
+
+struct smb2_compression_capabilities_context {
+ __le16 ContextType; /* 3 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 CompressionAlgorithmCount;
+ __le16 Padding;
+ __le32 Flags;
+ __le16 CompressionAlgorithms[3];
+ __u16 Pad; /* Some servers require pad to DataLen multiple of 8 */
+ /* Check if pad needed */
+} __packed;
+
+/*
+ * For smb2_netname_negotiate_context_id See MS-SMB2 2.2.3.1.4.
+ * Its struct simply contains NetName, an array of Unicode characters
+ */
+struct smb2_netname_neg_context {
+ __le16 ContextType; /* 5 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 NetName[]; /* hostname of target converted to UCS-2 */
+} __packed;
+
+/*
+ * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5
+ * and 2.2.4.1.5
+ */
+
+/* Flags */
+#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001
+
+struct smb2_transport_capabilities_context {
+ __le16 ContextType; /* 6 */
+ __le16 DataLength;
+ __u32 Reserved;
+ __le32 Flags;
+ __u32 Pad;
+} __packed;
+
+/*
+ * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6
+ * and 2.2.4.1.6
+ */
+
+/* RDMA Transform IDs */
+#define SMB2_RDMA_TRANSFORM_NONE 0x0000
+#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001
+#define SMB2_RDMA_TRANSFORM_SIGNING 0x0002
+
+struct smb2_rdma_transform_capabilities_context {
+ __le16 ContextType; /* 7 */
+ __le16 DataLength;
+ __u32 Reserved;
+ __le16 TransformCount;
+ __u16 Reserved1;
+ __u32 Reserved2;
+ __le16 RDMATransformIds[];
+} __packed;
+
+/*
+ * For signing capabilities context see MS-SMB2 2.2.3.1.7
+ * and 2.2.4.1.7
+ */
+
+/* Signing algorithms */
+#define SIGNING_ALG_HMAC_SHA256 0
+#define SIGNING_ALG_HMAC_SHA256_LE cpu_to_le16(0)
+#define SIGNING_ALG_AES_CMAC 1
+#define SIGNING_ALG_AES_CMAC_LE cpu_to_le16(1)
+#define SIGNING_ALG_AES_GMAC 2
+#define SIGNING_ALG_AES_GMAC_LE cpu_to_le16(2)
+
+struct smb2_signing_capabilities {
+ __le16 ContextType; /* 8 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 SigningAlgorithmCount;
+ __le16 SigningAlgorithms[];
+ /* Followed by padding to 8 byte boundary (required by some servers) */
+} __packed;
+
+#define POSIX_CTXT_DATA_LEN 16
+struct smb2_posix_neg_context {
+ __le16 ContextType; /* 0x100 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */
+} __packed;
+
+struct smb2_negotiate_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 36 */
+ __le16 DialectCount;
+ __le16 SecurityMode;
+ __le16 Reserved; /* MBZ */
+ __le32 Capabilities;
+ __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE];
+ /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */
+ __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */
+ __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */
+ __le16 Reserved2;
+ __le16 Dialects[];
+} __packed;
+
+struct smb2_negotiate_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 65 */
+ __le16 SecurityMode;
+ __le16 DialectRevision;
+ __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */
+ __u8 ServerGUID[16];
+ __le32 Capabilities;
+ __le32 MaxTransactSize;
+ __le32 MaxReadSize;
+ __le32 MaxWriteSize;
+ __le64 SystemTime; /* MBZ */
+ __le64 ServerStartTime;
+ __le16 SecurityBufferOffset;
+ __le16 SecurityBufferLength;
+ __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */
+ __u8 Buffer[1]; /* variable length GSS security buffer */
+} __packed;
+
+
+/*
+ * SMB2_SESSION_SETUP See MS-SMB2 section 2.2.5
+ */
+/* Flags */
+#define SMB2_SESSION_REQ_FLAG_BINDING 0x01
+#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04
+
+struct smb2_sess_setup_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 25 */
+ __u8 Flags;
+ __u8 SecurityMode;
+ __le32 Capabilities;
+ __le32 Channel;
+ __le16 SecurityBufferOffset;
+ __le16 SecurityBufferLength;
+ __le64 PreviousSessionId;
+ __u8 Buffer[1]; /* variable length GSS security buffer */
+} __packed;
+
+/* Currently defined SessionFlags */
+#define SMB2_SESSION_FLAG_IS_GUEST 0x0001
+#define SMB2_SESSION_FLAG_IS_GUEST_LE cpu_to_le16(0x0001)
+#define SMB2_SESSION_FLAG_IS_NULL 0x0002
+#define SMB2_SESSION_FLAG_IS_NULL_LE cpu_to_le16(0x0002)
+#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004
+#define SMB2_SESSION_FLAG_ENCRYPT_DATA_LE cpu_to_le16(0x0004)
+
+struct smb2_sess_setup_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 SessionFlags;
+ __le16 SecurityBufferOffset;
+ __le16 SecurityBufferLength;
+ __u8 Buffer[1]; /* variable length GSS security buffer */
+} __packed;
+
+
+/*
+ * SMB2_LOGOFF See MS-SMB2 section 2.2.7
+ */
+struct smb2_logoff_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+struct smb2_logoff_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+
+/*
+ * SMB2_CLOSE See MS-SMB2 section 2.2.15
+ */
+/* Currently defined values for close flags */
+#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001)
+struct smb2_close_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 24 */
+ __le16 Flags;
+ __le32 Reserved;
+ __le64 PersistentFileId; /* opaque endianness */
+ __le64 VolatileFileId; /* opaque endianness */
+} __packed;
+
+/*
+ * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data)
+ */
+#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124
+
+struct smb2_close_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* 60 */
+ __le16 Flags;
+ __le32 Reserved;
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */
+ __le64 EndOfFile;
+ __le32 Attributes;
+} __packed;
+
+
+/*
+ * SMB2_READ See MS-SMB2 section 2.2.19
+ */
+/* For read request Flags field below, following flag is defined for SMB3.02 */
+#define SMB2_READFLAG_READ_UNBUFFERED 0x01
+#define SMB2_READFLAG_REQUEST_COMPRESSED 0x02 /* See MS-SMB2 2.2.19 */
+
+/* Channel field for read and write: exactly one of following flags can be set*/
+#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000)
+#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001)
+#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002)
+#define SMB2_CHANNEL_RDMA_TRANSFORM cpu_to_le32(0x00000003)
+
+/* SMB2 read request without RFC1001 length at the beginning */
+struct smb2_read_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 49 */
+ __u8 Padding; /* offset from start of SMB2 header to place read */
+ __u8 Flags; /* MBZ unless SMB3.02 or later */
+ __le32 Length;
+ __le64 Offset;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+ __le32 MinimumCount;
+ __le32 Channel; /* MBZ except for SMB3 or later */
+ __le32 RemainingBytes;
+ __le16 ReadChannelInfoOffset;
+ __le16 ReadChannelInfoLength;
+ __u8 Buffer[1];
+} __packed;
+
+/* Read flags */
+#define SMB2_READFLAG_RESPONSE_NONE cpu_to_le32(0x00000000)
+#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM cpu_to_le32(0x00000001)
+
+struct smb2_read_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 17 */
+ __u8 DataOffset;
+ __u8 Reserved;
+ __le32 DataLength;
+ __le32 DataRemaining;
+ __le32 Flags;
+ __u8 Buffer[1];
+} __packed;
+
+
+/*
+ * SMB2_WRITE See MS-SMB2 section 2.2.21
+ */
+/* For write request Flags field below the following flags are defined: */
+#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */
+#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */
+
+struct smb2_write_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 49 */
+ __le16 DataOffset; /* offset from start of SMB2 header to write data */
+ __le32 Length;
+ __le64 Offset;
+ __le64 PersistentFileId; /* opaque endianness */
+ __le64 VolatileFileId; /* opaque endianness */
+ __le32 Channel; /* MBZ unless SMB3.02 or later */
+ __le32 RemainingBytes;
+ __le16 WriteChannelInfoOffset;
+ __le16 WriteChannelInfoLength;
+ __le32 Flags;
+ __u8 Buffer[1];
+} __packed;
+
+struct smb2_write_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 17 */
+ __u8 DataOffset;
+ __u8 Reserved;
+ __le32 DataLength;
+ __le32 DataRemaining;
+ __u32 Reserved2;
+ __u8 Buffer[1];
+} __packed;
+
+
+/*
+ * SMB2_FLUSH See MS-SMB2 section 2.2.17
+ */
+struct smb2_flush_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 24 */
+ __le16 Reserved1;
+ __le32 Reserved2;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+} __packed;
+
+struct smb2_flush_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize;
+ __le16 Reserved;
+} __packed;
+
+
+/*
+ * SMB2_NOTIFY See MS-SMB2 section 2.2.35
+ */
+/* notify flags */
+#define SMB2_WATCH_TREE 0x0001
+
+/* notify completion filter flags. See MS-FSCC 2.6 and MS-SMB2 2.2.35 */
+#define FILE_NOTIFY_CHANGE_FILE_NAME 0x00000001
+#define FILE_NOTIFY_CHANGE_DIR_NAME 0x00000002
+#define FILE_NOTIFY_CHANGE_ATTRIBUTES 0x00000004
+#define FILE_NOTIFY_CHANGE_SIZE 0x00000008
+#define FILE_NOTIFY_CHANGE_LAST_WRITE 0x00000010
+#define FILE_NOTIFY_CHANGE_LAST_ACCESS 0x00000020
+#define FILE_NOTIFY_CHANGE_CREATION 0x00000040
+#define FILE_NOTIFY_CHANGE_EA 0x00000080
+#define FILE_NOTIFY_CHANGE_SECURITY 0x00000100
+#define FILE_NOTIFY_CHANGE_STREAM_NAME 0x00000200
+#define FILE_NOTIFY_CHANGE_STREAM_SIZE 0x00000400
+#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800
+
+/* SMB2 Notify Action Flags */
+#define FILE_ACTION_ADDED 0x00000001
+#define FILE_ACTION_REMOVED 0x00000002
+#define FILE_ACTION_MODIFIED 0x00000003
+#define FILE_ACTION_RENAMED_OLD_NAME 0x00000004
+#define FILE_ACTION_RENAMED_NEW_NAME 0x00000005
+#define FILE_ACTION_ADDED_STREAM 0x00000006
+#define FILE_ACTION_REMOVED_STREAM 0x00000007
+#define FILE_ACTION_MODIFIED_STREAM 0x00000008
+#define FILE_ACTION_REMOVED_BY_DELETE 0x00000009
+
+struct smb2_change_notify_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize;
+ __le16 Flags;
+ __le32 OutputBufferLength;
+ __le64 PersistentFileId; /* opaque endianness */
+ __le64 VolatileFileId; /* opaque endianness */
+ __le32 CompletionFilter;
+ __u32 Reserved;
+} __packed;
+
+struct smb2_change_notify_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 OutputBufferOffset;
+ __le32 OutputBufferLength;
+ __u8 Buffer[1]; /* array of file notify structs */
+} __packed;
+
+
+/*
+ * SMB2_CREATE See MS-SMB2 section 2.2.13
+ */
+/* Oplock levels */
+#define SMB2_OPLOCK_LEVEL_NONE 0x00
+#define SMB2_OPLOCK_LEVEL_II 0x01
+#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08
+#define SMB2_OPLOCK_LEVEL_BATCH 0x09
+#define SMB2_OPLOCK_LEVEL_LEASE 0xFF
+/* Non-spec internal type */
+#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99
+
+/* Impersonation Levels. See MS-WPO section 9.7 and MSDN-IMPERS */
+#define IL_ANONYMOUS cpu_to_le32(0x00000000)
+#define IL_IDENTIFICATION cpu_to_le32(0x00000001)
+#define IL_IMPERSONATION cpu_to_le32(0x00000002)
+#define IL_DELEGATE cpu_to_le32(0x00000003)
+
+/* File Attrubutes */
+#define FILE_ATTRIBUTE_READONLY 0x00000001
+#define FILE_ATTRIBUTE_HIDDEN 0x00000002
+#define FILE_ATTRIBUTE_SYSTEM 0x00000004
+#define FILE_ATTRIBUTE_DIRECTORY 0x00000010
+#define FILE_ATTRIBUTE_ARCHIVE 0x00000020
+#define FILE_ATTRIBUTE_NORMAL 0x00000080
+#define FILE_ATTRIBUTE_TEMPORARY 0x00000100
+#define FILE_ATTRIBUTE_SPARSE_FILE 0x00000200
+#define FILE_ATTRIBUTE_REPARSE_POINT 0x00000400
+#define FILE_ATTRIBUTE_COMPRESSED 0x00000800
+#define FILE_ATTRIBUTE_OFFLINE 0x00001000
+#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x00002000
+#define FILE_ATTRIBUTE_ENCRYPTED 0x00004000
+#define FILE_ATTRIBUTE_INTEGRITY_STREAM 0x00008000
+#define FILE_ATTRIBUTE_NO_SCRUB_DATA 0x00020000
+#define FILE_ATTRIBUTE__MASK 0x00007FB7
+
+#define FILE_ATTRIBUTE_READONLY_LE cpu_to_le32(0x00000001)
+#define FILE_ATTRIBUTE_HIDDEN_LE cpu_to_le32(0x00000002)
+#define FILE_ATTRIBUTE_SYSTEM_LE cpu_to_le32(0x00000004)
+#define FILE_ATTRIBUTE_DIRECTORY_LE cpu_to_le32(0x00000010)
+#define FILE_ATTRIBUTE_ARCHIVE_LE cpu_to_le32(0x00000020)
+#define FILE_ATTRIBUTE_NORMAL_LE cpu_to_le32(0x00000080)
+#define FILE_ATTRIBUTE_TEMPORARY_LE cpu_to_le32(0x00000100)
+#define FILE_ATTRIBUTE_SPARSE_FILE_LE cpu_to_le32(0x00000200)
+#define FILE_ATTRIBUTE_REPARSE_POINT_LE cpu_to_le32(0x00000400)
+#define FILE_ATTRIBUTE_COMPRESSED_LE cpu_to_le32(0x00000800)
+#define FILE_ATTRIBUTE_OFFLINE_LE cpu_to_le32(0x00001000)
+#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED_LE cpu_to_le32(0x00002000)
+#define FILE_ATTRIBUTE_ENCRYPTED_LE cpu_to_le32(0x00004000)
+#define FILE_ATTRIBUTE_INTEGRITY_STREAM_LE cpu_to_le32(0x00008000)
+#define FILE_ATTRIBUTE_NO_SCRUB_DATA_LE cpu_to_le32(0x00020000)
+#define FILE_ATTRIBUTE_MASK_LE cpu_to_le32(0x00007FB7)
+
+/* Desired Access Flags */
+#define FILE_READ_DATA_LE cpu_to_le32(0x00000001)
+#define FILE_LIST_DIRECTORY_LE cpu_to_le32(0x00000001)
+#define FILE_WRITE_DATA_LE cpu_to_le32(0x00000002)
+#define FILE_APPEND_DATA_LE cpu_to_le32(0x00000004)
+#define FILE_ADD_SUBDIRECTORY_LE cpu_to_le32(0x00000004)
+#define FILE_READ_EA_LE cpu_to_le32(0x00000008)
+#define FILE_WRITE_EA_LE cpu_to_le32(0x00000010)
+#define FILE_EXECUTE_LE cpu_to_le32(0x00000020)
+#define FILE_DELETE_CHILD_LE cpu_to_le32(0x00000040)
+#define FILE_READ_ATTRIBUTES_LE cpu_to_le32(0x00000080)
+#define FILE_WRITE_ATTRIBUTES_LE cpu_to_le32(0x00000100)
+#define FILE_DELETE_LE cpu_to_le32(0x00010000)
+#define FILE_READ_CONTROL_LE cpu_to_le32(0x00020000)
+#define FILE_WRITE_DAC_LE cpu_to_le32(0x00040000)
+#define FILE_WRITE_OWNER_LE cpu_to_le32(0x00080000)
+#define FILE_SYNCHRONIZE_LE cpu_to_le32(0x00100000)
+#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000)
+#define FILE_MAXIMAL_ACCESS_LE cpu_to_le32(0x02000000)
+#define FILE_GENERIC_ALL_LE cpu_to_le32(0x10000000)
+#define FILE_GENERIC_EXECUTE_LE cpu_to_le32(0x20000000)
+#define FILE_GENERIC_WRITE_LE cpu_to_le32(0x40000000)
+#define FILE_GENERIC_READ_LE cpu_to_le32(0x80000000)
+#define DESIRED_ACCESS_MASK cpu_to_le32(0xF21F01FF)
+
+
+#define FILE_READ_DESIRED_ACCESS_LE (FILE_READ_DATA_LE | \
+ FILE_READ_EA_LE | \
+ FILE_GENERIC_READ_LE)
+#define FILE_WRITE_DESIRE_ACCESS_LE (FILE_WRITE_DATA_LE | \
+ FILE_APPEND_DATA_LE | \
+ FILE_WRITE_EA_LE | \
+ FILE_WRITE_ATTRIBUTES_LE | \
+ FILE_GENERIC_WRITE_LE)
+
+/* ShareAccess Flags */
+#define FILE_SHARE_READ_LE cpu_to_le32(0x00000001)
+#define FILE_SHARE_WRITE_LE cpu_to_le32(0x00000002)
+#define FILE_SHARE_DELETE_LE cpu_to_le32(0x00000004)
+#define FILE_SHARE_ALL_LE cpu_to_le32(0x00000007)
+
+/* CreateDisposition Flags */
+#define FILE_SUPERSEDE_LE cpu_to_le32(0x00000000)
+#define FILE_OPEN_LE cpu_to_le32(0x00000001)
+#define FILE_CREATE_LE cpu_to_le32(0x00000002)
+#define FILE_OPEN_IF_LE cpu_to_le32(0x00000003)
+#define FILE_OVERWRITE_LE cpu_to_le32(0x00000004)
+#define FILE_OVERWRITE_IF_LE cpu_to_le32(0x00000005)
+#define FILE_CREATE_MASK_LE cpu_to_le32(0x00000007)
+
+#define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA \
+ | FILE_READ_ATTRIBUTES)
+#define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \
+ | FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES)
+#define FILE_EXEC_RIGHTS (FILE_EXECUTE)
+
+/* CreateOptions Flags */
+#define FILE_DIRECTORY_FILE_LE cpu_to_le32(0x00000001)
+/* same as #define CREATE_NOT_FILE_LE cpu_to_le32(0x00000001) */
+#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002)
+#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004)
+#define FILE_NO_INTERMEDIATE_BUFFERING_LE cpu_to_le32(0x00000008)
+#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040)
+#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100)
+#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200)
+#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800)
+#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000)
+#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000)
+#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000)
+#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000)
+#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000)
+#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000)
+#define CREATE_OPTIONS_MASK_LE cpu_to_le32(0x00FFFFFF)
+
+#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \
+ | FILE_READ_ATTRIBUTES_LE)
+#define FILE_WRITE_RIGHTS_LE (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE \
+ | FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE)
+#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE)
+
+/* Create Context Values */
+#define SMB2_CREATE_EA_BUFFER "ExtA" /* extended attributes */
+#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */
+#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ"
+#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC"
+#define SMB2_CREATE_ALLOCATION_SIZE "AISi"
+#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc"
+#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp"
+#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid"
+#define SMB2_CREATE_REQUEST_LEASE "RqLs"
+#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q"
+#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C"
+#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C"
+
+/* Flag (SMB3 open response) values */
+#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01
+
+struct create_context {
+ __le32 Next;
+ __le16 NameOffset;
+ __le16 NameLength;
+ __le16 Reserved;
+ __le16 DataOffset;
+ __le32 DataLength;
+ __u8 Buffer[];
+} __packed;
+
+struct smb2_create_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 57 */
+ __u8 SecurityFlags;
+ __u8 RequestedOplockLevel;
+ __le32 ImpersonationLevel;
+ __le64 SmbCreateFlags;
+ __le64 Reserved;
+ __le32 DesiredAccess;
+ __le32 FileAttributes;
+ __le32 ShareAccess;
+ __le32 CreateDisposition;
+ __le32 CreateOptions;
+ __le16 NameOffset;
+ __le16 NameLength;
+ __le32 CreateContextsOffset;
+ __le32 CreateContextsLength;
+ __u8 Buffer[];
+} __packed;
+
+struct smb2_create_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 89 */
+ __u8 OplockLevel;
+ __u8 Flags; /* 0x01 if reparse point */
+ __le32 CreateAction;
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 AllocationSize;
+ __le64 EndofFile;
+ __le32 FileAttributes;
+ __le32 Reserved2;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+ __le32 CreateContextsOffset;
+ __le32 CreateContextsLength;
+ __u8 Buffer[1];
+} __packed;
+
+
+#endif /* _COMMON_SMB2PDU_H */
diff --git a/fs/smbfs_common/smbfsctl.h b/fs/smbfs_common/smbfsctl.h
index d01e8c9d7a31..926f87cd6af0 100644
--- a/fs/smbfs_common/smbfsctl.h
+++ b/fs/smbfs_common/smbfsctl.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: LGPL-2.1+ */
/*
- * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
+ * SMB, CIFS, SMB2 FSCTL definitions
*
* Copyright (c) International Business Machines Corp., 2002,2013
* Author(s): Steve French (sfrench@us.ibm.com)
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 60d6951915f4..bb44ff4c5cc6 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -16,6 +16,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/blkdev.h>
#include <linux/fs.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
@@ -179,8 +180,8 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
/* Check the filesystem does not extend beyond the end of the
block device */
msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
- if (msblk->bytes_used < 0 || msblk->bytes_used >
- i_size_read(sb->s_bdev->bd_inode))
+ if (msblk->bytes_used < 0 ||
+ msblk->bytes_used > bdev_nr_bytes(sb->s_bdev))
goto failed_mount;
/* Check block size for sanity */
diff --git a/fs/super.c b/fs/super.c
index bcef3a6f4c4b..3bfc0f8fbd5b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -476,6 +476,8 @@ void generic_shutdown_super(struct super_block *sb)
spin_unlock(&sb_lock);
up_write(&sb->s_umount);
if (sb->s_bdi != &noop_backing_dev_info) {
+ if (sb->s_iflags & SB_I_PERSB_BDI)
+ bdi_unregister(sb->s_bdi);
bdi_put(sb->s_bdi);
sb->s_bdi = &noop_backing_dev_info;
}
@@ -1562,6 +1564,7 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
}
WARN_ON(sb->s_bdi != &noop_backing_dev_info);
sb->s_bdi = bdi;
+ sb->s_iflags |= SB_I_PERSB_BDI;
return 0;
}
diff --git a/fs/sync.c b/fs/sync.c
index 1373a610dc78..3ce8e2137f31 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -3,6 +3,7 @@
* High-level sync()-related operations
*/
+#include <linux/blkdev.h>
#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/fs.h>
@@ -22,25 +23,6 @@
SYNC_FILE_RANGE_WAIT_AFTER)
/*
- * Do the filesystem syncing work. For simple filesystems
- * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
- * submit IO for these buffers via __sync_blockdev(). This also speeds up the
- * wait == 1 case since in that case write_inode() functions do
- * sync_dirty_buffer() and thus effectively write one block at a time.
- */
-static int __sync_filesystem(struct super_block *sb, int wait)
-{
- if (wait)
- sync_inodes_sb(sb);
- else
- writeback_inodes_sb(sb, WB_REASON_SYNC);
-
- if (sb->s_op->sync_fs)
- sb->s_op->sync_fs(sb, wait);
- return __sync_blockdev(sb->s_bdev, wait);
-}
-
-/*
* Write out and wait upon all dirty data associated with this
* superblock. Filesystem data as well as the underlying block
* device. Takes the superblock lock.
@@ -61,10 +43,25 @@ int sync_filesystem(struct super_block *sb)
if (sb_rdonly(sb))
return 0;
- ret = __sync_filesystem(sb, 0);
+ /*
+ * Do the filesystem syncing work. For simple filesystems
+ * writeback_inodes_sb(sb) just dirties buffers with inodes so we have
+ * to submit I/O for these buffers via sync_blockdev(). This also
+ * speeds up the wait == 1 case since in that case write_inode()
+ * methods call sync_dirty_buffer() and thus effectively write one block
+ * at a time.
+ */
+ writeback_inodes_sb(sb, WB_REASON_SYNC);
+ if (sb->s_op->sync_fs)
+ sb->s_op->sync_fs(sb, 0);
+ ret = sync_blockdev_nowait(sb->s_bdev);
if (ret < 0)
return ret;
- return __sync_filesystem(sb, 1);
+
+ sync_inodes_sb(sb);
+ if (sb->s_op->sync_fs)
+ sb->s_op->sync_fs(sb, 1);
+ return sync_blockdev(sb->s_bdev);
}
EXPORT_SYMBOL(sync_filesystem);
@@ -81,21 +78,6 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg)
sb->s_op->sync_fs(sb, *(int *)arg);
}
-static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
-{
- filemap_fdatawrite(bdev->bd_inode->i_mapping);
-}
-
-static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
-{
- /*
- * We keep the error status of individual mapping so that
- * applications can catch the writeback error using fsync(2).
- * See filemap_fdatawait_keep_errors() for details.
- */
- filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
-}
-
/*
* Sync everything. We start by waking flusher threads so that most of
* writeback runs on all devices in parallel. Then we sync all inodes reliably
@@ -114,8 +96,8 @@ void ksys_sync(void)
iterate_supers(sync_inodes_one_sb, NULL);
iterate_supers(sync_fs_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &wait);
- iterate_bdevs(fdatawrite_one_bdev, NULL);
- iterate_bdevs(fdatawait_one_bdev, NULL);
+ sync_bdevs(false);
+ sync_bdevs(true);
if (unlikely(laptop_mode))
laptop_sync_completion();
}
@@ -136,10 +118,10 @@ static void do_sync_work(struct work_struct *work)
*/
iterate_supers(sync_inodes_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &nowait);
- iterate_bdevs(fdatawrite_one_bdev, NULL);
+ sync_bdevs(false);
iterate_supers(sync_inodes_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &nowait);
- iterate_bdevs(fdatawrite_one_bdev, NULL);
+ sync_bdevs(false);
printk("Emergency Sync complete\n");
kfree(work);
}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 59dffd5ca517..b6b6796e1616 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -56,8 +56,7 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
kobject_get_ownership(kobj, &uid, &gid);
- kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
- S_IRWXU | S_IRUGO | S_IXUGO, uid, gid,
+ kn = kernfs_create_dir_ns(parent, kobject_name(kobj), 0755, uid, gid,
kobj, ns);
if (IS_ERR(kn)) {
if (PTR_ERR(kn) == -EEXIST)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index d019d6ac6ad0..42dcf96881b6 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -45,6 +45,9 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
ssize_t count;
char *buf;
+ if (WARN_ON_ONCE(!ops->show))
+ return -EINVAL;
+
/* acquire buffer and ensure that it's >= PAGE_SIZE and clear */
count = seq_get_buf(sf, &buf);
if (count < PAGE_SIZE) {
@@ -53,15 +56,9 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
}
memset(buf, 0, PAGE_SIZE);
- /*
- * Invoke show(). Control may reach here via seq file lseek even
- * if @ops->show() isn't implemented.
- */
- if (ops->show) {
- count = ops->show(kobj, of->kn->priv, buf);
- if (count < 0)
- return count;
- }
+ count = ops->show(kobj, of->kn->priv, buf);
+ if (count < 0)
+ return count;
/*
* The code works fine with PAGE_SIZE return but it's likely to
@@ -255,59 +252,74 @@ static const struct kernfs_ops sysfs_bin_kfops_mmap = {
};
int sysfs_add_file_mode_ns(struct kernfs_node *parent,
- const struct attribute *attr, bool is_bin,
- umode_t mode, kuid_t uid, kgid_t gid, const void *ns)
+ const struct attribute *attr, umode_t mode, kuid_t uid,
+ kgid_t gid, const void *ns)
{
+ struct kobject *kobj = parent->priv;
+ const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
struct lock_class_key *key = NULL;
- const struct kernfs_ops *ops;
+ const struct kernfs_ops *ops = NULL;
struct kernfs_node *kn;
- loff_t size;
-
- if (!is_bin) {
- struct kobject *kobj = parent->priv;
- const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
-
- /* every kobject with an attribute needs a ktype assigned */
- if (WARN(!sysfs_ops, KERN_ERR
- "missing sysfs attribute operations for kobject: %s\n",
- kobject_name(kobj)))
- return -EINVAL;
-
- if (sysfs_ops->show && sysfs_ops->store) {
- if (mode & SYSFS_PREALLOC)
- ops = &sysfs_prealloc_kfops_rw;
- else
- ops = &sysfs_file_kfops_rw;
- } else if (sysfs_ops->show) {
- if (mode & SYSFS_PREALLOC)
- ops = &sysfs_prealloc_kfops_ro;
- else
- ops = &sysfs_file_kfops_ro;
- } else if (sysfs_ops->store) {
- if (mode & SYSFS_PREALLOC)
- ops = &sysfs_prealloc_kfops_wo;
- else
- ops = &sysfs_file_kfops_wo;
- } else
- ops = &sysfs_file_kfops_empty;
-
- size = PAGE_SIZE;
+
+ /* every kobject with an attribute needs a ktype assigned */
+ if (WARN(!sysfs_ops, KERN_ERR
+ "missing sysfs attribute operations for kobject: %s\n",
+ kobject_name(kobj)))
+ return -EINVAL;
+
+ if (mode & SYSFS_PREALLOC) {
+ if (sysfs_ops->show && sysfs_ops->store)
+ ops = &sysfs_prealloc_kfops_rw;
+ else if (sysfs_ops->show)
+ ops = &sysfs_prealloc_kfops_ro;
+ else if (sysfs_ops->store)
+ ops = &sysfs_prealloc_kfops_wo;
} else {
- struct bin_attribute *battr = (void *)attr;
-
- if (battr->mmap)
- ops = &sysfs_bin_kfops_mmap;
- else if (battr->read && battr->write)
- ops = &sysfs_bin_kfops_rw;
- else if (battr->read)
- ops = &sysfs_bin_kfops_ro;
- else if (battr->write)
- ops = &sysfs_bin_kfops_wo;
- else
- ops = &sysfs_file_kfops_empty;
-
- size = battr->size;
+ if (sysfs_ops->show && sysfs_ops->store)
+ ops = &sysfs_file_kfops_rw;
+ else if (sysfs_ops->show)
+ ops = &sysfs_file_kfops_ro;
+ else if (sysfs_ops->store)
+ ops = &sysfs_file_kfops_wo;
+ }
+
+ if (!ops)
+ ops = &sysfs_file_kfops_empty;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ if (!attr->ignore_lockdep)
+ key = attr->key ?: (struct lock_class_key *)&attr->skey;
+#endif
+
+ kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid,
+ PAGE_SIZE, ops, (void *)attr, ns, key);
+ if (IS_ERR(kn)) {
+ if (PTR_ERR(kn) == -EEXIST)
+ sysfs_warn_dup(parent, attr->name);
+ return PTR_ERR(kn);
}
+ return 0;
+}
+
+int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent,
+ const struct bin_attribute *battr, umode_t mode,
+ kuid_t uid, kgid_t gid, const void *ns)
+{
+ const struct attribute *attr = &battr->attr;
+ struct lock_class_key *key = NULL;
+ const struct kernfs_ops *ops;
+ struct kernfs_node *kn;
+
+ if (battr->mmap)
+ ops = &sysfs_bin_kfops_mmap;
+ else if (battr->read && battr->write)
+ ops = &sysfs_bin_kfops_rw;
+ else if (battr->read)
+ ops = &sysfs_bin_kfops_ro;
+ else if (battr->write)
+ ops = &sysfs_bin_kfops_wo;
+ else
+ ops = &sysfs_file_kfops_empty;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
if (!attr->ignore_lockdep)
@@ -315,7 +327,7 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
#endif
kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid,
- size, ops, (void *)attr, ns, key);
+ battr->size, ops, (void *)attr, ns, key);
if (IS_ERR(kn)) {
if (PTR_ERR(kn) == -EEXIST)
sysfs_warn_dup(parent, attr->name);
@@ -340,9 +352,7 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
return -EINVAL;
kobject_get_ownership(kobj, &uid, &gid);
- return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode,
- uid, gid, ns);
-
+ return sysfs_add_file_mode_ns(kobj->sd, attr, attr->mode, uid, gid, ns);
}
EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
@@ -385,8 +395,8 @@ int sysfs_add_file_to_group(struct kobject *kobj,
return -ENOENT;
kobject_get_ownership(kobj, &uid, &gid);
- error = sysfs_add_file_mode_ns(parent, attr, false,
- attr->mode, uid, gid, NULL);
+ error = sysfs_add_file_mode_ns(parent, attr, attr->mode, uid, gid,
+ NULL);
kernfs_put(parent);
return error;
@@ -555,8 +565,8 @@ int sysfs_create_bin_file(struct kobject *kobj,
return -EINVAL;
kobject_get_ownership(kobj, &uid, &gid);
- return sysfs_add_file_mode_ns(kobj->sd, &attr->attr, true,
- attr->attr.mode, uid, gid, NULL);
+ return sysfs_add_bin_file_mode_ns(kobj->sd, attr, attr->attr.mode, uid,
+ gid, NULL);
}
EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index f29d62004527..eeb0e3099421 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -61,8 +61,8 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
(*attr)->name, mode);
mode &= SYSFS_PREALLOC | 0664;
- error = sysfs_add_file_mode_ns(parent, *attr, false,
- mode, uid, gid, NULL);
+ error = sysfs_add_file_mode_ns(parent, *attr, mode, uid,
+ gid, NULL);
if (unlikely(error))
break;
}
@@ -90,10 +90,9 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
(*bin_attr)->attr.name, mode);
mode &= SYSFS_PREALLOC | 0664;
- error = sysfs_add_file_mode_ns(parent,
- &(*bin_attr)->attr, true,
- mode,
- uid, gid, NULL);
+ error = sysfs_add_bin_file_mode_ns(parent, *bin_attr,
+ mode, uid, gid,
+ NULL);
if (error)
break;
}
@@ -340,8 +339,8 @@ int sysfs_merge_group(struct kobject *kobj,
kobject_get_ownership(kobj, &uid, &gid);
for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
- error = sysfs_add_file_mode_ns(parent, *attr, false,
- (*attr)->mode, uid, gid, NULL);
+ error = sysfs_add_file_mode_ns(parent, *attr, (*attr)->mode,
+ uid, gid, NULL);
if (error) {
while (--i >= 0)
kernfs_remove_by_name(parent, (*--attr)->name);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 0050cc0c0236..3f28c9af5756 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -28,9 +28,11 @@ void sysfs_warn_dup(struct kernfs_node *parent, const char *name);
* file.c
*/
int sysfs_add_file_mode_ns(struct kernfs_node *parent,
- const struct attribute *attr, bool is_bin,
- umode_t amode, kuid_t uid, kgid_t gid,
- const void *ns);
+ const struct attribute *attr, umode_t amode, kuid_t uid,
+ kgid_t gid, const void *ns);
+int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent,
+ const struct bin_attribute *battr, umode_t mode,
+ kuid_t uid, kgid_t gid, const void *ns);
/*
* symlink.c
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index cc8e2ed155c8..d1def0771a40 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -474,10 +474,8 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
struct sysv_sb_info *sbi;
struct buffer_head *bh;
- if (440 != sizeof (struct v7_super_block))
- panic("V7 FS: bad super-block size");
- if (64 != sizeof (struct sysv_inode))
- panic("sysv fs: bad i-node size");
+ BUILD_BUG_ON(sizeof(struct v7_super_block) != 440);
+ BUILD_BUG_ON(sizeof(struct sysv_inode) != 64);
sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
if (!sbi)
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 1261e8b41edb..925a621b432e 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -432,7 +432,8 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent,
if (unlikely(!inode))
return failed_creating(dentry);
- inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ /* Do not set bits for OTH */
+ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUSR| S_IRGRP | S_IXUSR | S_IXGRP;
inode->i_op = ops;
inode->i_fop = &simple_dir_operations;
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 22be7aeb96c4..c57b46a352d8 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -82,5 +82,4 @@ const struct fscrypt_operations ubifs_crypt_operations = {
.get_context = ubifs_crypt_get_context,
.set_context = ubifs_crypt_set_context,
.empty_dir = ubifs_crypt_empty_dir,
- .max_namelen = UBIFS_MAX_NLEN,
};
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index f1094cdcd6cd..46d697172197 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -47,8 +47,7 @@ unsigned int udf_get_last_session(struct super_block *sb)
unsigned long udf_get_last_block(struct super_block *sb)
{
- struct block_device *bdev = sb->s_bdev;
- struct cdrom_device_info *cdi = disk_to_cdi(bdev->bd_disk);
+ struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk);
unsigned long lblock = 0;
/*
@@ -56,7 +55,7 @@ unsigned long udf_get_last_block(struct super_block *sb)
* Try using the device size...
*/
if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0)
- lblock = i_size_read(bdev->bd_inode) >> sb->s_blocksize_bits;
+ lblock = sb_bdev_nr_blocks(sb);
if (lblock)
return lblock - 1;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index b2d7c57d0688..34247fba6df9 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1175,8 +1175,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
struct udf_inode_info *vati;
uint32_t pos;
struct virtualAllocationTable20 *vat20;
- sector_t blocks = i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits;
+ sector_t blocks = sb_bdev_nr_blocks(sb);
udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
if (!sbi->s_vat_inode &&
@@ -1838,8 +1837,7 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block,
int ret;
if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
- udf_fixed_to_variable(block) >=
- i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits)
+ udf_fixed_to_variable(block) >= sb_bdev_nr_blocks(sb))
return -EAGAIN;
bh = udf_read_tagged(sb, block, block, &ident);
@@ -1901,8 +1899,7 @@ static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock,
last[last_count++] = *lastblock - 152;
for (i = 0; i < last_count; i++) {
- if (last[i] >= i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits)
+ if (last[i] >= sb_bdev_nr_blocks(sb))
continue;
ret = udf_check_anchor_block(sb, last[i], fileset);
if (ret != -EAGAIN) {
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 003f0d31743e..22bf14ab2d16 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1827,9 +1827,15 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
if (mode_wp && mode_dontwake)
return -EINVAL;
- ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
- uffdio_wp.range.len, mode_wp,
- &ctx->mmap_changing);
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
+ uffdio_wp.range.len, mode_wp,
+ &ctx->mmap_changing);
+ mmput(ctx->mm);
+ } else {
+ return -ESRCH;
+ }
+
if (ret)
return ret;
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index 4f5e59f06284..37dd3fe5b1e9 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -21,10 +21,7 @@
#define VBOXSF_SUPER_MAGIC 0x786f4256 /* 'VBox' little endian */
-#define VBSF_MOUNT_SIGNATURE_BYTE_0 ('\000')
-#define VBSF_MOUNT_SIGNATURE_BYTE_1 ('\377')
-#define VBSF_MOUNT_SIGNATURE_BYTE_2 ('\376')
-#define VBSF_MOUNT_SIGNATURE_BYTE_3 ('\375')
+static const unsigned char VBSF_MOUNT_SIGNATURE[4] = "\000\377\376\375";
static int follow_symlinks;
module_param(follow_symlinks, int, 0444);
@@ -386,12 +383,7 @@ fail_nomem:
static int vboxsf_parse_monolithic(struct fs_context *fc, void *data)
{
- unsigned char *options = data;
-
- if (options && options[0] == VBSF_MOUNT_SIGNATURE_BYTE_0 &&
- options[1] == VBSF_MOUNT_SIGNATURE_BYTE_1 &&
- options[2] == VBSF_MOUNT_SIGNATURE_BYTE_2 &&
- options[3] == VBSF_MOUNT_SIGNATURE_BYTE_3) {
+ if (data && !memcmp(data, VBSF_MOUNT_SIGNATURE, 4)) {
vbg_err("vboxsf: Old binary mount data not supported, remove obsolete mount.vboxsf and/or update your VBoxService.\n");
return -EINVAL;
}
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index 77e159a0346b..60a4372aa4d7 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -177,7 +177,7 @@ static int build_merkle_tree(struct file *filp,
* (level 0) and ascending to the root node (level 'num_levels - 1').
* Then at the end (level 'num_levels'), calculate the root hash.
*/
- blocks = (inode->i_size + params->block_size - 1) >>
+ blocks = ((u64)inode->i_size + params->block_size - 1) >>
params->log_blocksize;
for (level = 0; level <= params->num_levels; level++) {
err = build_merkle_tree_level(filp, level, blocks, params,
diff --git a/fs/verity/open.c b/fs/verity/open.c
index 60ff8af7219f..92df87f5fa38 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -89,7 +89,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
*/
/* Compute number of levels and the number of blocks in each level */
- blocks = (inode->i_size + params->block_size - 1) >> log_blocksize;
+ blocks = ((u64)inode->i_size + params->block_size - 1) >> log_blocksize;
pr_debug("Data is %lld bytes (%llu blocks)\n", inode->i_size, blocks);
while (blocks > 1) {
if (params->num_levels >= FS_VERITY_MAX_LEVELS) {
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 54da6d717a06..b987dc2c6851 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -72,10 +72,6 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
/*
* Zone interfaces
*/
-
-#define kmem_zone kmem_cache
-#define kmem_zone_t struct kmem_cache
-
static inline struct page *
kmem_to_page(void *addr)
{
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 005abfd9fd34..d7d875cef07a 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -850,7 +850,7 @@ xfs_ag_shrink_space(
if (err2 != -ENOSPC)
goto resv_err;
- __xfs_bmap_add_free(*tpp, args.fsbno, delta, NULL, true);
+ __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, true);
/*
* Roll the transaction before trying to re-init the per-ag
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 4c6f9045baca..3f597cad2c33 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -116,23 +116,29 @@ void xfs_perag_put(struct xfs_perag *pag);
/*
* Perag iteration APIs
- *
- * XXX: for_each_perag_range() usage really needs an iterator to clean up when
- * we terminate at end_agno because we may have taken a reference to the perag
- * beyond end_agno. Right now callers have to be careful to catch and clean that
- * up themselves. This is not necessary for the callers of for_each_perag() and
- * for_each_perag_from() because they terminate at sb_agcount where there are
- * no perag structures in tree beyond end_agno.
*/
-#define for_each_perag_range(mp, next_agno, end_agno, pag) \
- for ((pag) = xfs_perag_get((mp), (next_agno)); \
- (pag) != NULL && (next_agno) <= (end_agno); \
- (next_agno) = (pag)->pag_agno + 1, \
- xfs_perag_put(pag), \
- (pag) = xfs_perag_get((mp), (next_agno)))
+static inline struct xfs_perag *
+xfs_perag_next(
+ struct xfs_perag *pag,
+ xfs_agnumber_t *agno,
+ xfs_agnumber_t end_agno)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+
+ *agno = pag->pag_agno + 1;
+ xfs_perag_put(pag);
+ if (*agno > end_agno)
+ return NULL;
+ return xfs_perag_get(mp, *agno);
+}
+
+#define for_each_perag_range(mp, agno, end_agno, pag) \
+ for ((pag) = xfs_perag_get((mp), (agno)); \
+ (pag) != NULL; \
+ (pag) = xfs_perag_next((pag), &(agno), (end_agno)))
-#define for_each_perag_from(mp, next_agno, pag) \
- for_each_perag_range((mp), (next_agno), (mp)->m_sb.sb_agcount, (pag))
+#define for_each_perag_from(mp, agno, pag) \
+ for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag))
#define for_each_perag(mp, agno, pag) \
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 2aa2b3484c28..fe94058d4e9e 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -91,7 +91,8 @@ xfs_ag_resv_critical(
trace_xfs_ag_resv_critical(pag, type, avail);
/* Critically low if less than 10% or max btree height remains. */
- return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS,
+ return XFS_TEST_ERROR(avail < orig / 10 ||
+ avail < pag->pag_mount->m_agbtree_maxlevels,
pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
}
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 95157f5a5a6c..353e53b892e6 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -27,7 +27,7 @@
#include "xfs_ag_resv.h"
#include "xfs_bmap.h"
-extern kmem_zone_t *xfs_bmap_free_item_zone;
+struct kmem_cache *xfs_extfree_item_cache;
struct workqueue_struct *xfs_alloc_wq;
@@ -426,8 +426,8 @@ xfs_alloc_fix_len(
*/
STATIC int /* error code */
xfs_alloc_fixup_trees(
- xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */
- xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */
+ struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */
+ struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */
xfs_agblock_t fbno, /* starting block of free extent */
xfs_extlen_t flen, /* length of free extent */
xfs_agblock_t rbno, /* starting block of returned extent */
@@ -488,8 +488,8 @@ xfs_alloc_fixup_trees(
struct xfs_btree_block *bnoblock;
struct xfs_btree_block *cntblock;
- bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
- cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
+ bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp);
+ cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp);
if (XFS_IS_CORRUPT(mp,
bnoblock->bb_numrecs !=
@@ -1200,8 +1200,8 @@ xfs_alloc_ag_vextent_exact(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
- xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
- xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
+ struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
+ struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
int error;
xfs_agblock_t fbno; /* start block of found extent */
xfs_extlen_t flen; /* length of found extent */
@@ -1512,7 +1512,7 @@ xfs_alloc_ag_vextent_lastblock(
* than minlen.
*/
if (*len || args->alignment > 1) {
- acur->cnt->bc_ptrs[0] = 1;
+ acur->cnt->bc_levels[0].ptr = 1;
do {
error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
if (error)
@@ -1658,8 +1658,8 @@ xfs_alloc_ag_vextent_size(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
struct xfs_agf *agf = args->agbp->b_addr;
- xfs_btree_cur_t *bno_cur; /* cursor for bno btree */
- xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */
+ struct xfs_btree_cur *bno_cur; /* cursor for bno btree */
+ struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */
int error; /* error result */
xfs_agblock_t fbno; /* start of found freespace */
xfs_extlen_t flen; /* length of found freespace */
@@ -2190,14 +2190,15 @@ xfs_free_ag_extent(
*/
/*
- * Compute and fill in value of m_ag_maxlevels.
+ * Compute and fill in value of m_alloc_maxlevels.
*/
void
xfs_alloc_compute_maxlevels(
xfs_mount_t *mp) /* file system mount structure */
{
- mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
+ mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
(mp->m_sb.sb_agblocks + 1) / 2);
+ ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk());
}
/*
@@ -2255,14 +2256,14 @@ xfs_alloc_min_freelist(
const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
unsigned int min_free;
- ASSERT(mp->m_ag_maxlevels > 0);
+ ASSERT(mp->m_alloc_maxlevels > 0);
/* space needed by-bno freespace btree */
min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
- mp->m_ag_maxlevels);
+ mp->m_alloc_maxlevels);
/* space needed by-size freespace btree */
min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
- mp->m_ag_maxlevels);
+ mp->m_alloc_maxlevels);
/* space needed reverse mapping used space btree */
if (xfs_has_rmapbt(mp))
min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
@@ -2439,7 +2440,7 @@ xfs_agfl_reset(
/*
* Defer an AGFL block free. This is effectively equivalent to
- * xfs_bmap_add_free() with some special handling particular to AGFL blocks.
+ * xfs_free_extent_later() with some special handling particular to AGFL blocks.
*
* Deferring AGFL frees helps prevent log reservation overruns due to too many
* allocation operations in a transaction. AGFL frees are prone to this problem
@@ -2458,21 +2459,74 @@ xfs_defer_agfl_block(
struct xfs_mount *mp = tp->t_mountp;
struct xfs_extent_free_item *new; /* new element */
- ASSERT(xfs_bmap_free_item_zone != NULL);
+ ASSERT(xfs_extfree_item_cache != NULL);
ASSERT(oinfo != NULL);
- new = kmem_cache_alloc(xfs_bmap_free_item_zone,
+ new = kmem_cache_zalloc(xfs_extfree_item_cache,
GFP_KERNEL | __GFP_NOFAIL);
new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
new->xefi_blockcount = 1;
- new->xefi_oinfo = *oinfo;
- new->xefi_skip_discard = false;
+ new->xefi_owner = oinfo->oi_owner;
trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
}
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+__xfs_free_extent_later(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ const struct xfs_owner_info *oinfo,
+ bool skip_discard)
+{
+ struct xfs_extent_free_item *new; /* new element */
+#ifdef DEBUG
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+
+ ASSERT(bno != NULLFSBLOCK);
+ ASSERT(len > 0);
+ ASSERT(len <= MAXEXTLEN);
+ ASSERT(!isnullstartblock(bno));
+ agno = XFS_FSB_TO_AGNO(mp, bno);
+ agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ ASSERT(agno < mp->m_sb.sb_agcount);
+ ASSERT(agbno < mp->m_sb.sb_agblocks);
+ ASSERT(len < mp->m_sb.sb_agblocks);
+ ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+ ASSERT(xfs_extfree_item_cache != NULL);
+
+ new = kmem_cache_zalloc(xfs_extfree_item_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
+ new->xefi_startblock = bno;
+ new->xefi_blockcount = (xfs_extlen_t)len;
+ if (skip_discard)
+ new->xefi_flags |= XFS_EFI_SKIP_DISCARD;
+ if (oinfo) {
+ ASSERT(oinfo->oi_offset == 0);
+
+ if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+ new->xefi_flags |= XFS_EFI_ATTR_FORK;
+ if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+ new->xefi_flags |= XFS_EFI_BMBT_BLOCK;
+ new->xefi_owner = oinfo->oi_owner;
+ } else {
+ new->xefi_owner = XFS_RMAP_OWN_NULL;
+ }
+ trace_xfs_bmap_free_defer(tp->t_mountp,
+ XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
+ XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
+}
+
#ifdef DEBUG
/*
* Check if an AGF has a free extent record whose length is equal to
@@ -2903,13 +2957,16 @@ xfs_agf_verify(
if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > mp->m_ag_maxlevels ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > mp->m_ag_maxlevels)
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
+ mp->m_alloc_maxlevels ||
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
+ mp->m_alloc_maxlevels)
return __this_address;
if (xfs_has_rmapbt(mp) &&
(be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > mp->m_rmap_maxlevels))
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
+ mp->m_rmap_maxlevels))
return __this_address;
if (xfs_has_rmapbt(mp) &&
@@ -3495,3 +3552,20 @@ xfs_agfl_walk(
return 0;
}
+
+int __init
+xfs_extfree_intent_init_cache(void)
+{
+ xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent",
+ sizeof(struct xfs_extent_free_item),
+ 0, 0, NULL);
+
+ return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_extfree_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_extfree_item_cache);
+ xfs_extfree_item_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index df4aefaf0046..1c14a0b1abea 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -98,7 +98,7 @@ unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
struct xfs_perag *pag);
/*
- * Compute and fill in value of m_ag_maxlevels.
+ * Compute and fill in value of m_alloc_maxlevels.
*/
void
xfs_alloc_compute_maxlevels(
@@ -248,4 +248,40 @@ xfs_buf_to_agfl_bno(
return bp->b_addr;
}
+void __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
+ xfs_filblks_t len, const struct xfs_owner_info *oinfo,
+ bool skip_discard);
+
+/*
+ * List of extents to be free "later".
+ * The list is kept sorted on xbf_startblock.
+ */
+struct xfs_extent_free_item {
+ struct list_head xefi_list;
+ uint64_t xefi_owner;
+ xfs_fsblock_t xefi_startblock;/* starting fs block number */
+ xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
+ unsigned int xefi_flags;
+};
+
+#define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */
+#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */
+#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */
+
+static inline void
+xfs_free_extent_later(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ const struct xfs_owner_info *oinfo)
+{
+ __xfs_free_extent_later(tp, bno, len, oinfo, false);
+}
+
+
+extern struct kmem_cache *xfs_extfree_item_cache;
+
+int __init xfs_extfree_intent_init_cache(void);
+void xfs_extfree_intent_destroy_cache(void);
+
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 6746fd735550..8c9f73cc0bee 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -20,6 +20,7 @@
#include "xfs_trans.h"
#include "xfs_ag.h"
+static struct kmem_cache *xfs_allocbt_cur_cache;
STATIC struct xfs_btree_cur *
xfs_allocbt_dup_cursor(
@@ -316,7 +317,7 @@ xfs_allocbt_verify(
if (pag && pag->pagf_init) {
if (level >= pag->pagf_levels[btnum])
return __this_address;
- } else if (level >= mp->m_ag_maxlevels)
+ } else if (level >= mp->m_alloc_maxlevels)
return __this_address;
return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
@@ -477,12 +478,8 @@ xfs_allocbt_init_common(
ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
- cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
-
- cur->bc_tp = tp;
- cur->bc_mp = mp;
- cur->bc_btnum = btnum;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur = xfs_btree_alloc_cursor(mp, tp, btnum, mp->m_alloc_maxlevels,
+ xfs_allocbt_cur_cache);
cur->bc_ag.abt.active = false;
if (btnum == XFS_BTNUM_CNT) {
@@ -571,6 +568,17 @@ xfs_allocbt_commit_staged_btree(
}
}
+/* Calculate number of records in an alloc btree block. */
+static inline unsigned int
+xfs_allocbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(xfs_alloc_rec_t);
+ return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
+}
+
/*
* Calculate number of records in an alloc btree block.
*/
@@ -581,10 +589,26 @@ xfs_allocbt_maxrecs(
int leaf)
{
blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
+ return xfs_allocbt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(xfs_alloc_rec_t);
- return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
+/* Free space btrees are at their largest when every other block is free. */
+#define XFS_MAX_FREESP_RECORDS ((XFS_MAX_AG_BLOCKS + 1) / 2)
+
+/* Compute the max possible height for free space btrees. */
+unsigned int
+xfs_allocbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+ XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+ minrecs[0] = xfs_allocbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_allocbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_FREESP_RECORDS);
}
/* Calculate the freespace btree size for some records. */
@@ -595,3 +619,22 @@ xfs_allocbt_calc_size(
{
return xfs_btree_calc_size(mp->m_alloc_mnr, len);
}
+
+int __init
+xfs_allocbt_init_cur_cache(void)
+{
+ xfs_allocbt_cur_cache = kmem_cache_create("xfs_bnobt_cur",
+ xfs_btree_cur_sizeof(xfs_allocbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_allocbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_allocbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_allocbt_cur_cache);
+ xfs_allocbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index 2f6b816aaf9f..45df893ef6bb 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -60,4 +60,9 @@ extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
void xfs_allocbt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
+unsigned int xfs_allocbt_maxlevels_ondisk(void);
+
+int __init xfs_allocbt_init_cur_cache(void);
+void xfs_allocbt_destroy_cur_cache(void);
+
#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index e1d11e314228..014daa8c542d 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -770,7 +770,7 @@ xfs_attr_fork_remove(
ASSERT(ip->i_afp->if_nextents == 0);
xfs_idestroy_fork(ip->i_afp);
- kmem_cache_free(xfs_ifork_zone, ip->i_afp);
+ kmem_cache_free(xfs_ifork_cache, ip->i_afp);
ip->i_afp = NULL;
ip->i_forkoff = 0;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index b48230f1a361..4dccd4d90622 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -37,8 +37,7 @@
#include "xfs_icache.h"
#include "xfs_iomap.h"
-
-kmem_zone_t *xfs_bmap_free_item_zone;
+struct kmem_cache *xfs_bmap_intent_cache;
/*
* Miscellaneous helper functions
@@ -93,6 +92,7 @@ xfs_bmap_compute_maxlevels(
maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
}
mp->m_bm_maxlevels[whichfork] = level;
+ ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk());
}
unsigned int
@@ -239,11 +239,11 @@ xfs_bmap_get_bp(
if (!cur)
return NULL;
- for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
- if (!cur->bc_bufs[i])
+ for (i = 0; i < cur->bc_maxlevels; i++) {
+ if (!cur->bc_levels[i].bp)
break;
- if (xfs_buf_daddr(cur->bc_bufs[i]) == bno)
- return cur->bc_bufs[i];
+ if (xfs_buf_daddr(cur->bc_levels[i].bp) == bno)
+ return cur->bc_levels[i].bp;
}
/* Chase down all the log items to see if the bp is there */
@@ -316,7 +316,7 @@ xfs_check_block(
*/
STATIC void
xfs_bmap_check_leaf_extents(
- xfs_btree_cur_t *cur, /* btree cursor or null */
+ struct xfs_btree_cur *cur, /* btree cursor or null */
xfs_inode_t *ip, /* incore inode pointer */
int whichfork) /* data or attr fork */
{
@@ -522,56 +522,6 @@ xfs_bmap_validate_ret(
#endif /* DEBUG */
/*
- * bmap free list manipulation functions
- */
-
-/*
- * Add the extent to the list of extents to be free at transaction end.
- * The list is maintained sorted (by block number).
- */
-void
-__xfs_bmap_add_free(
- struct xfs_trans *tp,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- const struct xfs_owner_info *oinfo,
- bool skip_discard)
-{
- struct xfs_extent_free_item *new; /* new element */
-#ifdef DEBUG
- struct xfs_mount *mp = tp->t_mountp;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
-
- ASSERT(bno != NULLFSBLOCK);
- ASSERT(len > 0);
- ASSERT(len <= MAXEXTLEN);
- ASSERT(!isnullstartblock(bno));
- agno = XFS_FSB_TO_AGNO(mp, bno);
- agbno = XFS_FSB_TO_AGBNO(mp, bno);
- ASSERT(agno < mp->m_sb.sb_agcount);
- ASSERT(agbno < mp->m_sb.sb_agblocks);
- ASSERT(len < mp->m_sb.sb_agblocks);
- ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
-#endif
- ASSERT(xfs_bmap_free_item_zone != NULL);
-
- new = kmem_cache_alloc(xfs_bmap_free_item_zone,
- GFP_KERNEL | __GFP_NOFAIL);
- new->xefi_startblock = bno;
- new->xefi_blockcount = (xfs_extlen_t)len;
- if (oinfo)
- new->xefi_oinfo = *oinfo;
- else
- new->xefi_oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
- new->xefi_skip_discard = skip_discard;
- trace_xfs_bmap_free_defer(tp->t_mountp,
- XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
- XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
-}
-
-/*
* Inode fork format manipulation functions
*/
@@ -625,12 +575,12 @@ xfs_bmap_btree_to_extents(
if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
return error;
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
- xfs_bmap_add_free(cur->bc_tp, cbno, 1, &oinfo);
+ xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo);
ip->i_nblocks--;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
xfs_trans_binval(tp, cbp);
- if (cur->bc_bufs[0] == cbp)
- cur->bc_bufs[0] = NULL;
+ if (cur->bc_levels[0].bp == cbp)
+ cur->bc_levels[0].bp = NULL;
xfs_iroot_realloc(ip, -1, whichfork);
ASSERT(ifp->if_broot == NULL);
ifp->if_format = XFS_DINODE_FMT_EXTENTS;
@@ -925,7 +875,7 @@ xfs_bmap_add_attrfork_btree(
int *flags) /* inode logging flags */
{
struct xfs_btree_block *block = ip->i_df.if_broot;
- xfs_btree_cur_t *cur; /* btree cursor */
+ struct xfs_btree_cur *cur; /* btree cursor */
int error; /* error return value */
xfs_mount_t *mp; /* file system mount struct */
int stat; /* newroot status */
@@ -968,7 +918,7 @@ xfs_bmap_add_attrfork_extents(
struct xfs_inode *ip, /* incore inode pointer */
int *flags) /* inode logging flags */
{
- xfs_btree_cur_t *cur; /* bmap btree cursor */
+ struct xfs_btree_cur *cur; /* bmap btree cursor */
int error; /* error return value */
if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <=
@@ -1988,11 +1938,11 @@ xfs_bmap_add_extent_unwritten_real(
xfs_inode_t *ip, /* incore inode pointer */
int whichfork,
struct xfs_iext_cursor *icur,
- xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
+ struct xfs_btree_cur **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
int *logflagsp) /* inode logging flags */
{
- xfs_btree_cur_t *cur; /* btree cursor */
+ struct xfs_btree_cur *cur; /* btree cursor */
int error; /* error return value */
int i; /* temp state */
struct xfs_ifork *ifp; /* inode fork pointer */
@@ -5045,7 +4995,7 @@ xfs_bmap_del_extent_real(
xfs_inode_t *ip, /* incore inode pointer */
xfs_trans_t *tp, /* current transaction pointer */
struct xfs_iext_cursor *icur,
- xfs_btree_cur_t *cur, /* if null, not a btree */
+ struct xfs_btree_cur *cur, /* if null, not a btree */
xfs_bmbt_irec_t *del, /* data to remove from extents */
int *logflagsp, /* inode logging flags */
int whichfork, /* data or attr fork */
@@ -5296,7 +5246,7 @@ xfs_bmap_del_extent_real(
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
xfs_refcount_decrease_extent(tp, del);
} else {
- __xfs_bmap_add_free(tp, del->br_startblock,
+ __xfs_free_extent_later(tp, del->br_startblock,
del->br_blockcount, NULL,
(bflags & XFS_BMAPI_NODISCARD) ||
del->br_state == XFS_EXT_UNWRITTEN);
@@ -6189,7 +6139,7 @@ __xfs_bmap_add(
bmap->br_blockcount,
bmap->br_state);
- bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS);
+ bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
INIT_LIST_HEAD(&bi->bi_list);
bi->bi_type = type;
bi->bi_owner = ip;
@@ -6300,3 +6250,20 @@ xfs_bmap_validate_extent(
return __this_address;
return NULL;
}
+
+int __init
+xfs_bmap_intent_init_cache(void)
+{
+ xfs_bmap_intent_cache = kmem_cache_create("xfs_bmap_intent",
+ sizeof(struct xfs_bmap_intent),
+ 0, 0, NULL);
+
+ return xfs_bmap_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_bmap_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_bmap_intent_cache);
+ xfs_bmap_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 67641f669918..03d9aaf87413 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -13,8 +13,6 @@ struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
-extern kmem_zone_t *xfs_bmap_free_item_zone;
-
/*
* Argument structure for xfs_bmap_alloc.
*/
@@ -44,19 +42,6 @@ struct xfs_bmalloca {
int flags;
};
-/*
- * List of extents to be free "later".
- * The list is kept sorted on xbf_startblock.
- */
-struct xfs_extent_free_item
-{
- xfs_fsblock_t xefi_startblock;/* starting fs block number */
- xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
- bool xefi_skip_discard;
- struct list_head xefi_list;
- struct xfs_owner_info xefi_oinfo; /* extent owner */
-};
-
#define XFS_BMAP_MAX_NMAP 4
/*
@@ -189,9 +174,6 @@ unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp,
struct xfs_inode *ip, int whichfork);
-void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno,
- xfs_filblks_t len, const struct xfs_owner_info *oinfo,
- bool skip_discard);
void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
@@ -239,16 +221,6 @@ int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
struct xfs_bmbt_irec *new, int *logflagsp);
-static inline void
-xfs_bmap_add_free(
- struct xfs_trans *tp,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- const struct xfs_owner_info *oinfo)
-{
- __xfs_bmap_add_free(tp, bno, len, oinfo, false);
-}
-
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,
XFS_BMAP_UNMAP,
@@ -257,8 +229,8 @@ enum xfs_bmap_intent_type {
struct xfs_bmap_intent {
struct list_head bi_list;
enum xfs_bmap_intent_type bi_type;
- struct xfs_inode *bi_owner;
int bi_whichfork;
+ struct xfs_inode *bi_owner;
struct xfs_bmbt_irec bi_bmap;
};
@@ -290,4 +262,9 @@ int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
int flags);
+extern struct kmem_cache *xfs_bmap_intent_cache;
+
+int __init xfs_bmap_intent_init_cache(void);
+void xfs_bmap_intent_destroy_cache(void);
+
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 72444b8b38a6..453309fc85f2 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -22,6 +22,8 @@
#include "xfs_trace.h"
#include "xfs_rmap.h"
+static struct kmem_cache *xfs_bmbt_cur_cache;
+
/*
* Convert on-disk form of btree root to in-memory form.
*/
@@ -286,7 +288,7 @@ xfs_bmbt_free_block(
struct xfs_owner_info oinfo;
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
- xfs_bmap_add_free(cur->bc_tp, fsbno, 1, &oinfo);
+ xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo);
ip->i_nblocks--;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -552,13 +554,9 @@ xfs_bmbt_init_cursor(
struct xfs_btree_cur *cur;
ASSERT(whichfork != XFS_COW_FORK);
- cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
-
- cur->bc_tp = tp;
- cur->bc_mp = mp;
+ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP,
+ mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache);
cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
- cur->bc_btnum = XFS_BTNUM_BMAP;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2);
cur->bc_ops = &xfs_bmbt_ops;
@@ -575,6 +573,17 @@ xfs_bmbt_init_cursor(
return cur;
}
+/* Calculate number of records in a block mapping btree block. */
+static inline unsigned int
+xfs_bmbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(xfs_bmbt_rec_t);
+ return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+
/*
* Calculate number of records in a bmap btree block.
*/
@@ -585,10 +594,24 @@ xfs_bmbt_maxrecs(
int leaf)
{
blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+ return xfs_bmbt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(xfs_bmbt_rec_t);
- return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+/* Compute the max possible height for block mapping btrees. */
+unsigned int
+xfs_bmbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+ XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+ minrecs[0] = xfs_bmbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2;
+
+ /* One extra level for the inode root. */
+ return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1;
}
/*
@@ -654,3 +677,22 @@ xfs_bmbt_calc_size(
{
return xfs_btree_calc_size(mp->m_bmap_dmnr, len);
}
+
+int __init
+xfs_bmbt_init_cur_cache(void)
+{
+ xfs_bmbt_cur_cache = kmem_cache_create("xfs_bmbt_cur",
+ xfs_btree_cur_sizeof(xfs_bmbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_bmbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_bmbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_bmbt_cur_cache);
+ xfs_bmbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 729e3bc569be..3e7a40a83835 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -110,4 +110,9 @@ extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
+unsigned int xfs_bmbt_maxlevels_ondisk(void);
+
+int __init xfs_bmbt_init_cur_cache(void);
+void xfs_bmbt_destroy_cur_cache(void);
+
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 298395481713..b4e19aacb9de 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -22,11 +22,11 @@
#include "xfs_log.h"
#include "xfs_btree_staging.h"
#include "xfs_ag.h"
-
-/*
- * Cursor allocation zone.
- */
-kmem_zone_t *xfs_btree_cur_zone;
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount_btree.h"
/*
* Btree magic numbers.
@@ -367,8 +367,8 @@ xfs_btree_del_cursor(
* way we won't have initialized all the entries down to 0.
*/
for (i = 0; i < cur->bc_nlevels; i++) {
- if (cur->bc_bufs[i])
- xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
+ if (cur->bc_levels[i].bp)
+ xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp);
else if (!error)
break;
}
@@ -379,7 +379,7 @@ xfs_btree_del_cursor(
kmem_free(cur->bc_ops);
if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
xfs_perag_put(cur->bc_ag.pag);
- kmem_cache_free(xfs_btree_cur_zone, cur);
+ kmem_cache_free(cur->bc_cache, cur);
}
/*
@@ -388,14 +388,14 @@ xfs_btree_del_cursor(
*/
int /* error */
xfs_btree_dup_cursor(
- xfs_btree_cur_t *cur, /* input cursor */
- xfs_btree_cur_t **ncur) /* output cursor */
+ struct xfs_btree_cur *cur, /* input cursor */
+ struct xfs_btree_cur **ncur) /* output cursor */
{
struct xfs_buf *bp; /* btree block's buffer pointer */
int error; /* error return value */
int i; /* level number of btree block */
xfs_mount_t *mp; /* mount structure for filesystem */
- xfs_btree_cur_t *new; /* new cursor value */
+ struct xfs_btree_cur *new; /* new cursor value */
xfs_trans_t *tp; /* transaction pointer, can be NULL */
tp = cur->bc_tp;
@@ -415,9 +415,9 @@ xfs_btree_dup_cursor(
* For each level current, re-get the buffer and copy the ptr value.
*/
for (i = 0; i < new->bc_nlevels; i++) {
- new->bc_ptrs[i] = cur->bc_ptrs[i];
- new->bc_ra[i] = cur->bc_ra[i];
- bp = cur->bc_bufs[i];
+ new->bc_levels[i].ptr = cur->bc_levels[i].ptr;
+ new->bc_levels[i].ra = cur->bc_levels[i].ra;
+ bp = cur->bc_levels[i].bp;
if (bp) {
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
xfs_buf_daddr(bp), mp->m_bsize,
@@ -429,7 +429,7 @@ xfs_btree_dup_cursor(
return error;
}
}
- new->bc_bufs[i] = bp;
+ new->bc_levels[i].bp = bp;
}
*ncur = new;
return 0;
@@ -681,7 +681,7 @@ xfs_btree_get_block(
return xfs_btree_get_iroot(cur);
}
- *bpp = cur->bc_bufs[level];
+ *bpp = cur->bc_levels[level].bp;
return XFS_BUF_TO_BLOCK(*bpp);
}
@@ -691,7 +691,7 @@ xfs_btree_get_block(
*/
STATIC int /* success=1, failure=0 */
xfs_btree_firstrec(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int level) /* level to change */
{
struct xfs_btree_block *block; /* generic btree block pointer */
@@ -711,7 +711,7 @@ xfs_btree_firstrec(
/*
* Set the ptr value to 1, that's the first record/key.
*/
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
return 1;
}
@@ -721,7 +721,7 @@ xfs_btree_firstrec(
*/
STATIC int /* success=1, failure=0 */
xfs_btree_lastrec(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int level) /* level to change */
{
struct xfs_btree_block *block; /* generic btree block pointer */
@@ -741,7 +741,7 @@ xfs_btree_lastrec(
/*
* Set the ptr value to numrecs, that's the last record/key.
*/
- cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
+ cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs);
return 1;
}
@@ -922,11 +922,11 @@ xfs_btree_readahead(
(lev == cur->bc_nlevels - 1))
return 0;
- if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+ if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra)
return 0;
- cur->bc_ra[lev] |= lr;
- block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
+ cur->bc_levels[lev].ra |= lr;
+ block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
return xfs_btree_readahead_lblock(cur, lr, block);
@@ -985,28 +985,28 @@ xfs_btree_readahead_ptr(
*/
STATIC void
xfs_btree_setbuf(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int lev, /* level in btree */
struct xfs_buf *bp) /* new buffer to set */
{
struct xfs_btree_block *b; /* btree block */
- if (cur->bc_bufs[lev])
- xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
- cur->bc_bufs[lev] = bp;
- cur->bc_ra[lev] = 0;
+ if (cur->bc_levels[lev].bp)
+ xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp);
+ cur->bc_levels[lev].bp = bp;
+ cur->bc_levels[lev].ra = 0;
b = XFS_BUF_TO_BLOCK(bp);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
- cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+ cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
- cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+ cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
} else {
if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
- cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+ cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
- cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+ cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
}
}
@@ -1548,7 +1548,7 @@ xfs_btree_increment(
#endif
/* We're done if we remain in the block after the increment. */
- if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
+ if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block))
goto out1;
/* Fail if we just went off the right edge of the tree. */
@@ -1571,7 +1571,7 @@ xfs_btree_increment(
goto error0;
#endif
- if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
+ if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block))
break;
/* Read-ahead the right block for the next loop. */
@@ -1598,14 +1598,14 @@ xfs_btree_increment(
for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
union xfs_btree_ptr *ptrp;
- ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+ ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
--lev;
error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
if (error)
goto error0;
xfs_btree_setbuf(cur, lev, bp);
- cur->bc_ptrs[lev] = 1;
+ cur->bc_levels[lev].ptr = 1;
}
out1:
*stat = 1;
@@ -1641,7 +1641,7 @@ xfs_btree_decrement(
xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
/* We're done if we remain in the block after the decrement. */
- if (--cur->bc_ptrs[level] > 0)
+ if (--cur->bc_levels[level].ptr > 0)
goto out1;
/* Get a pointer to the btree block. */
@@ -1665,7 +1665,7 @@ xfs_btree_decrement(
* Stop when we don't go off the left edge of a block.
*/
for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
- if (--cur->bc_ptrs[lev] > 0)
+ if (--cur->bc_levels[lev].ptr > 0)
break;
/* Read-ahead the left block for the next loop. */
xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
@@ -1691,13 +1691,13 @@ xfs_btree_decrement(
for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
union xfs_btree_ptr *ptrp;
- ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+ ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
--lev;
error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
if (error)
goto error0;
xfs_btree_setbuf(cur, lev, bp);
- cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
+ cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block);
}
out1:
*stat = 1;
@@ -1735,7 +1735,7 @@ xfs_btree_lookup_get_block(
*
* Otherwise throw it away and get a new one.
*/
- bp = cur->bc_bufs[level];
+ bp = cur->bc_levels[level].bp;
error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
if (error)
return error;
@@ -1864,7 +1864,7 @@ xfs_btree_lookup(
return -EFSCORRUPTED;
}
- cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+ cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE;
*stat = 0;
return 0;
}
@@ -1916,7 +1916,7 @@ xfs_btree_lookup(
if (error)
goto error0;
- cur->bc_ptrs[level] = keyno;
+ cur->bc_levels[level].ptr = keyno;
}
}
@@ -1933,7 +1933,7 @@ xfs_btree_lookup(
!xfs_btree_ptr_is_null(cur, &ptr)) {
int i;
- cur->bc_ptrs[0] = keyno;
+ cur->bc_levels[0].ptr = keyno;
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto error0;
@@ -1944,7 +1944,7 @@ xfs_btree_lookup(
}
} else if (dir == XFS_LOOKUP_LE && diff > 0)
keyno--;
- cur->bc_ptrs[0] = keyno;
+ cur->bc_levels[0].ptr = keyno;
/* Return if we succeeded or not. */
if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
@@ -2104,7 +2104,7 @@ __xfs_btree_updkeys(
if (error)
return error;
#endif
- ptr = cur->bc_ptrs[level];
+ ptr = cur->bc_levels[level].ptr;
nlkey = xfs_btree_key_addr(cur, ptr, block);
nhkey = xfs_btree_high_key_addr(cur, ptr, block);
if (!force_all &&
@@ -2171,7 +2171,7 @@ xfs_btree_update_keys(
if (error)
return error;
#endif
- ptr = cur->bc_ptrs[level];
+ ptr = cur->bc_levels[level].ptr;
kp = xfs_btree_key_addr(cur, ptr, block);
xfs_btree_copy_keys(cur, kp, &key, 1);
xfs_btree_log_keys(cur, bp, ptr, ptr);
@@ -2205,7 +2205,7 @@ xfs_btree_update(
goto error0;
#endif
/* Get the address of the rec to be updated. */
- ptr = cur->bc_ptrs[0];
+ ptr = cur->bc_levels[0].ptr;
rp = xfs_btree_rec_addr(cur, ptr, block);
/* Fill in the new contents and log them. */
@@ -2280,7 +2280,7 @@ xfs_btree_lshift(
* If the cursor entry is the one that would be moved, don't
* do it... it's too complicated.
*/
- if (cur->bc_ptrs[level] <= 1)
+ if (cur->bc_levels[level].ptr <= 1)
goto out0;
/* Set up the left neighbor as "left". */
@@ -2414,7 +2414,7 @@ xfs_btree_lshift(
goto error0;
/* Slide the cursor value left one. */
- cur->bc_ptrs[level]--;
+ cur->bc_levels[level].ptr--;
*stat = 1;
return 0;
@@ -2476,7 +2476,7 @@ xfs_btree_rshift(
* do it... it's too complicated.
*/
lrecs = xfs_btree_get_numrecs(left);
- if (cur->bc_ptrs[level] >= lrecs)
+ if (cur->bc_levels[level].ptr >= lrecs)
goto out0;
/* Set up the right neighbor as "right". */
@@ -2664,7 +2664,7 @@ __xfs_btree_split(
*/
lrecs = xfs_btree_get_numrecs(left);
rrecs = lrecs / 2;
- if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
+ if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1)
rrecs++;
src_index = (lrecs - rrecs + 1);
@@ -2760,9 +2760,9 @@ __xfs_btree_split(
* If it's just pointing past the last entry in left, then we'll
* insert there, so don't change anything in that case.
*/
- if (cur->bc_ptrs[level] > lrecs + 1) {
+ if (cur->bc_levels[level].ptr > lrecs + 1) {
xfs_btree_setbuf(cur, level, rbp);
- cur->bc_ptrs[level] -= lrecs;
+ cur->bc_levels[level].ptr -= lrecs;
}
/*
* If there are more levels, we'll need another cursor which refers
@@ -2772,7 +2772,7 @@ __xfs_btree_split(
error = xfs_btree_dup_cursor(cur, curp);
if (error)
goto error0;
- (*curp)->bc_ptrs[level + 1]++;
+ (*curp)->bc_levels[level + 1].ptr++;
}
*ptrp = rptr;
*stat = 1;
@@ -2933,7 +2933,8 @@ xfs_btree_new_iroot(
be16_add_cpu(&block->bb_level, 1);
xfs_btree_set_numrecs(block, 1);
cur->bc_nlevels++;
- cur->bc_ptrs[level + 1] = 1;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
+ cur->bc_levels[level + 1].ptr = 1;
kp = xfs_btree_key_addr(cur, 1, block);
ckp = xfs_btree_key_addr(cur, 1, cblock);
@@ -3094,8 +3095,9 @@ xfs_btree_new_root(
/* Fix up the cursor. */
xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
- cur->bc_ptrs[cur->bc_nlevels] = nptr;
+ cur->bc_levels[cur->bc_nlevels].ptr = nptr;
cur->bc_nlevels++;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
*stat = 1;
return 0;
error0:
@@ -3152,7 +3154,7 @@ xfs_btree_make_block_unfull(
return error;
if (*stat) {
- *oindex = *index = cur->bc_ptrs[level];
+ *oindex = *index = cur->bc_levels[level].ptr;
return 0;
}
@@ -3167,7 +3169,7 @@ xfs_btree_make_block_unfull(
return error;
- *index = cur->bc_ptrs[level];
+ *index = cur->bc_levels[level].ptr;
return 0;
}
@@ -3214,7 +3216,7 @@ xfs_btree_insrec(
}
/* If we're off the left edge, return failure. */
- ptr = cur->bc_ptrs[level];
+ ptr = cur->bc_levels[level].ptr;
if (ptr == 0) {
*stat = 0;
return 0;
@@ -3557,7 +3559,7 @@ xfs_btree_kill_iroot(
if (error)
return error;
- cur->bc_bufs[level - 1] = NULL;
+ cur->bc_levels[level - 1].bp = NULL;
be16_add_cpu(&block->bb_level, -1);
xfs_trans_log_inode(cur->bc_tp, ip,
XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
@@ -3590,8 +3592,8 @@ xfs_btree_kill_root(
if (error)
return error;
- cur->bc_bufs[level] = NULL;
- cur->bc_ra[level] = 0;
+ cur->bc_levels[level].bp = NULL;
+ cur->bc_levels[level].ra = 0;
cur->bc_nlevels--;
return 0;
@@ -3650,7 +3652,7 @@ xfs_btree_delrec(
tcur = NULL;
/* Get the index of the entry being deleted, check for nothing there. */
- ptr = cur->bc_ptrs[level];
+ ptr = cur->bc_levels[level].ptr;
if (ptr == 0) {
*stat = 0;
return 0;
@@ -3960,7 +3962,7 @@ xfs_btree_delrec(
xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
tcur = NULL;
if (level == 0)
- cur->bc_ptrs[0]++;
+ cur->bc_levels[0].ptr++;
*stat = 1;
return 0;
@@ -4097,9 +4099,9 @@ xfs_btree_delrec(
* cursor to the left block, and fix up the index.
*/
if (bp != lbp) {
- cur->bc_bufs[level] = lbp;
- cur->bc_ptrs[level] += lrecs;
- cur->bc_ra[level] = 0;
+ cur->bc_levels[level].bp = lbp;
+ cur->bc_levels[level].ptr += lrecs;
+ cur->bc_levels[level].ra = 0;
}
/*
* If we joined with the right neighbor and there's a level above
@@ -4119,16 +4121,16 @@ xfs_btree_delrec(
* We can't use decrement because it would change the next level up.
*/
if (level > 0)
- cur->bc_ptrs[level]--;
+ cur->bc_levels[level].ptr--;
/*
* We combined blocks, so we have to update the parent keys if the
- * btree supports overlapped intervals. However, bc_ptrs[level + 1]
- * points to the old block so that the caller knows which record to
- * delete. Therefore, the caller must be savvy enough to call updkeys
- * for us if we return stat == 2. The other exit points from this
- * function don't require deletions further up the tree, so they can
- * call updkeys directly.
+ * btree supports overlapped intervals. However,
+ * bc_levels[level + 1].ptr points to the old block so that the caller
+ * knows which record to delete. Therefore, the caller must be savvy
+ * enough to call updkeys for us if we return stat == 2. The other
+ * exit points from this function don't require deletions further up
+ * the tree, so they can call updkeys directly.
*/
/* Return value means the next level up has something to do. */
@@ -4182,7 +4184,7 @@ xfs_btree_delete(
if (i == 0) {
for (level = 1; level < cur->bc_nlevels; level++) {
- if (cur->bc_ptrs[level] == 0) {
+ if (cur->bc_levels[level].ptr == 0) {
error = xfs_btree_decrement(cur, level, &i);
if (error)
goto error0;
@@ -4213,7 +4215,7 @@ xfs_btree_get_rec(
int error; /* error return value */
#endif
- ptr = cur->bc_ptrs[0];
+ ptr = cur->bc_levels[0].ptr;
block = xfs_btree_get_block(cur, 0, &bp);
#ifdef DEBUG
@@ -4512,21 +4514,76 @@ xfs_btree_sblock_verify(
}
/*
- * Calculate the number of btree levels needed to store a given number of
- * records in a short-format btree.
+ * For the given limits on leaf and keyptr records per block, calculate the
+ * height of the tree needed to index the number of leaf records.
*/
-uint
+unsigned int
xfs_btree_compute_maxlevels(
- uint *limits,
- unsigned long len)
+ const unsigned int *limits,
+ unsigned long long records)
+{
+ unsigned long long level_blocks = howmany_64(records, limits[0]);
+ unsigned int height = 1;
+
+ while (level_blocks > 1) {
+ level_blocks = howmany_64(level_blocks, limits[1]);
+ height++;
+ }
+
+ return height;
+}
+
+/*
+ * For the given limits on leaf and keyptr records per block, calculate the
+ * number of blocks needed to index the given number of leaf records.
+ */
+unsigned long long
+xfs_btree_calc_size(
+ const unsigned int *limits,
+ unsigned long long records)
+{
+ unsigned long long level_blocks = howmany_64(records, limits[0]);
+ unsigned long long blocks = level_blocks;
+
+ while (level_blocks > 1) {
+ level_blocks = howmany_64(level_blocks, limits[1]);
+ blocks += level_blocks;
+ }
+
+ return blocks;
+}
+
+/*
+ * Given a number of available blocks for the btree to consume with records and
+ * pointers, calculate the height of the tree needed to index all the records
+ * that space can hold based on the number of pointers each interior node
+ * holds.
+ *
+ * We start by assuming a single level tree consumes a single block, then track
+ * the number of blocks each node level consumes until we no longer have space
+ * to store the next node level. At this point, we are indexing all the leaf
+ * blocks in the space, and there's no more free space to split the tree any
+ * further. That's our maximum btree height.
+ */
+unsigned int
+xfs_btree_space_to_height(
+ const unsigned int *limits,
+ unsigned long long leaf_blocks)
{
- uint level;
- unsigned long maxblocks;
+ unsigned long long node_blocks = limits[1];
+ unsigned long long blocks_left = leaf_blocks - 1;
+ unsigned int height = 1;
+
+ if (leaf_blocks < 1)
+ return 0;
+
+ while (node_blocks < blocks_left) {
+ blocks_left -= node_blocks;
+ node_blocks *= limits[1];
+ height++;
+ }
- maxblocks = (len + limits[0] - 1) / limits[0];
- for (level = 1; maxblocks > 1; level++)
- maxblocks = (maxblocks + limits[1] - 1) / limits[1];
- return level;
+ return height;
}
/*
@@ -4661,23 +4718,25 @@ xfs_btree_overlapped_query_range(
if (error)
goto out;
#endif
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
while (level < cur->bc_nlevels) {
block = xfs_btree_get_block(cur, level, &bp);
/* End of node, pop back towards the root. */
- if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
+ if (cur->bc_levels[level].ptr >
+ be16_to_cpu(block->bb_numrecs)) {
pop_up:
if (level < cur->bc_nlevels - 1)
- cur->bc_ptrs[level + 1]++;
+ cur->bc_levels[level + 1].ptr++;
level++;
continue;
}
if (level == 0) {
/* Handle a leaf node. */
- recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
+ recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
+ block);
cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
@@ -4700,14 +4759,15 @@ pop_up:
/* Record is larger than high key; pop. */
goto pop_up;
}
- cur->bc_ptrs[level]++;
+ cur->bc_levels[level].ptr++;
continue;
}
/* Handle an internal node. */
- lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
- hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
- pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
+ lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
+ hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr,
+ block);
+ pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
@@ -4730,13 +4790,13 @@ pop_up:
if (error)
goto out;
#endif
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
continue;
} else if (hdiff < 0) {
/* The low key is larger than the upper range; pop. */
goto pop_up;
}
- cur->bc_ptrs[level]++;
+ cur->bc_levels[level].ptr++;
}
out:
@@ -4747,13 +4807,14 @@ out:
* with a zero-results range query, so release the buffers if we
* failed to return any results.
*/
- if (cur->bc_bufs[0] == NULL) {
+ if (cur->bc_levels[0].bp == NULL) {
for (i = 0; i < cur->bc_nlevels; i++) {
- if (cur->bc_bufs[i]) {
- xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
- cur->bc_bufs[i] = NULL;
- cur->bc_ptrs[i] = 0;
- cur->bc_ra[i] = 0;
+ if (cur->bc_levels[i].bp) {
+ xfs_trans_brelse(cur->bc_tp,
+ cur->bc_levels[i].bp);
+ cur->bc_levels[i].bp = NULL;
+ cur->bc_levels[i].ptr = 0;
+ cur->bc_levels[i].ra = 0;
}
}
}
@@ -4816,29 +4877,6 @@ xfs_btree_query_all(
return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
}
-/*
- * Calculate the number of blocks needed to store a given number of records
- * in a short-format (per-AG metadata) btree.
- */
-unsigned long long
-xfs_btree_calc_size(
- uint *limits,
- unsigned long long len)
-{
- int level;
- int maxrecs;
- unsigned long long rval;
-
- maxrecs = limits[0];
- for (level = 0, rval = 0; len > 1; level++) {
- len += maxrecs - 1;
- do_div(len, maxrecs);
- maxrecs = limits[1];
- rval += len;
- }
- return rval;
-}
-
static int
xfs_btree_count_blocks_helper(
struct xfs_btree_cur *cur,
@@ -4915,7 +4953,7 @@ xfs_btree_has_more_records(
block = xfs_btree_get_block(cur, 0, &bp);
/* There are still records in this block. */
- if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block))
+ if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block))
return true;
/* There are more record blocks. */
@@ -4924,3 +4962,42 @@ xfs_btree_has_more_records(
else
return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
}
+
+/* Set up all the btree cursor caches. */
+int __init
+xfs_btree_init_cur_caches(void)
+{
+ int error;
+
+ error = xfs_allocbt_init_cur_cache();
+ if (error)
+ return error;
+ error = xfs_inobt_init_cur_cache();
+ if (error)
+ goto err;
+ error = xfs_bmbt_init_cur_cache();
+ if (error)
+ goto err;
+ error = xfs_rmapbt_init_cur_cache();
+ if (error)
+ goto err;
+ error = xfs_refcountbt_init_cur_cache();
+ if (error)
+ goto err;
+
+ return 0;
+err:
+ xfs_btree_destroy_cur_caches();
+ return error;
+}
+
+/* Destroy all the btree cursor caches, if they've been allocated. */
+void
+xfs_btree_destroy_cur_caches(void)
+{
+ xfs_allocbt_destroy_cur_cache();
+ xfs_inobt_destroy_cur_cache();
+ xfs_bmbt_destroy_cur_cache();
+ xfs_rmapbt_destroy_cur_cache();
+ xfs_refcountbt_destroy_cur_cache();
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 4eaf8517f850..22d9f411fde6 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -13,8 +13,6 @@ struct xfs_trans;
struct xfs_ifork;
struct xfs_perag;
-extern kmem_zone_t *xfs_btree_cur_zone;
-
/*
* Generic key, ptr and record wrapper structures.
*
@@ -92,8 +90,6 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
#define XFS_BTREE_STATS_ADD(cur, stat, val) \
XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val)
-#define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */
-
struct xfs_btree_ops {
/* size of the key and record structures */
size_t key_len;
@@ -181,18 +177,18 @@ union xfs_btree_irec {
/* Per-AG btree information. */
struct xfs_btree_cur_ag {
- struct xfs_perag *pag;
+ struct xfs_perag *pag;
union {
struct xfs_buf *agbp;
struct xbtree_afakeroot *afake; /* for staging cursor */
};
union {
struct {
- unsigned long nr_ops; /* # record updates */
- int shape_changes; /* # of extent splits */
+ unsigned int nr_ops; /* # record updates */
+ unsigned int shape_changes; /* # of extent splits */
} refc;
struct {
- bool active; /* allocation cursor state */
+ bool active; /* allocation cursor state */
} abt;
};
};
@@ -212,26 +208,35 @@ struct xfs_btree_cur_ino {
#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1)
};
+struct xfs_btree_level {
+ /* buffer pointer */
+ struct xfs_buf *bp;
+
+ /* key/record number */
+ uint16_t ptr;
+
+ /* readahead info */
+#define XFS_BTCUR_LEFTRA (1 << 0) /* left sibling has been read-ahead */
+#define XFS_BTCUR_RIGHTRA (1 << 1) /* right sibling has been read-ahead */
+ uint16_t ra;
+};
+
/*
* Btree cursor structure.
* This collects all information needed by the btree code in one place.
*/
-typedef struct xfs_btree_cur
+struct xfs_btree_cur
{
struct xfs_trans *bc_tp; /* transaction we're in, if any */
struct xfs_mount *bc_mp; /* file system mount struct */
const struct xfs_btree_ops *bc_ops;
- uint bc_flags; /* btree features - below */
+ struct kmem_cache *bc_cache; /* cursor cache */
+ unsigned int bc_flags; /* btree features - below */
+ xfs_btnum_t bc_btnum; /* identifies which btree type */
union xfs_btree_irec bc_rec; /* current insert/search record value */
- struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */
- int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */
- uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */
-#define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */
-#define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */
- uint8_t bc_nlevels; /* number of levels in the tree */
- uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */
- xfs_btnum_t bc_btnum; /* identifies which btree type */
- int bc_statoff; /* offset of btre stats array */
+ uint8_t bc_nlevels; /* number of levels in the tree */
+ uint8_t bc_maxlevels; /* maximum levels for this btree type */
+ int bc_statoff; /* offset of btree stats array */
/*
* Short btree pointers need an agno to be able to turn the pointers
@@ -243,7 +248,21 @@ typedef struct xfs_btree_cur
struct xfs_btree_cur_ag bc_ag;
struct xfs_btree_cur_ino bc_ino;
};
-} xfs_btree_cur_t;
+
+ /* Must be at the end of the struct! */
+ struct xfs_btree_level bc_levels[];
+};
+
+/*
+ * Compute the size of a btree cursor that can handle a btree of a given
+ * height. The bc_levels array handles node and leaf blocks, so its size
+ * is exactly nlevels.
+ */
+static inline size_t
+xfs_btree_cur_sizeof(unsigned int nlevels)
+{
+ return struct_size((struct xfs_btree_cur *)NULL, bc_levels, nlevels);
+}
/* cursor flags */
#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */
@@ -258,7 +277,6 @@ typedef struct xfs_btree_cur
*/
#define XFS_BTREE_STAGING (1<<5)
-
#define XFS_BTREE_NOERROR 0
#define XFS_BTREE_ERROR 1
@@ -309,7 +327,7 @@ xfs_btree_check_sptr(
*/
void
xfs_btree_del_cursor(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int error); /* del because of error */
/*
@@ -318,8 +336,8 @@ xfs_btree_del_cursor(
*/
int /* error */
xfs_btree_dup_cursor(
- xfs_btree_cur_t *cur, /* input cursor */
- xfs_btree_cur_t **ncur);/* output cursor */
+ struct xfs_btree_cur *cur, /* input cursor */
+ struct xfs_btree_cur **ncur);/* output cursor */
/*
* Compute first and last byte offsets for the fields given.
@@ -460,8 +478,12 @@ xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp,
xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
unsigned int max_recs);
-uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len);
-unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
+unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits,
+ unsigned long long records);
+unsigned long long xfs_btree_calc_size(const unsigned int *limits,
+ unsigned long long records);
+unsigned int xfs_btree_space_to_height(const unsigned int *limits,
+ unsigned long long blocks);
/*
* Return codes for the query range iterator function are 0 to continue
@@ -527,7 +549,7 @@ struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur);
/* Does this cursor point to the last block in the given level? */
static inline bool
xfs_btree_islastblock(
- xfs_btree_cur_t *cur,
+ struct xfs_btree_cur *cur,
int level)
{
struct xfs_btree_block *block;
@@ -558,4 +580,27 @@ void xfs_btree_copy_keys(struct xfs_btree_cur *cur,
union xfs_btree_key *dst_key,
const union xfs_btree_key *src_key, int numkeys);
+static inline struct xfs_btree_cur *
+xfs_btree_alloc_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_btnum_t btnum,
+ uint8_t maxlevels,
+ struct kmem_cache *cache)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = kmem_cache_zalloc(cache, GFP_NOFS | __GFP_NOFAIL);
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_btnum = btnum;
+ cur->bc_maxlevels = maxlevels;
+ cur->bc_cache = cache;
+
+ return cur;
+}
+
+int __init xfs_btree_init_cur_caches(void);
+void xfs_btree_destroy_cur_caches(void);
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
index ac9e80152b5c..dd75e208b543 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.c
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -657,12 +657,12 @@ xfs_btree_bload_compute_geometry(
* checking levels 0 and 1 here, so set bc_nlevels such that the btree
* code doesn't interpret either as the root level.
*/
- cur->bc_nlevels = XFS_BTREE_MAXLEVELS - 1;
+ cur->bc_nlevels = cur->bc_maxlevels - 1;
xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
bbl->nr_records = nr_this_level = nr_records;
- for (cur->bc_nlevels = 1; cur->bc_nlevels < XFS_BTREE_MAXLEVELS;) {
+ for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) {
uint64_t level_blocks;
uint64_t dontcare64;
unsigned int level = cur->bc_nlevels - 1;
@@ -703,6 +703,7 @@ xfs_btree_bload_compute_geometry(
* block-based btree level.
*/
cur->bc_nlevels++;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
xfs_btree_bload_level_geometry(cur, bbl, level,
nr_this_level, &avg_per_block,
&level_blocks, &dontcare64);
@@ -718,13 +719,14 @@ xfs_btree_bload_compute_geometry(
/* Otherwise, we need another level of btree. */
cur->bc_nlevels++;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
}
nr_blocks += level_blocks;
nr_this_level = level_blocks;
}
- if (cur->bc_nlevels == XFS_BTREE_MAXLEVELS)
+ if (cur->bc_nlevels > cur->bc_maxlevels)
return -EOVERFLOW;
bbl->btree_height = cur->bc_nlevels;
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index c062e2c85178..dd7a2dbce1d1 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -72,7 +72,7 @@ STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state,
xfs_da_state_blk_t *save_blk);
-kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
+struct kmem_cache *xfs_da_state_cache; /* anchor for dir/attr state */
/*
* Allocate a dir-state structure.
@@ -84,7 +84,7 @@ xfs_da_state_alloc(
{
struct xfs_da_state *state;
- state = kmem_cache_zalloc(xfs_da_state_zone, GFP_NOFS | __GFP_NOFAIL);
+ state = kmem_cache_zalloc(xfs_da_state_cache, GFP_NOFS | __GFP_NOFAIL);
state->args = args;
state->mp = args->dp->i_mount;
return state;
@@ -113,7 +113,7 @@ xfs_da_state_free(xfs_da_state_t *state)
#ifdef DEBUG
memset((char *)state, 0, sizeof(*state));
#endif /* DEBUG */
- kmem_cache_free(xfs_da_state_zone, state);
+ kmem_cache_free(xfs_da_state_cache, state);
}
static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork)
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index ad5dd324631a..0faf7d9ac241 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -9,7 +9,6 @@
struct xfs_inode;
struct xfs_trans;
-struct zone;
/*
* Directory/attribute geometry information. There will be one of these for each
@@ -227,6 +226,6 @@ void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp,
void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp,
struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from);
-extern struct kmem_zone *xfs_da_state_zone;
+extern struct kmem_cache *xfs_da_state_cache;
#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index eff4a127188e..0805ade2d300 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -18,6 +18,12 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_log.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "xfs_bmap.h"
+#include "xfs_alloc.h"
+
+static struct kmem_cache *xfs_defer_pending_cache;
/*
* Deferred Operations in XFS
@@ -232,23 +238,20 @@ xfs_defer_trans_abort(
}
}
-/* Roll a transaction so we can do some deferred op processing. */
-STATIC int
-xfs_defer_trans_roll(
- struct xfs_trans **tpp)
+/*
+ * Capture resources that the caller said not to release ("held") when the
+ * transaction commits. Caller is responsible for zero-initializing @dres.
+ */
+static int
+xfs_defer_save_resources(
+ struct xfs_defer_resources *dres,
+ struct xfs_trans *tp)
{
- struct xfs_trans *tp = *tpp;
struct xfs_buf_log_item *bli;
struct xfs_inode_log_item *ili;
struct xfs_log_item *lip;
- struct xfs_buf *bplist[XFS_DEFER_OPS_NR_BUFS];
- struct xfs_inode *iplist[XFS_DEFER_OPS_NR_INODES];
- unsigned int ordered = 0; /* bitmap */
- int bpcount = 0, ipcount = 0;
- int i;
- int error;
- BUILD_BUG_ON(NBBY * sizeof(ordered) < XFS_DEFER_OPS_NR_BUFS);
+ BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS);
list_for_each_entry(lip, &tp->t_items, li_trans) {
switch (lip->li_type) {
@@ -256,28 +259,29 @@ xfs_defer_trans_roll(
bli = container_of(lip, struct xfs_buf_log_item,
bli_item);
if (bli->bli_flags & XFS_BLI_HOLD) {
- if (bpcount >= XFS_DEFER_OPS_NR_BUFS) {
+ if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) {
ASSERT(0);
return -EFSCORRUPTED;
}
if (bli->bli_flags & XFS_BLI_ORDERED)
- ordered |= (1U << bpcount);
+ dres->dr_ordered |=
+ (1U << dres->dr_bufs);
else
xfs_trans_dirty_buf(tp, bli->bli_buf);
- bplist[bpcount++] = bli->bli_buf;
+ dres->dr_bp[dres->dr_bufs++] = bli->bli_buf;
}
break;
case XFS_LI_INODE:
ili = container_of(lip, struct xfs_inode_log_item,
ili_item);
if (ili->ili_lock_flags == 0) {
- if (ipcount >= XFS_DEFER_OPS_NR_INODES) {
+ if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) {
ASSERT(0);
return -EFSCORRUPTED;
}
xfs_trans_log_inode(tp, ili->ili_inode,
XFS_ILOG_CORE);
- iplist[ipcount++] = ili->ili_inode;
+ dres->dr_ip[dres->dr_inos++] = ili->ili_inode;
}
break;
default:
@@ -285,7 +289,43 @@ xfs_defer_trans_roll(
}
}
- trace_xfs_defer_trans_roll(tp, _RET_IP_);
+ return 0;
+}
+
+/* Attach the held resources to the transaction. */
+static void
+xfs_defer_restore_resources(
+ struct xfs_trans *tp,
+ struct xfs_defer_resources *dres)
+{
+ unsigned short i;
+
+ /* Rejoin the joined inodes. */
+ for (i = 0; i < dres->dr_inos; i++)
+ xfs_trans_ijoin(tp, dres->dr_ip[i], 0);
+
+ /* Rejoin the buffers and dirty them so the log moves forward. */
+ for (i = 0; i < dres->dr_bufs; i++) {
+ xfs_trans_bjoin(tp, dres->dr_bp[i]);
+ if (dres->dr_ordered & (1U << i))
+ xfs_trans_ordered_buf(tp, dres->dr_bp[i]);
+ xfs_trans_bhold(tp, dres->dr_bp[i]);
+ }
+}
+
+/* Roll a transaction so we can do some deferred op processing. */
+STATIC int
+xfs_defer_trans_roll(
+ struct xfs_trans **tpp)
+{
+ struct xfs_defer_resources dres = { };
+ int error;
+
+ error = xfs_defer_save_resources(&dres, *tpp);
+ if (error)
+ return error;
+
+ trace_xfs_defer_trans_roll(*tpp, _RET_IP_);
/*
* Roll the transaction. Rolling always given a new transaction (even
@@ -295,22 +335,11 @@ xfs_defer_trans_roll(
* happened.
*/
error = xfs_trans_roll(tpp);
- tp = *tpp;
- /* Rejoin the joined inodes. */
- for (i = 0; i < ipcount; i++)
- xfs_trans_ijoin(tp, iplist[i], 0);
-
- /* Rejoin the buffers and dirty them so the log moves forward. */
- for (i = 0; i < bpcount; i++) {
- xfs_trans_bjoin(tp, bplist[i]);
- if (ordered & (1U << i))
- xfs_trans_ordered_buf(tp, bplist[i]);
- xfs_trans_bhold(tp, bplist[i]);
- }
+ xfs_defer_restore_resources(*tpp, &dres);
if (error)
- trace_xfs_defer_trans_roll_error(tp, error);
+ trace_xfs_defer_trans_roll_error(*tpp, error);
return error;
}
@@ -342,7 +371,7 @@ xfs_defer_cancel_list(
ops->cancel_item(pwi);
}
ASSERT(dfp->dfp_count == 0);
- kmem_free(dfp);
+ kmem_cache_free(xfs_defer_pending_cache, dfp);
}
}
@@ -439,7 +468,7 @@ xfs_defer_finish_one(
/* Done with the dfp, free it. */
list_del(&dfp->dfp_list);
- kmem_free(dfp);
+ kmem_cache_free(xfs_defer_pending_cache, dfp);
out:
if (ops->finish_cleanup)
ops->finish_cleanup(tp, state, error);
@@ -573,8 +602,8 @@ xfs_defer_add(
dfp = NULL;
}
if (!dfp) {
- dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
- KM_NOFS);
+ dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
+ GFP_NOFS | __GFP_NOFAIL);
dfp->dfp_type = type;
dfp->dfp_intent = NULL;
dfp->dfp_done = NULL;
@@ -627,10 +656,11 @@ xfs_defer_move(
*/
static struct xfs_defer_capture *
xfs_defer_ops_capture(
- struct xfs_trans *tp,
- struct xfs_inode *capture_ip)
+ struct xfs_trans *tp)
{
struct xfs_defer_capture *dfc;
+ unsigned short i;
+ int error;
if (list_empty(&tp->t_dfops))
return NULL;
@@ -654,27 +684,48 @@ xfs_defer_ops_capture(
/* Preserve the log reservation size. */
dfc->dfc_logres = tp->t_log_res;
+ error = xfs_defer_save_resources(&dfc->dfc_held, tp);
+ if (error) {
+ /*
+ * Resource capture should never fail, but if it does, we
+ * still have to shut down the log and release things
+ * properly.
+ */
+ xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE);
+ }
+
/*
- * Grab an extra reference to this inode and attach it to the capture
- * structure.
+ * Grab extra references to the inodes and buffers because callers are
+ * expected to release their held references after we commit the
+ * transaction.
*/
- if (capture_ip) {
- ihold(VFS_I(capture_ip));
- dfc->dfc_capture_ip = capture_ip;
+ for (i = 0; i < dfc->dfc_held.dr_inos; i++) {
+ ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL));
+ ihold(VFS_I(dfc->dfc_held.dr_ip[i]));
}
+ for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+ xfs_buf_hold(dfc->dfc_held.dr_bp[i]);
+
return dfc;
}
/* Release all resources that we used to capture deferred ops. */
void
-xfs_defer_ops_release(
+xfs_defer_ops_capture_free(
struct xfs_mount *mp,
struct xfs_defer_capture *dfc)
{
+ unsigned short i;
+
xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
- if (dfc->dfc_capture_ip)
- xfs_irele(dfc->dfc_capture_ip);
+
+ for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+ xfs_buf_relse(dfc->dfc_held.dr_bp[i]);
+
+ for (i = 0; i < dfc->dfc_held.dr_inos; i++)
+ xfs_irele(dfc->dfc_held.dr_ip[i]);
+
kmem_free(dfc);
}
@@ -689,24 +740,21 @@ xfs_defer_ops_release(
int
xfs_defer_ops_capture_and_commit(
struct xfs_trans *tp,
- struct xfs_inode *capture_ip,
struct list_head *capture_list)
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_defer_capture *dfc;
int error;
- ASSERT(!capture_ip || xfs_isilocked(capture_ip, XFS_ILOCK_EXCL));
-
/* If we don't capture anything, commit transaction and exit. */
- dfc = xfs_defer_ops_capture(tp, capture_ip);
+ dfc = xfs_defer_ops_capture(tp);
if (!dfc)
return xfs_trans_commit(tp);
/* Commit the transaction and add the capture structure to the list. */
error = xfs_trans_commit(tp);
if (error) {
- xfs_defer_ops_release(mp, dfc);
+ xfs_defer_ops_capture_free(mp, dfc);
return error;
}
@@ -724,17 +772,19 @@ void
xfs_defer_ops_continue(
struct xfs_defer_capture *dfc,
struct xfs_trans *tp,
- struct xfs_inode **captured_ipp)
+ struct xfs_defer_resources *dres)
{
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
/* Lock and join the captured inode to the new transaction. */
- if (dfc->dfc_capture_ip) {
- xfs_ilock(dfc->dfc_capture_ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, dfc->dfc_capture_ip, 0);
- }
- *captured_ipp = dfc->dfc_capture_ip;
+ if (dfc->dfc_held.dr_inos == 2)
+ xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
+ dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
+ else if (dfc->dfc_held.dr_inos == 1)
+ xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL);
+ xfs_defer_restore_resources(tp, &dfc->dfc_held);
+ memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources));
/* Move captured dfops chain and state to the transaction. */
list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
@@ -742,3 +792,82 @@ xfs_defer_ops_continue(
kmem_free(dfc);
}
+
+/* Release the resources captured and continued during recovery. */
+void
+xfs_defer_resources_rele(
+ struct xfs_defer_resources *dres)
+{
+ unsigned short i;
+
+ for (i = 0; i < dres->dr_inos; i++) {
+ xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL);
+ xfs_irele(dres->dr_ip[i]);
+ dres->dr_ip[i] = NULL;
+ }
+
+ for (i = 0; i < dres->dr_bufs; i++) {
+ xfs_buf_relse(dres->dr_bp[i]);
+ dres->dr_bp[i] = NULL;
+ }
+
+ dres->dr_inos = 0;
+ dres->dr_bufs = 0;
+ dres->dr_ordered = 0;
+}
+
+static inline int __init
+xfs_defer_init_cache(void)
+{
+ xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending",
+ sizeof(struct xfs_defer_pending),
+ 0, 0, NULL);
+
+ return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM;
+}
+
+static inline void
+xfs_defer_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_defer_pending_cache);
+ xfs_defer_pending_cache = NULL;
+}
+
+/* Set up caches for deferred work items. */
+int __init
+xfs_defer_init_item_caches(void)
+{
+ int error;
+
+ error = xfs_defer_init_cache();
+ if (error)
+ return error;
+ error = xfs_rmap_intent_init_cache();
+ if (error)
+ goto err;
+ error = xfs_refcount_intent_init_cache();
+ if (error)
+ goto err;
+ error = xfs_bmap_intent_init_cache();
+ if (error)
+ goto err;
+ error = xfs_extfree_intent_init_cache();
+ if (error)
+ goto err;
+
+ return 0;
+err:
+ xfs_defer_destroy_item_caches();
+ return error;
+}
+
+/* Destroy all the deferred work item caches, if they've been allocated. */
+void
+xfs_defer_destroy_item_caches(void)
+{
+ xfs_extfree_intent_destroy_cache();
+ xfs_bmap_intent_destroy_cache();
+ xfs_refcount_intent_destroy_cache();
+ xfs_rmap_intent_destroy_cache();
+ xfs_defer_destroy_cache();
+}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 05472f71fffe..7bb8a31ad65b 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -65,6 +65,30 @@ extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
/*
+ * Deferred operation item relogging limits.
+ */
+#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */
+#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */
+
+/* Resources that must be held across a transaction roll. */
+struct xfs_defer_resources {
+ /* held buffers */
+ struct xfs_buf *dr_bp[XFS_DEFER_OPS_NR_BUFS];
+
+ /* inodes with no unlock flags */
+ struct xfs_inode *dr_ip[XFS_DEFER_OPS_NR_INODES];
+
+ /* number of held buffers */
+ unsigned short dr_bufs;
+
+ /* bitmap of ordered buffers */
+ unsigned short dr_ordered;
+
+ /* number of held inodes */
+ unsigned short dr_inos;
+};
+
+/*
* This structure enables a dfops user to detach the chain of deferred
* operations from a transaction so that they can be continued later.
*/
@@ -83,11 +107,7 @@ struct xfs_defer_capture {
/* Log reservation saved from the transaction. */
unsigned int dfc_logres;
- /*
- * An inode reference that must be maintained to complete the deferred
- * work.
- */
- struct xfs_inode *dfc_capture_ip;
+ struct xfs_defer_resources dfc_held;
};
/*
@@ -95,9 +115,14 @@ struct xfs_defer_capture {
* This doesn't normally happen except log recovery.
*/
int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
- struct xfs_inode *capture_ip, struct list_head *capture_list);
+ struct list_head *capture_list);
void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
- struct xfs_inode **captured_ipp);
-void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d);
+ struct xfs_defer_resources *dres);
+void xfs_defer_ops_capture_free(struct xfs_mount *mp,
+ struct xfs_defer_capture *d);
+void xfs_defer_resources_rele(struct xfs_defer_resources *dres);
+
+int __init xfs_defer_init_item_caches(void);
+void xfs_defer_destroy_item_caches(void);
#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index deeb74becabc..15a362e2f5ea 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -22,7 +22,7 @@ xfs_calc_dquots_per_chunk(
unsigned int nbblks) /* basic block units */
{
ASSERT(nbblks > 0);
- return BBTOB(nbblks) / sizeof(xfs_dqblk_t);
+ return BBTOB(nbblks) / sizeof(struct xfs_dqblk);
}
/*
@@ -127,7 +127,7 @@ xfs_dqblk_repair(
* Typically, a repair is only requested by quotacheck.
*/
ASSERT(id != -1);
- memset(dqb, 0, sizeof(xfs_dqblk_t));
+ memset(dqb, 0, sizeof(struct xfs_dqblk));
dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION;
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 2d7057b7984b..d665c04e69dd 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -184,7 +184,7 @@ typedef struct xfs_sb {
* Superblock - on disk version. Must match the in core version above.
* Must be padded to 64 bit alignment.
*/
-typedef struct xfs_dsb {
+struct xfs_dsb {
__be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */
__be32 sb_blocksize; /* logical block size, bytes */
__be64 sb_dblocks; /* number of data blocks */
@@ -263,7 +263,7 @@ typedef struct xfs_dsb {
uuid_t sb_meta_uuid; /* metadata file system unique id */
/* must be padded to 64 bit alignment */
-} xfs_dsb_t;
+};
/*
* Misc. Flags - warning - these will be cleared by xfs_repair unless
@@ -780,7 +780,7 @@ static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds)
* padding field for v3 inodes.
*/
#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
-typedef struct xfs_dinode {
+struct xfs_dinode {
__be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
__be16 di_mode; /* mode and type of file */
__u8 di_version; /* inode version */
@@ -825,7 +825,7 @@ typedef struct xfs_dinode {
uuid_t di_uuid; /* UUID of the filesystem */
/* structure must be padded to 64 bit alignment */
-} xfs_dinode_t;
+};
#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc)
@@ -1215,7 +1215,7 @@ struct xfs_disk_dquot {
* This is what goes on disk. This is separated from the xfs_disk_dquot because
* carrying the unnecessary padding would be a waste of memory.
*/
-typedef struct xfs_dqblk {
+struct xfs_dqblk {
struct xfs_disk_dquot dd_diskdq; /* portion living incore as well */
char dd_fill[4];/* filling for posterity */
@@ -1225,7 +1225,7 @@ typedef struct xfs_dqblk {
__be32 dd_crc; /* checksum */
__be64 dd_lsn; /* last modification in log */
uuid_t dd_uuid; /* location information */
-} xfs_dqblk_t;
+};
#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index bde2b4c64dbe..c43877c8a279 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -268,6 +268,8 @@ typedef struct xfs_fsop_resblks {
*/
#define XFS_MIN_AG_BYTES (1ULL << 24) /* 16 MB */
#define XFS_MAX_AG_BYTES (1ULL << 40) /* 1 TB */
+#define XFS_MAX_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_BLOCKSIZE)
+#define XFS_MAX_CRC_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_CRC_BLOCKSIZE)
/* keep the maximum size under 2^31 by a small amount */
#define XFS_MAX_LOG_BYTES \
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 994ad783d407..b418fe0c0679 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1827,7 +1827,7 @@ xfs_difree_inode_chunk(
if (!xfs_inobt_issparse(rec->ir_holemask)) {
/* not sparse, calculate extent info directly */
- xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
+ xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
M_IGEO(mp)->ialloc_blks,
&XFS_RMAP_OINFO_INODES);
return;
@@ -1872,7 +1872,7 @@ xfs_difree_inode_chunk(
ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
- xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
+ xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
contigblk, &XFS_RMAP_OINFO_INODES);
/* reset range to current bit and carry on... */
@@ -2793,6 +2793,7 @@ xfs_ialloc_setup_geometry(
inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
inodes);
+ ASSERT(igeo->inobt_maxlevels <= xfs_iallocbt_maxlevels_ondisk());
/*
* Set the maximum inode count for this filesystem, being careful not
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 27190840c5d8..b2ad2fdc40f5 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -22,6 +22,8 @@
#include "xfs_rmap.h"
#include "xfs_ag.h"
+static struct kmem_cache *xfs_inobt_cur_cache;
+
STATIC int
xfs_inobt_get_minrecs(
struct xfs_btree_cur *cur,
@@ -432,10 +434,8 @@ xfs_inobt_init_common(
{
struct xfs_btree_cur *cur;
- cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
- cur->bc_tp = tp;
- cur->bc_mp = mp;
- cur->bc_btnum = btnum;
+ cur = xfs_btree_alloc_cursor(mp, tp, btnum,
+ M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
if (btnum == XFS_BTNUM_INO) {
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2);
cur->bc_ops = &xfs_inobt_ops;
@@ -444,8 +444,6 @@ xfs_inobt_init_common(
cur->bc_ops = &xfs_finobt_ops;
}
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
-
if (xfs_has_crc(mp))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
@@ -530,6 +528,17 @@ xfs_inobt_commit_staged_btree(
}
}
+/* Calculate number of records in an inode btree block. */
+static inline unsigned int
+xfs_inobt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(xfs_inobt_rec_t);
+ return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
+}
+
/*
* Calculate number of records in an inobt btree block.
*/
@@ -540,10 +549,54 @@ xfs_inobt_maxrecs(
int leaf)
{
blocklen -= XFS_INOBT_BLOCK_LEN(mp);
+ return xfs_inobt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(xfs_inobt_rec_t);
- return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
+/*
+ * Maximum number of inode btree records per AG. Pretend that we can fill an
+ * entire AG completely full of inodes except for the AG headers.
+ */
+#define XFS_MAX_INODE_RECORDS \
+ ((XFS_MAX_AG_BYTES - (4 * BBSIZE)) / XFS_DINODE_MIN_SIZE) / \
+ XFS_INODES_PER_CHUNK
+
+/* Compute the max possible height for the inode btree. */
+static inline unsigned int
+xfs_inobt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+ XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+ minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS);
+}
+
+/* Compute the max possible height for the free inode btree. */
+static inline unsigned int
+xfs_finobt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS);
+}
+
+/* Compute the max possible height for either inode btree. */
+unsigned int
+xfs_iallocbt_maxlevels_ondisk(void)
+{
+ return max(xfs_inobt_maxlevels_ondisk(),
+ xfs_finobt_maxlevels_ondisk());
}
/*
@@ -761,3 +814,22 @@ xfs_iallocbt_calc_size(
{
return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, len);
}
+
+int __init
+xfs_inobt_init_cur_cache(void)
+{
+ xfs_inobt_cur_cache = kmem_cache_create("xfs_inobt_cur",
+ xfs_btree_cur_sizeof(xfs_inobt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_inobt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_inobt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_inobt_cur_cache);
+ xfs_inobt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index 8a322d402e61..26451cb76b98 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -75,4 +75,9 @@ int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp,
void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
+unsigned int xfs_iallocbt_maxlevels_ondisk(void);
+
+int __init xfs_inobt_init_cur_cache(void);
+void xfs_inobt_destroy_cur_cache(void);
+
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 3932b4ebf903..cae9708c8587 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -51,9 +51,9 @@ xfs_inode_buf_verify(
agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
for (i = 0; i < ni; i++) {
- int di_ok;
- xfs_dinode_t *dip;
- xfs_agino_t unlinked_ino;
+ struct xfs_dinode *dip;
+ xfs_agino_t unlinked_ino;
+ int di_ok;
dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 1d174909f9bd..9149f4f796fc 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -26,7 +26,7 @@
#include "xfs_types.h"
#include "xfs_errortag.h"
-kmem_zone_t *xfs_ifork_zone;
+struct kmem_cache *xfs_ifork_cache;
void
xfs_init_local_fork(
@@ -67,10 +67,10 @@ xfs_init_local_fork(
*/
STATIC int
xfs_iformat_local(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
- int whichfork,
- int size)
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
+ int whichfork,
+ int size)
{
/*
* If the size is unreasonable, then something
@@ -162,8 +162,8 @@ xfs_iformat_extents(
*/
STATIC int
xfs_iformat_btree(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
int whichfork)
{
struct xfs_mount *mp = ip->i_mount;
@@ -284,7 +284,7 @@ xfs_ifork_alloc(
{
struct xfs_ifork *ifp;
- ifp = kmem_cache_zalloc(xfs_ifork_zone, GFP_NOFS | __GFP_NOFAIL);
+ ifp = kmem_cache_zalloc(xfs_ifork_cache, GFP_NOFS | __GFP_NOFAIL);
ifp->if_format = format;
ifp->if_nextents = nextents;
return ifp;
@@ -325,7 +325,7 @@ xfs_iformat_attr_fork(
}
if (error) {
- kmem_cache_free(xfs_ifork_zone, ip->i_afp);
+ kmem_cache_free(xfs_ifork_cache, ip->i_afp);
ip->i_afp = NULL;
}
return error;
@@ -580,8 +580,8 @@ xfs_iextents_copy(
*/
void
xfs_iflush_fork(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
struct xfs_inode_log_item *iip,
int whichfork)
{
@@ -676,7 +676,7 @@ xfs_ifork_init_cow(
if (ip->i_cowfp)
return;
- ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_zone,
+ ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_cache,
GFP_NOFS | __GFP_NOFAIL);
ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS;
}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index a6f7897b6887..3d64a3acb0ed 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -221,7 +221,7 @@ static inline bool xfs_iext_peek_prev_extent(struct xfs_ifork *ifp,
xfs_iext_get_extent((ifp), (ext), (got)); \
xfs_iext_next((ifp), (ext)))
-extern struct kmem_zone *xfs_ifork_zone;
+extern struct kmem_cache *xfs_ifork_cache;
extern void xfs_ifork_init_cow(struct xfs_inode *ip);
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index e5d767a7fc5d..327ba25e9e17 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -24,6 +24,8 @@
#include "xfs_rmap.h"
#include "xfs_ag.h"
+struct kmem_cache *xfs_refcount_intent_cache;
+
/* Allowable refcount adjustment amounts. */
enum xfs_refc_adjust_op {
XFS_REFCOUNT_ADJUST_INCREASE = 1,
@@ -916,8 +918,7 @@ xfs_refcount_adjust_extents(
struct xfs_btree_cur *cur,
xfs_agblock_t *agbno,
xfs_extlen_t *aglen,
- enum xfs_refc_adjust_op adj,
- struct xfs_owner_info *oinfo)
+ enum xfs_refc_adjust_op adj)
{
struct xfs_refcount_irec ext, tmp;
int error;
@@ -974,8 +975,8 @@ xfs_refcount_adjust_extents(
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
cur->bc_ag.pag->pag_agno,
tmp.rc_startblock);
- xfs_bmap_add_free(cur->bc_tp, fsbno,
- tmp.rc_blockcount, oinfo);
+ xfs_free_extent_later(cur->bc_tp, fsbno,
+ tmp.rc_blockcount, NULL);
}
(*agbno) += tmp.rc_blockcount;
@@ -1019,8 +1020,8 @@ xfs_refcount_adjust_extents(
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
cur->bc_ag.pag->pag_agno,
ext.rc_startblock);
- xfs_bmap_add_free(cur->bc_tp, fsbno, ext.rc_blockcount,
- oinfo);
+ xfs_free_extent_later(cur->bc_tp, fsbno,
+ ext.rc_blockcount, NULL);
}
skip:
@@ -1048,8 +1049,7 @@ xfs_refcount_adjust(
xfs_extlen_t aglen,
xfs_agblock_t *new_agbno,
xfs_extlen_t *new_aglen,
- enum xfs_refc_adjust_op adj,
- struct xfs_owner_info *oinfo)
+ enum xfs_refc_adjust_op adj)
{
bool shape_changed;
int shape_changes = 0;
@@ -1092,8 +1092,7 @@ xfs_refcount_adjust(
cur->bc_ag.refc.shape_changes++;
/* Now that we've taken care of the ends, adjust the middle extents */
- error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen,
- adj, oinfo);
+ error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, adj);
if (error)
goto out_error;
@@ -1188,12 +1187,12 @@ xfs_refcount_finish_one(
switch (type) {
case XFS_REFCOUNT_INCREASE:
error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
- new_len, XFS_REFCOUNT_ADJUST_INCREASE, NULL);
+ new_len, XFS_REFCOUNT_ADJUST_INCREASE);
*new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
break;
case XFS_REFCOUNT_DECREASE:
error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
- new_len, XFS_REFCOUNT_ADJUST_DECREASE, NULL);
+ new_len, XFS_REFCOUNT_ADJUST_DECREASE);
*new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
break;
case XFS_REFCOUNT_ALLOC_COW:
@@ -1235,8 +1234,8 @@ __xfs_refcount_add(
type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
blockcount);
- ri = kmem_alloc(sizeof(struct xfs_refcount_intent),
- KM_NOFS);
+ ri = kmem_cache_alloc(xfs_refcount_intent_cache,
+ GFP_NOFS | __GFP_NOFAIL);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_startblock = startblock;
@@ -1742,7 +1741,7 @@ xfs_refcount_recover_cow_leftovers(
rr->rr_rrec.rc_blockcount);
/* Free the block. */
- xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
+ xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
error = xfs_trans_commit(tp);
if (error)
@@ -1782,3 +1781,20 @@ xfs_refcount_has_record(
return xfs_btree_has_record(cur, &low, &high, exists);
}
+
+int __init
+xfs_refcount_intent_init_cache(void)
+{
+ xfs_refcount_intent_cache = kmem_cache_create("xfs_refc_intent",
+ sizeof(struct xfs_refcount_intent),
+ 0, 0, NULL);
+
+ return xfs_refcount_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_refcount_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_refcount_intent_cache);
+ xfs_refcount_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 02cb3aa405be..9eb01edbd89d 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -32,8 +32,8 @@ enum xfs_refcount_intent_type {
struct xfs_refcount_intent {
struct list_head ri_list;
enum xfs_refcount_intent_type ri_type;
- xfs_fsblock_t ri_startblock;
xfs_extlen_t ri_blockcount;
+ xfs_fsblock_t ri_startblock;
};
void xfs_refcount_increase_extent(struct xfs_trans *tp,
@@ -83,4 +83,9 @@ extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
+extern struct kmem_cache *xfs_refcount_intent_cache;
+
+int __init xfs_refcount_intent_init_cache(void);
+void xfs_refcount_intent_destroy_cache(void);
+
#endif /* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 1ef9b99962ab..d14c1720b0fb 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -21,6 +21,8 @@
#include "xfs_rmap.h"
#include "xfs_ag.h"
+static struct kmem_cache *xfs_refcountbt_cur_cache;
+
static struct xfs_btree_cur *
xfs_refcountbt_dup_cursor(
struct xfs_btree_cur *cur)
@@ -322,11 +324,8 @@ xfs_refcountbt_init_common(
ASSERT(pag->pag_agno < mp->m_sb.sb_agcount);
- cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
- cur->bc_tp = tp;
- cur->bc_mp = mp;
- cur->bc_btnum = XFS_BTNUM_REFC;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_REFC,
+ mp->m_refc_maxlevels, xfs_refcountbt_cur_cache);
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
@@ -396,6 +395,18 @@ xfs_refcountbt_commit_staged_btree(
xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops);
}
+/* Calculate number of records in a refcount btree block. */
+static inline unsigned int
+xfs_refcountbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct xfs_refcount_rec);
+ return blocklen / (sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_refcount_ptr_t));
+}
+
/*
* Calculate the number of records in a refcount btree block.
*/
@@ -405,11 +416,22 @@ xfs_refcountbt_maxrecs(
bool leaf)
{
blocklen -= XFS_REFCOUNT_BLOCK_LEN;
+ return xfs_refcountbt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(struct xfs_refcount_rec);
- return blocklen / (sizeof(struct xfs_refcount_key) +
- sizeof(xfs_refcount_ptr_t));
+/* Compute the max possible height of the maximally sized refcount btree. */
+unsigned int
+xfs_refcountbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_refcountbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_refcountbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_CRC_AG_BLOCKS);
}
/* Compute the maximum height of a refcount btree. */
@@ -417,8 +439,14 @@ void
xfs_refcountbt_compute_maxlevels(
struct xfs_mount *mp)
{
+ if (!xfs_has_reflink(mp)) {
+ mp->m_refc_maxlevels = 0;
+ return;
+ }
+
mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(
mp->m_refc_mnr, mp->m_sb.sb_agblocks);
+ ASSERT(mp->m_refc_maxlevels <= xfs_refcountbt_maxlevels_ondisk());
}
/* Calculate the refcount btree size for some records. */
@@ -488,3 +516,22 @@ xfs_refcountbt_calc_reserves(
return error;
}
+
+int __init
+xfs_refcountbt_init_cur_cache(void)
+{
+ xfs_refcountbt_cur_cache = kmem_cache_create("xfs_refcbt_cur",
+ xfs_btree_cur_sizeof(xfs_refcountbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_refcountbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_refcountbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_refcountbt_cur_cache);
+ xfs_refcountbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index bd9ed9e1e41f..d66b37259bed 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -65,4 +65,9 @@ extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
+unsigned int xfs_refcountbt_maxlevels_ondisk(void);
+
+int __init xfs_refcountbt_init_cur_cache(void);
+void xfs_refcountbt_destroy_cur_cache(void);
+
#endif /* __XFS_REFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index f45929b1b94a..cd322174dbff 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -24,6 +24,8 @@
#include "xfs_inode.h"
#include "xfs_ag.h"
+struct kmem_cache *xfs_rmap_intent_cache;
+
/*
* Lookup the first record less than or equal to [bno, len, owner, offset]
* in the btree given by cur.
@@ -2485,7 +2487,7 @@ __xfs_rmap_add(
bmap->br_blockcount,
bmap->br_state);
- ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS);
+ ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_owner = owner;
@@ -2779,3 +2781,20 @@ const struct xfs_owner_info XFS_RMAP_OINFO_REFC = {
const struct xfs_owner_info XFS_RMAP_OINFO_COW = {
.oi_owner = XFS_RMAP_OWN_COW,
};
+
+int __init
+xfs_rmap_intent_init_cache(void)
+{
+ xfs_rmap_intent_cache = kmem_cache_create("xfs_rmap_intent",
+ sizeof(struct xfs_rmap_intent),
+ 0, 0, NULL);
+
+ return xfs_rmap_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_rmap_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_rmap_intent_cache);
+ xfs_rmap_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index fd67904ed446..b718ebeda372 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -159,8 +159,8 @@ enum xfs_rmap_intent_type {
struct xfs_rmap_intent {
struct list_head ri_list;
enum xfs_rmap_intent_type ri_type;
- uint64_t ri_owner;
int ri_whichfork;
+ uint64_t ri_owner;
struct xfs_bmbt_irec ri_bmap;
};
@@ -215,4 +215,9 @@ extern const struct xfs_owner_info XFS_RMAP_OINFO_INODES;
extern const struct xfs_owner_info XFS_RMAP_OINFO_REFC;
extern const struct xfs_owner_info XFS_RMAP_OINFO_COW;
+extern struct kmem_cache *xfs_rmap_intent_cache;
+
+int __init xfs_rmap_intent_init_cache(void);
+void xfs_rmap_intent_destroy_cache(void);
+
#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index b7dbbfb3aeed..69e104d0277f 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -22,6 +22,8 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
+static struct kmem_cache *xfs_rmapbt_cur_cache;
+
/*
* Reverse map btree.
*
@@ -451,13 +453,10 @@ xfs_rmapbt_init_common(
{
struct xfs_btree_cur *cur;
- cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
- cur->bc_tp = tp;
- cur->bc_mp = mp;
/* Overlapping btree; 2 keys per pointer. */
- cur->bc_btnum = XFS_BTNUM_RMAP;
+ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP,
+ mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
cur->bc_ops = &xfs_rmapbt_ops;
@@ -522,6 +521,18 @@ xfs_rmapbt_commit_staged_btree(
xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops);
}
+/* Calculate number of records in a reverse mapping btree block. */
+static inline unsigned int
+xfs_rmapbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct xfs_rmap_rec);
+ return blocklen /
+ (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
+}
+
/*
* Calculate number of records in an rmap btree block.
*/
@@ -531,11 +542,33 @@ xfs_rmapbt_maxrecs(
int leaf)
{
blocklen -= XFS_RMAP_BLOCK_LEN;
+ return xfs_rmapbt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(struct xfs_rmap_rec);
- return blocklen /
- (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
+/* Compute the max possible height for reverse mapping btrees. */
+unsigned int
+xfs_rmapbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_rmapbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_rmapbt_block_maxrecs(blocklen, false) / 2;
+
+ /*
+ * Compute the asymptotic maxlevels for an rmapbt on any reflink fs.
+ *
+ * On a reflink filesystem, each AG block can have up to 2^32 (per the
+ * refcount record format) owners, which means that theoretically we
+ * could face up to 2^64 rmap records. However, we're likely to run
+ * out of blocks in the AG long before that happens, which means that
+ * we must compute the max height based on what the btree will look
+ * like if it consumes almost all the blocks in the AG due to maximal
+ * sharing factor.
+ */
+ return xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS);
}
/* Compute the maximum height of an rmap btree. */
@@ -543,26 +576,36 @@ void
xfs_rmapbt_compute_maxlevels(
struct xfs_mount *mp)
{
- /*
- * On a non-reflink filesystem, the maximum number of rmap
- * records is the number of blocks in the AG, hence the max
- * rmapbt height is log_$maxrecs($agblocks). However, with
- * reflink each AG block can have up to 2^32 (per the refcount
- * record format) owners, which means that theoretically we
- * could face up to 2^64 rmap records.
- *
- * That effectively means that the max rmapbt height must be
- * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG
- * blocks to feed the rmapbt long before the rmapbt reaches
- * maximum height. The reflink code uses ag_resv_critical to
- * disallow reflinking when less than 10% of the per-AG metadata
- * block reservation since the fallback is a regular file copy.
- */
- if (xfs_has_reflink(mp))
- mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
- else
+ if (!xfs_has_rmapbt(mp)) {
+ mp->m_rmap_maxlevels = 0;
+ return;
+ }
+
+ if (xfs_has_reflink(mp)) {
+ /*
+ * Compute the asymptotic maxlevels for an rmap btree on a
+ * filesystem that supports reflink.
+ *
+ * On a reflink filesystem, each AG block can have up to 2^32
+ * (per the refcount record format) owners, which means that
+ * theoretically we could face up to 2^64 rmap records.
+ * However, we're likely to run out of blocks in the AG long
+ * before that happens, which means that we must compute the
+ * max height based on what the btree will look like if it
+ * consumes almost all the blocks in the AG due to maximal
+ * sharing factor.
+ */
+ mp->m_rmap_maxlevels = xfs_btree_space_to_height(mp->m_rmap_mnr,
+ mp->m_sb.sb_agblocks);
+ } else {
+ /*
+ * If there's no block sharing, compute the maximum rmapbt
+ * height assuming one rmap record per AG block.
+ */
mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+ }
+ ASSERT(mp->m_rmap_maxlevels <= xfs_rmapbt_maxlevels_ondisk());
}
/* Calculate the refcount btree size for some records. */
@@ -633,3 +676,22 @@ xfs_rmapbt_calc_reserves(
return error;
}
+
+int __init
+xfs_rmapbt_init_cur_cache(void)
+{
+ xfs_rmapbt_cur_cache = kmem_cache_create("xfs_rmapbt_cur",
+ xfs_btree_cur_sizeof(xfs_rmapbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_rmapbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_rmapbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_rmapbt_cur_cache);
+ xfs_rmapbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index f2eee6572af4..3244715dd111 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -59,4 +59,9 @@ extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp,
extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp,
struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used);
+unsigned int xfs_rmapbt_maxlevels_ondisk(void);
+
+int __init xfs_rmapbt_init_cur_cache(void);
+void xfs_rmapbt_destroy_cur_cache(void);
+
#endif /* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index e58349be78bd..f4e84aa1d50a 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -495,7 +495,7 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp)
static void
__xfs_sb_from_disk(
struct xfs_sb *to,
- xfs_dsb_t *from,
+ struct xfs_dsb *from,
bool convert_xquota)
{
to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
@@ -571,7 +571,7 @@ __xfs_sb_from_disk(
void
xfs_sb_from_disk(
struct xfs_sb *to,
- xfs_dsb_t *from)
+ struct xfs_dsb *from)
{
__xfs_sb_from_disk(to, from, true);
}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 5e300daa2559..6f83d9b306ee 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -70,7 +70,7 @@ xfs_allocfree_log_count(
{
uint blocks;
- blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1);
+ blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1);
if (xfs_has_rmapbt(mp))
blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
if (xfs_has_reflink(mp))
@@ -814,6 +814,19 @@ xfs_trans_resv_calc(
struct xfs_mount *mp,
struct xfs_trans_resv *resp)
{
+ unsigned int rmap_maxlevels = mp->m_rmap_maxlevels;
+
+ /*
+ * In the early days of rmap+reflink, we always set the rmap maxlevels
+ * to 9 even if the AG was small enough that it would never grow to
+ * that height. Transaction reservation sizes influence the minimum
+ * log size calculation, which influences the size of the log that mkfs
+ * creates. Use the old value here to ensure that newly formatted
+ * small filesystems will mount on older kernels.
+ */
+ if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
+ mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
+
/*
* The following transactions are logged in physical format and
* require a permanent reservation on space.
@@ -916,4 +929,7 @@ xfs_trans_resv_calc(
resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
+
+ /* Put everything back the way it was. This goes at the end. */
+ mp->m_rmap_maxlevels = rmap_maxlevels;
}
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 50332be34388..87b31c69a773 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -17,6 +17,13 @@
/* Adding one rmap could split every level up to the top of the tree. */
#define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels)
+/*
+ * Note that we historically set m_rmap_maxlevels to 9 when reflink is enabled,
+ * so we must preserve this behavior to avoid changing the transaction space
+ * reservations and minimum log size calculations for existing filesystems.
+ */
+#define XFS_OLD_REFLINK_RMAP_MAXLEVELS 9
+
/* Blocks we might need to add "b" rmaps to a tree. */
#define XFS_NRMAPADD_SPACE_RES(mp, b)\
(((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
@@ -74,7 +81,7 @@
#define XFS_DIOSTRAT_SPACE_RES(mp, v) \
(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
#define XFS_GROWFS_SPACE_RES(mp) \
- (2 * (mp)->m_ag_maxlevels)
+ (2 * (mp)->m_alloc_maxlevels)
#define XFS_GROWFSRT_SPACE_RES(mp,b) \
((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
#define XFS_LINK_SPACE_RES(mp,nl) \
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index ae3c9f6e2c69..bed798792226 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -555,11 +555,11 @@ xchk_agf(
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > mp->m_alloc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > mp->m_alloc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
if (xfs_has_rmapbt(mp)) {
@@ -568,7 +568,7 @@ xchk_agf(
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > mp->m_rmap_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
}
@@ -578,7 +578,7 @@ xchk_agf(
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_refcount_level);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > mp->m_refc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
}
@@ -850,6 +850,7 @@ xchk_agi(
struct xfs_mount *mp = sc->mp;
struct xfs_agi *agi;
struct xfs_perag *pag;
+ struct xfs_ino_geometry *igeo = M_IGEO(sc->mp);
xfs_agnumber_t agno = sc->sm->sm_agno;
xfs_agblock_t agbno;
xfs_agblock_t eoag;
@@ -880,7 +881,7 @@ xchk_agi(
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
level = be32_to_cpu(agi->agi_level);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > igeo->inobt_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
if (xfs_has_finobt(mp)) {
@@ -889,7 +890,7 @@ xchk_agi(
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
level = be32_to_cpu(agi->agi_free_level);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > igeo->inobt_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 0f8deee66f15..d7bfed52f4cd 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -122,7 +122,7 @@ xrep_check_btree_root(
xfs_agnumber_t agno = sc->sm->sm_agno;
return xfs_verify_agbno(mp, agno, fab->root) &&
- fab->height <= XFS_BTREE_MAXLEVELS;
+ fab->height <= fab->maxlevels;
}
/*
@@ -339,18 +339,22 @@ xrep_agf(
[XREP_AGF_BNOBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_bnobt_buf_ops,
+ .maxlevels = sc->mp->m_alloc_maxlevels,
},
[XREP_AGF_CNTBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_cntbt_buf_ops,
+ .maxlevels = sc->mp->m_alloc_maxlevels,
},
[XREP_AGF_RMAPBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_rmapbt_buf_ops,
+ .maxlevels = sc->mp->m_rmap_maxlevels,
},
[XREP_AGF_REFCOUNTBT] = {
.rmap_owner = XFS_RMAP_OWN_REFC,
.buf_ops = &xfs_refcountbt_buf_ops,
+ .maxlevels = sc->mp->m_refc_maxlevels,
},
[XREP_AGF_END] = {
.buf_ops = NULL,
@@ -881,10 +885,12 @@ xrep_agi(
[XREP_AGI_INOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
.buf_ops = &xfs_inobt_buf_ops,
+ .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
},
[XREP_AGI_FINOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
.buf_ops = &xfs_finobt_buf_ops,
+ .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
},
[XREP_AGI_END] = {
.buf_ops = NULL
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index d6d24c866bc4..b89bf9de9b1c 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -222,21 +222,21 @@ out:
* 1 2 3
*
* Pretend for this example that each leaf block has 100 btree records. For
- * the first btree record, we'll observe that bc_ptrs[0] == 1, so we record
- * that we saw block 1. Then we observe that bc_ptrs[1] == 1, so we record
- * block 4. The list is [1, 4].
+ * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we
+ * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so
+ * we record block 4. The list is [1, 4].
*
- * For the second btree record, we see that bc_ptrs[0] == 2, so we exit the
- * loop. The list remains [1, 4].
+ * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit
+ * the loop. The list remains [1, 4].
*
* For the 101st btree record, we've moved onto leaf block 2. Now
- * bc_ptrs[0] == 1 again, so we record that we saw block 2. We see that
- * bc_ptrs[1] == 2, so we exit the loop. The list is now [1, 4, 2].
+ * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that
+ * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2].
*
- * For the 102nd record, bc_ptrs[0] == 2, so we continue.
+ * For the 102nd record, bc_levels[0].ptr == 2, so we continue.
*
- * For the 201st record, we've moved on to leaf block 3. bc_ptrs[0] == 1, so
- * we add 3 to the list. Now it is [1, 4, 2, 3].
+ * For the 201st record, we've moved on to leaf block 3.
+ * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3].
*
* For the 300th record we just exit, with the list being [1, 4, 2, 3].
*/
@@ -256,7 +256,7 @@ xbitmap_set_btcur_path(
int i;
int error;
- for (i = 0; i < cur->bc_nlevels && cur->bc_ptrs[i] == 1; i++) {
+ for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
xfs_btree_get_block(cur, i, &bp);
if (!bp)
continue;
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 017da9ceaee9..a4cbbc346f60 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -402,7 +402,7 @@ xchk_bmapbt_rec(
* the root since the verifiers don't do that.
*/
if (xfs_has_crc(bs->cur->bc_mp) &&
- bs->cur->bc_ptrs[0] == 1) {
+ bs->cur->bc_levels[0].ptr == 1) {
for (i = 0; i < bs->cur->bc_nlevels - 1; i++) {
block = xfs_btree_get_block(bs->cur, i, &bp);
owner = be64_to_cpu(block->bb_u.l.bb_owner);
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index eccb855dc904..39dd46f038fe 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -136,14 +136,14 @@ xchk_btree_rec(
struct xfs_buf *bp;
block = xfs_btree_get_block(cur, 0, &bp);
- rec = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
+ rec = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, block);
trace_xchk_btree_rec(bs->sc, cur, 0);
/* If this isn't the first record, are they in order? */
- if (!bs->firstrec && !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
+ if (cur->bc_levels[0].ptr > 1 &&
+ !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
xchk_btree_set_corrupt(bs->sc, cur, 0);
- bs->firstrec = false;
memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
if (cur->bc_nlevels == 1)
@@ -152,7 +152,7 @@ xchk_btree_rec(
/* Is this at least as large as the parent low key? */
cur->bc_ops->init_key_from_rec(&key, rec);
keyblock = xfs_btree_get_block(cur, 1, &bp);
- keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[1], keyblock);
+ keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0)
xchk_btree_set_corrupt(bs->sc, cur, 1);
@@ -161,7 +161,7 @@ xchk_btree_rec(
/* Is this no larger than the parent high key? */
cur->bc_ops->init_high_key_from_rec(&hkey, rec);
- keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[1], keyblock);
+ keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0)
xchk_btree_set_corrupt(bs->sc, cur, 1);
}
@@ -183,23 +183,22 @@ xchk_btree_key(
struct xfs_buf *bp;
block = xfs_btree_get_block(cur, level, &bp);
- key = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
+ key = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
trace_xchk_btree_key(bs->sc, cur, level);
/* If this isn't the first key, are they in order? */
- if (!bs->firstkey[level] &&
- !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level], key))
+ if (cur->bc_levels[level].ptr > 1 &&
+ !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1], key))
xchk_btree_set_corrupt(bs->sc, cur, level);
- bs->firstkey[level] = false;
- memcpy(&bs->lastkey[level], key, cur->bc_ops->key_len);
+ memcpy(&bs->lastkey[level - 1], key, cur->bc_ops->key_len);
if (level + 1 >= cur->bc_nlevels)
return;
/* Is this at least as large as the parent low key? */
keyblock = xfs_btree_get_block(cur, level + 1, &bp);
- keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1], keyblock);
+ keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock);
if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0)
xchk_btree_set_corrupt(bs->sc, cur, level);
@@ -207,8 +206,9 @@ xchk_btree_key(
return;
/* Is this no larger than the parent high key? */
- key = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
- keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1], keyblock);
+ key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block);
+ keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
+ keyblock);
if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0)
xchk_btree_set_corrupt(bs->sc, cur, level);
}
@@ -291,7 +291,7 @@ xchk_btree_block_check_sibling(
/* Compare upper level pointer to sibling pointer. */
pblock = xfs_btree_get_block(ncur, level + 1, &pbp);
- pp = xfs_btree_ptr_addr(ncur, ncur->bc_ptrs[level + 1], pblock);
+ pp = xfs_btree_ptr_addr(ncur, ncur->bc_levels[level + 1].ptr, pblock);
if (!xchk_btree_ptr_ok(bs, level + 1, pp))
goto out;
if (pbp)
@@ -596,7 +596,7 @@ xchk_btree_block_keys(
/* Obtain the parent's copy of the keys for this block. */
parent_block = xfs_btree_get_block(cur, level + 1, &bp);
- parent_keys = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1],
+ parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
parent_block);
if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0)
@@ -607,7 +607,7 @@ xchk_btree_block_keys(
/* Get high keys */
high_bk = xfs_btree_high_key_from_key(cur, &block_keys);
- high_pk = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1],
+ high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
parent_block);
if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0)
@@ -627,35 +627,39 @@ xchk_btree(
const struct xfs_owner_info *oinfo,
void *private)
{
- struct xchk_btree bs = {
- .cur = cur,
- .scrub_rec = scrub_fn,
- .oinfo = oinfo,
- .firstrec = true,
- .private = private,
- .sc = sc,
- };
union xfs_btree_ptr ptr;
+ struct xchk_btree *bs;
union xfs_btree_ptr *pp;
union xfs_btree_rec *recp;
struct xfs_btree_block *block;
- int level;
struct xfs_buf *bp;
struct check_owner *co;
struct check_owner *n;
- int i;
+ size_t cur_sz;
+ int level;
int error = 0;
- /* Initialize scrub state */
- for (i = 0; i < XFS_BTREE_MAXLEVELS; i++)
- bs.firstkey[i] = true;
- INIT_LIST_HEAD(&bs.to_check);
-
- /* Don't try to check a tree with a height we can't handle. */
- if (cur->bc_nlevels > XFS_BTREE_MAXLEVELS) {
+ /*
+ * Allocate the btree scrub context from the heap, because this
+ * structure can get rather large. Don't let a caller feed us a
+ * totally absurd size.
+ */
+ cur_sz = xchk_btree_sizeof(cur->bc_nlevels);
+ if (cur_sz > PAGE_SIZE) {
xchk_btree_set_corrupt(sc, cur, 0);
- goto out;
+ return 0;
}
+ bs = kmem_zalloc(cur_sz, KM_NOFS | KM_MAYFAIL);
+ if (!bs)
+ return -ENOMEM;
+ bs->cur = cur;
+ bs->scrub_rec = scrub_fn;
+ bs->oinfo = oinfo;
+ bs->private = private;
+ bs->sc = sc;
+
+ /* Initialize scrub state */
+ INIT_LIST_HEAD(&bs->to_check);
/*
* Load the root of the btree. The helper function absorbs
@@ -663,79 +667,82 @@ xchk_btree(
*/
level = cur->bc_nlevels - 1;
cur->bc_ops->init_ptr_from_cur(cur, &ptr);
- if (!xchk_btree_ptr_ok(&bs, cur->bc_nlevels, &ptr))
+ if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
goto out;
- error = xchk_btree_get_block(&bs, level, &ptr, &block, &bp);
+ error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
if (error || !block)
goto out;
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
while (level < cur->bc_nlevels) {
block = xfs_btree_get_block(cur, level, &bp);
if (level == 0) {
/* End of leaf, pop back towards the root. */
- if (cur->bc_ptrs[level] >
+ if (cur->bc_levels[level].ptr >
be16_to_cpu(block->bb_numrecs)) {
- xchk_btree_block_keys(&bs, level, block);
+ xchk_btree_block_keys(bs, level, block);
if (level < cur->bc_nlevels - 1)
- cur->bc_ptrs[level + 1]++;
+ cur->bc_levels[level + 1].ptr++;
level++;
continue;
}
/* Records in order for scrub? */
- xchk_btree_rec(&bs);
+ xchk_btree_rec(bs);
/* Call out to the record checker. */
- recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
- error = bs.scrub_rec(&bs, recp);
+ recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
+ block);
+ error = bs->scrub_rec(bs, recp);
if (error)
break;
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
break;
- cur->bc_ptrs[level]++;
+ cur->bc_levels[level].ptr++;
continue;
}
/* End of node, pop back towards the root. */
- if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
- xchk_btree_block_keys(&bs, level, block);
+ if (cur->bc_levels[level].ptr >
+ be16_to_cpu(block->bb_numrecs)) {
+ xchk_btree_block_keys(bs, level, block);
if (level < cur->bc_nlevels - 1)
- cur->bc_ptrs[level + 1]++;
+ cur->bc_levels[level + 1].ptr++;
level++;
continue;
}
/* Keys in order for scrub? */
- xchk_btree_key(&bs, level);
+ xchk_btree_key(bs, level);
/* Drill another level deeper. */
- pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
- if (!xchk_btree_ptr_ok(&bs, level, pp)) {
- cur->bc_ptrs[level]++;
+ pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
+ if (!xchk_btree_ptr_ok(bs, level, pp)) {
+ cur->bc_levels[level].ptr++;
continue;
}
level--;
- error = xchk_btree_get_block(&bs, level, pp, &block, &bp);
+ error = xchk_btree_get_block(bs, level, pp, &block, &bp);
if (error || !block)
goto out;
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
}
out:
/* Process deferred owner checks on btree blocks. */
- list_for_each_entry_safe(co, n, &bs.to_check, list) {
- if (!error && bs.cur)
- error = xchk_btree_check_block_owner(&bs,
- co->level, co->daddr);
+ list_for_each_entry_safe(co, n, &bs->to_check, list) {
+ if (!error && bs->cur)
+ error = xchk_btree_check_block_owner(bs, co->level,
+ co->daddr);
list_del(&co->list);
kmem_free(co);
}
+ kmem_free(bs);
return error;
}
diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h
index b7d2fc01fbf9..da61a53a0b61 100644
--- a/fs/xfs/scrub/btree.h
+++ b/fs/xfs/scrub/btree.h
@@ -39,11 +39,22 @@ struct xchk_btree {
/* internal scrub state */
union xfs_btree_rec lastrec;
- bool firstrec;
- union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS];
- bool firstkey[XFS_BTREE_MAXLEVELS];
struct list_head to_check;
+
+ /* this element must come last! */
+ union xfs_btree_key lastkey[];
};
+
+/*
+ * Calculate the size of a xchk_btree structure. There are nlevels-1 slots for
+ * keys because we track leaf records separately in lastrec.
+ */
+static inline size_t
+xchk_btree_sizeof(unsigned int nlevels)
+{
+ return struct_size((struct xchk_btree *)NULL, lastkey, nlevels - 1);
+}
+
int xchk_btree(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
xchk_btree_rec_fn scrub_fn, const struct xfs_owner_info *oinfo,
void *private);
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 8a52514bc1ff..b962cfbbd92b 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -473,7 +473,7 @@ xchk_da_btree(
xchk_da_btree_rec_fn scrub_fn,
void *private)
{
- struct xchk_da_btree ds = {};
+ struct xchk_da_btree *ds;
struct xfs_mount *mp = sc->mp;
struct xfs_da_state_blk *blks;
struct xfs_da_node_entry *key;
@@ -486,32 +486,35 @@ xchk_da_btree(
return 0;
/* Set up initial da state. */
- ds.dargs.dp = sc->ip;
- ds.dargs.whichfork = whichfork;
- ds.dargs.trans = sc->tp;
- ds.dargs.op_flags = XFS_DA_OP_OKNOENT;
- ds.state = xfs_da_state_alloc(&ds.dargs);
- ds.sc = sc;
- ds.private = private;
+ ds = kmem_zalloc(sizeof(struct xchk_da_btree), KM_NOFS | KM_MAYFAIL);
+ if (!ds)
+ return -ENOMEM;
+ ds->dargs.dp = sc->ip;
+ ds->dargs.whichfork = whichfork;
+ ds->dargs.trans = sc->tp;
+ ds->dargs.op_flags = XFS_DA_OP_OKNOENT;
+ ds->state = xfs_da_state_alloc(&ds->dargs);
+ ds->sc = sc;
+ ds->private = private;
if (whichfork == XFS_ATTR_FORK) {
- ds.dargs.geo = mp->m_attr_geo;
- ds.lowest = 0;
- ds.highest = 0;
+ ds->dargs.geo = mp->m_attr_geo;
+ ds->lowest = 0;
+ ds->highest = 0;
} else {
- ds.dargs.geo = mp->m_dir_geo;
- ds.lowest = ds.dargs.geo->leafblk;
- ds.highest = ds.dargs.geo->freeblk;
+ ds->dargs.geo = mp->m_dir_geo;
+ ds->lowest = ds->dargs.geo->leafblk;
+ ds->highest = ds->dargs.geo->freeblk;
}
- blkno = ds.lowest;
+ blkno = ds->lowest;
level = 0;
/* Find the root of the da tree, if present. */
- blks = ds.state->path.blk;
- error = xchk_da_btree_block(&ds, level, blkno);
+ blks = ds->state->path.blk;
+ error = xchk_da_btree_block(ds, level, blkno);
if (error)
goto out_state;
/*
- * We didn't find a block at ds.lowest, which means that there's
+ * We didn't find a block at ds->lowest, which means that there's
* no LEAF1/LEAFN tree (at least not where it's supposed to be),
* so jump out now.
*/
@@ -523,16 +526,16 @@ xchk_da_btree(
/* Handle leaf block. */
if (blks[level].magic != XFS_DA_NODE_MAGIC) {
/* End of leaf, pop back towards the root. */
- if (blks[level].index >= ds.maxrecs[level]) {
+ if (blks[level].index >= ds->maxrecs[level]) {
if (level > 0)
blks[level - 1].index++;
- ds.tree_level++;
+ ds->tree_level++;
level--;
continue;
}
/* Dispatch record scrubbing. */
- error = scrub_fn(&ds, level);
+ error = scrub_fn(ds, level);
if (error)
break;
if (xchk_should_terminate(sc, &error) ||
@@ -545,17 +548,17 @@ xchk_da_btree(
/* End of node, pop back towards the root. */
- if (blks[level].index >= ds.maxrecs[level]) {
+ if (blks[level].index >= ds->maxrecs[level]) {
if (level > 0)
blks[level - 1].index++;
- ds.tree_level++;
+ ds->tree_level++;
level--;
continue;
}
/* Hashes in order for scrub? */
- key = xchk_da_btree_node_entry(&ds, level);
- error = xchk_da_btree_hash(&ds, level, &key->hashval);
+ key = xchk_da_btree_node_entry(ds, level);
+ error = xchk_da_btree_hash(ds, level, &key->hashval);
if (error)
goto out;
@@ -564,11 +567,11 @@ xchk_da_btree(
level++;
if (level >= XFS_DA_NODE_MAXDEPTH) {
/* Too deep! */
- xchk_da_set_corrupt(&ds, level - 1);
+ xchk_da_set_corrupt(ds, level - 1);
break;
}
- ds.tree_level--;
- error = xchk_da_btree_block(&ds, level, blkno);
+ ds->tree_level--;
+ error = xchk_da_btree_block(ds, level, blkno);
if (error)
goto out;
if (blks[level].bp == NULL)
@@ -587,6 +590,7 @@ out:
}
out_state:
- xfs_da_state_free(ds.state);
+ xfs_da_state_free(ds->state);
+ kmem_free(ds);
return error;
}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 3bb152d52a07..840f74ec431c 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -44,6 +44,9 @@ struct xrep_find_ag_btree {
/* in: buffer ops */
const struct xfs_buf_ops *buf_ops;
+ /* in: maximum btree height */
+ unsigned int maxlevels;
+
/* out: the highest btree block found and the tree height */
xfs_agblock_t root;
unsigned int height;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 51e4c61916d2..8d528d35b725 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -461,15 +461,10 @@ xfs_scrub_metadata(
struct file *file,
struct xfs_scrub_metadata *sm)
{
- struct xfs_scrub sc = {
- .file = file,
- .sm = sm,
- };
+ struct xfs_scrub *sc;
struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount;
int error = 0;
- sc.mp = mp;
-
BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
(sizeof(struct xchk_meta_ops) * XFS_SCRUB_TYPE_NR));
@@ -489,59 +484,68 @@ xfs_scrub_metadata(
xchk_experimental_warning(mp);
- sc.ops = &meta_scrub_ops[sm->sm_type];
- sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
+ sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL);
+ if (!sc) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ sc->mp = mp;
+ sc->file = file;
+ sc->sm = sm;
+ sc->ops = &meta_scrub_ops[sm->sm_type];
+ sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
retry_op:
/*
* When repairs are allowed, prevent freezing or readonly remount while
* scrub is running with a real transaction.
*/
if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
- error = mnt_want_write_file(sc.file);
+ error = mnt_want_write_file(sc->file);
if (error)
- goto out;
+ goto out_sc;
}
/* Set up for the operation. */
- error = sc.ops->setup(&sc);
+ error = sc->ops->setup(sc);
if (error)
goto out_teardown;
/* Scrub for errors. */
- error = sc.ops->scrub(&sc);
- if (!(sc.flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
+ error = sc->ops->scrub(sc);
+ if (!(sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
/*
* Scrubbers return -EDEADLOCK to mean 'try harder'.
* Tear down everything we hold, then set up again with
* preparation for worst-case scenarios.
*/
- error = xchk_teardown(&sc, 0);
+ error = xchk_teardown(sc, 0);
if (error)
- goto out;
- sc.flags |= XCHK_TRY_HARDER;
+ goto out_sc;
+ sc->flags |= XCHK_TRY_HARDER;
goto retry_op;
} else if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE))
goto out_teardown;
- xchk_update_health(&sc);
+ xchk_update_health(sc);
- if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
- !(sc.flags & XREP_ALREADY_FIXED)) {
+ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+ !(sc->flags & XREP_ALREADY_FIXED)) {
bool needs_fix;
/* Let debug users force us into the repair routines. */
if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
- sc.sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
- needs_fix = (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
- XFS_SCRUB_OFLAG_XCORRUPT |
- XFS_SCRUB_OFLAG_PREEN));
+ needs_fix = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT |
+ XFS_SCRUB_OFLAG_PREEN));
/*
* If userspace asked for a repair but it wasn't necessary,
* report that back to userspace.
*/
if (!needs_fix) {
- sc.sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
goto out_nofix;
}
@@ -549,26 +553,28 @@ retry_op:
* If it's broken, userspace wants us to fix it, and we haven't
* already tried to fix it, then attempt a repair.
*/
- error = xrep_attempt(&sc);
+ error = xrep_attempt(sc);
if (error == -EAGAIN) {
/*
* Either the repair function succeeded or it couldn't
* get all the resources it needs; either way, we go
* back to the beginning and call the scrub function.
*/
- error = xchk_teardown(&sc, 0);
+ error = xchk_teardown(sc, 0);
if (error) {
xrep_failure(mp);
- goto out;
+ goto out_sc;
}
goto retry_op;
}
}
out_nofix:
- xchk_postmortem(&sc);
+ xchk_postmortem(sc);
out_teardown:
- error = xchk_teardown(&sc, error);
+ error = xchk_teardown(sc, error);
+out_sc:
+ kmem_free(sc);
out:
trace_xchk_done(XFS_I(file_inode(file)), sm, error);
if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index c0ef53fe6611..b5f94676c37c 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -21,13 +21,14 @@ xchk_btree_cur_fsbno(
struct xfs_btree_cur *cur,
int level)
{
- if (level < cur->bc_nlevels && cur->bc_bufs[level])
+ if (level < cur->bc_nlevels && cur->bc_levels[level].bp)
return XFS_DADDR_TO_FSB(cur->bc_mp,
- xfs_buf_daddr(cur->bc_bufs[level]));
- if (level == cur->bc_nlevels - 1 && cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ xfs_buf_daddr(cur->bc_levels[level].bp));
+
+ if (level == cur->bc_nlevels - 1 &&
+ (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino);
- if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS))
- return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, 0);
+
return NULLFSBLOCK;
}
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index a7bbb84f91a7..93ece6df02e3 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -348,7 +348,7 @@ TRACE_EVENT(xchk_btree_op_error,
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->error = error;
__entry->ret_ip = ret_ip;
),
@@ -389,7 +389,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__entry->type = sc->sm->sm_type;
__entry->btnum = cur->bc_btnum;
__entry->level = level;
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->error = error;
@@ -431,7 +431,7 @@ TRACE_EVENT(xchk_btree_error,
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->ret_ip = ret_ip;
),
TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
@@ -471,7 +471,7 @@ TRACE_EVENT(xchk_ifork_btree_error,
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->ret_ip = ret_ip;
),
TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
@@ -511,7 +511,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
),
TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 34fc6148032a..c8c15c3c3147 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -82,6 +82,7 @@ xfs_end_ioend(
struct iomap_ioend *ioend)
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
unsigned int nofs_flag;
@@ -97,18 +98,26 @@ xfs_end_ioend(
/*
* Just clean up the in-memory structures if the fs has been shut down.
*/
- if (xfs_is_shutdown(ip->i_mount)) {
+ if (xfs_is_shutdown(mp)) {
error = -EIO;
goto done;
}
/*
- * Clean up any COW blocks on an I/O error.
+ * Clean up all COW blocks and underlying data fork delalloc blocks on
+ * I/O error. The delalloc punch is required because this ioend was
+ * mapped to blocks in the COW fork and the associated pages are no
+ * longer dirty. If we don't remove delalloc blocks here, they become
+ * stale and can corrupt free space accounting on unmount.
*/
error = blk_status_to_errno(ioend->io_bio->bi_status);
if (unlikely(error)) {
- if (ioend->io_flags & IOMAP_F_SHARED)
+ if (ioend->io_flags & IOMAP_F_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
+ xfs_bmap_punch_delalloc_range(ip,
+ XFS_B_TO_FSBT(mp, offset),
+ XFS_B_TO_FSB(mp, size));
+ }
goto done;
}
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 2b5da6218977..27265771f247 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -390,7 +390,7 @@ out_destroy_fork:
/* kill the in-core attr fork before we drop the inode lock */
if (dp->i_afp) {
xfs_idestroy_fork(dp->i_afp);
- kmem_cache_free(xfs_ifork_zone, dp->i_afp);
+ kmem_cache_free(xfs_ifork_cache, dp->i_afp);
dp->i_afp = NULL;
}
if (lock_mode)
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 03159970133f..e1f4d7d5a011 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -25,8 +25,8 @@
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
-kmem_zone_t *xfs_bui_zone;
-kmem_zone_t *xfs_bud_zone;
+struct kmem_cache *xfs_bui_cache;
+struct kmem_cache *xfs_bud_cache;
static const struct xfs_item_ops xfs_bui_item_ops;
@@ -39,7 +39,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
- kmem_cache_free(xfs_bui_zone, buip);
+ kmem_cache_free(xfs_bui_cache, buip);
}
/*
@@ -138,7 +138,7 @@ xfs_bui_init(
{
struct xfs_bui_log_item *buip;
- buip = kmem_cache_zalloc(xfs_bui_zone, GFP_KERNEL | __GFP_NOFAIL);
+ buip = kmem_cache_zalloc(xfs_bui_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
@@ -198,7 +198,7 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
- kmem_cache_free(xfs_bud_zone, budp);
+ kmem_cache_free(xfs_bud_cache, budp);
}
static const struct xfs_item_ops xfs_bud_item_ops = {
@@ -215,7 +215,7 @@ xfs_trans_get_bud(
{
struct xfs_bud_log_item *budp;
- budp = kmem_cache_zalloc(xfs_bud_zone, GFP_KERNEL | __GFP_NOFAIL);
+ budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
&xfs_bud_item_ops);
budp->bud_buip = buip;
@@ -384,7 +384,7 @@ xfs_bmap_update_finish_item(
bmap->bi_bmap.br_blockcount = count;
return -EAGAIN;
}
- kmem_free(bmap);
+ kmem_cache_free(xfs_bmap_intent_cache, bmap);
return error;
}
@@ -404,7 +404,7 @@ xfs_bmap_update_cancel_item(
struct xfs_bmap_intent *bmap;
bmap = container_of(item, struct xfs_bmap_intent, bi_list);
- kmem_free(bmap);
+ kmem_cache_free(xfs_bmap_intent_cache, bmap);
}
const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
@@ -532,7 +532,7 @@ xfs_bui_item_recover(
* Commit transaction, which frees the transaction and saves the inode
* for later replay activities.
*/
- error = xfs_defer_ops_capture_and_commit(tp, ip, capture_list);
+ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
if (error)
goto err_unlock;
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index b9be62f8bd52..3fafd3881a0b 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -25,7 +25,7 @@
/* kernel only BUI/BUD definitions */
struct xfs_mount;
-struct kmem_zone;
+struct kmem_cache;
/*
* Max number of extents in fast allocation path.
@@ -65,7 +65,7 @@ struct xfs_bud_log_item {
struct xfs_bud_log_format bud_format;
};
-extern struct kmem_zone *xfs_bui_zone;
-extern struct kmem_zone *xfs_bud_zone;
+extern struct kmem_cache *xfs_bui_cache;
+extern struct kmem_cache *xfs_bud_cache;
#endif /* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 5fa6cd947dd4..631c5a61d89b 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -20,7 +20,7 @@
#include "xfs_error.h"
#include "xfs_ag.h"
-static kmem_zone_t *xfs_buf_zone;
+static struct kmem_cache *xfs_buf_cache;
/*
* Locking orders
@@ -220,7 +220,7 @@ _xfs_buf_alloc(
int i;
*bpp = NULL;
- bp = kmem_cache_zalloc(xfs_buf_zone, GFP_NOFS | __GFP_NOFAIL);
+ bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL);
/*
* We don't want certain flags to appear in b_flags unless they are
@@ -247,7 +247,7 @@ _xfs_buf_alloc(
*/
error = xfs_buf_get_maps(bp, nmaps);
if (error) {
- kmem_cache_free(xfs_buf_zone, bp);
+ kmem_cache_free(xfs_buf_cache, bp);
return error;
}
@@ -307,7 +307,7 @@ xfs_buf_free(
kmem_free(bp->b_addr);
xfs_buf_free_maps(bp);
- kmem_cache_free(xfs_buf_zone, bp);
+ kmem_cache_free(xfs_buf_cache, bp);
}
static int
@@ -2258,12 +2258,12 @@ xfs_buf_delwri_pushbuf(
int __init
xfs_buf_init(void)
{
- xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
+ xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
SLAB_HWCACHE_ALIGN |
SLAB_RECLAIM_ACCOUNT |
SLAB_MEM_SPREAD,
NULL);
- if (!xfs_buf_zone)
+ if (!xfs_buf_cache)
goto out;
return 0;
@@ -2275,7 +2275,7 @@ xfs_buf_init(void)
void
xfs_buf_terminate(void)
{
- kmem_cache_destroy(xfs_buf_zone);
+ kmem_cache_destroy(xfs_buf_cache);
}
void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index b1ab100c09e1..a7a8e4528881 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -23,7 +23,7 @@
#include "xfs_log.h"
-kmem_zone_t *xfs_buf_item_zone;
+struct kmem_cache *xfs_buf_item_cache;
static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
{
@@ -804,7 +804,7 @@ xfs_buf_item_init(
return 0;
}
- bip = kmem_cache_zalloc(xfs_buf_item_zone, GFP_KERNEL | __GFP_NOFAIL);
+ bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
bip->bli_buf = bp;
@@ -825,7 +825,7 @@ xfs_buf_item_init(
map_size = DIV_ROUND_UP(chunks, NBWORD);
if (map_size > XFS_BLF_DATAMAP_SIZE) {
- kmem_cache_free(xfs_buf_item_zone, bip);
+ kmem_cache_free(xfs_buf_item_cache, bip);
xfs_err(mp,
"buffer item dirty bitmap (%u uints) too small to reflect %u bytes!",
map_size,
@@ -1002,7 +1002,7 @@ xfs_buf_item_free(
{
xfs_buf_item_free_format(bip);
kmem_free(bip->bli_item.li_lv_shadow);
- kmem_cache_free(xfs_buf_item_zone, bip);
+ kmem_cache_free(xfs_buf_item_cache, bip);
}
/*
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 50aa0f5ef959..e11e9ef2338f 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -71,6 +71,6 @@ static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp)
void xfs_buf_iodone(struct xfs_buf *);
bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
-extern kmem_zone_t *xfs_buf_item_zone;
+extern struct kmem_cache *xfs_buf_item_cache;
#endif /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index a476c7ef5d53..70ca5751b13e 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -603,7 +603,7 @@ xlog_recover_do_inode_buffer(
inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog;
for (i = 0; i < inodes_per_buf; i++) {
next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
- offsetof(xfs_dinode_t, di_next_unlinked);
+ offsetof(struct xfs_dinode, di_next_unlinked);
while (next_unlinked_offset >=
(reg_buf_offset + reg_buf_bytes)) {
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index c15d61d47a06..e48ae227bb11 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -38,8 +38,8 @@
* otherwise by the lowest id first, see xfs_dqlock2.
*/
-struct kmem_zone *xfs_qm_dqtrxzone;
-static struct kmem_zone *xfs_qm_dqzone;
+struct kmem_cache *xfs_dqtrx_cache;
+static struct kmem_cache *xfs_dquot_cache;
static struct lock_class_key xfs_dquot_group_class;
static struct lock_class_key xfs_dquot_project_class;
@@ -57,7 +57,7 @@ xfs_qm_dqdestroy(
mutex_destroy(&dqp->q_qlock);
XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
- kmem_cache_free(xfs_qm_dqzone, dqp);
+ kmem_cache_free(xfs_dquot_cache, dqp);
}
/*
@@ -458,7 +458,7 @@ xfs_dquot_alloc(
{
struct xfs_dquot *dqp;
- dqp = kmem_cache_zalloc(xfs_qm_dqzone, GFP_KERNEL | __GFP_NOFAIL);
+ dqp = kmem_cache_zalloc(xfs_dquot_cache, GFP_KERNEL | __GFP_NOFAIL);
dqp->q_type = type;
dqp->q_id = id;
@@ -471,7 +471,7 @@ xfs_dquot_alloc(
* Offset of dquot in the (fixed sized) dquot chunk.
*/
dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
- sizeof(xfs_dqblk_t);
+ sizeof(struct xfs_dqblk);
/*
* Because we want to use a counting completion, complete
@@ -1363,22 +1363,22 @@ xfs_dqlock2(
int __init
xfs_qm_init(void)
{
- xfs_qm_dqzone = kmem_cache_create("xfs_dquot",
+ xfs_dquot_cache = kmem_cache_create("xfs_dquot",
sizeof(struct xfs_dquot),
0, 0, NULL);
- if (!xfs_qm_dqzone)
+ if (!xfs_dquot_cache)
goto out;
- xfs_qm_dqtrxzone = kmem_cache_create("xfs_dqtrx",
+ xfs_dqtrx_cache = kmem_cache_create("xfs_dqtrx",
sizeof(struct xfs_dquot_acct),
0, 0, NULL);
- if (!xfs_qm_dqtrxzone)
- goto out_free_dqzone;
+ if (!xfs_dqtrx_cache)
+ goto out_free_dquot_cache;
return 0;
-out_free_dqzone:
- kmem_cache_destroy(xfs_qm_dqzone);
+out_free_dquot_cache:
+ kmem_cache_destroy(xfs_dquot_cache);
out:
return -ENOMEM;
}
@@ -1386,8 +1386,8 @@ out:
void
xfs_qm_exit(void)
{
- kmem_cache_destroy(xfs_qm_dqtrxzone);
- kmem_cache_destroy(xfs_qm_dqzone);
+ kmem_cache_destroy(xfs_dqtrx_cache);
+ kmem_cache_destroy(xfs_dquot_cache);
}
/*
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 3f8a0713573a..47ef9c9c5c17 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -25,8 +25,8 @@
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
-kmem_zone_t *xfs_efi_zone;
-kmem_zone_t *xfs_efd_zone;
+struct kmem_cache *xfs_efi_cache;
+struct kmem_cache *xfs_efd_cache;
static const struct xfs_item_ops xfs_efi_item_ops;
@@ -43,7 +43,7 @@ xfs_efi_item_free(
if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
kmem_free(efip);
else
- kmem_cache_free(xfs_efi_zone, efip);
+ kmem_cache_free(xfs_efi_cache, efip);
}
/*
@@ -161,7 +161,7 @@ xfs_efi_init(
((nextents - 1) * sizeof(xfs_extent_t)));
efip = kmem_zalloc(size, 0);
} else {
- efip = kmem_cache_zalloc(xfs_efi_zone,
+ efip = kmem_cache_zalloc(xfs_efi_cache,
GFP_KERNEL | __GFP_NOFAIL);
}
@@ -241,7 +241,7 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp)
if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
kmem_free(efdp);
else
- kmem_cache_free(xfs_efd_zone, efdp);
+ kmem_cache_free(xfs_efd_cache, efdp);
}
/*
@@ -333,7 +333,7 @@ xfs_trans_get_efd(
(nextents - 1) * sizeof(struct xfs_extent),
0);
} else {
- efdp = kmem_cache_zalloc(xfs_efd_zone,
+ efdp = kmem_cache_zalloc(xfs_efd_cache,
GFP_KERNEL | __GFP_NOFAIL);
}
@@ -474,15 +474,21 @@ xfs_extent_free_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
+ struct xfs_owner_info oinfo = { };
struct xfs_extent_free_item *free;
int error;
free = container_of(item, struct xfs_extent_free_item, xefi_list);
+ oinfo.oi_owner = free->xefi_owner;
+ if (free->xefi_flags & XFS_EFI_ATTR_FORK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+ if (free->xefi_flags & XFS_EFI_BMBT_BLOCK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
error = xfs_trans_free_extent(tp, EFD_ITEM(done),
free->xefi_startblock,
free->xefi_blockcount,
- &free->xefi_oinfo, free->xefi_skip_discard);
- kmem_free(free);
+ &oinfo, free->xefi_flags & XFS_EFI_SKIP_DISCARD);
+ kmem_cache_free(xfs_extfree_item_cache, free);
return error;
}
@@ -502,7 +508,7 @@ xfs_extent_free_cancel_item(
struct xfs_extent_free_item *free;
free = container_of(item, struct xfs_extent_free_item, xefi_list);
- kmem_free(free);
+ kmem_cache_free(xfs_extfree_item_cache, free);
}
const struct xfs_defer_op_type xfs_extent_free_defer_type = {
@@ -525,6 +531,7 @@ xfs_agfl_free_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
+ struct xfs_owner_info oinfo = { };
struct xfs_mount *mp = tp->t_mountp;
struct xfs_efd_log_item *efdp = EFD_ITEM(done);
struct xfs_extent_free_item *free;
@@ -539,13 +546,13 @@ xfs_agfl_free_finish_item(
ASSERT(free->xefi_blockcount == 1);
agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
+ oinfo.oi_owner = free->xefi_owner;
trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
if (!error)
- error = xfs_free_agfl_block(tp, agno, agbno, agbp,
- &free->xefi_oinfo);
+ error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo);
/*
* Mark the transaction dirty, even on error. This ensures the
@@ -564,7 +571,7 @@ xfs_agfl_free_finish_item(
extp->ext_len = free->xefi_blockcount;
efdp->efd_next_extent++;
- kmem_free(free);
+ kmem_cache_free(xfs_extfree_item_cache, free);
return error;
}
@@ -637,7 +644,7 @@ xfs_efi_item_recover(
}
- return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
+ return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
xfs_trans_cancel(tp);
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index cd2860c875bf..186d0f2137f1 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -9,7 +9,7 @@
/* kernel only EFI/EFD definitions */
struct xfs_mount;
-struct kmem_zone;
+struct kmem_cache;
/*
* Max number of extents in fast allocation path.
@@ -69,7 +69,7 @@ struct xfs_efd_log_item {
*/
#define XFS_EFD_MAX_FAST_EXTENTS 16
-extern struct kmem_zone *xfs_efi_zone;
-extern struct kmem_zone *xfs_efd_zone;
+extern struct kmem_cache *xfs_efi_cache;
+extern struct kmem_cache *xfs_efd_cache;
#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7aa943edfc02..27594738b0d1 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -259,7 +259,7 @@ xfs_file_dio_read(
ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
if (ret)
return ret;
- ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
+ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -569,7 +569,7 @@ xfs_file_dio_write_aligned(
}
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, 0);
+ &xfs_dio_write_ops, 0, 0);
out_unlock:
if (iolock)
xfs_iunlock(ip, iolock);
@@ -647,7 +647,7 @@ retry_exclusive:
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, flags);
+ &xfs_dio_write_ops, flags, 0);
/*
* Retry unaligned I/O with exclusive blocking semantics if the DIO
@@ -1452,7 +1452,7 @@ const struct file_operations xfs_file_operations = {
.write_iter = xfs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index f2210d927481..e1472004170e 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -77,10 +77,10 @@ xfs_inode_alloc(
* XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
* and return NULL here on ENOMEM.
*/
- ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL);
+ ip = kmem_cache_alloc(xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
if (inode_init_always(mp->m_super, VFS_I(ip))) {
- kmem_cache_free(xfs_inode_zone, ip);
+ kmem_cache_free(xfs_inode_cache, ip);
return NULL;
}
@@ -130,11 +130,11 @@ xfs_inode_free_callback(
if (ip->i_afp) {
xfs_idestroy_fork(ip->i_afp);
- kmem_cache_free(xfs_ifork_zone, ip->i_afp);
+ kmem_cache_free(xfs_ifork_cache, ip->i_afp);
}
if (ip->i_cowfp) {
xfs_idestroy_fork(ip->i_cowfp);
- kmem_cache_free(xfs_ifork_zone, ip->i_cowfp);
+ kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
}
if (ip->i_itemp) {
ASSERT(!test_bit(XFS_LI_IN_AIL,
@@ -143,7 +143,7 @@ xfs_inode_free_callback(
ip->i_itemp = NULL;
}
- kmem_cache_free(xfs_inode_zone, ip);
+ kmem_cache_free(xfs_inode_cache, ip);
}
static void
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 017904a34c02..508e184e3b8f 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -20,7 +20,7 @@
#include "xfs_ialloc.h"
#include "xfs_trace.h"
-kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+struct kmem_cache *xfs_icreate_cache; /* inode create item */
static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
{
@@ -63,7 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
- kmem_cache_free(xfs_icreate_zone, ICR_ITEM(lip));
+ kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip));
}
static const struct xfs_item_ops xfs_icreate_item_ops = {
@@ -97,7 +97,7 @@ xfs_icreate_log(
{
struct xfs_icreate_item *icp;
- icp = kmem_cache_zalloc(xfs_icreate_zone, GFP_KERNEL | __GFP_NOFAIL);
+ icp = kmem_cache_zalloc(xfs_icreate_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
&xfs_icreate_item_ops);
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h
index a50d0b01e15a..64992823108a 100644
--- a/fs/xfs/xfs_icreate_item.h
+++ b/fs/xfs/xfs_icreate_item.h
@@ -12,7 +12,7 @@ struct xfs_icreate_item {
struct xfs_icreate_log ic_format;
};
-extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+extern struct kmem_cache *xfs_icreate_cache; /* inode create item */
void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno,
xfs_agblock_t agbno, unsigned int count,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a4f6f034fb81..64b9bf334806 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -36,7 +36,7 @@
#include "xfs_reflink.h"
#include "xfs_ag.h"
-kmem_zone_t *xfs_inode_zone;
+struct kmem_cache *xfs_inode_cache;
/*
* Used in xfs_itruncate_extents(). This is the maximum number of extents
@@ -564,8 +564,6 @@ xfs_lock_two_inodes(
struct xfs_inode *ip1,
uint ip1_mode)
{
- struct xfs_inode *temp;
- uint mode_temp;
int attempts = 0;
struct xfs_log_item *lp;
@@ -578,12 +576,8 @@ xfs_lock_two_inodes(
ASSERT(ip0->i_ino != ip1->i_ino);
if (ip0->i_ino > ip1->i_ino) {
- temp = ip0;
- ip0 = ip1;
- ip1 = temp;
- mode_temp = ip0_mode;
- ip0_mode = ip1_mode;
- ip1_mode = mode_temp;
+ swap(ip0, ip1);
+ swap(ip0_mode, ip1_mode);
}
again:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index b21b177832d1..e635a3d64cba 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -504,7 +504,7 @@ static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
void xfs_irele(struct xfs_inode *ip);
-extern struct kmem_zone *xfs_inode_zone;
+extern struct kmem_cache *xfs_inode_cache;
/* The default CoW extent size hint. */
#define XFS_DEFAULT_COWEXTSZ_HINT 32
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 0659d19c211e..90d8e591baf8 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -21,7 +21,7 @@
#include <linux/iversion.h>
-kmem_zone_t *xfs_ili_zone; /* inode log item zone */
+struct kmem_cache *xfs_ili_cache; /* inode log item */
static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
{
@@ -672,7 +672,7 @@ xfs_inode_item_init(
struct xfs_inode_log_item *iip;
ASSERT(ip->i_itemp == NULL);
- iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_zone,
+ iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_cache,
GFP_KERNEL | __GFP_NOFAIL);
iip->ili_inode = ip;
@@ -694,7 +694,7 @@ xfs_inode_item_destroy(
ip->i_itemp = NULL;
kmem_free(iip->ili_item.li_lv_shadow);
- kmem_cache_free(xfs_ili_zone, iip);
+ kmem_cache_free(xfs_ili_cache, iip);
}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 403b45ab9aa2..1a302000d604 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -47,6 +47,6 @@ extern void xfs_iflush_abort(struct xfs_inode *);
extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
struct xfs_inode_log_format *);
-extern struct kmem_zone *xfs_ili_zone;
+extern struct kmem_cache *xfs_ili_cache;
#endif /* __XFS_INODE_ITEM_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0c795dc093ef..174cd8950cb6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1547,7 +1547,7 @@ xfs_ioc_getbmap(
if (bmx.bmv_count > ULONG_MAX / recsize)
return -ENOMEM;
- buf = kvzalloc(bmx.bmv_count * sizeof(*buf), GFP_KERNEL);
+ buf = kvcalloc(bmx.bmv_count, sizeof(*buf), GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -1601,11 +1601,11 @@ xfs_ioc_getfsmap(
*/
count = min_t(unsigned int, head.fmh_count,
131072 / sizeof(struct fsmap));
- recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL);
+ recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
if (!recs) {
count = min_t(unsigned int, head.fmh_count,
PAGE_SIZE / sizeof(struct fsmap));
- recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL);
+ recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
if (!recs)
return -ENOMEM;
}
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f6cd2d4aa770..89fec9a18c34 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -21,7 +21,7 @@
#include "xfs_sb.h"
#include "xfs_health.h"
-kmem_zone_t *xfs_log_ticket_zone;
+struct kmem_cache *xfs_log_ticket_cache;
/* Local miscellaneous function prototypes */
STATIC struct xlog *
@@ -3487,7 +3487,7 @@ xfs_log_ticket_put(
{
ASSERT(atomic_read(&ticket->t_ref) > 0);
if (atomic_dec_and_test(&ticket->t_ref))
- kmem_cache_free(xfs_log_ticket_zone, ticket);
+ kmem_cache_free(xfs_log_ticket_cache, ticket);
}
xlog_ticket_t *
@@ -3611,7 +3611,7 @@ xlog_ticket_alloc(
struct xlog_ticket *tic;
int unit_res;
- tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL);
+ tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL);
unit_res = xlog_calc_unit_res(log, unit_bytes);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 844fbeec3545..23103d68423c 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -497,7 +497,7 @@ xlog_recover_cancel(struct xlog *);
extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
char *dp, int size);
-extern kmem_zone_t *xfs_log_ticket_zone;
+extern struct kmem_cache *xfs_log_ticket_cache;
struct xlog_ticket *
xlog_ticket_alloc(
struct xlog *log,
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 10562ecbd9ea..53366cc0bc9e 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2466,11 +2466,11 @@ xlog_finish_defer_ops(
{
struct xfs_defer_capture *dfc, *next;
struct xfs_trans *tp;
- struct xfs_inode *ip;
int error = 0;
list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
struct xfs_trans_res resv;
+ struct xfs_defer_resources dres;
/*
* Create a new transaction reservation from the captured
@@ -2494,13 +2494,9 @@ xlog_finish_defer_ops(
* from recovering a single intent item.
*/
list_del_init(&dfc->dfc_list);
- xfs_defer_ops_continue(dfc, tp, &ip);
-
+ xfs_defer_ops_continue(dfc, tp, &dres);
error = xfs_trans_commit(tp);
- if (ip) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_irele(ip);
- }
+ xfs_defer_resources_rele(&dres);
if (error)
return error;
}
@@ -2520,7 +2516,7 @@ xlog_abort_defer_ops(
list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
list_del_init(&dfc->dfc_list);
- xfs_defer_ops_release(mp, dfc);
+ xfs_defer_ops_capture_free(mp, dfc);
}
}
/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 06dac09eddbd..359109b6f0d3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -567,6 +567,18 @@ xfs_mount_setup_inode_geom(
xfs_ialloc_setup_geometry(mp);
}
+/* Compute maximum possible height for per-AG btree types for this fs. */
+static inline void
+xfs_agbtree_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ unsigned int levels;
+
+ levels = max(mp->m_alloc_maxlevels, M_IGEO(mp)->inobt_maxlevels);
+ levels = max(levels, mp->m_rmap_maxlevels);
+ mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
+}
+
/*
* This function does the following on an initial mount of a file system:
* - reads the superblock from disk and init the mount struct
@@ -638,6 +650,8 @@ xfs_mountfs(
xfs_rmapbt_compute_maxlevels(mp);
xfs_refcountbt_compute_maxlevels(mp);
+ xfs_agbtree_compute_maxlevels(mp);
+
/*
* Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks
* is NOT aligned turn off m_dalign since allocator alignment is within
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e091f3b3fa15..00720a02e761 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -128,10 +128,11 @@ typedef struct xfs_mount {
uint m_rmap_mnr[2]; /* min rmap btree records */
uint m_refc_mxr[2]; /* max refc btree records */
uint m_refc_mnr[2]; /* min refc btree records */
- uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
- uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
+ uint m_alloc_maxlevels; /* max alloc btree levels */
+ uint m_bm_maxlevels[2]; /* max bmap btree levels */
uint m_rmap_maxlevels; /* max rmap btree levels */
uint m_refc_maxlevels; /* max refcount btree level */
+ unsigned int m_agbtree_maxlevels; /* max level of all AG btrees */
xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */
uint m_alloc_set_aside; /* space we can't use */
uint m_ag_max_usable; /* max space per AG */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 34c3b16f834f..f85e3b07ab44 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -219,7 +219,7 @@ _xfs_mru_cache_list_insert(
* When destroying or reaping, all the elements that were migrated to the reap
* list need to be deleted. For each element this involves removing it from the
* data store, removing it from the reap list, calling the client's free
- * function and deleting the element from the element zone.
+ * function and deleting the element from the element cache.
*
* We get called holding the mru->lock, which we drop and then reacquire.
* Sparse need special help with this to tell it we know what we are doing.
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5608066d6e53..32ac8d9c8940 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -850,7 +850,7 @@ xfs_qm_reset_dqcounts(
*/
#ifdef DEBUG
j = (int)XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) /
- sizeof(xfs_dqblk_t);
+ sizeof(struct xfs_dqblk);
ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
#endif
dqb = bp->b_addr;
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 442a0f97a9d4..5bb12717ea28 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -11,7 +11,7 @@
struct xfs_inode;
-extern struct kmem_zone *xfs_qm_dqtrxzone;
+extern struct kmem_cache *xfs_dqtrx_cache;
/*
* Number of bmaps that we ask from bmapi when doing a quotacheck.
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 46904b793bd4..d3da67772d57 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -21,8 +21,8 @@
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
-kmem_zone_t *xfs_cui_zone;
-kmem_zone_t *xfs_cud_zone;
+struct kmem_cache *xfs_cui_cache;
+struct kmem_cache *xfs_cud_cache;
static const struct xfs_item_ops xfs_cui_item_ops;
@@ -38,7 +38,7 @@ xfs_cui_item_free(
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
- kmem_cache_free(xfs_cui_zone, cuip);
+ kmem_cache_free(xfs_cui_cache, cuip);
}
/*
@@ -143,7 +143,7 @@ xfs_cui_init(
cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
0);
else
- cuip = kmem_cache_zalloc(xfs_cui_zone,
+ cuip = kmem_cache_zalloc(xfs_cui_cache,
GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
@@ -204,7 +204,7 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
- kmem_cache_free(xfs_cud_zone, cudp);
+ kmem_cache_free(xfs_cud_cache, cudp);
}
static const struct xfs_item_ops xfs_cud_item_ops = {
@@ -221,7 +221,7 @@ xfs_trans_get_cud(
{
struct xfs_cud_log_item *cudp;
- cudp = kmem_cache_zalloc(xfs_cud_zone, GFP_KERNEL | __GFP_NOFAIL);
+ cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
&xfs_cud_item_ops);
cudp->cud_cuip = cuip;
@@ -384,7 +384,7 @@ xfs_refcount_update_finish_item(
refc->ri_blockcount = new_aglen;
return -EAGAIN;
}
- kmem_free(refc);
+ kmem_cache_free(xfs_refcount_intent_cache, refc);
return error;
}
@@ -404,7 +404,7 @@ xfs_refcount_update_cancel_item(
struct xfs_refcount_intent *refc;
refc = container_of(item, struct xfs_refcount_intent, ri_list);
- kmem_free(refc);
+ kmem_cache_free(xfs_refcount_intent_cache, refc);
}
const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
@@ -557,7 +557,7 @@ xfs_cui_item_recover(
}
xfs_refcount_finish_one_cleanup(tp, rcur, error);
- return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
+ return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
xfs_refcount_finish_one_cleanup(tp, rcur, error);
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index f4f2e836540b..eb0ab13682d0 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -25,7 +25,7 @@
/* kernel only CUI/CUD definitions */
struct xfs_mount;
-struct kmem_zone;
+struct kmem_cache;
/*
* Max number of extents in fast allocation path.
@@ -68,7 +68,7 @@ struct xfs_cud_log_item {
struct xfs_cud_log_format cud_format;
};
-extern struct kmem_zone *xfs_cui_zone;
-extern struct kmem_zone *xfs_cud_zone;
+extern struct kmem_cache *xfs_cui_cache;
+extern struct kmem_cache *xfs_cud_cache;
#endif /* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 76355f293488..cb0edb1d68ef 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -484,7 +484,7 @@ xfs_reflink_cancel_cow_blocks(
xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
del.br_blockcount);
- xfs_bmap_add_free(*tpp, del.br_startblock,
+ xfs_free_extent_later(*tpp, del.br_startblock,
del.br_blockcount, NULL);
/* Roll the transaction */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 5f0695980467..c3966b4c58ef 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -21,8 +21,8 @@
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
-kmem_zone_t *xfs_rui_zone;
-kmem_zone_t *xfs_rud_zone;
+struct kmem_cache *xfs_rui_cache;
+struct kmem_cache *xfs_rud_cache;
static const struct xfs_item_ops xfs_rui_item_ops;
@@ -38,7 +38,7 @@ xfs_rui_item_free(
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
- kmem_cache_free(xfs_rui_zone, ruip);
+ kmem_cache_free(xfs_rui_cache, ruip);
}
/*
@@ -141,7 +141,7 @@ xfs_rui_init(
if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
else
- ruip = kmem_cache_zalloc(xfs_rui_zone,
+ ruip = kmem_cache_zalloc(xfs_rui_cache,
GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
@@ -227,7 +227,7 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
- kmem_cache_free(xfs_rud_zone, rudp);
+ kmem_cache_free(xfs_rud_cache, rudp);
}
static const struct xfs_item_ops xfs_rud_item_ops = {
@@ -244,7 +244,7 @@ xfs_trans_get_rud(
{
struct xfs_rud_log_item *rudp;
- rudp = kmem_cache_zalloc(xfs_rud_zone, GFP_KERNEL | __GFP_NOFAIL);
+ rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
&xfs_rud_item_ops);
rudp->rud_ruip = ruip;
@@ -427,7 +427,7 @@ xfs_rmap_update_finish_item(
rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock,
rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state,
state);
- kmem_free(rmap);
+ kmem_cache_free(xfs_rmap_intent_cache, rmap);
return error;
}
@@ -447,7 +447,7 @@ xfs_rmap_update_cancel_item(
struct xfs_rmap_intent *rmap;
rmap = container_of(item, struct xfs_rmap_intent, ri_list);
- kmem_free(rmap);
+ kmem_cache_free(xfs_rmap_intent_cache, rmap);
}
const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
@@ -587,7 +587,7 @@ xfs_rui_item_recover(
}
xfs_rmap_finish_one_cleanup(tp, rcur, error);
- return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
+ return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
xfs_rmap_finish_one_cleanup(tp, rcur, error);
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index 31e6cdfff71f..802e5119eaca 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -28,7 +28,7 @@
/* kernel only RUI/RUD definitions */
struct xfs_mount;
-struct kmem_zone;
+struct kmem_cache;
/*
* Max number of extents in fast allocation path.
@@ -68,7 +68,7 @@ struct xfs_rud_log_item {
struct xfs_rud_log_format rud_format;
};
-extern struct kmem_zone *xfs_rui_zone;
-extern struct kmem_zone *xfs_rud_zone;
+extern struct kmem_cache *xfs_rui_cache;
+extern struct kmem_cache *xfs_rud_cache;
#endif /* __XFS_RMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c4e0cd1c1c8c..e21459f9923a 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -37,6 +37,7 @@
#include "xfs_reflink.h"
#include "xfs_pwork.h"
#include "xfs_ag.h"
+#include "xfs_defer.h"
#include <linux/magic.h>
#include <linux/fs_context.h>
@@ -1951,196 +1952,194 @@ static struct file_system_type xfs_fs_type = {
MODULE_ALIAS_FS("xfs");
STATIC int __init
-xfs_init_zones(void)
+xfs_init_caches(void)
{
- xfs_log_ticket_zone = kmem_cache_create("xfs_log_ticket",
+ int error;
+
+ xfs_log_ticket_cache = kmem_cache_create("xfs_log_ticket",
sizeof(struct xlog_ticket),
0, 0, NULL);
- if (!xfs_log_ticket_zone)
+ if (!xfs_log_ticket_cache)
goto out;
- xfs_bmap_free_item_zone = kmem_cache_create("xfs_bmap_free_item",
- sizeof(struct xfs_extent_free_item),
- 0, 0, NULL);
- if (!xfs_bmap_free_item_zone)
- goto out_destroy_log_ticket_zone;
+ error = xfs_btree_init_cur_caches();
+ if (error)
+ goto out_destroy_log_ticket_cache;
- xfs_btree_cur_zone = kmem_cache_create("xfs_btree_cur",
- sizeof(struct xfs_btree_cur),
- 0, 0, NULL);
- if (!xfs_btree_cur_zone)
- goto out_destroy_bmap_free_item_zone;
+ error = xfs_defer_init_item_caches();
+ if (error)
+ goto out_destroy_btree_cur_cache;
- xfs_da_state_zone = kmem_cache_create("xfs_da_state",
+ xfs_da_state_cache = kmem_cache_create("xfs_da_state",
sizeof(struct xfs_da_state),
0, 0, NULL);
- if (!xfs_da_state_zone)
- goto out_destroy_btree_cur_zone;
+ if (!xfs_da_state_cache)
+ goto out_destroy_defer_item_cache;
- xfs_ifork_zone = kmem_cache_create("xfs_ifork",
+ xfs_ifork_cache = kmem_cache_create("xfs_ifork",
sizeof(struct xfs_ifork),
0, 0, NULL);
- if (!xfs_ifork_zone)
- goto out_destroy_da_state_zone;
+ if (!xfs_ifork_cache)
+ goto out_destroy_da_state_cache;
- xfs_trans_zone = kmem_cache_create("xfs_trans",
+ xfs_trans_cache = kmem_cache_create("xfs_trans",
sizeof(struct xfs_trans),
0, 0, NULL);
- if (!xfs_trans_zone)
- goto out_destroy_ifork_zone;
+ if (!xfs_trans_cache)
+ goto out_destroy_ifork_cache;
/*
- * The size of the zone allocated buf log item is the maximum
+ * The size of the cache-allocated buf log item is the maximum
* size possible under XFS. This wastes a little bit of memory,
* but it is much faster.
*/
- xfs_buf_item_zone = kmem_cache_create("xfs_buf_item",
+ xfs_buf_item_cache = kmem_cache_create("xfs_buf_item",
sizeof(struct xfs_buf_log_item),
0, 0, NULL);
- if (!xfs_buf_item_zone)
- goto out_destroy_trans_zone;
+ if (!xfs_buf_item_cache)
+ goto out_destroy_trans_cache;
- xfs_efd_zone = kmem_cache_create("xfs_efd_item",
+ xfs_efd_cache = kmem_cache_create("xfs_efd_item",
(sizeof(struct xfs_efd_log_item) +
(XFS_EFD_MAX_FAST_EXTENTS - 1) *
sizeof(struct xfs_extent)),
0, 0, NULL);
- if (!xfs_efd_zone)
- goto out_destroy_buf_item_zone;
+ if (!xfs_efd_cache)
+ goto out_destroy_buf_item_cache;
- xfs_efi_zone = kmem_cache_create("xfs_efi_item",
+ xfs_efi_cache = kmem_cache_create("xfs_efi_item",
(sizeof(struct xfs_efi_log_item) +
(XFS_EFI_MAX_FAST_EXTENTS - 1) *
sizeof(struct xfs_extent)),
0, 0, NULL);
- if (!xfs_efi_zone)
- goto out_destroy_efd_zone;
+ if (!xfs_efi_cache)
+ goto out_destroy_efd_cache;
- xfs_inode_zone = kmem_cache_create("xfs_inode",
+ xfs_inode_cache = kmem_cache_create("xfs_inode",
sizeof(struct xfs_inode), 0,
(SLAB_HWCACHE_ALIGN |
SLAB_RECLAIM_ACCOUNT |
SLAB_MEM_SPREAD | SLAB_ACCOUNT),
xfs_fs_inode_init_once);
- if (!xfs_inode_zone)
- goto out_destroy_efi_zone;
+ if (!xfs_inode_cache)
+ goto out_destroy_efi_cache;
- xfs_ili_zone = kmem_cache_create("xfs_ili",
+ xfs_ili_cache = kmem_cache_create("xfs_ili",
sizeof(struct xfs_inode_log_item), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
NULL);
- if (!xfs_ili_zone)
- goto out_destroy_inode_zone;
+ if (!xfs_ili_cache)
+ goto out_destroy_inode_cache;
- xfs_icreate_zone = kmem_cache_create("xfs_icr",
+ xfs_icreate_cache = kmem_cache_create("xfs_icr",
sizeof(struct xfs_icreate_item),
0, 0, NULL);
- if (!xfs_icreate_zone)
- goto out_destroy_ili_zone;
+ if (!xfs_icreate_cache)
+ goto out_destroy_ili_cache;
- xfs_rud_zone = kmem_cache_create("xfs_rud_item",
+ xfs_rud_cache = kmem_cache_create("xfs_rud_item",
sizeof(struct xfs_rud_log_item),
0, 0, NULL);
- if (!xfs_rud_zone)
- goto out_destroy_icreate_zone;
+ if (!xfs_rud_cache)
+ goto out_destroy_icreate_cache;
- xfs_rui_zone = kmem_cache_create("xfs_rui_item",
+ xfs_rui_cache = kmem_cache_create("xfs_rui_item",
xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS),
0, 0, NULL);
- if (!xfs_rui_zone)
- goto out_destroy_rud_zone;
+ if (!xfs_rui_cache)
+ goto out_destroy_rud_cache;
- xfs_cud_zone = kmem_cache_create("xfs_cud_item",
+ xfs_cud_cache = kmem_cache_create("xfs_cud_item",
sizeof(struct xfs_cud_log_item),
0, 0, NULL);
- if (!xfs_cud_zone)
- goto out_destroy_rui_zone;
+ if (!xfs_cud_cache)
+ goto out_destroy_rui_cache;
- xfs_cui_zone = kmem_cache_create("xfs_cui_item",
+ xfs_cui_cache = kmem_cache_create("xfs_cui_item",
xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS),
0, 0, NULL);
- if (!xfs_cui_zone)
- goto out_destroy_cud_zone;
+ if (!xfs_cui_cache)
+ goto out_destroy_cud_cache;
- xfs_bud_zone = kmem_cache_create("xfs_bud_item",
+ xfs_bud_cache = kmem_cache_create("xfs_bud_item",
sizeof(struct xfs_bud_log_item),
0, 0, NULL);
- if (!xfs_bud_zone)
- goto out_destroy_cui_zone;
+ if (!xfs_bud_cache)
+ goto out_destroy_cui_cache;
- xfs_bui_zone = kmem_cache_create("xfs_bui_item",
+ xfs_bui_cache = kmem_cache_create("xfs_bui_item",
xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS),
0, 0, NULL);
- if (!xfs_bui_zone)
- goto out_destroy_bud_zone;
+ if (!xfs_bui_cache)
+ goto out_destroy_bud_cache;
return 0;
- out_destroy_bud_zone:
- kmem_cache_destroy(xfs_bud_zone);
- out_destroy_cui_zone:
- kmem_cache_destroy(xfs_cui_zone);
- out_destroy_cud_zone:
- kmem_cache_destroy(xfs_cud_zone);
- out_destroy_rui_zone:
- kmem_cache_destroy(xfs_rui_zone);
- out_destroy_rud_zone:
- kmem_cache_destroy(xfs_rud_zone);
- out_destroy_icreate_zone:
- kmem_cache_destroy(xfs_icreate_zone);
- out_destroy_ili_zone:
- kmem_cache_destroy(xfs_ili_zone);
- out_destroy_inode_zone:
- kmem_cache_destroy(xfs_inode_zone);
- out_destroy_efi_zone:
- kmem_cache_destroy(xfs_efi_zone);
- out_destroy_efd_zone:
- kmem_cache_destroy(xfs_efd_zone);
- out_destroy_buf_item_zone:
- kmem_cache_destroy(xfs_buf_item_zone);
- out_destroy_trans_zone:
- kmem_cache_destroy(xfs_trans_zone);
- out_destroy_ifork_zone:
- kmem_cache_destroy(xfs_ifork_zone);
- out_destroy_da_state_zone:
- kmem_cache_destroy(xfs_da_state_zone);
- out_destroy_btree_cur_zone:
- kmem_cache_destroy(xfs_btree_cur_zone);
- out_destroy_bmap_free_item_zone:
- kmem_cache_destroy(xfs_bmap_free_item_zone);
- out_destroy_log_ticket_zone:
- kmem_cache_destroy(xfs_log_ticket_zone);
+ out_destroy_bud_cache:
+ kmem_cache_destroy(xfs_bud_cache);
+ out_destroy_cui_cache:
+ kmem_cache_destroy(xfs_cui_cache);
+ out_destroy_cud_cache:
+ kmem_cache_destroy(xfs_cud_cache);
+ out_destroy_rui_cache:
+ kmem_cache_destroy(xfs_rui_cache);
+ out_destroy_rud_cache:
+ kmem_cache_destroy(xfs_rud_cache);
+ out_destroy_icreate_cache:
+ kmem_cache_destroy(xfs_icreate_cache);
+ out_destroy_ili_cache:
+ kmem_cache_destroy(xfs_ili_cache);
+ out_destroy_inode_cache:
+ kmem_cache_destroy(xfs_inode_cache);
+ out_destroy_efi_cache:
+ kmem_cache_destroy(xfs_efi_cache);
+ out_destroy_efd_cache:
+ kmem_cache_destroy(xfs_efd_cache);
+ out_destroy_buf_item_cache:
+ kmem_cache_destroy(xfs_buf_item_cache);
+ out_destroy_trans_cache:
+ kmem_cache_destroy(xfs_trans_cache);
+ out_destroy_ifork_cache:
+ kmem_cache_destroy(xfs_ifork_cache);
+ out_destroy_da_state_cache:
+ kmem_cache_destroy(xfs_da_state_cache);
+ out_destroy_defer_item_cache:
+ xfs_defer_destroy_item_caches();
+ out_destroy_btree_cur_cache:
+ xfs_btree_destroy_cur_caches();
+ out_destroy_log_ticket_cache:
+ kmem_cache_destroy(xfs_log_ticket_cache);
out:
return -ENOMEM;
}
STATIC void
-xfs_destroy_zones(void)
+xfs_destroy_caches(void)
{
/*
* Make sure all delayed rcu free are flushed before we
* destroy caches.
*/
rcu_barrier();
- kmem_cache_destroy(xfs_bui_zone);
- kmem_cache_destroy(xfs_bud_zone);
- kmem_cache_destroy(xfs_cui_zone);
- kmem_cache_destroy(xfs_cud_zone);
- kmem_cache_destroy(xfs_rui_zone);
- kmem_cache_destroy(xfs_rud_zone);
- kmem_cache_destroy(xfs_icreate_zone);
- kmem_cache_destroy(xfs_ili_zone);
- kmem_cache_destroy(xfs_inode_zone);
- kmem_cache_destroy(xfs_efi_zone);
- kmem_cache_destroy(xfs_efd_zone);
- kmem_cache_destroy(xfs_buf_item_zone);
- kmem_cache_destroy(xfs_trans_zone);
- kmem_cache_destroy(xfs_ifork_zone);
- kmem_cache_destroy(xfs_da_state_zone);
- kmem_cache_destroy(xfs_btree_cur_zone);
- kmem_cache_destroy(xfs_bmap_free_item_zone);
- kmem_cache_destroy(xfs_log_ticket_zone);
+ kmem_cache_destroy(xfs_bui_cache);
+ kmem_cache_destroy(xfs_bud_cache);
+ kmem_cache_destroy(xfs_cui_cache);
+ kmem_cache_destroy(xfs_cud_cache);
+ kmem_cache_destroy(xfs_rui_cache);
+ kmem_cache_destroy(xfs_rud_cache);
+ kmem_cache_destroy(xfs_icreate_cache);
+ kmem_cache_destroy(xfs_ili_cache);
+ kmem_cache_destroy(xfs_inode_cache);
+ kmem_cache_destroy(xfs_efi_cache);
+ kmem_cache_destroy(xfs_efd_cache);
+ kmem_cache_destroy(xfs_buf_item_cache);
+ kmem_cache_destroy(xfs_trans_cache);
+ kmem_cache_destroy(xfs_ifork_cache);
+ kmem_cache_destroy(xfs_da_state_cache);
+ xfs_defer_destroy_item_caches();
+ xfs_btree_destroy_cur_caches();
+ kmem_cache_destroy(xfs_log_ticket_cache);
}
STATIC int __init
@@ -2233,13 +2232,13 @@ init_xfs_fs(void)
if (error)
goto out;
- error = xfs_init_zones();
+ error = xfs_init_caches();
if (error)
goto out_destroy_hp;
error = xfs_init_workqueues();
if (error)
- goto out_destroy_zones;
+ goto out_destroy_caches;
error = xfs_mru_cache_init();
if (error)
@@ -2314,8 +2313,8 @@ init_xfs_fs(void)
xfs_mru_cache_uninit();
out_destroy_wq:
xfs_destroy_workqueues();
- out_destroy_zones:
- xfs_destroy_zones();
+ out_destroy_caches:
+ xfs_destroy_caches();
out_destroy_hp:
xfs_cpu_hotplug_destroy();
out:
@@ -2338,7 +2337,7 @@ exit_xfs_fs(void)
xfs_buf_terminate();
xfs_mru_cache_uninit();
xfs_destroy_workqueues();
- xfs_destroy_zones();
+ xfs_destroy_caches();
xfs_uuid_table_free();
xfs_cpu_hotplug_destroy();
}
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 18dc5eca6c04..8608f804388f 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -105,7 +105,7 @@ bug_on_assert_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bug_on_assert ? 1 : 0);
+ return sysfs_emit(buf, "%d\n", xfs_globals.bug_on_assert);
}
XFS_SYSFS_ATTR_RW(bug_on_assert);
@@ -135,7 +135,7 @@ log_recovery_delay_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay);
+ return sysfs_emit(buf, "%d\n", xfs_globals.log_recovery_delay);
}
XFS_SYSFS_ATTR_RW(log_recovery_delay);
@@ -165,7 +165,7 @@ mount_delay_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.mount_delay);
+ return sysfs_emit(buf, "%d\n", xfs_globals.mount_delay);
}
XFS_SYSFS_ATTR_RW(mount_delay);
@@ -188,7 +188,7 @@ always_cow_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
+ return sysfs_emit(buf, "%d\n", xfs_globals.always_cow);
}
XFS_SYSFS_ATTR_RW(always_cow);
@@ -224,7 +224,7 @@ pwork_threads_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads);
+ return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads);
}
XFS_SYSFS_ATTR_RW(pwork_threads);
#endif /* DEBUG */
@@ -327,7 +327,7 @@ log_head_lsn_show(
block = log->l_curr_block;
spin_unlock(&log->l_icloglock);
- return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+ return sysfs_emit(buf, "%d:%d\n", cycle, block);
}
XFS_SYSFS_ATTR_RO(log_head_lsn);
@@ -341,7 +341,7 @@ log_tail_lsn_show(
struct xlog *log = to_xlog(kobject);
xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
- return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+ return sysfs_emit(buf, "%d:%d\n", cycle, block);
}
XFS_SYSFS_ATTR_RO(log_tail_lsn);
@@ -356,7 +356,7 @@ reserve_grant_head_show(
struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
- return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+ return sysfs_emit(buf, "%d:%d\n", cycle, bytes);
}
XFS_SYSFS_ATTR_RO(reserve_grant_head);
@@ -370,7 +370,7 @@ write_grant_head_show(
struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
- return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+ return sysfs_emit(buf, "%d:%d\n", cycle, bytes);
}
XFS_SYSFS_ATTR_RO(write_grant_head);
@@ -425,7 +425,7 @@ max_retries_show(
else
retries = cfg->max_retries;
- return snprintf(buf, PAGE_SIZE, "%d\n", retries);
+ return sysfs_emit(buf, "%d\n", retries);
}
static ssize_t
@@ -466,7 +466,7 @@ retry_timeout_seconds_show(
else
timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC;
- return snprintf(buf, PAGE_SIZE, "%d\n", timeout);
+ return sysfs_emit(buf, "%d\n", timeout);
}
static ssize_t
@@ -504,7 +504,7 @@ fail_at_unmount_show(
{
struct xfs_mount *mp = err_to_mp(kobject);
- return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_unmount);
+ return sysfs_emit(buf, "%d\n", mp->m_fail_unmount);
}
static ssize_t
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 1033a95fbf8e..4a8076ef8cb4 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2476,7 +2476,7 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class,
__entry->btnum = cur->bc_btnum;
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->daddr = bp ? xfs_buf_daddr(bp) : -1;
),
TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx",
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 67dec11e34c7..234a9d9c2f43 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -25,7 +25,7 @@
#include "xfs_dquot.h"
#include "xfs_icache.h"
-kmem_zone_t *xfs_trans_zone;
+struct kmem_cache *xfs_trans_cache;
#if defined(CONFIG_TRACEPOINTS)
static void
@@ -76,7 +76,7 @@ xfs_trans_free(
if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
sb_end_intwrite(tp->t_mountp->m_super);
xfs_trans_free_dqinfo(tp);
- kmem_cache_free(xfs_trans_zone, tp);
+ kmem_cache_free(xfs_trans_cache, tp);
}
/*
@@ -95,7 +95,7 @@ xfs_trans_dup(
trace_xfs_trans_dup(tp, _RET_IP_);
- ntp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL);
+ ntp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL);
/*
* Initialize the new transaction structure.
@@ -263,7 +263,7 @@ xfs_trans_alloc(
* by doing GFP_KERNEL allocations inside sb_start_intwrite().
*/
retry:
- tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL);
+ tp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL);
if (!(flags & XFS_TRANS_NO_WRITECOUNT))
sb_start_intwrite(mp->m_super);
xfs_trans_set_context(tp);
@@ -477,7 +477,7 @@ STATIC void
xfs_trans_apply_sb_deltas(
xfs_trans_t *tp)
{
- xfs_dsb_t *sbp;
+ struct xfs_dsb *sbp;
struct xfs_buf *bp;
int whole = 0;
@@ -541,14 +541,14 @@ xfs_trans_apply_sb_deltas(
/*
* Log the whole thing, the fields are noncontiguous.
*/
- xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_dsb_t) - 1);
+ xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
else
/*
* Since all the modifiable fields are contiguous, we
* can get away with this.
*/
- xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount),
- offsetof(xfs_dsb_t, sb_frextents) +
+ xfs_trans_log_buf(tp, bp, offsetof(struct xfs_dsb, sb_icount),
+ offsetof(struct xfs_dsb, sb_frextents) +
sizeof(sbp->sb_frextents) - 1);
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 50da47f23a07..a487b264a9eb 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -113,12 +113,6 @@ void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
#define XFS_ITEM_FLUSHING 3
/*
- * Deferred operation item relogging limits.
- */
-#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */
-#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */
-
-/*
* This is the structure maintained for every active transaction.
*/
typedef struct xfs_trans {
@@ -243,7 +237,7 @@ void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
struct xfs_buf *src_bp);
-extern kmem_zone_t *xfs_trans_zone;
+extern struct kmem_cache *xfs_trans_cache;
static inline struct xfs_log_item *
xfs_trans_item_relog(
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 3872ce671411..9ba7e6b9bed3 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -846,7 +846,7 @@ STATIC void
xfs_trans_alloc_dqinfo(
xfs_trans_t *tp)
{
- tp->t_dqinfo = kmem_cache_zalloc(xfs_qm_dqtrxzone,
+ tp->t_dqinfo = kmem_cache_zalloc(xfs_dqtrx_cache,
GFP_KERNEL | __GFP_NOFAIL);
}
@@ -856,6 +856,6 @@ xfs_trans_free_dqinfo(
{
if (!tp->t_dqinfo)
return;
- kmem_cache_free(xfs_qm_dqtrxzone, tp->t_dqinfo);
+ kmem_cache_free(xfs_dqtrx_cache, tp->t_dqinfo);
tp->t_dqinfo = NULL;
}
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index ddc346a9df9b..259ee2bda492 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -852,7 +852,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ret = zonefs_file_dio_append(iocb, from);
else
ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
- &zonefs_write_dio_ops, 0);
+ &zonefs_write_dio_ops, 0, 0);
if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
(ret > 0 || ret == -EIOCBQUEUED)) {
if (ret > 0)
@@ -987,7 +987,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
file_accessed(iocb->ki_filp);
ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
- &zonefs_read_dio_ops, 0);
+ &zonefs_read_dio_ops, 0, 0);
} else {
ret = generic_file_read_iter(iocb, to);
if (ret == -EIO)
@@ -1128,7 +1128,7 @@ static const struct file_operations zonefs_file_operations = {
.write_iter = zonefs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
};
static struct kmem_cache *zonefs_inode_cachep;