summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_dentry.c9
-rw-r--r--fs/9p/vfs_inode.c1
-rw-r--r--fs/afs/inode.c1
-rw-r--r--fs/afs/mntpt.c5
-rw-r--r--fs/aio.c4
-rw-r--r--fs/backing-file.c23
-rw-r--r--fs/bcachefs/trace.h6
-rw-r--r--fs/bcachefs/util.h5
-rw-r--r--fs/btrfs/dev-replace.c2
-rw-r--r--fs/btrfs/disk-io.c6
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/btrfs/qgroup.c32
-rw-r--r--fs/btrfs/scrub.c2
-rw-r--r--fs/btrfs/super.c8
-rw-r--r--fs/btrfs/volumes.c15
-rw-r--r--fs/btrfs/zoned.c15
-rw-r--r--fs/buffer.c26
-rw-r--r--fs/cachefiles/namei.c3
-rw-r--r--fs/ceph/dir.c28
-rw-r--r--fs/ceph/file.c66
-rw-r--r--fs/ceph/inode.c46
-rw-r--r--fs/ceph/mds_client.c270
-rw-r--r--fs/ceph/mds_client.h28
-rw-r--r--fs/coredump.c4
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/erofs/data.c25
-rw-r--r--fs/erofs/decompressor_deflate.c55
-rw-r--r--fs/erofs/dir.c4
-rw-r--r--fs/erofs/fscache.c12
-rw-r--r--fs/erofs/inode.c4
-rw-r--r--fs/erofs/internal.h9
-rw-r--r--fs/erofs/namei.c6
-rw-r--r--fs/erofs/super.c44
-rw-r--r--fs/erofs/xattr.c37
-rw-r--r--fs/erofs/zdata.c8
-rw-r--r--fs/erofs/zmap.c24
-rw-r--r--fs/ext2/Kconfig1
-rw-r--r--fs/ext2/dir.c1
-rw-r--r--fs/ext2/file.c8
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext4/dir.c2
-rw-r--r--fs/ext4/ext4_jbd2.c2
-rw-r--r--fs/ext4/super.c26
-rw-r--r--fs/f2fs/checkpoint.c13
-rw-r--r--fs/f2fs/compress.c96
-rw-r--r--fs/f2fs/data.c231
-rw-r--r--fs/f2fs/f2fs.h57
-rw-r--r--fs/f2fs/file.c256
-rw-r--r--fs/f2fs/gc.c11
-rw-r--r--fs/f2fs/gc.h1
-rw-r--r--fs/f2fs/inline.c36
-rw-r--r--fs/f2fs/inode.c22
-rw-r--r--fs/f2fs/node.c20
-rw-r--r--fs/f2fs/recovery.c3
-rw-r--r--fs/f2fs/segment.c132
-rw-r--r--fs/f2fs/super.c80
-rw-r--r--fs/f2fs/sysfs.c21
-rw-r--r--fs/file.c19
-rw-r--r--fs/fuse/dev.c3
-rw-r--r--fs/fuse/file.c10
-rw-r--r--fs/fuse/ioctl.c60
-rw-r--r--fs/fuse/virtio_fs.c74
-rw-r--r--fs/gfs2/glock.c2
-rw-r--r--fs/gfs2/ops_fstype.c2
-rw-r--r--fs/internal.h3
-rw-r--r--fs/iomap/buffered-io.c2
-rw-r--r--fs/isofs/Makefile7
-rw-r--r--fs/isofs/compress.c4
-rw-r--r--fs/isofs/inode.c473
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jffs2/background.c4
-rw-r--r--fs/jffs2/malloc.c32
-rw-r--r--fs/jffs2/nodemgmt.c24
-rw-r--r--fs/jffs2/super.c1
-rw-r--r--fs/kernfs/mount.c2
-rw-r--r--fs/namei.c6
-rw-r--r--fs/netfs/buffered_write.c2
-rw-r--r--fs/netfs/direct_write.c5
-rw-r--r--fs/netfs/objects.c5
-rw-r--r--fs/netfs/write_collect.c7
-rw-r--r--fs/netfs/write_issue.c9
-rw-r--r--fs/nfs/Kconfig4
-rw-r--r--fs/nfs/dir.c54
-rw-r--r--fs/nfs/filelayout/filelayout.c24
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c12
-rw-r--r--fs/nfs/fs_context.c11
-rw-r--r--fs/nfs/internal.h11
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs4proc.c2
-rw-r--r--fs/nfs/nfs4state.c12
-rw-r--r--fs/nfs/nfs4trace.h42
-rw-r--r--fs/nfs/nfstrace.h41
-rw-r--r--fs/nfs/pnfs.c29
-rw-r--r--fs/nfs/pnfs.h3
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/super.c10
-rw-r--r--fs/nfsd/filecache.c4
-rw-r--r--fs/nfsd/trace.h40
-rw-r--r--fs/nilfs2/recovery.c4
-rw-r--r--fs/nilfs2/segment.c65
-rw-r--r--fs/notify/dnotify/dnotify.c4
-rw-r--r--fs/notify/fanotify/fanotify_user.c141
-rw-r--r--fs/notify/fdinfo.c20
-rw-r--r--fs/notify/fsnotify.c27
-rw-r--r--fs/notify/fsnotify.h39
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/notify/mark.c174
-rw-r--r--fs/ntfs3/attrib.c32
-rw-r--r--fs/ntfs3/dir.c1
-rw-r--r--fs/ntfs3/file.c9
-rw-r--r--fs/ntfs3/frecord.c2
-rw-r--r--fs/ntfs3/fslog.c5
-rw-r--r--fs/ntfs3/index.c6
-rw-r--r--fs/ntfs3/inode.c46
-rw-r--r--fs/ntfs3/namei.c121
-rw-r--r--fs/ntfs3/ntfs.h2
-rw-r--r--fs/ntfs3/ntfs_fs.h10
-rw-r--r--fs/ntfs3/record.c11
-rw-r--r--fs/ntfs3/super.c2
-rw-r--r--fs/ntfs3/xattr.c5
-rw-r--r--fs/ocfs2/ocfs2_trace.h60
-rw-r--r--fs/open.c11
-rw-r--r--fs/overlayfs/dir.c152
-rw-r--r--fs/overlayfs/file.c3
-rw-r--r--fs/overlayfs/inode.c1
-rw-r--r--fs/overlayfs/overlayfs.h3
-rw-r--r--fs/overlayfs/util.c2
-rw-r--r--fs/pidfs.c28
-rw-r--r--fs/proc/fd.c4
-rw-r--r--fs/proc/task_mmu.c9
-rw-r--r--fs/quota/dquot.c33
-rw-r--r--fs/read_write.c13
-rw-r--r--fs/reiserfs/README16
-rw-r--r--fs/reiserfs/inode.c16
-rw-r--r--fs/reiserfs/journal.c5
-rw-r--r--fs/remap_range.c4
-rw-r--r--fs/signalfd.c6
-rw-r--r--fs/smb/client/cifsfs.c7
-rw-r--r--fs/smb/client/cifsfs.h4
-rw-r--r--fs/smb/client/file.c23
-rw-r--r--fs/smb/client/smb2ops.c2
-rw-r--r--fs/smb/client/trace.h18
-rw-r--r--fs/smb/server/mgmt/share_config.c6
-rw-r--r--fs/smb/server/oplock.c21
-rw-r--r--fs/splice.c4
-rw-r--r--fs/super.c1
-rw-r--r--fs/sysfs/file.c27
-rw-r--r--fs/tracefs/event_inode.c223
-rw-r--r--fs/tracefs/inode.c48
-rw-r--r--fs/udf/file.c20
-rw-r--r--fs/udf/inode.c65
-rw-r--r--fs/udf/super.c8
-rw-r--r--fs/udf/symlink.c34
-rw-r--r--fs/udf/udftime.c11
-rw-r--r--fs/xfs/Makefile18
-rw-r--r--fs/xfs/libxfs/xfs_ag.c12
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c24
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.h2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c4
-rw-r--r--fs/xfs/libxfs/xfs_attr.c233
-rw-r--r--fs/xfs/libxfs/xfs_attr.h46
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c154
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h4
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c102
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.h8
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h1
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c377
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h13
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c189
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h34
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h37
-rw-r--r--fs/xfs/libxfs/xfs_defer.c12
-rw-r--r--fs/xfs/libxfs/xfs_defer.h10
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c281
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h23
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c42
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c18
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c100
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c44
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h15
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h4
-rw-r--r--fs/xfs/libxfs/xfs_exchmaps.c1235
-rw-r--r--fs/xfs/libxfs/xfs_exchmaps.h124
-rw-r--r--fs/xfs/libxfs/xfs_format.h34
-rw-r--r--fs/xfs/libxfs/xfs_fs.h158
-rw-r--r--fs/xfs/libxfs/xfs_health.h4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c56
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h5
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c4
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c8
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c57
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h6
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h89
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h4
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c46
-rw-r--r--fs/xfs/libxfs/xfs_ondisk.h6
-rw-r--r--fs/xfs/libxfs/xfs_parent.c379
-rw-r--r--fs/xfs/libxfs/xfs_parent.h110
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c57
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.h17
-rw-r--r--fs/xfs/libxfs/xfs_sb.c9
-rw-r--r--fs/xfs/libxfs/xfs_shared.h6
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c54
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.h8
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c326
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.c121
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h29
-rw-r--r--fs/xfs/scrub/agheader.c43
-rw-r--r--fs/xfs/scrub/agheader_repair.c879
-rw-r--r--fs/xfs/scrub/agino_bitmap.h49
-rw-r--r--fs/xfs/scrub/alloc_repair.c2
-rw-r--r--fs/xfs/scrub/attr.c214
-rw-r--r--fs/xfs/scrub/attr.h7
-rw-r--r--fs/xfs/scrub/attr_repair.c1663
-rw-r--r--fs/xfs/scrub/attr_repair.h15
-rw-r--r--fs/xfs/scrub/bitmap.c22
-rw-r--r--fs/xfs/scrub/common.c41
-rw-r--r--fs/xfs/scrub/common.h27
-rw-r--r--fs/xfs/scrub/dab_bitmap.h37
-rw-r--r--fs/xfs/scrub/dabtree.c24
-rw-r--r--fs/xfs/scrub/dabtree.h3
-rw-r--r--fs/xfs/scrub/dir.c377
-rw-r--r--fs/xfs/scrub/dir_repair.c1958
-rw-r--r--fs/xfs/scrub/dirtree.c985
-rw-r--r--fs/xfs/scrub/dirtree.h178
-rw-r--r--fs/xfs/scrub/dirtree_repair.c821
-rw-r--r--fs/xfs/scrub/findparent.c454
-rw-r--r--fs/xfs/scrub/findparent.h56
-rw-r--r--fs/xfs/scrub/fscounters.c14
-rw-r--r--fs/xfs/scrub/fscounters.h1
-rw-r--r--fs/xfs/scrub/fscounters_repair.c12
-rw-r--r--fs/xfs/scrub/health.c1
-rw-r--r--fs/xfs/scrub/ino_bitmap.h37
-rw-r--r--fs/xfs/scrub/inode.c19
-rw-r--r--fs/xfs/scrub/inode_repair.c153
-rw-r--r--fs/xfs/scrub/iscan.c67
-rw-r--r--fs/xfs/scrub/iscan.h16
-rw-r--r--fs/xfs/scrub/listxattr.c320
-rw-r--r--fs/xfs/scrub/listxattr.h19
-rw-r--r--fs/xfs/scrub/nlinks.c133
-rw-r--r--fs/xfs/scrub/nlinks.h7
-rw-r--r--fs/xfs/scrub/nlinks_repair.c186
-rw-r--r--fs/xfs/scrub/orphanage.c627
-rw-r--r--fs/xfs/scrub/orphanage.h86
-rw-r--r--fs/xfs/scrub/parent.c700
-rw-r--r--fs/xfs/scrub/parent_repair.c1612
-rw-r--r--fs/xfs/scrub/quota_repair.c6
-rw-r--r--fs/xfs/scrub/readdir.c140
-rw-r--r--fs/xfs/scrub/readdir.h3
-rw-r--r--fs/xfs/scrub/reap.c445
-rw-r--r--fs/xfs/scrub/reap.h21
-rw-r--r--fs/xfs/scrub/repair.c127
-rw-r--r--fs/xfs/scrub/repair.h31
-rw-r--r--fs/xfs/scrub/rmap_repair.c24
-rw-r--r--fs/xfs/scrub/rtbitmap_repair.c2
-rw-r--r--fs/xfs/scrub/rtsummary.c33
-rw-r--r--fs/xfs/scrub/rtsummary.h37
-rw-r--r--fs/xfs/scrub/rtsummary_repair.c175
-rw-r--r--fs/xfs/scrub/scrub.c310
-rw-r--r--fs/xfs/scrub/scrub.h91
-rw-r--r--fs/xfs/scrub/stats.c1
-rw-r--r--fs/xfs/scrub/symlink.c13
-rw-r--r--fs/xfs/scrub/symlink_repair.c509
-rw-r--r--fs/xfs/scrub/tempexch.h22
-rw-r--r--fs/xfs/scrub/tempfile.c851
-rw-r--r--fs/xfs/scrub/tempfile.h48
-rw-r--r--fs/xfs/scrub/trace.c6
-rw-r--r--fs/xfs/scrub/trace.h1317
-rw-r--r--fs/xfs/scrub/xfarray.c27
-rw-r--r--fs/xfs/scrub/xfarray.h6
-rw-r--r--fs/xfs/scrub/xfblob.c168
-rw-r--r--fs/xfs/scrub/xfblob.h50
-rw-r--r--fs/xfs/scrub/xfile.c14
-rw-r--r--fs/xfs/scrub/xfile.h6
-rw-r--r--fs/xfs/scrub/xfs_scrub.h6
-rw-r--r--fs/xfs/xfs_acl.c17
-rw-r--r--fs/xfs/xfs_aops.c60
-rw-r--r--fs/xfs/xfs_attr_item.c554
-rw-r--r--fs/xfs/xfs_attr_item.h10
-rw-r--r--fs/xfs/xfs_attr_list.c120
-rw-r--r--fs/xfs/xfs_bmap_item.c4
-rw-r--r--fs/xfs/xfs_bmap_util.c67
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_buf.c5
-rw-r--r--fs/xfs/xfs_dir2_readdir.c25
-rw-r--r--fs/xfs/xfs_discard.c153
-rw-r--r--fs/xfs/xfs_dquot.c47
-rw-r--r--fs/xfs/xfs_dquot.h1
-rw-r--r--fs/xfs/xfs_error.c3
-rw-r--r--fs/xfs/xfs_exchmaps_item.c614
-rw-r--r--fs/xfs/xfs_exchmaps_item.h64
-rw-r--r--fs/xfs/xfs_exchrange.c804
-rw-r--r--fs/xfs/xfs_exchrange.h38
-rw-r--r--fs/xfs/xfs_export.c4
-rw-r--r--fs/xfs/xfs_export.h2
-rw-r--r--fs/xfs/xfs_extent_busy.c80
-rw-r--r--fs/xfs/xfs_file.c90
-rw-r--r--fs/xfs/xfs_file.h15
-rw-r--r--fs/xfs/xfs_fsmap.c4
-rw-r--r--fs/xfs/xfs_fsops.c29
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_handle.c952
-rw-r--r--fs/xfs/xfs_handle.h33
-rw-r--r--fs/xfs/xfs_health.c1
-rw-r--r--fs/xfs/xfs_icache.c4
-rw-r--r--fs/xfs/xfs_inode.c496
-rw-r--r--fs/xfs/xfs_inode.h41
-rw-r--r--fs/xfs/xfs_ioctl.c625
-rw-r--r--fs/xfs/xfs_ioctl.h28
-rw-r--r--fs/xfs/xfs_ioctl32.c1
-rw-r--r--fs/xfs/xfs_iomap.c105
-rw-r--r--fs/xfs/xfs_iops.c23
-rw-r--r--fs/xfs/xfs_iops.h7
-rw-r--r--fs/xfs/xfs_itable.c8
-rw-r--r--fs/xfs/xfs_iwalk.c4
-rw-r--r--fs/xfs/xfs_linux.h5
-rw-r--r--fs/xfs/xfs_log.c28
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_cil.c2
-rw-r--r--fs/xfs/xfs_log_priv.h8
-rw-r--r--fs/xfs/xfs_log_recover.c85
-rw-r--r--fs/xfs/xfs_mount.c109
-rw-r--r--fs/xfs/xfs_mount.h88
-rw-r--r--fs/xfs/xfs_qm.c4
-rw-r--r--fs/xfs/xfs_qm.h2
-rw-r--r--fs/xfs/xfs_quota.h23
-rw-r--r--fs/xfs/xfs_reflink.c48
-rw-r--r--fs/xfs/xfs_rtalloc.c29
-rw-r--r--fs/xfs/xfs_super.c76
-rw-r--r--fs/xfs/xfs_symlink.c91
-rw-r--r--fs/xfs/xfs_trace.c3
-rw-r--r--fs/xfs/xfs_trace.h466
-rw-r--r--fs/xfs/xfs_trans.c72
-rw-r--r--fs/xfs/xfs_trans_dquot.c15
-rw-r--r--fs/xfs/xfs_xattr.c92
-rw-r--r--fs/xfs/xfs_xattr.h3
336 files changed, 27475 insertions, 4564 deletions
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index f16f73581634..01338d4c2d9e 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -48,12 +48,17 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)
static void v9fs_dentry_release(struct dentry *dentry)
{
struct hlist_node *p, *n;
+ struct hlist_head head;
p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
dentry, dentry);
- hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata)
+
+ spin_lock(&dentry->d_lock);
+ hlist_move_list((struct hlist_head *)&dentry->d_fsdata, &head);
+ spin_unlock(&dentry->d_lock);
+
+ hlist_for_each_safe(p, n, &head)
p9_fid_put(hlist_entry(p, struct p9_fid, dlist));
- dentry->d_fsdata = NULL;
}
static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7a3308d77606..fd72fc38c8f5 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -348,6 +348,7 @@ void v9fs_evict_inode(struct inode *inode)
__le32 __maybe_unused version;
if (!is_bad_inode(inode)) {
+ netfs_wait_for_outstanding_io(inode);
truncate_inode_pages_final(&inode->i_data);
version = cpu_to_le32(v9inode->qid.version);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 94fc049aff58..15bb7989c387 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -648,6 +648,7 @@ void afs_evict_inode(struct inode *inode)
ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
+ netfs_wait_for_outstanding_io(inode);
truncate_inode_pages_final(&inode->i_data);
afs_set_cache_aux(vnode, &aux);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 97f50e9fd9eb..297487ee8323 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -140,6 +140,11 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
put_page(page);
if (ret < 0)
return ret;
+
+ /* Don't cross a backup volume mountpoint from a backup volume */
+ if (src_as->volume && src_as->volume->type == AFSVL_BACKVOL &&
+ ctx->type == AFSVL_BACKVOL)
+ return -ENODEV;
}
return 0;
diff --git a/fs/aio.c b/fs/aio.c
index 6ed5507cd330..57c9f7c077e6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1608,7 +1608,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
return ret;
ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret)
- aio_rw_done(req, call_read_iter(file, req, &iter));
+ aio_rw_done(req, file->f_op->read_iter(req, &iter));
kfree(iovec);
return ret;
}
@@ -1639,7 +1639,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
if (S_ISREG(file_inode(file)->i_mode))
kiocb_start_write(req);
req->ki_flags |= IOCB_WRITE;
- aio_rw_done(req, call_write_iter(file, req, &iter));
+ aio_rw_done(req, file->f_op->write_iter(req, &iter));
}
kfree(iovec);
return ret;
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 740185198db3..afb557446c27 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -52,6 +52,29 @@ struct file *backing_file_open(const struct path *user_path, int flags,
}
EXPORT_SYMBOL_GPL(backing_file_open);
+struct file *backing_tmpfile_open(const struct path *user_path, int flags,
+ const struct path *real_parentpath,
+ umode_t mode, const struct cred *cred)
+{
+ struct mnt_idmap *real_idmap = mnt_idmap(real_parentpath->mnt);
+ struct file *f;
+ int error;
+
+ f = alloc_empty_backing_file(flags, cred);
+ if (IS_ERR(f))
+ return f;
+
+ path_get(user_path);
+ *backing_file_user_path(f) = *user_path;
+ error = vfs_tmpfile(real_idmap, real_parentpath, f, mode);
+ if (error) {
+ fput(f);
+ f = ERR_PTR(error);
+ }
+ return f;
+}
+EXPORT_SYMBOL(backing_tmpfile_open);
+
struct backing_aio {
struct kiocb iocb;
refcount_t ref;
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 362e1fc7ef6a..84fcf26e306e 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -43,7 +43,7 @@ DECLARE_EVENT_CLASS(fs_str,
TP_fast_assign(
__entry->dev = c->dev;
- __assign_str(str, str);
+ __assign_str(str);
),
TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
@@ -64,7 +64,7 @@ DECLARE_EVENT_CLASS(trans_str,
__entry->dev = trans->c->dev;
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
- __assign_str(str, str);
+ __assign_str(str);
),
TP_printk("%d,%d %s %pS %s",
@@ -85,7 +85,7 @@ DECLARE_EVENT_CLASS(trans_str_nocaller,
TP_fast_assign(
__entry->dev = trans->c->dev;
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __assign_str(str, str);
+ __assign_str(str);
),
TP_printk("%d,%d %s %s",
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 5cf885b09986..5d2c470a49ac 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -445,11 +445,6 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
void bch2_bio_map(struct bio *bio, void *base, size_t);
int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
-static inline sector_t bdev_sectors(struct block_device *bdev)
-{
- return bdev->bd_inode->i_size >> 9;
-}
-
#define closure_bio_submit(bio, cl) \
do { \
closure_get(cl); \
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7696beec4c21..7130040d92ab 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -316,7 +316,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->dev_stats_valid = 1;
- set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+ set_blocksize(bdev_file, BTRFS_BDEV_BLOCKSIZE);
device->fs_devices = fs_devices;
ret = btrfs_get_dev_zone_info(device, false);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a91a8056758a..1b20b3e390df 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3656,7 +3656,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
struct btrfs_super_block *super;
struct page *page;
u64 bytenr, bytenr_orig;
- struct address_space *mapping = bdev->bd_inode->i_mapping;
+ struct address_space *mapping = bdev->bd_mapping;
int ret;
bytenr_orig = btrfs_sb_offset(copy_num);
@@ -3743,7 +3743,7 @@ static int write_dev_supers(struct btrfs_device *device,
struct btrfs_super_block *sb, int max_mirrors)
{
struct btrfs_fs_info *fs_info = device->fs_info;
- struct address_space *mapping = device->bdev->bd_inode->i_mapping;
+ struct address_space *mapping = device->bdev->bd_mapping;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
int i;
int ret;
@@ -3861,7 +3861,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
device->commit_total_bytes)
break;
- folio = filemap_get_folio(device->bdev->bd_inode->i_mapping,
+ folio = filemap_get_folio(device->bdev->bd_mapping,
bytenr >> PAGE_SHIFT);
/* If the folio has been removed, then we know it completed. */
if (IS_ERR(folio))
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 47d48233b592..3774c191e36d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -477,7 +477,7 @@ again:
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
if (ret) {
- if (ret > 1)
+ if (ret > 0)
return -ENOENT;
return ret;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index eb28141d5c37..fc2a7ea26354 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -468,6 +468,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
}
if (!qgroup) {
struct btrfs_qgroup *prealloc;
+ struct btrfs_root *tree_root = fs_info->tree_root;
prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
if (!prealloc) {
@@ -475,6 +476,25 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
goto out;
}
qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
+ /*
+ * If a qgroup exists for a subvolume ID, it is possible
+ * that subvolume has been deleted, in which case
+ * re-using that ID would lead to incorrect accounting.
+ *
+ * Ensure that we skip any such subvol ids.
+ *
+ * We don't need to lock because this is only called
+ * during mount before we start doing things like creating
+ * subvolumes.
+ */
+ if (is_fstree(qgroup->qgroupid) &&
+ qgroup->qgroupid > tree_root->free_objectid)
+ /*
+ * Don't need to check against BTRFS_LAST_FREE_OBJECTID,
+ * as it will get checked on the next call to
+ * btrfs_get_free_objectid.
+ */
+ tree_root->free_objectid = qgroup->qgroupid + 1;
}
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
if (ret < 0)
@@ -3820,14 +3840,14 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
/* we're resuming qgroup rescan at mount time */
if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
- btrfs_warn(fs_info,
+ btrfs_debug(fs_info,
"qgroup rescan init failed, qgroup rescan is not queued");
ret = -EINVAL;
} else if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_ON)) {
- btrfs_warn(fs_info,
+ btrfs_debug(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
- ret = -EINVAL;
+ ret = -ENOTCONN;
}
if (ret)
@@ -3838,14 +3858,12 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
if (init_flags) {
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
- btrfs_warn(fs_info,
- "qgroup rescan is already in progress");
ret = -EINPROGRESS;
} else if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_ON)) {
- btrfs_warn(fs_info,
+ btrfs_debug(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
- ret = -EINVAL;
+ ret = -ENOTCONN;
} else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
/* Quota disable is in progress */
ret = -EBUSY;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 4b22cfe9a98c..afd6932f5e89 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2100,7 +2100,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
struct btrfs_fs_info *fs_info = sctx->fs_info;
const u64 logical_end = logical_start + logical_length;
u64 cur_logical = logical_start;
- int ret;
+ int ret = 0;
/* The range must be inside the bg */
ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2dbc930a20f7..f05cce7c8b8d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -119,6 +119,7 @@ enum {
Opt_thread_pool,
Opt_treelog,
Opt_user_subvol_rm_allowed,
+ Opt_norecovery,
/* Rescue options */
Opt_rescue,
@@ -245,6 +246,8 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = {
__fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL),
/* Deprecated, with alias rescue=usebackuproot */
__fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL),
+ /* For compatibility only, alias for "rescue=nologreplay". */
+ fsparam_flag("norecovery", Opt_norecovery),
/* Debugging options. */
fsparam_flag_no("enospc_debug", Opt_enospc_debug),
@@ -438,6 +441,11 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
"'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
break;
+ case Opt_norecovery:
+ btrfs_info(NULL,
+"'norecovery' is for compatibility only, recommended to use 'rescue=nologreplay'");
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
+ break;
case Opt_flushoncommit:
if (result.negated)
btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b6a701011fb0..c39145e8c4ad 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -482,10 +482,12 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
if (flush)
sync_blockdev(bdev);
- ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE);
- if (ret) {
- fput(*bdev_file);
- goto error;
+ if (holder) {
+ ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE);
+ if (ret) {
+ fput(*bdev_file);
+ goto error;
+ }
}
invalidate_bdev(bdev);
*disk_super = btrfs_read_dev_super(bdev);
@@ -498,6 +500,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
return 0;
error:
+ *disk_super = NULL;
*bdev_file = NULL;
return ret;
}
@@ -1287,7 +1290,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
return ERR_PTR(-EINVAL);
/* pull in the page with our super */
- page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
+ page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL);
if (IS_ERR(page))
return ERR_CAST(page);
@@ -2714,7 +2717,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->dev_stats_valid = 1;
- set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+ set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
if (seeding_dev) {
btrfs_clear_sb_rdonly(sb);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 4cba80b34387..947a87576f6c 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -118,7 +118,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
return -ENOENT;
} else if (full[0] && full[1]) {
/* Compare two super blocks */
- struct address_space *mapping = bdev->bd_inode->i_mapping;
+ struct address_space *mapping = bdev->bd_mapping;
struct page *page[BTRFS_NR_SB_LOG_ZONES];
struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
int i;
@@ -1290,7 +1290,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
struct btrfs_chunk_map *map)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- struct btrfs_device *device = map->stripes[zone_idx].dev;
+ struct btrfs_device *device;
int dev_replace_is_ongoing = 0;
unsigned int nofs_flag;
struct blk_zone zone;
@@ -1298,7 +1298,11 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
info->physical = map->stripes[zone_idx].physical;
+ down_read(&dev_replace->rwsem);
+ device = map->stripes[zone_idx].dev;
+
if (!device->bdev) {
+ up_read(&dev_replace->rwsem);
info->alloc_offset = WP_MISSING_DEV;
return 0;
}
@@ -1308,6 +1312,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
__set_bit(zone_idx, active);
if (!btrfs_dev_is_sequential(device, info->physical)) {
+ up_read(&dev_replace->rwsem);
info->alloc_offset = WP_CONVENTIONAL;
return 0;
}
@@ -1315,11 +1320,9 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
/* This zone will be used for allocation, so mark this zone non-empty. */
btrfs_dev_clear_zone_empty(device, info->physical);
- down_read(&dev_replace->rwsem);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical);
- up_read(&dev_replace->rwsem);
/*
* The group is mapped to a sequential zone. Get the zone write pointer
@@ -1330,6 +1333,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
ret = btrfs_get_dev_zone(device, info->physical, &zone);
memalloc_nofs_restore(nofs_flag);
if (ret) {
+ up_read(&dev_replace->rwsem);
if (ret != -EIO && ret != -EOPNOTSUPP)
return ret;
info->alloc_offset = WP_MISSING_DEV;
@@ -1341,6 +1345,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
"zoned: unexpected conventional zone %llu on device %s (devid %llu)",
zone.start << SECTOR_SHIFT, rcu_str_deref(device->name),
device->devid);
+ up_read(&dev_replace->rwsem);
return -EIO;
}
@@ -1368,6 +1373,8 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
break;
}
+ up_read(&dev_replace->rwsem);
+
return 0;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index ed698caa8834..8c19e705b9c3 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -189,8 +189,8 @@ EXPORT_SYMBOL(end_buffer_write_sync);
static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block)
{
- struct inode *bd_inode = bdev->bd_inode;
- struct address_space *bd_mapping = bd_inode->i_mapping;
+ struct address_space *bd_mapping = bdev->bd_mapping;
+ const int blkbits = bd_mapping->host->i_blkbits;
struct buffer_head *ret = NULL;
pgoff_t index;
struct buffer_head *bh;
@@ -199,7 +199,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
int all_mapped = 1;
static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
- index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE;
+ index = ((loff_t)block << blkbits) / PAGE_SIZE;
folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0);
if (IS_ERR(folio))
goto out;
@@ -233,7 +233,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
(unsigned long long)block,
(unsigned long long)bh->b_blocknr,
bh->b_state, bh->b_size, bdev,
- 1 << bd_inode->i_blkbits);
+ 1 << blkbits);
}
out_unlock:
spin_unlock(&bd_mapping->i_private_lock);
@@ -1041,12 +1041,12 @@ static sector_t folio_init_buffers(struct folio *folio,
static bool grow_dev_folio(struct block_device *bdev, sector_t block,
pgoff_t index, unsigned size, gfp_t gfp)
{
- struct inode *inode = bdev->bd_inode;
+ struct address_space *mapping = bdev->bd_mapping;
struct folio *folio;
struct buffer_head *bh;
sector_t end_block = 0;
- folio = __filemap_get_folio(inode->i_mapping, index,
+ folio = __filemap_get_folio(mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
if (IS_ERR(folio))
return false;
@@ -1080,10 +1080,10 @@ static bool grow_dev_folio(struct block_device *bdev, sector_t block,
* lock to be atomic wrt __find_get_block(), which does not
* run under the folio lock.
*/
- spin_lock(&inode->i_mapping->i_private_lock);
+ spin_lock(&mapping->i_private_lock);
link_dev_buffers(folio, bh);
end_block = folio_init_buffers(folio, bdev, size);
- spin_unlock(&inode->i_mapping->i_private_lock);
+ spin_unlock(&mapping->i_private_lock);
unlock:
folio_unlock(folio);
folio_put(folio);
@@ -1486,7 +1486,7 @@ struct buffer_head *__bread_gfp(struct block_device *bdev, sector_t block,
{
struct buffer_head *bh;
- gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+ gfp |= mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);
/*
* Prefer looping in the allocator rather than here, at least that
@@ -1719,16 +1719,16 @@ EXPORT_SYMBOL(create_empty_buffers);
*/
void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
{
- struct inode *bd_inode = bdev->bd_inode;
- struct address_space *bd_mapping = bd_inode->i_mapping;
+ struct address_space *bd_mapping = bdev->bd_mapping;
+ const int blkbits = bd_mapping->host->i_blkbits;
struct folio_batch fbatch;
- pgoff_t index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE;
+ pgoff_t index = ((loff_t)block << blkbits) / PAGE_SIZE;
pgoff_t end;
int i, count;
struct buffer_head *bh;
struct buffer_head *head;
- end = ((loff_t)(block + len - 1) << bd_inode->i_blkbits) / PAGE_SIZE;
+ end = ((loff_t)(block + len - 1) << blkbits) / PAGE_SIZE;
folio_batch_init(&fbatch);
while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
count = folio_batch_count(&fbatch);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 7ade836beb58..f53977169db4 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -563,8 +563,7 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
*/
path.mnt = cache->mnt;
path.dentry = dentry;
- file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT,
- d_backing_inode(dentry), cache->cache_cred);
+ file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT, cache->cache_cred);
if (IS_ERR(file)) {
trace_cachefiles_vfs_error(object, d_backing_inode(dentry),
PTR_ERR(file),
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0e9f56eaba1e..82a2e2a06a65 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1336,8 +1336,12 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode = d_inode(dentry);
struct ceph_mds_request *req;
bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
+ struct dentry *dn;
int err = -EROFS;
int op;
+ char *path;
+ int pathlen;
+ u64 pathbase;
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* rmdir .snap/foo is RMSNAP */
@@ -1351,6 +1355,30 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
} else
goto out;
+
+ dn = d_find_alias(dir);
+ if (!dn) {
+ try_async = false;
+ } else {
+ path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
+ if (IS_ERR(path)) {
+ try_async = false;
+ err = 0;
+ } else {
+ err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
+ }
+ ceph_mdsc_free_path(path, pathlen);
+ dput(dn);
+
+ /* For none EACCES cases will let the MDS do the mds auth check */
+ if (err == -EACCES) {
+ return err;
+ } else if (err < 0) {
+ try_async = false;
+ err = 0;
+ }
+ }
+
retry:
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req)) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 16873d07692f..4b8d59ebda00 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -366,6 +366,12 @@ int ceph_open(struct inode *inode, struct file *file)
struct ceph_file_info *fi = file->private_data;
int err;
int flags, fmode, wanted;
+ struct dentry *dentry;
+ char *path;
+ int pathlen;
+ u64 pathbase;
+ bool do_sync = false;
+ int mask = MAY_READ;
if (fi) {
doutc(cl, "file %p is already opened\n", file);
@@ -387,6 +393,31 @@ int ceph_open(struct inode *inode, struct file *file)
fmode = ceph_flags_to_mode(flags);
wanted = ceph_caps_for_mode(fmode);
+ if (fmode & CEPH_FILE_MODE_WR)
+ mask |= MAY_WRITE;
+ dentry = d_find_alias(inode);
+ if (!dentry) {
+ do_sync = true;
+ } else {
+ path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
+ if (IS_ERR(path)) {
+ do_sync = true;
+ err = 0;
+ } else {
+ err = ceph_mds_check_access(mdsc, path, mask);
+ }
+ ceph_mdsc_free_path(path, pathlen);
+ dput(dentry);
+
+ /* For none EACCES cases will let the MDS do the mds auth check */
+ if (err == -EACCES) {
+ return err;
+ } else if (err < 0) {
+ do_sync = true;
+ err = 0;
+ }
+ }
+
/* snapped files are read-only */
if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
return -EROFS;
@@ -402,7 +433,7 @@ int ceph_open(struct inode *inode, struct file *file)
* asynchronously.
*/
spin_lock(&ci->i_ceph_lock);
- if (__ceph_is_any_real_caps(ci) &&
+ if (!do_sync && __ceph_is_any_real_caps(ci) &&
(((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
int mds_wanted = __ceph_caps_mds_wanted(ci, true);
int issued = __ceph_caps_issued(ci, NULL);
@@ -420,7 +451,7 @@ int ceph_open(struct inode *inode, struct file *file)
ceph_check_caps(ci, 0);
return ceph_init_file(inode, file, fmode);
- } else if (ceph_snap(inode) != CEPH_NOSNAP &&
+ } else if (!do_sync && ceph_snap(inode) != CEPH_NOSNAP &&
(ci->i_snap_caps & wanted) == wanted) {
__ceph_touch_fmode(ci, mdsc, fmode);
spin_unlock(&ci->i_ceph_lock);
@@ -759,6 +790,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
int mask;
int err;
+ char *path;
+ int pathlen;
+ u64 pathbase;
doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n",
dir, ceph_vinop(dir), dentry, dentry,
@@ -776,6 +810,34 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
*/
flags &= ~O_TRUNC;
+ dn = d_find_alias(dir);
+ if (!dn) {
+ try_async = false;
+ } else {
+ path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
+ if (IS_ERR(path)) {
+ try_async = false;
+ err = 0;
+ } else {
+ int fmode = ceph_flags_to_mode(flags);
+
+ mask = MAY_READ;
+ if (fmode & CEPH_FILE_MODE_WR)
+ mask |= MAY_WRITE;
+ err = ceph_mds_check_access(mdsc, path, mask);
+ }
+ ceph_mdsc_free_path(path, pathlen);
+ dput(dn);
+
+ /* For none EACCES cases will let the MDS do the mds auth check */
+ if (err == -EACCES) {
+ return err;
+ } else if (err < 0) {
+ try_async = false;
+ err = 0;
+ }
+ }
+
retry:
if (flags & O_CREAT) {
if (ceph_quota_is_max_files_exceeded(dir))
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 99561fddcb38..249ddfbb1b03 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2482,6 +2482,34 @@ int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
bool lock_snap_rwsem = false;
bool fill_fscrypt;
int truncate_retry = 20; /* The RMW will take around 50ms */
+ struct dentry *dentry;
+ char *path;
+ int pathlen;
+ u64 pathbase;
+ bool do_sync = false;
+
+ dentry = d_find_alias(inode);
+ if (!dentry) {
+ do_sync = true;
+ } else {
+ path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
+ if (IS_ERR(path)) {
+ do_sync = true;
+ err = 0;
+ } else {
+ err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
+ }
+ ceph_mdsc_free_path(path, pathlen);
+ dput(dentry);
+
+ /* For none EACCES cases will let the MDS do the mds auth check */
+ if (err == -EACCES) {
+ return err;
+ } else if (err < 0) {
+ do_sync = true;
+ err = 0;
+ }
+ }
retry:
prealloc_cf = ceph_alloc_cap_flush();
@@ -2528,7 +2556,7 @@ retry:
/* It should never be re-set once set */
WARN_ON_ONCE(ci->fscrypt_auth);
- if (issued & CEPH_CAP_AUTH_EXCL) {
+ if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
dirtied |= CEPH_CAP_AUTH_EXCL;
kfree(ci->fscrypt_auth);
ci->fscrypt_auth = (u8 *)cia->fscrypt_auth;
@@ -2557,7 +2585,7 @@ retry:
ceph_vinop(inode),
from_kuid(&init_user_ns, inode->i_uid),
from_kuid(&init_user_ns, attr->ia_uid));
- if (issued & CEPH_CAP_AUTH_EXCL) {
+ if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
inode->i_uid = fsuid;
dirtied |= CEPH_CAP_AUTH_EXCL;
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
@@ -2575,7 +2603,7 @@ retry:
ceph_vinop(inode),
from_kgid(&init_user_ns, inode->i_gid),
from_kgid(&init_user_ns, attr->ia_gid));
- if (issued & CEPH_CAP_AUTH_EXCL) {
+ if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
inode->i_gid = fsgid;
dirtied |= CEPH_CAP_AUTH_EXCL;
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
@@ -2589,7 +2617,7 @@ retry:
if (ia_valid & ATTR_MODE) {
doutc(cl, "%p %llx.%llx mode 0%o -> 0%o\n", inode,
ceph_vinop(inode), inode->i_mode, attr->ia_mode);
- if (issued & CEPH_CAP_AUTH_EXCL) {
+ if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
inode->i_mode = attr->ia_mode;
dirtied |= CEPH_CAP_AUTH_EXCL;
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
@@ -2608,11 +2636,11 @@ retry:
inode, ceph_vinop(inode),
atime.tv_sec, atime.tv_nsec,
attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
- if (issued & CEPH_CAP_FILE_EXCL) {
+ if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) {
ci->i_time_warp_seq++;
inode_set_atime_to_ts(inode, attr->ia_atime);
dirtied |= CEPH_CAP_FILE_EXCL;
- } else if ((issued & CEPH_CAP_FILE_WR) &&
+ } else if (!do_sync && (issued & CEPH_CAP_FILE_WR) &&
timespec64_compare(&atime,
&attr->ia_atime) < 0) {
inode_set_atime_to_ts(inode, attr->ia_atime);
@@ -2648,7 +2676,7 @@ retry:
CEPH_FSCRYPT_BLOCK_SIZE));
req->r_fscrypt_file = attr->ia_size;
fill_fscrypt = true;
- } else if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
+ } else if (!do_sync && (issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
if (attr->ia_size > isize) {
i_size_write(inode, attr->ia_size);
inode->i_blocks = calc_inode_blocks(attr->ia_size);
@@ -2685,11 +2713,11 @@ retry:
inode, ceph_vinop(inode),
mtime.tv_sec, mtime.tv_nsec,
attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
- if (issued & CEPH_CAP_FILE_EXCL) {
+ if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) {
ci->i_time_warp_seq++;
inode_set_mtime_to_ts(inode, attr->ia_mtime);
dirtied |= CEPH_CAP_FILE_EXCL;
- } else if ((issued & CEPH_CAP_FILE_WR) &&
+ } else if (!do_sync && (issued & CEPH_CAP_FILE_WR) &&
timespec64_compare(&mtime, &attr->ia_mtime) < 0) {
inode_set_mtime_to_ts(inode, attr->ia_mtime);
dirtied |= CEPH_CAP_FILE_WR;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 360b686c3c67..c2157f6e0c69 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4112,10 +4112,13 @@ static void handle_session(struct ceph_mds_session *session,
void *p = msg->front.iov_base;
void *end = p + msg->front.iov_len;
struct ceph_mds_session_head *h;
- u32 op;
+ struct ceph_mds_cap_auth *cap_auths = NULL;
+ u32 op, cap_auths_num = 0;
u64 seq, features = 0;
int wake = 0;
bool blocklisted = false;
+ u32 i;
+
/* decode */
ceph_decode_need(&p, end, sizeof(*h), bad);
@@ -4160,7 +4163,101 @@ static void handle_session(struct ceph_mds_session *session,
}
}
+ if (msg_version >= 6) {
+ ceph_decode_32_safe(&p, end, cap_auths_num, bad);
+ doutc(cl, "cap_auths_num %d\n", cap_auths_num);
+
+ if (cap_auths_num && op != CEPH_SESSION_OPEN) {
+ WARN_ON_ONCE(op != CEPH_SESSION_OPEN);
+ goto skip_cap_auths;
+ }
+
+ cap_auths = kcalloc(cap_auths_num,
+ sizeof(struct ceph_mds_cap_auth),
+ GFP_KERNEL);
+ if (!cap_auths) {
+ pr_err_client(cl, "No memory for cap_auths\n");
+ return;
+ }
+
+ for (i = 0; i < cap_auths_num; i++) {
+ u32 _len, j;
+
+ /* struct_v, struct_compat, and struct_len in MDSCapAuth */
+ ceph_decode_skip_n(&p, end, 2 + sizeof(u32), bad);
+
+ /* struct_v, struct_compat, and struct_len in MDSCapMatch */
+ ceph_decode_skip_n(&p, end, 2 + sizeof(u32), bad);
+ ceph_decode_64_safe(&p, end, cap_auths[i].match.uid, bad);
+ ceph_decode_32_safe(&p, end, _len, bad);
+ if (_len) {
+ cap_auths[i].match.gids = kcalloc(_len, sizeof(u32),
+ GFP_KERNEL);
+ if (!cap_auths[i].match.gids) {
+ pr_err_client(cl, "No memory for gids\n");
+ goto fail;
+ }
+
+ cap_auths[i].match.num_gids = _len;
+ for (j = 0; j < _len; j++)
+ ceph_decode_32_safe(&p, end,
+ cap_auths[i].match.gids[j],
+ bad);
+ }
+
+ ceph_decode_32_safe(&p, end, _len, bad);
+ if (_len) {
+ cap_auths[i].match.path = kcalloc(_len + 1, sizeof(char),
+ GFP_KERNEL);
+ if (!cap_auths[i].match.path) {
+ pr_err_client(cl, "No memory for path\n");
+ goto fail;
+ }
+ ceph_decode_copy(&p, cap_auths[i].match.path, _len);
+
+ /* Remove the tailing '/' */
+ while (_len && cap_auths[i].match.path[_len - 1] == '/') {
+ cap_auths[i].match.path[_len - 1] = '\0';
+ _len -= 1;
+ }
+ }
+
+ ceph_decode_32_safe(&p, end, _len, bad);
+ if (_len) {
+ cap_auths[i].match.fs_name = kcalloc(_len + 1, sizeof(char),
+ GFP_KERNEL);
+ if (!cap_auths[i].match.fs_name) {
+ pr_err_client(cl, "No memory for fs_name\n");
+ goto fail;
+ }
+ ceph_decode_copy(&p, cap_auths[i].match.fs_name, _len);
+ }
+
+ ceph_decode_8_safe(&p, end, cap_auths[i].match.root_squash, bad);
+ ceph_decode_8_safe(&p, end, cap_auths[i].readable, bad);
+ ceph_decode_8_safe(&p, end, cap_auths[i].writeable, bad);
+ doutc(cl, "uid %lld, num_gids %u, path %s, fs_name %s, root_squash %d, readable %d, writeable %d\n",
+ cap_auths[i].match.uid, cap_auths[i].match.num_gids,
+ cap_auths[i].match.path, cap_auths[i].match.fs_name,
+ cap_auths[i].match.root_squash,
+ cap_auths[i].readable, cap_auths[i].writeable);
+ }
+ }
+
+skip_cap_auths:
mutex_lock(&mdsc->mutex);
+ if (op == CEPH_SESSION_OPEN) {
+ if (mdsc->s_cap_auths) {
+ for (i = 0; i < mdsc->s_cap_auths_num; i++) {
+ kfree(mdsc->s_cap_auths[i].match.gids);
+ kfree(mdsc->s_cap_auths[i].match.path);
+ kfree(mdsc->s_cap_auths[i].match.fs_name);
+ }
+ kfree(mdsc->s_cap_auths);
+ }
+ mdsc->s_cap_auths_num = cap_auths_num;
+ mdsc->s_cap_auths = cap_auths;
+ }
if (op == CEPH_SESSION_CLOSE) {
ceph_get_mds_session(session);
__unregister_session(mdsc, session);
@@ -4290,6 +4387,13 @@ bad:
pr_err_client(cl, "corrupt message mds%d len %d\n", mds,
(int)msg->front.iov_len);
ceph_msg_dump(msg);
+fail:
+ for (i = 0; i < cap_auths_num; i++) {
+ kfree(cap_auths[i].match.gids);
+ kfree(cap_auths[i].match.path);
+ kfree(cap_auths[i].match.fs_name);
+ }
+ kfree(cap_auths);
return;
}
@@ -5499,6 +5603,170 @@ void send_flush_mdlog(struct ceph_mds_session *s)
mutex_unlock(&s->s_mutex);
}
+static int ceph_mds_auth_match(struct ceph_mds_client *mdsc,
+ struct ceph_mds_cap_auth *auth,
+ char *tpath)
+{
+ const struct cred *cred = get_current_cred();
+ u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid);
+ u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid);
+ struct ceph_client *cl = mdsc->fsc->client;
+ const char *spath = mdsc->fsc->mount_options->server_path;
+ bool gid_matched = false;
+ u32 gid, tlen, len;
+ int i, j;
+
+ doutc(cl, "match.uid %lld\n", auth->match.uid);
+ if (auth->match.uid != MDS_AUTH_UID_ANY) {
+ if (auth->match.uid != caller_uid)
+ return 0;
+ if (auth->match.num_gids) {
+ for (i = 0; i < auth->match.num_gids; i++) {
+ if (caller_gid == auth->match.gids[i])
+ gid_matched = true;
+ }
+ if (!gid_matched && cred->group_info->ngroups) {
+ for (i = 0; i < cred->group_info->ngroups; i++) {
+ gid = from_kgid(&init_user_ns,
+ cred->group_info->gid[i]);
+ for (j = 0; j < auth->match.num_gids; j++) {
+ if (gid == auth->match.gids[j]) {
+ gid_matched = true;
+ break;
+ }
+ }
+ if (gid_matched)
+ break;
+ }
+ }
+ if (!gid_matched)
+ return 0;
+ }
+ }
+
+ /* path match */
+ if (auth->match.path) {
+ if (!tpath)
+ return 0;
+
+ tlen = strlen(tpath);
+ len = strlen(auth->match.path);
+ if (len) {
+ char *_tpath = tpath;
+ bool free_tpath = false;
+ int m, n;
+
+ doutc(cl, "server path %s, tpath %s, match.path %s\n",
+ spath, tpath, auth->match.path);
+ if (spath && (m = strlen(spath)) != 1) {
+ /* mount path + '/' + tpath + an extra space */
+ n = m + 1 + tlen + 1;
+ _tpath = kmalloc(n, GFP_NOFS);
+ if (!_tpath)
+ return -ENOMEM;
+ /* remove the leading '/' */
+ snprintf(_tpath, n, "%s/%s", spath + 1, tpath);
+ free_tpath = true;
+ tlen = strlen(_tpath);
+ }
+
+ /*
+ * Please note the tailing '/' for match.path has already
+ * been removed when parsing.
+ *
+ * Remove the tailing '/' for the target path.
+ */
+ while (tlen && _tpath[tlen - 1] == '/') {
+ _tpath[tlen - 1] = '\0';
+ tlen -= 1;
+ }
+ doutc(cl, "_tpath %s\n", _tpath);
+
+ /*
+ * In case first == _tpath && tlen == len:
+ * match.path=/foo --> /foo _path=/foo --> match
+ * match.path=/foo/ --> /foo _path=/foo --> match
+ *
+ * In case first == _tmatch.path && tlen > len:
+ * match.path=/foo/ --> /foo _path=/foo/ --> match
+ * match.path=/foo --> /foo _path=/foo/ --> match
+ * match.path=/foo/ --> /foo _path=/foo/d --> match
+ * match.path=/foo --> /foo _path=/food --> mismatch
+ *
+ * All the other cases --> mismatch
+ */
+ char *first = strstr(_tpath, auth->match.path);
+ if (first != _tpath) {
+ if (free_tpath)
+ kfree(_tpath);
+ return 0;
+ }
+
+ if (tlen > len && _tpath[len] != '/') {
+ if (free_tpath)
+ kfree(_tpath);
+ return 0;
+ }
+ }
+ }
+
+ doutc(cl, "matched\n");
+ return 1;
+}
+
+int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath, int mask)
+{
+ const struct cred *cred = get_current_cred();
+ u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid);
+ u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid);
+ struct ceph_mds_cap_auth *rw_perms_s = NULL;
+ struct ceph_client *cl = mdsc->fsc->client;
+ bool root_squash_perms = true;
+ int i, err;
+
+ doutc(cl, "tpath '%s', mask %d, caller_uid %d, caller_gid %d\n",
+ tpath, mask, caller_uid, caller_gid);
+
+ for (i = 0; i < mdsc->s_cap_auths_num; i++) {
+ struct ceph_mds_cap_auth *s = &mdsc->s_cap_auths[i];
+
+ err = ceph_mds_auth_match(mdsc, s, tpath);
+ if (err < 0) {
+ return err;
+ } else if (err > 0) {
+ /* always follow the last auth caps' permision */
+ root_squash_perms = true;
+ rw_perms_s = NULL;
+ if ((mask & MAY_WRITE) && s->writeable &&
+ s->match.root_squash && (!caller_uid || !caller_gid))
+ root_squash_perms = false;
+
+ if (((mask & MAY_WRITE) && !s->writeable) ||
+ ((mask & MAY_READ) && !s->readable))
+ rw_perms_s = s;
+ }
+ }
+
+ doutc(cl, "root_squash_perms %d, rw_perms_s %p\n", root_squash_perms,
+ rw_perms_s);
+ if (root_squash_perms && rw_perms_s == NULL) {
+ doutc(cl, "access allowed\n");
+ return 0;
+ }
+
+ if (!root_squash_perms) {
+ doutc(cl, "root_squash is enabled and user(%d %d) isn't allowed to write",
+ caller_uid, caller_gid);
+ }
+ if (rw_perms_s) {
+ doutc(cl, "mds auth caps readable/writeable %d/%d while request r/w %d/%d",
+ rw_perms_s->readable, rw_perms_s->writeable,
+ !!(mask & MAY_READ), !!(mask & MAY_WRITE));
+ }
+ doutc(cl, "access denied\n");
+ return -EACCES;
+}
+
/*
* called before mount is ro, and before dentries are torn down.
* (hmm, does this still race with new lookups?)
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index b88e80415224..cfa18cf915a0 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -35,8 +35,9 @@ enum ceph_feature_type {
CEPHFS_FEATURE_32BITS_RETRY_FWD,
CEPHFS_FEATURE_NEW_SNAPREALM_INFO,
CEPHFS_FEATURE_HAS_OWNER_UIDGID,
+ CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK,
- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_HAS_OWNER_UIDGID,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK,
};
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
@@ -52,6 +53,7 @@ enum ceph_feature_type {
CEPHFS_FEATURE_OP_GETVXATTR, \
CEPHFS_FEATURE_32BITS_RETRY_FWD, \
CEPHFS_FEATURE_HAS_OWNER_UIDGID, \
+ CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, \
}
/*
@@ -71,6 +73,24 @@ enum ceph_feature_type {
struct ceph_fs_client;
struct ceph_cap;
+#define MDS_AUTH_UID_ANY -1
+
+struct ceph_mds_cap_match {
+ s64 uid; /* default to MDS_AUTH_UID_ANY */
+ u32 num_gids;
+ u32 *gids; /* use these GIDs */
+ char *path; /* require path to be child of this
+ (may be "" or "/" for any) */
+ char *fs_name;
+ bool root_squash; /* default to false */
+};
+
+struct ceph_mds_cap_auth {
+ struct ceph_mds_cap_match match;
+ bool readable;
+ bool writeable;
+};
+
/*
* parsed info about a single inode. pointers are into the encoded
* on-wire structures within the mds reply message payload.
@@ -513,6 +533,9 @@ struct ceph_mds_client {
struct rw_semaphore pool_perm_rwsem;
struct rb_root pool_perm_tree;
+ u32 s_cap_auths_num;
+ struct ceph_mds_cap_auth *s_cap_auths;
+
char nodename[__NEW_UTS_LEN + 1];
};
@@ -581,6 +604,9 @@ extern void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc);
extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
int (*cb)(struct inode *, int mds, void *),
void *arg);
+extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath,
+ int mask);
+
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
static inline void ceph_mdsc_free_path(char *path, int len)
diff --git a/fs/coredump.c b/fs/coredump.c
index 317065e3eb9b..a57a06b80f57 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -376,9 +376,7 @@ static int zap_process(struct task_struct *start, int exit_code)
if (t != current && !(t->flags & PF_POSTCOREDUMP)) {
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
- /* The vhost_worker does not particpate in coredumps */
- if ((t->flags & (PF_USER_WORKER | PF_IO_WORKER)) != PF_USER_WORKER)
- nr++;
+ nr++;
}
}
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 9901057a15ba..460690ca0174 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -183,7 +183,7 @@ static int next_buffer;
static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
unsigned int len)
{
- struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+ struct address_space *mapping = sb->s_bdev->bd_mapping;
struct file_ra_state ra = {};
struct page *pages[BLKS_PER_BUF];
unsigned i, blocknr, buffer;
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 52524bd9698b..8be60797ea2f 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -29,11 +29,9 @@ void erofs_put_metabuf(struct erofs_buf *buf)
* Derive the block size from inode->i_blkbits to make compatible with
* anonymous inode in fscache mode.
*/
-void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr,
+void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
enum erofs_kmap_type type)
{
- struct inode *inode = buf->inode;
- erofs_off_t offset = (erofs_off_t)blkaddr << inode->i_blkbits;
pgoff_t index = offset >> PAGE_SHIFT;
struct page *page = buf->page;
struct folio *folio;
@@ -43,7 +41,7 @@ void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr,
erofs_put_metabuf(buf);
nofs_flag = memalloc_nofs_save();
- folio = read_cache_folio(inode->i_mapping, index, NULL, NULL);
+ folio = read_cache_folio(buf->mapping, index, NULL, NULL);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(folio))
return folio;
@@ -68,16 +66,16 @@ void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr,
void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
{
if (erofs_is_fscache_mode(sb))
- buf->inode = EROFS_SB(sb)->s_fscache->inode;
+ buf->mapping = EROFS_SB(sb)->s_fscache->inode->i_mapping;
else
- buf->inode = sb->s_bdev->bd_inode;
+ buf->mapping = sb->s_bdev->bd_mapping;
}
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_blk_t blkaddr, enum erofs_kmap_type type)
+ erofs_off_t offset, enum erofs_kmap_type type)
{
erofs_init_metabuf(buf, sb);
- return erofs_bread(buf, blkaddr, type);
+ return erofs_bread(buf, offset, type);
}
static int erofs_map_blocks_flatmode(struct inode *inode,
@@ -154,7 +152,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
pos = ALIGN(erofs_iloc(inode) + vi->inode_isize +
vi->xattr_isize, unit) + unit * chunknr;
- kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP);
+ kaddr = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP);
if (IS_ERR(kaddr)) {
err = PTR_ERR(kaddr);
goto out;
@@ -165,7 +163,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
/* handle block map */
if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) {
- __le32 *blkaddr = kaddr + erofs_blkoff(sb, pos);
+ __le32 *blkaddr = kaddr;
if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) {
map->m_flags = 0;
@@ -176,7 +174,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
goto out_unlock;
}
/* parse chunk indexes */
- idx = kaddr + erofs_blkoff(sb, pos);
+ idx = kaddr;
switch (le32_to_cpu(idx->blkaddr)) {
case EROFS_NULL_ADDR:
map->m_flags = 0;
@@ -296,11 +294,10 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
iomap->type = IOMAP_INLINE;
- ptr = erofs_read_metabuf(&buf, sb,
- erofs_blknr(sb, mdev.m_pa), EROFS_KMAP);
+ ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, EROFS_KMAP);
if (IS_ERR(ptr))
return PTR_ERR(ptr);
- iomap->inline_data = ptr + erofs_blkoff(sb, mdev.m_pa);
+ iomap->inline_data = ptr;
iomap->private = buf.base;
} else {
iomap->type = IOMAP_MAPPED;
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
index 81e65c453ef0..3a3461561a3c 100644
--- a/fs/erofs/decompressor_deflate.c
+++ b/fs/erofs/decompressor_deflate.c
@@ -46,39 +46,15 @@ int __init z_erofs_deflate_init(void)
/* by default, use # of possible CPUs instead */
if (!z_erofs_deflate_nstrms)
z_erofs_deflate_nstrms = num_possible_cpus();
-
- for (; z_erofs_deflate_avail_strms < z_erofs_deflate_nstrms;
- ++z_erofs_deflate_avail_strms) {
- struct z_erofs_deflate *strm;
-
- strm = kzalloc(sizeof(*strm), GFP_KERNEL);
- if (!strm)
- goto out_failed;
-
- /* XXX: in-kernel zlib cannot shrink windowbits currently */
- strm->z.workspace = vmalloc(zlib_inflate_workspacesize());
- if (!strm->z.workspace) {
- kfree(strm);
- goto out_failed;
- }
-
- spin_lock(&z_erofs_deflate_lock);
- strm->next = z_erofs_deflate_head;
- z_erofs_deflate_head = strm;
- spin_unlock(&z_erofs_deflate_lock);
- }
return 0;
-
-out_failed:
- erofs_err(NULL, "failed to allocate zlib workspace");
- z_erofs_deflate_exit();
- return -ENOMEM;
}
int z_erofs_load_deflate_config(struct super_block *sb,
struct erofs_super_block *dsb, void *data, int size)
{
struct z_erofs_deflate_cfgs *dfl = data;
+ static DEFINE_MUTEX(deflate_resize_mutex);
+ static bool inited;
if (!dfl || size < sizeof(struct z_erofs_deflate_cfgs)) {
erofs_err(sb, "invalid deflate cfgs, size=%u", size);
@@ -89,9 +65,36 @@ int z_erofs_load_deflate_config(struct super_block *sb,
erofs_err(sb, "unsupported windowbits %u", dfl->windowbits);
return -EOPNOTSUPP;
}
+ mutex_lock(&deflate_resize_mutex);
+ if (!inited) {
+ for (; z_erofs_deflate_avail_strms < z_erofs_deflate_nstrms;
+ ++z_erofs_deflate_avail_strms) {
+ struct z_erofs_deflate *strm;
+
+ strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+ if (!strm)
+ goto failed;
+ /* XXX: in-kernel zlib cannot customize windowbits */
+ strm->z.workspace = vmalloc(zlib_inflate_workspacesize());
+ if (!strm->z.workspace) {
+ kfree(strm);
+ goto failed;
+ }
+ spin_lock(&z_erofs_deflate_lock);
+ strm->next = z_erofs_deflate_head;
+ z_erofs_deflate_head = strm;
+ spin_unlock(&z_erofs_deflate_lock);
+ }
+ inited = true;
+ }
+ mutex_unlock(&deflate_resize_mutex);
erofs_info(sb, "EXPERIMENTAL DEFLATE feature in use. Use at your own risk!");
return 0;
+failed:
+ mutex_unlock(&deflate_resize_mutex);
+ z_erofs_deflate_exit();
+ return -ENOMEM;
}
int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index b80abec0531a..2193a6710c8f 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -58,12 +58,12 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
int err = 0;
bool initial = true;
- buf.inode = dir;
+ buf.mapping = dir->i_mapping;
while (ctx->pos < dirsize) {
struct erofs_dirent *de;
unsigned int nameoff, maxsize;
- de = erofs_bread(&buf, i, EROFS_KMAP);
+ de = erofs_bread(&buf, erofs_pos(sb, i), EROFS_KMAP);
if (IS_ERR(de)) {
erofs_err(sb, "fail to readdir of logical block %u of nid %llu",
i, EROFS_I(dir)->nid);
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 62da538d91cb..fda16eedafb5 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -273,21 +273,15 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req)
if (map.m_flags & EROFS_MAP_META) {
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct iov_iter iter;
- erofs_blk_t blknr;
- size_t offset, size;
+ size_t size = map.m_llen;
void *src;
- /* For tail packing layout, the offset may be non-zero. */
- offset = erofs_blkoff(sb, map.m_pa);
- blknr = erofs_blknr(sb, map.m_pa);
- size = map.m_llen;
-
- src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP);
+ src = erofs_read_metabuf(&buf, sb, map.m_pa, EROFS_KMAP);
if (IS_ERR(src))
return PTR_ERR(src);
iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE);
- if (copy_to_iter(src + offset, size, &iter) != size) {
+ if (copy_to_iter(src, size, &iter) != size) {
erofs_put_metabuf(&buf);
return -EFAULT;
}
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 0eb0e6f933c3..5f6439a63af7 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -26,7 +26,7 @@ static void *erofs_read_inode(struct erofs_buf *buf,
blkaddr = erofs_blknr(sb, inode_loc);
*ofs = erofs_blkoff(sb, inode_loc);
- kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP);
+ kaddr = erofs_read_metabuf(buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP);
if (IS_ERR(kaddr)) {
erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
vi->nid, PTR_ERR(kaddr));
@@ -66,7 +66,7 @@ static void *erofs_read_inode(struct erofs_buf *buf,
goto err_out;
}
memcpy(copied, dic, gotten);
- kaddr = erofs_read_metabuf(buf, sb, blkaddr + 1,
+ kaddr = erofs_read_metabuf(buf, sb, erofs_pos(sb, blkaddr + 1),
EROFS_KMAP);
if (IS_ERR(kaddr)) {
erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld",
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 21def866a482..0c1b44ac9524 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -64,15 +64,12 @@ enum {
};
struct erofs_mount_opts {
-#ifdef CONFIG_EROFS_FS_ZIP
/* current strategy of how to use managed cache */
unsigned char cache_strategy;
/* strategy of sync decompression (0 - auto, 1 - force on, 2 - force off) */
unsigned int sync_decompress;
-
/* threshold for decompression synchronously */
unsigned int max_sync_decompress_pages;
-#endif
unsigned int mount_opt;
};
@@ -216,7 +213,7 @@ enum erofs_kmap_type {
};
struct erofs_buf {
- struct inode *inode;
+ struct address_space *mapping;
struct page *page;
void *base;
enum erofs_kmap_type kmap_type;
@@ -402,11 +399,11 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
erofs_off_t *offset, int *lengthp);
void erofs_unmap_metabuf(struct erofs_buf *buf);
void erofs_put_metabuf(struct erofs_buf *buf);
-void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr,
+void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
enum erofs_kmap_type type);
void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb);
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_blk_t blkaddr, enum erofs_kmap_type type);
+ erofs_off_t offset, enum erofs_kmap_type type);
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index f0110a78acb2..c94d0c1608a8 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -99,8 +99,8 @@ static void *erofs_find_target_block(struct erofs_buf *target,
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_dirent *de;
- buf.inode = dir;
- de = erofs_bread(&buf, mid, EROFS_KMAP);
+ buf.mapping = dir->i_mapping;
+ de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), EROFS_KMAP);
if (!IS_ERR(de)) {
const int nameoff = nameoff_from_disk(de->nameoff, bsz);
const int ndirents = nameoff / sizeof(*de);
@@ -171,7 +171,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid,
qn.name = name->name;
qn.end = name->name + name->len;
- buf.inode = dir;
+ buf.mapping = dir->i_mapping;
ndirents = 0;
de = erofs_find_target_block(&buf, dir, &qn, &ndirents);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 044c79229a78..c93bd24d2771 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -132,11 +132,11 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
int len, i, cnt;
*offset = round_up(*offset, 4);
- ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP);
+ ptr = erofs_bread(buf, *offset, EROFS_KMAP);
if (IS_ERR(ptr))
return ptr;
- len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(sb, *offset)]);
+ len = le16_to_cpu(*(__le16 *)ptr);
if (!len)
len = U16_MAX + 1;
buffer = kmalloc(len, GFP_KERNEL);
@@ -148,12 +148,12 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
for (i = 0; i < len; i += cnt) {
cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset),
len - i);
- ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP);
+ ptr = erofs_bread(buf, *offset, EROFS_KMAP);
if (IS_ERR(ptr)) {
kfree(buffer);
return ptr;
}
- memcpy(buffer + i, ptr + erofs_blkoff(sb, *offset), cnt);
+ memcpy(buffer + i, ptr, cnt);
*offset += cnt;
}
return buffer;
@@ -178,12 +178,10 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
struct erofs_fscache *fscache;
struct erofs_deviceslot *dis;
struct file *bdev_file;
- void *ptr;
- ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP);
- if (IS_ERR(ptr))
- return PTR_ERR(ptr);
- dis = ptr + erofs_blkoff(sb, *pos);
+ dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP);
+ if (IS_ERR(dis))
+ return PTR_ERR(dis);
if (!sbi->devs->flatdev && !dif->path) {
if (!dis->tag[0]) {
@@ -943,26 +941,14 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
struct erofs_mount_opts *opt = &sbi->opt;
-#ifdef CONFIG_EROFS_FS_XATTR
- if (test_opt(opt, XATTR_USER))
- seq_puts(seq, ",user_xattr");
- else
- seq_puts(seq, ",nouser_xattr");
-#endif
-#ifdef CONFIG_EROFS_FS_POSIX_ACL
- if (test_opt(opt, POSIX_ACL))
- seq_puts(seq, ",acl");
- else
- seq_puts(seq, ",noacl");
-#endif
-#ifdef CONFIG_EROFS_FS_ZIP
- if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
- seq_puts(seq, ",cache_strategy=disabled");
- else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
- seq_puts(seq, ",cache_strategy=readahead");
- else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
- seq_puts(seq, ",cache_strategy=readaround");
-#endif
+ if (IS_ENABLED(CONFIG_EROFS_FS_XATTR))
+ seq_puts(seq, test_opt(opt, XATTR_USER) ?
+ ",user_xattr" : ",nouser_xattr");
+ if (IS_ENABLED(CONFIG_EROFS_FS_POSIX_ACL))
+ seq_puts(seq, test_opt(opt, POSIX_ACL) ? ",acl" : ",noacl");
+ if (IS_ENABLED(CONFIG_EROFS_FS_ZIP))
+ seq_printf(seq, ",cache_strategy=%s",
+ erofs_param_cache_strategy[opt->cache_strategy].name);
if (test_opt(opt, DAX_ALWAYS))
seq_puts(seq, ",dax=always");
if (test_opt(opt, DAX_NEVER))
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index b58316b49a43..a90d7d649739 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -81,13 +81,13 @@ static int erofs_init_inode_xattrs(struct inode *inode)
it.pos = erofs_iloc(inode) + vi->inode_isize;
/* read in shared xattr array (non-atomic, see kmalloc below) */
- it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), EROFS_KMAP);
+ it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP);
if (IS_ERR(it.kaddr)) {
ret = PTR_ERR(it.kaddr);
goto out_unlock;
}
- ih = it.kaddr + erofs_blkoff(sb, it.pos);
+ ih = it.kaddr;
vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter);
vi->xattr_shared_count = ih->h_shared_count;
vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count,
@@ -102,16 +102,14 @@ static int erofs_init_inode_xattrs(struct inode *inode)
it.pos += sizeof(struct erofs_xattr_ibody_header);
for (i = 0; i < vi->xattr_shared_count; ++i) {
- it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos),
- EROFS_KMAP);
+ it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP);
if (IS_ERR(it.kaddr)) {
kfree(vi->xattr_shared_xattrs);
vi->xattr_shared_xattrs = NULL;
ret = PTR_ERR(it.kaddr);
goto out_unlock;
}
- vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)
- (it.kaddr + erofs_blkoff(sb, it.pos)));
+ vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)it.kaddr);
it.pos += sizeof(__le32);
}
erofs_put_metabuf(&it.buf);
@@ -185,12 +183,11 @@ static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it,
void *src;
for (processed = 0; processed < len; processed += slice) {
- it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
- EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
- src = it->kaddr + erofs_blkoff(sb, it->pos);
+ src = it->kaddr;
slice = min_t(unsigned int, sb->s_blocksize -
erofs_blkoff(sb, it->pos), len - processed);
memcpy(it->buffer + it->buffer_ofs, src, slice);
@@ -208,8 +205,7 @@ static int erofs_listxattr_foreach(struct erofs_xattr_iter *it)
int err;
/* 1. handle xattr entry */
- entry = *(struct erofs_xattr_entry *)
- (it->kaddr + erofs_blkoff(it->sb, it->pos));
+ entry = *(struct erofs_xattr_entry *)it->kaddr;
it->pos += sizeof(struct erofs_xattr_entry);
base_index = entry.e_name_index;
@@ -259,8 +255,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
unsigned int slice, processed, value_sz;
/* 1. handle xattr entry */
- entry = *(struct erofs_xattr_entry *)
- (it->kaddr + erofs_blkoff(sb, it->pos));
+ entry = *(struct erofs_xattr_entry *)it->kaddr;
it->pos += sizeof(struct erofs_xattr_entry);
value_sz = le16_to_cpu(entry.e_value_size);
@@ -291,8 +286,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
/* 2. handle xattr name */
for (processed = 0; processed < entry.e_name_len; processed += slice) {
- it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
- EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -300,7 +294,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
sb->s_blocksize - erofs_blkoff(sb, it->pos),
entry.e_name_len - processed);
if (memcmp(it->name.name + it->infix_len + processed,
- it->kaddr + erofs_blkoff(sb, it->pos), slice))
+ it->kaddr, slice))
return -ENOATTR;
it->pos += slice;
}
@@ -336,13 +330,11 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it,
it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz;
while (remaining) {
- it->kaddr = erofs_bread(&it->buf, erofs_blknr(it->sb, it->pos),
- EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
- entry_sz = erofs_xattr_entry_size(it->kaddr +
- erofs_blkoff(it->sb, it->pos));
+ entry_sz = erofs_xattr_entry_size(it->kaddr);
/* xattr on-disk corruption: xattr entry beyond xattr_isize */
if (remaining < entry_sz) {
DBG_BUGON(1);
@@ -375,8 +367,7 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it,
for (i = 0; i < vi->xattr_shared_count; ++i) {
it->pos = erofs_pos(sb, sbi->xattr_blkaddr) +
vi->xattr_shared_xattrs[i] * sizeof(__le32);
- it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
- EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -492,7 +483,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
return -ENOMEM;
if (sbi->packed_inode)
- buf.inode = sbi->packed_inode;
+ buf.mapping = sbi->packed_inode->i_mapping;
else
erofs_init_metabuf(&buf, sb);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 3216b920d369..d6fe002a4a71 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -868,7 +868,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
} else {
void *mptr;
- mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP);
+ mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, EROFS_NO_KMAP);
if (IS_ERR(mptr)) {
ret = PTR_ERR(mptr);
erofs_err(sb, "failed to get inline data %d", ret);
@@ -936,16 +936,16 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
if (!packed_inode)
return -EFSCORRUPTED;
- buf.inode = packed_inode;
+ buf.mapping = packed_inode->i_mapping;
for (; cur < end; cur += cnt, pos += cnt) {
cnt = min_t(unsigned int, end - cur,
sb->s_blocksize - erofs_blkoff(sb, pos));
- src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP);
+ src = erofs_bread(&buf, pos, EROFS_KMAP);
if (IS_ERR(src)) {
erofs_put_metabuf(&buf);
return PTR_ERR(src);
}
- memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt);
+ memcpy_to_page(page, cur, src, cnt);
}
erofs_put_metabuf(&buf);
return 0;
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 0a2454d8bcc1..9b248ee5fef2 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -34,13 +34,13 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
unsigned int advise;
m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
- erofs_blknr(inode->i_sb, pos), EROFS_KMAP);
+ pos, EROFS_KMAP);
if (IS_ERR(m->kaddr))
return PTR_ERR(m->kaddr);
m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index);
m->lcn = lcn;
- di = m->kaddr + erofs_blkoff(inode->i_sb, pos);
+ di = m->kaddr;
advise = le16_to_cpu(di->di_advise);
m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK;
@@ -109,7 +109,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
{
struct erofs_inode *const vi = EROFS_I(m->inode);
const unsigned int lclusterbits = vi->z_logical_clusterbits;
- unsigned int vcnt, base, lo, lobits, encodebits, nblk, eofs;
+ unsigned int vcnt, lo, lobits, encodebits, nblk, bytes;
int i;
u8 *in, type;
bool big_pcluster;
@@ -127,11 +127,11 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U);
encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
- eofs = erofs_blkoff(m->inode->i_sb, pos);
- base = round_down(eofs, vcnt << amortizedshift);
- in = m->kaddr + base;
+ bytes = pos & ((vcnt << amortizedshift) - 1);
- i = (eofs - base) >> amortizedshift;
+ in = m->kaddr - bytes;
+
+ i = bytes >> amortizedshift;
lo = decode_compactedbits(lobits, in, encodebits * i, &type);
m->type = type;
@@ -256,7 +256,7 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
out:
pos += lcn * (1 << amortizedshift);
m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
- erofs_blknr(inode->i_sb, pos), EROFS_KMAP);
+ pos, EROFS_KMAP);
if (IS_ERR(m->kaddr))
return PTR_ERR(m->kaddr);
return unpack_compacted_index(m, amortizedshift, pos, lookahead);
@@ -570,7 +570,6 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
int err, headnr;
erofs_off_t pos;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- void *kaddr;
struct z_erofs_map_header *h;
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
@@ -590,13 +589,12 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
goto out_unlock;
pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
- kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP);
- if (IS_ERR(kaddr)) {
- err = PTR_ERR(kaddr);
+ h = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP);
+ if (IS_ERR(h)) {
+ err = PTR_ERR(h);
goto out_unlock;
}
- h = kaddr + erofs_blkoff(sb, pos);
/*
* if the highest bit of the 8-byte map header is set, the whole file
* is stored in the packed inode. The rest bits keeps z_fragmentoff.
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index d6cfb1849580..d5bce83ad905 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -3,7 +3,6 @@ config EXT2_FS
tristate "Second extended fs support (DEPRECATED)"
select BUFFER_HEAD
select FS_IOMAP
- select LEGACY_DIRECT_IO
help
Ext2 is a standard Linux file system for hard disks.
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 4fb155b5a958..087457061c6e 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -175,7 +175,6 @@ Eend:
(unsigned long) le32_to_cpu(p->inode));
}
fail:
- folio_set_error(folio);
return false;
}
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 4ddc36f4dbd4..10b061ac5bc0 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -302,6 +302,12 @@ static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
return generic_file_write_iter(iocb, from);
}
+static int ext2_file_open(struct inode *inode, struct file *filp)
+{
+ filp->f_mode |= FMODE_CAN_ODIRECT;
+ return dquot_file_open(inode, filp);
+}
+
const struct file_operations ext2_file_operations = {
.llseek = generic_file_llseek,
.read_iter = ext2_file_read_iter,
@@ -311,7 +317,7 @@ const struct file_operations ext2_file_operations = {
.compat_ioctl = ext2_compat_ioctl,
#endif
.mmap = ext2_file_mmap,
- .open = dquot_file_open,
+ .open = ext2_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
.get_unmapped_area = thp_get_unmapped_area,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index f3d570a9302b..0caa1650cee8 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -965,7 +965,6 @@ const struct address_space_operations ext2_aops = {
.write_begin = ext2_write_begin,
.write_end = ext2_write_end,
.bmap = ext2_bmap,
- .direct_IO = noop_direct_IO,
.writepages = ext2_writepages,
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -974,7 +973,6 @@ const struct address_space_operations ext2_aops = {
static const struct address_space_operations ext2_dax_aops = {
.writepages = ext2_dax_writepages,
- .direct_IO = noop_direct_IO,
.dirty_folio = noop_dirty_folio,
};
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 3985f8c33f95..ff4514e4626b 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -192,7 +192,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
(PAGE_SHIFT - inode->i_blkbits);
if (!ra_has_index(&file->f_ra, index))
page_cache_sync_readahead(
- sb->s_bdev->bd_inode->i_mapping,
+ sb->s_bdev->bd_mapping,
&file->f_ra, file,
index, 1);
file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 5d8055161acd..da4a82456383 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -206,7 +206,7 @@ static void ext4_journal_abort_handle(const char *caller, unsigned int line,
static void ext4_check_bdev_write_error(struct super_block *sb)
{
- struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+ struct address_space *mapping = sb->s_bdev->bd_mapping;
struct ext4_sb_info *sbi = EXT4_SB(sb);
int err;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 893ab80dafba..c682fb927b64 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -244,7 +244,7 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
blk_opf_t op_flags)
{
- gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping,
+ gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
~__GFP_FS) | __GFP_MOVABLE;
return __ext4_sb_bread_gfp(sb, block, op_flags, gfp);
@@ -253,7 +253,7 @@ struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
sector_t block)
{
- gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping,
+ gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
~__GFP_FS);
return __ext4_sb_bread_gfp(sb, block, 0, gfp);
@@ -492,22 +492,6 @@ static void ext4_maybe_update_superblock(struct super_block *sb)
schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
}
-/*
- * The del_gendisk() function uninitializes the disk-specific data
- * structures, including the bdi structure, without telling anyone
- * else. Once this happens, any attempt to call mark_buffer_dirty()
- * (for example, by ext4_commit_super), will cause a kernel OOPS.
- * This is a kludge to prevent these oops until we can put in a proper
- * hook in del_gendisk() to inform the VFS and file system layers.
- */
-static int block_device_ejected(struct super_block *sb)
-{
- struct inode *bd_inode = sb->s_bdev->bd_inode;
- struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
-
- return bdi->dev == NULL;
-}
-
static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
{
struct super_block *sb = journal->j_private;
@@ -5563,7 +5547,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* used to detect the metadata async write error.
*/
spin_lock_init(&sbi->s_bdev_wb_lock);
- errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
+ errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err,
&sbi->s_bdev_wb_err);
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
@@ -5866,7 +5850,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb,
sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
offset = EXT4_MIN_BLOCK_SIZE % blocksize;
- set_blocksize(bdev, blocksize);
+ set_blocksize(bdev_file, blocksize);
bh = __bread(bdev, sb_block, blocksize);
if (!bh) {
ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
@@ -6164,8 +6148,6 @@ static int ext4_commit_super(struct super_block *sb)
if (!sbh)
return -EINVAL;
- if (block_device_ejected(sb))
- return -ENODEV;
ext4_update_super(sb);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index eac698b8dd38..55d444bec5c0 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -179,22 +179,22 @@ static bool __f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
break;
case META_SIT:
if (unlikely(blkaddr >= SIT_BLK_CNT(sbi)))
- goto err;
+ goto check_only;
break;
case META_SSA:
if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) ||
blkaddr < SM_I(sbi)->ssa_blkaddr))
- goto err;
+ goto check_only;
break;
case META_CP:
if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr ||
blkaddr < __start_cp_addr(sbi)))
- goto err;
+ goto check_only;
break;
case META_POR:
if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
blkaddr < MAIN_BLKADDR(sbi)))
- goto err;
+ goto check_only;
break;
case DATA_GENERIC:
case DATA_GENERIC_ENHANCE:
@@ -228,6 +228,7 @@ static bool __f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
return true;
err:
f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
+check_only:
return false;
}
@@ -345,7 +346,7 @@ static int __f2fs_write_meta_page(struct page *page,
{
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
- trace_f2fs_writepage(page, META);
+ trace_f2fs_writepage(page_folio(page), META);
if (unlikely(f2fs_cp_error(sbi))) {
if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) {
@@ -492,7 +493,7 @@ stop:
static bool f2fs_dirty_meta_folio(struct address_space *mapping,
struct folio *folio)
{
- trace_f2fs_set_page_dirty(&folio->page, META);
+ trace_f2fs_set_page_dirty(folio, META);
if (!folio_test_uptodate(folio))
folio_mark_uptodate(folio);
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 8892c8262141..1ef82a546391 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -198,8 +198,8 @@ static int lzo_compress_pages(struct compress_ctx *cc)
ret = lzo1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata,
&cc->clen, cc->private);
if (ret != LZO_E_OK) {
- printk_ratelimited("%sF2FS-fs (%s): lzo compress failed, ret:%d\n",
- KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret);
+ f2fs_err_ratelimited(F2FS_I_SB(cc->inode),
+ "lzo compress failed, ret:%d", ret);
return -EIO;
}
return 0;
@@ -212,17 +212,15 @@ static int lzo_decompress_pages(struct decompress_io_ctx *dic)
ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen,
dic->rbuf, &dic->rlen);
if (ret != LZO_E_OK) {
- printk_ratelimited("%sF2FS-fs (%s): lzo decompress failed, ret:%d\n",
- KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret);
+ f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ "lzo decompress failed, ret:%d", ret);
return -EIO;
}
if (dic->rlen != PAGE_SIZE << dic->log_cluster_size) {
- printk_ratelimited("%sF2FS-fs (%s): lzo invalid rlen:%zu, "
- "expected:%lu\n", KERN_ERR,
- F2FS_I_SB(dic->inode)->sb->s_id,
- dic->rlen,
- PAGE_SIZE << dic->log_cluster_size);
+ f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ "lzo invalid rlen:%zu, expected:%lu",
+ dic->rlen, PAGE_SIZE << dic->log_cluster_size);
return -EIO;
}
return 0;
@@ -294,16 +292,15 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic)
ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf,
dic->clen, dic->rlen);
if (ret < 0) {
- printk_ratelimited("%sF2FS-fs (%s): lz4 decompress failed, ret:%d\n",
- KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret);
+ f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ "lz4 decompress failed, ret:%d", ret);
return -EIO;
}
if (ret != PAGE_SIZE << dic->log_cluster_size) {
- printk_ratelimited("%sF2FS-fs (%s): lz4 invalid ret:%d, "
- "expected:%lu\n", KERN_ERR,
- F2FS_I_SB(dic->inode)->sb->s_id, ret,
- PAGE_SIZE << dic->log_cluster_size);
+ f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ "lz4 invalid ret:%d, expected:%lu",
+ ret, PAGE_SIZE << dic->log_cluster_size);
return -EIO;
}
return 0;
@@ -350,9 +347,8 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc)
stream = zstd_init_cstream(&params, 0, workspace, workspace_size);
if (!stream) {
- printk_ratelimited("%sF2FS-fs (%s): %s zstd_init_cstream failed\n",
- KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
- __func__);
+ f2fs_err_ratelimited(F2FS_I_SB(cc->inode),
+ "%s zstd_init_cstream failed", __func__);
kvfree(workspace);
return -EIO;
}
@@ -390,16 +386,16 @@ static int zstd_compress_pages(struct compress_ctx *cc)
ret = zstd_compress_stream(stream, &outbuf, &inbuf);
if (zstd_is_error(ret)) {
- printk_ratelimited("%sF2FS-fs (%s): %s zstd_compress_stream failed, ret: %d\n",
- KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
+ f2fs_err_ratelimited(F2FS_I_SB(cc->inode),
+ "%s zstd_compress_stream failed, ret: %d",
__func__, zstd_get_error_code(ret));
return -EIO;
}
ret = zstd_end_stream(stream, &outbuf);
if (zstd_is_error(ret)) {
- printk_ratelimited("%sF2FS-fs (%s): %s zstd_end_stream returned %d\n",
- KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
+ f2fs_err_ratelimited(F2FS_I_SB(cc->inode),
+ "%s zstd_end_stream returned %d",
__func__, zstd_get_error_code(ret));
return -EIO;
}
@@ -432,9 +428,8 @@ static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic)
stream = zstd_init_dstream(max_window_size, workspace, workspace_size);
if (!stream) {
- printk_ratelimited("%sF2FS-fs (%s): %s zstd_init_dstream failed\n",
- KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id,
- __func__);
+ f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ "%s zstd_init_dstream failed", __func__);
kvfree(workspace);
return -EIO;
}
@@ -469,16 +464,15 @@ static int zstd_decompress_pages(struct decompress_io_ctx *dic)
ret = zstd_decompress_stream(stream, &outbuf, &inbuf);
if (zstd_is_error(ret)) {
- printk_ratelimited("%sF2FS-fs (%s): %s zstd_decompress_stream failed, ret: %d\n",
- KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id,
+ f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ "%s zstd_decompress_stream failed, ret: %d",
__func__, zstd_get_error_code(ret));
return -EIO;
}
if (dic->rlen != outbuf.pos) {
- printk_ratelimited("%sF2FS-fs (%s): %s ZSTD invalid rlen:%zu, "
- "expected:%lu\n", KERN_ERR,
- F2FS_I_SB(dic->inode)->sb->s_id,
+ f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ "%s ZSTD invalid rlen:%zu, expected:%lu",
__func__, dic->rlen,
PAGE_SIZE << dic->log_cluster_size);
return -EIO;
@@ -1031,6 +1025,31 @@ static void set_cluster_writeback(struct compress_ctx *cc)
}
}
+static void cancel_cluster_writeback(struct compress_ctx *cc,
+ struct compress_io_ctx *cic, int submitted)
+{
+ int i;
+
+ /* Wait for submitted IOs. */
+ if (submitted > 1) {
+ f2fs_submit_merged_write(F2FS_I_SB(cc->inode), DATA);
+ while (atomic_read(&cic->pending_pages) !=
+ (cc->valid_nr_cpages - submitted + 1))
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
+ }
+
+ /* Cancel writeback and stay locked. */
+ for (i = 0; i < cc->cluster_size; i++) {
+ if (i < submitted) {
+ inode_inc_dirty_pages(cc->inode);
+ lock_page(cc->rpages[i]);
+ }
+ clear_page_private_gcing(cc->rpages[i]);
+ if (folio_test_writeback(page_folio(cc->rpages[i])))
+ end_page_writeback(cc->rpages[i]);
+ }
+}
+
static void set_cluster_dirty(struct compress_ctx *cc)
{
int i;
@@ -1232,7 +1251,6 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
.page = NULL,
.encrypted_page = NULL,
.compressed_page = NULL,
- .submitted = 0,
.io_type = io_type,
.io_wbc = wbc,
.encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode) ?
@@ -1358,7 +1376,16 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
fio.compressed_page = cc->cpages[i - 1];
cc->cpages[i - 1] = NULL;
+ fio.submitted = 0;
f2fs_outplace_write_data(&dn, &fio);
+ if (unlikely(!fio.submitted)) {
+ cancel_cluster_writeback(cc, cic, i);
+
+ /* To call fscrypt_finalize_bounce_page */
+ i = cc->valid_nr_cpages;
+ *submitted = 0;
+ goto out_destroy_crypt;
+ }
(*submitted)++;
unlock_continue:
inode_dec_dirty_pages(cc->inode);
@@ -1392,8 +1419,11 @@ unlock_continue:
out_destroy_crypt:
page_array_free(cc->inode, cic->rpages, cc->cluster_size);
- for (--i; i >= 0; i--)
+ for (--i; i >= 0; i--) {
+ if (!cc->cpages[i])
+ continue;
fscrypt_finalize_bounce_page(&cc->cpages[i]);
+ }
out_put_cic:
kmem_cache_free(cic_entry_slab, cic);
out_put_dnode:
@@ -1484,7 +1514,7 @@ continue_unlock:
if (!PageDirty(cc->rpages[i]))
goto continue_unlock;
- if (PageWriteback(cc->rpages[i])) {
+ if (folio_test_writeback(page_folio(cc->rpages[i]))) {
if (wbc->sync_mode == WB_SYNC_NONE)
goto continue_unlock;
f2fs_wait_on_page_writeback(cc->rpages[i], DATA, true, true);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 961e6ff77c72..b9b0debc6b3d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -465,6 +465,8 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
} else {
bio->bi_end_io = f2fs_write_end_io;
bio->bi_private = sbi;
+ bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi,
+ fio->type, fio->temp);
}
iostat_alloc_and_bind_ctx(sbi, bio, NULL);
@@ -593,17 +595,20 @@ int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi)
return -ENOMEM;
for (j = HOT; j < n; j++) {
- init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem);
- sbi->write_io[i][j].sbi = sbi;
- sbi->write_io[i][j].bio = NULL;
- spin_lock_init(&sbi->write_io[i][j].io_lock);
- INIT_LIST_HEAD(&sbi->write_io[i][j].io_list);
- INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list);
- init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock);
+ struct f2fs_bio_info *io = &sbi->write_io[i][j];
+
+ init_f2fs_rwsem(&io->io_rwsem);
+ io->sbi = sbi;
+ io->bio = NULL;
+ io->last_block_in_bio = 0;
+ spin_lock_init(&io->io_lock);
+ INIT_LIST_HEAD(&io->io_list);
+ INIT_LIST_HEAD(&io->bio_list);
+ init_f2fs_rwsem(&io->bio_list_lock);
#ifdef CONFIG_BLK_DEV_ZONED
- init_completion(&sbi->write_io[i][j].zone_wait);
- sbi->write_io[i][j].zone_pending_bio = NULL;
- sbi->write_io[i][j].bi_private = NULL;
+ init_completion(&io->zone_wait);
+ io->zone_pending_bio = NULL;
+ io->bi_private = NULL;
#endif
}
}
@@ -1507,6 +1512,25 @@ static bool f2fs_map_blocks_cached(struct inode *inode,
return true;
}
+static bool map_is_mergeable(struct f2fs_sb_info *sbi,
+ struct f2fs_map_blocks *map,
+ block_t blkaddr, int flag, int bidx,
+ int ofs)
+{
+ if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev)
+ return false;
+ if (map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs))
+ return true;
+ if (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR)
+ return true;
+ if (flag == F2FS_GET_BLOCK_PRE_DIO)
+ return true;
+ if (flag == F2FS_GET_BLOCK_DIO &&
+ map->m_pblk == NULL_ADDR && blkaddr == NULL_ADDR)
+ return true;
+ return false;
+}
+
/*
* f2fs_map_blocks() tries to find or build mapping relationship which
* maps continuous logical blocks to physical blocks, and return such
@@ -1574,8 +1598,9 @@ next_block:
}
/* use out-place-update for direct IO under LFS mode */
- if (map->m_may_create &&
- (is_hole || (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO))) {
+ if (map->m_may_create && (is_hole ||
+ (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) &&
+ !f2fs_is_pinned_file(inode)))) {
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
goto sync_out;
@@ -1628,6 +1653,10 @@ next_block:
goto sync_out;
}
break;
+ case F2FS_GET_BLOCK_DIO:
+ if (map->m_next_pgofs)
+ *map->m_next_pgofs = pgofs + 1;
+ break;
default:
/* for defragment case */
if (map->m_next_pgofs)
@@ -1646,19 +1675,15 @@ next_block:
/* reserved delalloc block should be mapped for fiemap. */
if (blkaddr == NEW_ADDR)
map->m_flags |= F2FS_MAP_DELALLOC;
- map->m_flags |= F2FS_MAP_MAPPED;
+ if (flag != F2FS_GET_BLOCK_DIO || !is_hole)
+ map->m_flags |= F2FS_MAP_MAPPED;
map->m_pblk = blkaddr;
map->m_len = 1;
if (map->m_multidev_dio)
map->m_bdev = FDEV(bidx).bdev;
- } else if ((map->m_pblk != NEW_ADDR &&
- blkaddr == (map->m_pblk + ofs)) ||
- (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) ||
- flag == F2FS_GET_BLOCK_PRE_DIO) {
- if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev)
- goto sync_out;
+ } else if (map_is_mergeable(sbi, map, blkaddr, flag, bidx, ofs)) {
ofs++;
map->m_len++;
} else {
@@ -2042,7 +2067,7 @@ static inline loff_t f2fs_readpage_limit(struct inode *inode)
return i_size_read(inode);
}
-static int f2fs_read_single_page(struct inode *inode, struct page *page,
+static int f2fs_read_single_page(struct inode *inode, struct folio *folio,
unsigned nr_pages,
struct f2fs_map_blocks *map,
struct bio **bio_ret,
@@ -2055,9 +2080,10 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page,
sector_t last_block;
sector_t last_block_in_file;
sector_t block_nr;
+ pgoff_t index = folio_index(folio);
int ret = 0;
- block_in_file = (sector_t)page_index(page);
+ block_in_file = (sector_t)index;
last_block = block_in_file + nr_pages;
last_block_in_file = bytes_to_blks(inode,
f2fs_readpage_limit(inode) + blocksize - 1);
@@ -2088,7 +2114,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page,
got_it:
if ((map->m_flags & F2FS_MAP_MAPPED)) {
block_nr = map->m_pblk + block_in_file - map->m_lblk;
- SetPageMappedToDisk(page);
+ folio_set_mappedtodisk(folio);
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
DATA_GENERIC_ENHANCE_READ)) {
@@ -2097,15 +2123,15 @@ got_it:
}
} else {
zero_out:
- zero_user_segment(page, 0, PAGE_SIZE);
- if (f2fs_need_verity(inode, page->index) &&
- !fsverity_verify_page(page)) {
+ folio_zero_segment(folio, 0, folio_size(folio));
+ if (f2fs_need_verity(inode, index) &&
+ !fsverity_verify_folio(folio)) {
ret = -EIO;
goto out;
}
- if (!PageUptodate(page))
- SetPageUptodate(page);
- unlock_page(page);
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
goto out;
}
@@ -2115,14 +2141,14 @@ zero_out:
*/
if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
*last_block_in_bio, block_nr) ||
- !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) {
+ !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
submit_and_realloc:
f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
bio = NULL;
}
if (bio == NULL) {
bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
- is_readahead ? REQ_RAHEAD : 0, page->index,
+ is_readahead ? REQ_RAHEAD : 0, index,
false);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
@@ -2137,7 +2163,7 @@ submit_and_realloc:
*/
f2fs_wait_on_block_writeback(inode, block_nr);
- if (bio_add_page(bio, page, blocksize, 0) < blocksize)
+ if (!bio_add_folio(bio, folio, blocksize, 0))
goto submit_and_realloc;
inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
@@ -2324,7 +2350,7 @@ out:
* Major change was from block_size == page_size in f2fs by default.
*/
static int f2fs_mpage_readpages(struct inode *inode,
- struct readahead_control *rac, struct page *page)
+ struct readahead_control *rac, struct folio *folio)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
@@ -2344,6 +2370,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
#endif
unsigned nr_pages = rac ? readahead_count(rac) : 1;
unsigned max_nr_pages = nr_pages;
+ pgoff_t index;
int ret = 0;
map.m_pblk = 0;
@@ -2357,64 +2384,63 @@ static int f2fs_mpage_readpages(struct inode *inode,
for (; nr_pages; nr_pages--) {
if (rac) {
- page = readahead_page(rac);
- prefetchw(&page->flags);
+ folio = readahead_folio(rac);
+ prefetchw(&folio->flags);
}
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- if (f2fs_compressed_file(inode)) {
- /* there are remained compressed pages, submit them */
- if (!f2fs_cluster_can_merge_page(&cc, page->index)) {
- ret = f2fs_read_multi_pages(&cc, &bio,
- max_nr_pages,
- &last_block_in_bio,
- rac != NULL, false);
- f2fs_destroy_compress_ctx(&cc, false);
- if (ret)
- goto set_error_page;
- }
- if (cc.cluster_idx == NULL_CLUSTER) {
- if (nc_cluster_idx ==
- page->index >> cc.log_cluster_size) {
- goto read_single_page;
- }
-
- ret = f2fs_is_compressed_cluster(inode, page->index);
- if (ret < 0)
- goto set_error_page;
- else if (!ret) {
- nc_cluster_idx =
- page->index >> cc.log_cluster_size;
- goto read_single_page;
- }
+ index = folio_index(folio);
- nc_cluster_idx = NULL_CLUSTER;
- }
- ret = f2fs_init_compress_ctx(&cc);
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (!f2fs_compressed_file(inode))
+ goto read_single_page;
+
+ /* there are remained compressed pages, submit them */
+ if (!f2fs_cluster_can_merge_page(&cc, index)) {
+ ret = f2fs_read_multi_pages(&cc, &bio,
+ max_nr_pages,
+ &last_block_in_bio,
+ rac != NULL, false);
+ f2fs_destroy_compress_ctx(&cc, false);
if (ret)
goto set_error_page;
+ }
+ if (cc.cluster_idx == NULL_CLUSTER) {
+ if (nc_cluster_idx == index >> cc.log_cluster_size)
+ goto read_single_page;
- f2fs_compress_ctx_add_page(&cc, page);
+ ret = f2fs_is_compressed_cluster(inode, index);
+ if (ret < 0)
+ goto set_error_page;
+ else if (!ret) {
+ nc_cluster_idx =
+ index >> cc.log_cluster_size;
+ goto read_single_page;
+ }
- goto next_page;
+ nc_cluster_idx = NULL_CLUSTER;
}
+ ret = f2fs_init_compress_ctx(&cc);
+ if (ret)
+ goto set_error_page;
+
+ f2fs_compress_ctx_add_page(&cc, &folio->page);
+
+ goto next_page;
read_single_page:
#endif
- ret = f2fs_read_single_page(inode, page, max_nr_pages, &map,
+ ret = f2fs_read_single_page(inode, folio, max_nr_pages, &map,
&bio, &last_block_in_bio, rac);
if (ret) {
#ifdef CONFIG_F2FS_FS_COMPRESSION
set_error_page:
#endif
- zero_user_segment(page, 0, PAGE_SIZE);
- unlock_page(page);
+ folio_zero_segment(folio, 0, folio_size(folio));
+ folio_unlock(folio);
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
next_page:
#endif
- if (rac)
- put_page(page);
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
@@ -2436,22 +2462,21 @@ next_page:
static int f2fs_read_data_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page_file_mapping(page)->host;
+ struct inode *inode = folio_file_mapping(folio)->host;
int ret = -EAGAIN;
- trace_f2fs_readpage(page, DATA);
+ trace_f2fs_readpage(folio, DATA);
if (!f2fs_is_compress_backend_ready(inode)) {
- unlock_page(page);
+ folio_unlock(folio);
return -EOPNOTSUPP;
}
/* If the file has inline data, try to read it directly */
if (f2fs_has_inline_data(inode))
- ret = f2fs_read_inline_data(inode, page);
+ ret = f2fs_read_inline_data(inode, folio);
if (ret == -EAGAIN)
- ret = f2fs_mpage_readpages(inode, NULL, page);
+ ret = f2fs_mpage_readpages(inode, NULL, folio);
return ret;
}
@@ -2685,12 +2710,11 @@ got_it:
if (err) {
if (fscrypt_inode_uses_fs_layer_crypto(inode))
fscrypt_finalize_bounce_page(&fio->encrypted_page);
- if (PageWriteback(page))
- end_page_writeback(page);
+ end_page_writeback(page);
} else {
set_inode_flag(inode, FI_UPDATE_WRITE);
}
- trace_f2fs_do_write_data_page(fio->page, IPU);
+ trace_f2fs_do_write_data_page(page_folio(page), IPU);
return err;
}
@@ -2719,7 +2743,7 @@ got_it:
/* LFS mode write path */
f2fs_outplace_write_data(&dn, fio);
- trace_f2fs_do_write_data_page(page, OPU);
+ trace_f2fs_do_write_data_page(page_folio(page), OPU);
set_inode_flag(inode, FI_APPEND_WRITE);
out_writepage:
f2fs_put_dnode(&dn);
@@ -2766,7 +2790,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
.last_block = last_block,
};
- trace_f2fs_writepage(page, DATA);
+ trace_f2fs_writepage(page_folio(page), DATA);
/* we should bypass data pages to proceed the kworker jobs */
if (unlikely(f2fs_cp_error(sbi))) {
@@ -3379,7 +3403,7 @@ restart:
if (f2fs_has_inline_data(inode)) {
if (pos + len <= MAX_INLINE_DATA(inode)) {
- f2fs_do_read_inline_data(page, ipage);
+ f2fs_do_read_inline_data(page_folio(page), ipage);
set_inode_flag(inode, FI_DATA_EXIST);
if (inode->i_nlink)
set_page_private_inline(ipage);
@@ -3740,7 +3764,7 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping,
{
struct inode *inode = mapping->host;
- trace_f2fs_set_page_dirty(&folio->page, DATA);
+ trace_f2fs_set_page_dirty(folio, DATA);
if (!folio_test_uptodate(folio))
folio_mark_uptodate(folio);
@@ -3896,15 +3920,14 @@ static int check_swap_activate(struct swap_info_struct *sis,
struct address_space *mapping = swap_file->f_mapping;
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- sector_t cur_lblock;
- sector_t last_lblock;
- sector_t pblock;
- sector_t lowest_pblock = -1;
- sector_t highest_pblock = 0;
+ block_t cur_lblock;
+ block_t last_lblock;
+ block_t pblock;
+ block_t lowest_pblock = -1;
+ block_t highest_pblock = 0;
int nr_extents = 0;
- unsigned long nr_pblocks;
+ unsigned int nr_pblocks;
unsigned int blks_per_sec = BLKS_PER_SEC(sbi);
- unsigned int sec_blks_mask = BLKS_PER_SEC(sbi) - 1;
unsigned int not_aligned = 0;
int ret = 0;
@@ -3942,8 +3965,8 @@ retry:
pblock = map.m_pblk;
nr_pblocks = map.m_len;
- if ((pblock - SM_I(sbi)->main_blkaddr) & sec_blks_mask ||
- nr_pblocks & sec_blks_mask ||
+ if ((pblock - SM_I(sbi)->main_blkaddr) % blks_per_sec ||
+ nr_pblocks % blks_per_sec ||
!f2fs_valid_pinned_area(sbi, pblock)) {
bool last_extent = false;
@@ -4160,7 +4183,8 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
map.m_lblk = bytes_to_blks(inode, offset);
map.m_len = bytes_to_blks(inode, offset + length - 1) - map.m_lblk + 1;
map.m_next_pgofs = &next_pgofs;
- map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint);
+ map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode),
+ inode->i_write_hint);
if (flags & IOMAP_WRITE)
map.m_may_create = true;
@@ -4181,12 +4205,13 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
* We should never see delalloc or compressed extents here based on
* prior flushing and checks.
*/
- if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR))
- return -EINVAL;
if (WARN_ON_ONCE(map.m_pblk == COMPRESS_ADDR))
return -EINVAL;
- if (map.m_pblk != NULL_ADDR) {
+ if (map.m_flags & F2FS_MAP_MAPPED) {
+ if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR))
+ return -EINVAL;
+
iomap->length = blks_to_bytes(inode, map.m_len);
iomap->type = IOMAP_MAPPED;
iomap->flags |= IOMAP_F_MERGED;
@@ -4195,9 +4220,17 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
} else {
if (flags & IOMAP_WRITE)
return -ENOTBLK;
- iomap->length = blks_to_bytes(inode, next_pgofs) -
- iomap->offset;
- iomap->type = IOMAP_HOLE;
+
+ if (map.m_pblk == NULL_ADDR) {
+ iomap->length = blks_to_bytes(inode, next_pgofs) -
+ iomap->offset;
+ iomap->type = IOMAP_HOLE;
+ } else if (map.m_pblk == NEW_ADDR) {
+ iomap->length = blks_to_bytes(inode, map.m_len);
+ iomap->type = IOMAP_UNWRITTEN;
+ } else {
+ f2fs_bug_on(F2FS_I_SB(inode), 1);
+ }
iomap->addr = IOMAP_NULL_ADDR;
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index fced2b7652f4..1974b6aff397 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -72,7 +72,7 @@ enum {
struct f2fs_fault_info {
atomic_t inject_ops;
- unsigned int inject_rate;
+ int inject_rate;
unsigned int inject_type;
};
@@ -765,11 +765,6 @@ enum {
#define DEF_DIR_LEVEL 0
-enum {
- GC_FAILURE_PIN,
- MAX_GC_FAILURE
-};
-
/* used for f2fs_inode_info->flags */
enum {
FI_NEW_INODE, /* indicate newly allocated inode */
@@ -816,9 +811,10 @@ struct f2fs_inode_info {
unsigned long i_flags; /* keep an inode flags for ioctl */
unsigned char i_advise; /* use to give file attribute hints */
unsigned char i_dir_level; /* use for dentry level for large dir */
- unsigned int i_current_depth; /* only for directory depth */
- /* for gc failure statistic */
- unsigned int i_gc_failures[MAX_GC_FAILURE];
+ union {
+ unsigned int i_current_depth; /* only for directory depth */
+ unsigned short i_gc_failures; /* for gc failure statistic */
+ };
unsigned int i_pino; /* parent inode number */
umode_t i_acl_mode; /* keep file acl mode temporarily */
@@ -1557,6 +1553,7 @@ struct f2fs_sb_info {
#ifdef CONFIG_BLK_DEV_ZONED
unsigned int blocks_per_blkz; /* F2FS blocks per zone */
+ unsigned int max_open_zones; /* max open zone resources of the zoned device */
#endif
/* for node-related operations */
@@ -1676,7 +1673,7 @@ struct f2fs_sb_info {
unsigned long long skipped_gc_rwsem; /* FG_GC only */
/* threshold for gc trials on pinned files */
- u64 gc_pin_file_threshold;
+ unsigned short gc_pin_file_threshold;
struct f2fs_rwsem pin_sem;
/* maximum # of trials to find a victim segment for SSR and GC */
@@ -2309,7 +2306,7 @@ static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool);
static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
struct inode *inode, blkcnt_t *count, bool partial)
{
- blkcnt_t diff = 0, release = 0;
+ long long diff = 0, release = 0;
block_t avail_user_block_count;
int ret;
@@ -2329,26 +2326,27 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
percpu_counter_add(&sbi->alloc_valid_block_count, (*count));
spin_lock(&sbi->stat_lock);
- sbi->total_valid_block_count += (block_t)(*count);
- avail_user_block_count = get_available_block_count(sbi, inode, true);
- if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
+ avail_user_block_count = get_available_block_count(sbi, inode, true);
+ diff = (long long)sbi->total_valid_block_count + *count -
+ avail_user_block_count;
+ if (unlikely(diff > 0)) {
if (!partial) {
spin_unlock(&sbi->stat_lock);
+ release = *count;
goto enospc;
}
-
- diff = sbi->total_valid_block_count - avail_user_block_count;
if (diff > *count)
diff = *count;
*count -= diff;
release = diff;
- sbi->total_valid_block_count -= diff;
if (!*count) {
spin_unlock(&sbi->stat_lock);
goto enospc;
}
}
+ sbi->total_valid_block_count += (block_t)(*count);
+
spin_unlock(&sbi->stat_lock);
if (unlikely(release)) {
@@ -3132,7 +3130,7 @@ static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth)
static inline void f2fs_i_gc_failures_write(struct inode *inode,
unsigned int count)
{
- F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = count;
+ F2FS_I(inode)->i_gc_failures = count;
f2fs_mark_inode_dirty_sync(inode, true);
}
@@ -3497,6 +3495,8 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr);
int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end);
void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count);
+int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag,
+ bool readonly);
int f2fs_precache_extents(struct inode *inode);
int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
int f2fs_fileattr_set(struct mnt_idmap *idmap,
@@ -3719,6 +3719,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
block_t old_addr, block_t new_addr,
unsigned char version, bool recover_curseg,
bool recover_newaddr);
+int f2fs_get_segment_temp(int seg_type);
int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
@@ -3741,7 +3742,9 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi);
void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi);
int __init f2fs_create_segment_manager_caches(void);
void f2fs_destroy_segment_manager_caches(void);
-int f2fs_rw_hint_to_seg_type(enum rw_hint hint);
+int f2fs_rw_hint_to_seg_type(struct f2fs_sb_info *sbi, enum rw_hint hint);
+enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
+ enum page_type type, enum temp_type temp);
unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
unsigned int segno);
unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
@@ -4148,10 +4151,10 @@ extern struct kmem_cache *f2fs_inode_entry_slab;
bool f2fs_may_inline_data(struct inode *inode);
bool f2fs_sanity_check_inline_data(struct inode *inode);
bool f2fs_may_inline_dentry(struct inode *inode);
-void f2fs_do_read_inline_data(struct page *page, struct page *ipage);
+void f2fs_do_read_inline_data(struct folio *folio, struct page *ipage);
void f2fs_truncate_inline_inode(struct inode *inode,
struct page *ipage, u64 from);
-int f2fs_read_inline_data(struct inode *inode, struct page *page);
+int f2fs_read_inline_data(struct inode *inode, struct folio *folio);
int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page);
int f2fs_convert_inline_inode(struct inode *inode);
int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry);
@@ -4596,10 +4599,14 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
}
#ifdef CONFIG_F2FS_FAULT_INJECTION
-extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
- unsigned int type);
+extern int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate,
+ unsigned long type);
#else
-#define f2fs_build_fault_attr(sbi, rate, type) do { } while (0)
+static inline int f2fs_build_fault_attr(struct f2fs_sb_info *sbi,
+ unsigned long rate, unsigned long type)
+{
+ return 0;
+}
#endif
static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
@@ -4657,7 +4664,7 @@ static inline void f2fs_truncate_meta_inode_pages(struct f2fs_sb_info *sbi,
page = find_get_page(META_MAPPING(sbi), blkaddr + i);
if (page) {
- if (PageWriteback(page))
+ if (folio_test_writeback(page_folio(page)))
need_submit = true;
f2fs_put_page(page, 0);
}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 2b65e09822d4..5c0b281a70f3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -58,7 +58,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
struct inode *inode = file_inode(vmf->vma->vm_file);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
- bool need_alloc = true;
+ bool need_alloc = !f2fs_is_pinned_file(inode);
int err = 0;
vm_fault_t ret;
@@ -115,19 +115,18 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
goto out_sem;
}
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
if (need_alloc) {
/* block allocation */
- set_new_dnode(&dn, inode, NULL, NULL, 0);
err = f2fs_get_block_locked(&dn, page->index);
- }
-
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- if (!need_alloc) {
- set_new_dnode(&dn, inode, NULL, NULL, 0);
+ } else {
err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
f2fs_put_dnode(&dn);
+ if (f2fs_is_pinned_file(inode) &&
+ !__is_valid_data_blkaddr(dn.data_blkaddr))
+ err = -EIO;
}
-#endif
+
if (err) {
unlock_page(page);
goto out_sem;
@@ -834,7 +833,8 @@ static bool f2fs_force_buffered_io(struct inode *inode, int rw)
* for blkzoned device, fallback direct IO to buffered IO, so
* all IOs can be serialized by log-structured write.
*/
- if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE))
+ if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE) &&
+ !f2fs_is_pinned_file(inode))
return true;
if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
return true;
@@ -952,9 +952,14 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
ATTR_GID | ATTR_TIMES_SET))))
return -EPERM;
- if ((attr->ia_valid & ATTR_SIZE) &&
- !f2fs_is_compress_backend_ready(inode))
- return -EOPNOTSUPP;
+ if ((attr->ia_valid & ATTR_SIZE)) {
+ if (!f2fs_is_compress_backend_ready(inode))
+ return -EOPNOTSUPP;
+ if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED) &&
+ !IS_ALIGNED(attr->ia_size,
+ F2FS_BLK_TO_BYTES(F2FS_I(inode)->i_cluster_size)))
+ return -EINVAL;
+ }
err = setattr_prepare(idmap, dentry, attr);
if (err)
@@ -1325,6 +1330,9 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
f2fs_put_page(psrc, 1);
return PTR_ERR(pdst);
}
+
+ f2fs_wait_on_page_writeback(pdst, DATA, true, true);
+
memcpy_page(pdst, 0, psrc, 0, PAGE_SIZE);
set_page_dirty(pdst);
set_page_private_gcing(pdst);
@@ -1817,15 +1825,6 @@ static long f2fs_fallocate(struct file *file, int mode,
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
- /*
- * Pinned file should not support partial truncation since the block
- * can be used by applications.
- */
- if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) &&
- (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE |
- FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE)))
- return -EOPNOTSUPP;
-
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
FALLOC_FL_INSERT_RANGE))
@@ -1833,6 +1832,17 @@ static long f2fs_fallocate(struct file *file, int mode,
inode_lock(inode);
+ /*
+ * Pinned file should not support partial truncation since the block
+ * can be used by applications.
+ */
+ if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) &&
+ (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE |
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE))) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
ret = file_modified(file);
if (ret)
goto out;
@@ -2224,34 +2234,13 @@ static int f2fs_ioc_abort_atomic_write(struct file *filp)
return ret;
}
-static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
+int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag,
+ bool readonly)
{
- struct inode *inode = file_inode(filp);
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct super_block *sb = sbi->sb;
- __u32 in;
int ret = 0;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (get_user(in, (__u32 __user *)arg))
- return -EFAULT;
-
- if (in != F2FS_GOING_DOWN_FULLSYNC) {
- ret = mnt_want_write_file(filp);
- if (ret) {
- if (ret == -EROFS) {
- ret = 0;
- f2fs_stop_checkpoint(sbi, false,
- STOP_CP_REASON_SHUTDOWN);
- trace_f2fs_shutdown(sbi, in, ret);
- }
- return ret;
- }
- }
-
- switch (in) {
+ switch (flag) {
case F2FS_GOING_DOWN_FULLSYNC:
ret = bdev_freeze(sb->s_bdev);
if (ret)
@@ -2290,6 +2279,9 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
goto out;
}
+ if (readonly)
+ goto out;
+
f2fs_stop_gc_thread(sbi);
f2fs_stop_discard_thread(sbi);
@@ -2298,10 +2290,44 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
f2fs_update_time(sbi, REQ_TIME);
out:
- if (in != F2FS_GOING_DOWN_FULLSYNC)
- mnt_drop_write_file(filp);
- trace_f2fs_shutdown(sbi, in, ret);
+ trace_f2fs_shutdown(sbi, flag, ret);
+
+ return ret;
+}
+
+static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ __u32 in;
+ int ret;
+ bool need_drop = false, readonly = false;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(in, (__u32 __user *)arg))
+ return -EFAULT;
+
+ if (in != F2FS_GOING_DOWN_FULLSYNC) {
+ ret = mnt_want_write_file(filp);
+ if (ret) {
+ if (ret != -EROFS)
+ return ret;
+
+ /* fallback to nosync shutdown for readonly fs */
+ in = F2FS_GOING_DOWN_NOSYNC;
+ readonly = true;
+ } else {
+ need_drop = true;
+ }
+ }
+
+ ret = f2fs_do_shutdown(sbi, in, readonly);
+
+ if (need_drop)
+ mnt_drop_write_file(filp);
return ret;
}
@@ -2354,13 +2380,14 @@ static bool uuid_is_nonzero(__u8 u[16])
static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
+ int ret;
if (!f2fs_sb_has_encrypt(F2FS_I_SB(inode)))
return -EOPNOTSUPP;
+ ret = fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
-
- return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
+ return ret;
}
static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
@@ -2607,12 +2634,13 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
bool fragmented = false;
int err;
- pg_start = range->start >> PAGE_SHIFT;
- pg_end = (range->start + range->len) >> PAGE_SHIFT;
-
f2fs_balance_fs(sbi, true);
inode_lock(inode);
+ pg_start = range->start >> PAGE_SHIFT;
+ pg_end = min_t(pgoff_t,
+ (range->start + range->len) >> PAGE_SHIFT,
+ DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE));
if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
err = -EINVAL;
@@ -2627,8 +2655,9 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
}
/* writeback all dirty pages in the range */
- err = filemap_write_and_wait_range(inode->i_mapping, range->start,
- range->start + range->len - 1);
+ err = filemap_write_and_wait_range(inode->i_mapping,
+ pg_start << PAGE_SHIFT,
+ (pg_end << PAGE_SHIFT) - 1);
if (err)
goto out;
@@ -2786,7 +2815,8 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
err = f2fs_defragment_range(sbi, filp, &range);
mnt_drop_write_file(filp);
- f2fs_update_time(sbi, REQ_TIME);
+ if (range.len)
+ f2fs_update_time(sbi, REQ_TIME);
if (err < 0)
return err;
@@ -2837,7 +2867,8 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
goto out;
}
- if (f2fs_compressed_file(src) || f2fs_compressed_file(dst)) {
+ if (f2fs_compressed_file(src) || f2fs_compressed_file(dst) ||
+ f2fs_is_pinned_file(src) || f2fs_is_pinned_file(dst)) {
ret = -EOPNOTSUPP;
goto out_unlock;
}
@@ -3189,18 +3220,17 @@ int f2fs_pin_file_control(struct inode *inode, bool inc)
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- /* Use i_gc_failures for normal file as a risk signal. */
- if (inc)
- f2fs_i_gc_failures_write(inode,
- fi->i_gc_failures[GC_FAILURE_PIN] + 1);
-
- if (fi->i_gc_failures[GC_FAILURE_PIN] > sbi->gc_pin_file_threshold) {
+ if (fi->i_gc_failures >= sbi->gc_pin_file_threshold) {
f2fs_warn(sbi, "%s: Enable GC = ino %lx after %x GC trials",
- __func__, inode->i_ino,
- fi->i_gc_failures[GC_FAILURE_PIN]);
+ __func__, inode->i_ino, fi->i_gc_failures);
clear_inode_flag(inode, FI_PIN_FILE);
return -EAGAIN;
}
+
+ /* Use i_gc_failures for normal file as a risk signal. */
+ if (inc)
+ f2fs_i_gc_failures_write(inode, fi->i_gc_failures + 1);
+
return 0;
}
@@ -3234,7 +3264,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
goto done;
}
- if (f2fs_sb_has_blkzoned(sbi) && F2FS_HAS_BLOCKS(inode)) {
+ if (F2FS_HAS_BLOCKS(inode)) {
ret = -EFBIG;
goto out;
}
@@ -3261,7 +3291,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
}
set_inode_flag(inode, FI_PIN_FILE);
- ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN];
+ ret = F2FS_I(inode)->i_gc_failures;
done:
f2fs_update_time(sbi, REQ_TIME);
out:
@@ -3276,7 +3306,7 @@ static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg)
__u32 pin = 0;
if (is_inode_flag_set(inode, FI_PIN_FILE))
- pin = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN];
+ pin = F2FS_I(inode)->i_gc_failures;
return put_user(pin, (u32 __user *)arg);
}
@@ -3522,9 +3552,6 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
if (!f2fs_sb_has_compression(sbi))
return -EOPNOTSUPP;
- if (!f2fs_compressed_file(inode))
- return -EINVAL;
-
if (f2fs_readonly(sbi->sb))
return -EROFS;
@@ -3543,7 +3570,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
goto out;
}
- if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
+ if (!f2fs_compressed_file(inode) ||
+ is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
ret = -EINVAL;
goto out;
}
@@ -3570,9 +3598,12 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
struct dnode_of_data dn;
pgoff_t end_offset, count;
+ f2fs_lock_op(sbi);
+
set_new_dnode(&dn, inode, NULL, NULL, 0);
ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE);
if (ret) {
+ f2fs_unlock_op(sbi);
if (ret == -ENOENT) {
page_idx = f2fs_get_next_page_offset(&dn,
page_idx);
@@ -3590,6 +3621,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
f2fs_put_dnode(&dn);
+ f2fs_unlock_op(sbi);
+
if (ret < 0)
break;
@@ -3600,6 +3633,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
filemap_invalidate_unlock(inode->i_mapping);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
out:
+ if (released_blocks)
+ f2fs_update_time(sbi, REQ_TIME);
inode_unlock(inode);
mnt_drop_write_file(filp);
@@ -3641,7 +3676,8 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count,
while (count) {
int compr_blocks = 0;
- blkcnt_t reserved;
+ blkcnt_t reserved = 0;
+ blkcnt_t to_reserved;
int ret;
for (i = 0; i < cluster_size; i++) {
@@ -3661,20 +3697,26 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count,
* fails in release_compress_blocks(), so NEW_ADDR
* is a possible case.
*/
- if (blkaddr == NEW_ADDR ||
- __is_valid_data_blkaddr(blkaddr)) {
+ if (blkaddr == NEW_ADDR) {
+ reserved++;
+ continue;
+ }
+ if (__is_valid_data_blkaddr(blkaddr)) {
compr_blocks++;
continue;
}
}
- reserved = cluster_size - compr_blocks;
+ to_reserved = cluster_size - compr_blocks - reserved;
/* for the case all blocks in cluster were reserved */
- if (reserved == 1)
+ if (to_reserved == 1) {
+ dn->ofs_in_node += cluster_size;
goto next;
+ }
- ret = inc_valid_block_count(sbi, dn->inode, &reserved, false);
+ ret = inc_valid_block_count(sbi, dn->inode,
+ &to_reserved, false);
if (unlikely(ret))
return ret;
@@ -3685,7 +3727,7 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count,
f2fs_i_compr_blocks_update(dn->inode, compr_blocks, true);
- *reserved_blocks += reserved;
+ *reserved_blocks += to_reserved;
next:
count -= cluster_size;
}
@@ -3704,9 +3746,6 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
if (!f2fs_sb_has_compression(sbi))
return -EOPNOTSUPP;
- if (!f2fs_compressed_file(inode))
- return -EINVAL;
-
if (f2fs_readonly(sbi->sb))
return -EROFS;
@@ -3718,7 +3757,8 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
inode_lock(inode);
- if (!is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
+ if (!f2fs_compressed_file(inode) ||
+ !is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
ret = -EINVAL;
goto unlock_inode;
}
@@ -3735,9 +3775,12 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
struct dnode_of_data dn;
pgoff_t end_offset, count;
+ f2fs_lock_op(sbi);
+
set_new_dnode(&dn, inode, NULL, NULL, 0);
ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE);
if (ret) {
+ f2fs_unlock_op(sbi);
if (ret == -ENOENT) {
page_idx = f2fs_get_next_page_offset(&dn,
page_idx);
@@ -3755,6 +3798,8 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
f2fs_put_dnode(&dn);
+ f2fs_unlock_op(sbi);
+
if (ret < 0)
break;
@@ -3770,6 +3815,8 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
f2fs_mark_inode_dirty_sync(inode, true);
}
unlock_inode:
+ if (reserved_blocks)
+ f2fs_update_time(sbi, REQ_TIME);
inode_unlock(inode);
mnt_drop_write_file(filp);
@@ -3778,7 +3825,7 @@ unlock_inode:
} else if (reserved_blocks &&
atomic_read(&F2FS_I(inode)->i_compr_blocks)) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
- f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx "
+ f2fs_warn(sbi, "%s: partial blocks were reserved i_ino=%lx "
"iblocks=%llu, reserved=%u, compr_blocks=%u, "
"run fsck to fix.",
__func__, inode->i_ino, inode->i_blocks,
@@ -3966,6 +4013,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
if (len)
ret = f2fs_secure_erase(prev_bdev, inode, prev_index,
prev_block, len, range.flags);
+ f2fs_update_time(sbi, REQ_TIME);
out:
filemap_invalidate_unlock(mapping);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -4119,9 +4167,6 @@ static int f2fs_ioc_decompress_file(struct file *filp)
if (!(filp->f_mode & FMODE_WRITE))
return -EBADF;
- if (!f2fs_compressed_file(inode))
- return -EINVAL;
-
f2fs_balance_fs(sbi, true);
file_start_write(filp);
@@ -4132,7 +4177,8 @@ static int f2fs_ioc_decompress_file(struct file *filp)
goto out;
}
- if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
+ if (!f2fs_compressed_file(inode) ||
+ is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
ret = -EINVAL;
goto out;
}
@@ -4175,6 +4221,7 @@ static int f2fs_ioc_decompress_file(struct file *filp)
if (ret)
f2fs_warn(sbi, "%s: The file might be partially decompressed (errno=%d). Please delete the file.",
__func__, ret);
+ f2fs_update_time(sbi, REQ_TIME);
out:
inode_unlock(inode);
file_end_write(filp);
@@ -4197,9 +4244,6 @@ static int f2fs_ioc_compress_file(struct file *filp)
if (!(filp->f_mode & FMODE_WRITE))
return -EBADF;
- if (!f2fs_compressed_file(inode))
- return -EINVAL;
-
f2fs_balance_fs(sbi, true);
file_start_write(filp);
@@ -4210,7 +4254,8 @@ static int f2fs_ioc_compress_file(struct file *filp)
goto out;
}
- if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
+ if (!f2fs_compressed_file(inode) ||
+ is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
ret = -EINVAL;
goto out;
}
@@ -4254,6 +4299,7 @@ static int f2fs_ioc_compress_file(struct file *filp)
if (ret)
f2fs_warn(sbi, "%s: The file might be partially compressed (errno=%d). Please delete the file.",
__func__, ret);
+ f2fs_update_time(sbi, REQ_TIME);
out:
inode_unlock(inode);
file_end_write(filp);
@@ -4612,7 +4658,8 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter,
map.m_may_create = true;
if (dio) {
- map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint);
+ map.m_seg_type = f2fs_rw_hint_to_seg_type(sbi,
+ inode->i_write_hint);
flag = F2FS_GET_BLOCK_PRE_DIO;
} else {
map.m_seg_type = NO_CHECK_TYPE;
@@ -4660,8 +4707,21 @@ static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error,
return 0;
}
+static void f2fs_dio_write_submit_io(const struct iomap_iter *iter,
+ struct bio *bio, loff_t file_offset)
+{
+ struct inode *inode = iter->inode;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int seg_type = f2fs_rw_hint_to_seg_type(sbi, inode->i_write_hint);
+ enum temp_type temp = f2fs_get_segment_temp(seg_type);
+
+ bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, DATA, temp);
+ submit_bio(bio);
+}
+
static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = {
- .end_io = f2fs_dio_write_end_io,
+ .end_io = f2fs_dio_write_end_io,
+ .submit_io = f2fs_dio_write_submit_io,
};
static void f2fs_flush_buffered_write(struct address_space *mapping,
@@ -4798,6 +4858,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
bool dio;
bool may_need_sync = true;
int preallocated;
+ const loff_t pos = iocb->ki_pos;
+ const ssize_t count = iov_iter_count(from);
ssize_t ret;
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) {
@@ -4819,6 +4881,12 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
inode_lock(inode);
}
+ if (f2fs_is_pinned_file(inode) &&
+ !f2fs_overwrite_io(inode, pos, count)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
ret = f2fs_write_checks(iocb, from);
if (ret <= 0)
goto out_unlock;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 8852814dab7f..6066c6eecf41 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1434,7 +1434,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
goto out;
if (gc_type == BG_GC) {
- if (PageWriteback(page)) {
+ if (folio_test_writeback(page_folio(page))) {
err = -EAGAIN;
goto out;
}
@@ -1554,10 +1554,15 @@ next_step:
int err;
inode = f2fs_iget(sb, dni.ino);
- if (IS_ERR(inode) || is_bad_inode(inode) ||
- special_file(inode->i_mode))
+ if (IS_ERR(inode))
continue;
+ if (is_bad_inode(inode) ||
+ special_file(inode->i_mode)) {
+ iput(inode);
+ continue;
+ }
+
err = f2fs_gc_pinned_control(inode, gc_type, segno);
if (err == -EAGAIN) {
iput(inode);
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 9c0d06c4d19a..a8ea3301b815 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -26,6 +26,7 @@
#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
#define DEF_GC_FAILED_PINNED_FILES 2048
+#define MAX_GC_FAILED_PINNED_FILES USHRT_MAX
/* Search max. number of dirty segments to select a victim segment */
#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index ac00423f117b..7638d0d7b7ee 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -61,22 +61,22 @@ bool f2fs_may_inline_dentry(struct inode *inode)
return true;
}
-void f2fs_do_read_inline_data(struct page *page, struct page *ipage)
+void f2fs_do_read_inline_data(struct folio *folio, struct page *ipage)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio_file_mapping(folio)->host;
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
return;
- f2fs_bug_on(F2FS_P_SB(page), page->index);
+ f2fs_bug_on(F2FS_I_SB(inode), folio_index(folio));
- zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE);
+ folio_zero_segment(folio, MAX_INLINE_DATA(inode), folio_size(folio));
/* Copy the whole inline data block */
- memcpy_to_page(page, 0, inline_data_addr(inode, ipage),
+ memcpy_to_folio(folio, 0, inline_data_addr(inode, ipage),
MAX_INLINE_DATA(inode));
- if (!PageUptodate(page))
- SetPageUptodate(page);
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
}
void f2fs_truncate_inline_inode(struct inode *inode,
@@ -97,13 +97,13 @@ void f2fs_truncate_inline_inode(struct inode *inode,
clear_inode_flag(inode, FI_DATA_EXIST);
}
-int f2fs_read_inline_data(struct inode *inode, struct page *page)
+int f2fs_read_inline_data(struct inode *inode, struct folio *folio)
{
struct page *ipage;
ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage)) {
- unlock_page(page);
+ folio_unlock(folio);
return PTR_ERR(ipage);
}
@@ -112,15 +112,15 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
return -EAGAIN;
}
- if (page->index)
- zero_user_segment(page, 0, PAGE_SIZE);
+ if (folio_index(folio))
+ folio_zero_segment(folio, 0, folio_size(folio));
else
- f2fs_do_read_inline_data(page, ipage);
+ f2fs_do_read_inline_data(folio, ipage);
- if (!PageUptodate(page))
- SetPageUptodate(page);
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
f2fs_put_page(ipage, 1);
- unlock_page(page);
+ folio_unlock(folio);
return 0;
}
@@ -164,9 +164,9 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
return -EFSCORRUPTED;
}
- f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page));
+ f2fs_bug_on(F2FS_P_SB(page), folio_test_writeback(page_folio(page)));
- f2fs_do_read_inline_data(page, dn->inode_page);
+ f2fs_do_read_inline_data(page_folio(page), dn->inode_page);
set_page_dirty(page);
/* clear dirty state */
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index c26effdce9aa..005dde72aff3 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -161,7 +161,8 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page)
if (!f2fs_enable_inode_chksum(sbi, page))
#else
if (!f2fs_enable_inode_chksum(sbi, page) ||
- PageDirty(page) || PageWriteback(page))
+ PageDirty(page) ||
+ folio_test_writeback(page_folio(page)))
#endif
return true;
@@ -361,6 +362,12 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
+ if (fi->i_xattr_nid && f2fs_check_nid_range(sbi, fi->i_xattr_nid)) {
+ f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_xattr_nid: %u, run fsck to fix.",
+ __func__, inode->i_ino, fi->i_xattr_nid);
+ return false;
+ }
+
return true;
}
@@ -408,8 +415,7 @@ static int do_read_inode(struct inode *inode)
if (S_ISDIR(inode->i_mode))
fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
else if (S_ISREG(inode->i_mode))
- fi->i_gc_failures[GC_FAILURE_PIN] =
- le16_to_cpu(ri->i_gc_failures);
+ fi->i_gc_failures = le16_to_cpu(ri->i_gc_failures);
fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
fi->i_flags = le32_to_cpu(ri->i_flags);
if (S_ISREG(inode->i_mode))
@@ -679,8 +685,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
ri->i_current_depth =
cpu_to_le32(F2FS_I(inode)->i_current_depth);
else if (S_ISREG(inode->i_mode))
- ri->i_gc_failures =
- cpu_to_le16(F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]);
+ ri->i_gc_failures = cpu_to_le16(F2FS_I(inode)->i_gc_failures);
ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid);
ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
@@ -804,6 +809,7 @@ void f2fs_evict_inode(struct inode *inode)
struct f2fs_inode_info *fi = F2FS_I(inode);
nid_t xnid = fi->i_xattr_nid;
int err = 0;
+ bool freeze_protected = false;
f2fs_abort_atomic_write(inode, true);
@@ -843,8 +849,10 @@ void f2fs_evict_inode(struct inode *inode)
f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO);
- if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING))
+ if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING)) {
sb_start_intwrite(inode->i_sb);
+ freeze_protected = true;
+ }
set_inode_flag(inode, FI_NO_ALLOC);
i_size_write(inode, 0);
retry:
@@ -887,7 +895,7 @@ retry:
if (dquot_initialize_needed(inode))
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
}
- if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING))
+ if (freeze_protected)
sb_end_intwrite(inode->i_sb);
no_delete:
dquot_drop(inode);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b3de6d6cdb02..b72ef96f7e33 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1187,7 +1187,17 @@ skip_partial:
default:
BUG();
}
- if (err < 0 && err != -ENOENT)
+ if (err == -ENOENT) {
+ set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK);
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
+ f2fs_err_ratelimited(sbi,
+ "truncate node fail, ino:%lu, nid:%u, "
+ "offset[0]:%d, offset[1]:%d, nofs:%d",
+ inode->i_ino, dn.nid, offset[0],
+ offset[1], nofs);
+ err = 0;
+ }
+ if (err < 0)
goto fail;
if (offset[1] == 0 &&
ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
@@ -1319,6 +1329,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
}
if (unlikely(new_ni.blk_addr != NULL_ADDR)) {
err = -EFSCORRUPTED;
+ dec_valid_node_count(sbi, dn->inode, !ofs);
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto fail;
@@ -1345,7 +1356,6 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
if (ofs == 0)
inc_valid_inode_count(sbi);
return page;
-
fail:
clear_node_page_dirty(page);
f2fs_put_page(page, 1);
@@ -1614,7 +1624,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
};
unsigned int seq;
- trace_f2fs_writepage(page, NODE);
+ trace_f2fs_writepage(page_folio(page), NODE);
if (unlikely(f2fs_cp_error(sbi))) {
/* keep node pages in remount-ro mode */
@@ -1733,7 +1743,7 @@ int f2fs_move_node_page(struct page *node_page, int gc_type)
goto release_page;
} else {
/* set page dirty and write it */
- if (!PageWriteback(node_page))
+ if (!folio_test_writeback(page_folio(node_page)))
set_page_dirty(node_page);
}
out_page:
@@ -2161,7 +2171,7 @@ skip_write:
static bool f2fs_dirty_node_folio(struct address_space *mapping,
struct folio *folio)
{
- trace_f2fs_set_page_dirty(&folio->page, NODE);
+ trace_f2fs_set_page_dirty(folio, NODE);
if (!folio_test_uptodate(folio))
folio_mark_uptodate(folio);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index e7bf15b8240a..496aee53c38a 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -330,8 +330,7 @@ static int recover_inode(struct inode *inode, struct page *page)
F2FS_I(inode)->i_advise = raw->i_advise;
F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags);
f2fs_set_inode_flags(inode);
- F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] =
- le16_to_cpu(raw->i_gc_failures);
+ F2FS_I(inode)->i_gc_failures = le16_to_cpu(raw->i_gc_failures);
recover_inline_flags(inode, raw);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 4fd76e867e0a..a0ce3d080f80 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -771,8 +771,10 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
block_t valid_blocks =
get_valid_blocks(sbi, segno, true);
- f2fs_bug_on(sbi, unlikely(!valid_blocks ||
- valid_blocks == CAP_BLKS_PER_SEC(sbi)));
+ f2fs_bug_on(sbi,
+ (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
+ !valid_blocks) ||
+ valid_blocks == CAP_BLKS_PER_SEC(sbi));
if (!IS_CURSEC(sbi, secno))
set_bit(secno, dirty_i->dirty_secmap);
@@ -1109,9 +1111,8 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi,
dc->error = 0;
if (dc->error)
- printk_ratelimited(
- "%sF2FS-fs (%s): Issue discard(%u, %u, %u) failed, ret: %d",
- KERN_INFO, sbi->sb->s_id,
+ f2fs_info_ratelimited(sbi,
+ "Issue discard(%u, %u, %u) failed, ret: %d",
dc->di.lstart, dc->di.start, dc->di.len, dc->error);
__detach_discard_cmd(dcc, dc);
}
@@ -2645,7 +2646,7 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi,
}
static int is_next_segment_free(struct f2fs_sb_info *sbi,
- struct curseg_info *curseg, int type)
+ struct curseg_info *curseg)
{
unsigned int segno = curseg->segno + 1;
struct free_segmap_info *free_i = FREE_I(sbi);
@@ -3073,8 +3074,7 @@ static bool need_new_seg(struct f2fs_sb_info *sbi, int type)
if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
curseg->seg_type == CURSEG_WARM_NODE)
return true;
- if (curseg->alloc_type == LFS &&
- is_next_segment_free(sbi, curseg, type) &&
+ if (curseg->alloc_type == LFS && is_next_segment_free(sbi, curseg) &&
likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
return true;
if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0))
@@ -3352,8 +3352,14 @@ out:
return err;
}
-int f2fs_rw_hint_to_seg_type(enum rw_hint hint)
+int f2fs_rw_hint_to_seg_type(struct f2fs_sb_info *sbi, enum rw_hint hint)
{
+ if (F2FS_OPTION(sbi).active_logs == 2)
+ return CURSEG_HOT_DATA;
+ else if (F2FS_OPTION(sbi).active_logs == 4)
+ return CURSEG_COLD_DATA;
+
+ /* active_log == 6 */
switch (hint) {
case WRITE_LIFE_SHORT:
return CURSEG_HOT_DATA;
@@ -3364,6 +3370,65 @@ int f2fs_rw_hint_to_seg_type(enum rw_hint hint)
}
}
+/*
+ * This returns write hints for each segment type. This hints will be
+ * passed down to block layer as below by default.
+ *
+ * User F2FS Block
+ * ---- ---- -----
+ * META WRITE_LIFE_NONE|REQ_META
+ * HOT_NODE WRITE_LIFE_NONE
+ * WARM_NODE WRITE_LIFE_MEDIUM
+ * COLD_NODE WRITE_LIFE_LONG
+ * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
+ * extension list " "
+ *
+ * -- buffered io
+ * COLD_DATA WRITE_LIFE_EXTREME
+ * HOT_DATA WRITE_LIFE_SHORT
+ * WARM_DATA WRITE_LIFE_NOT_SET
+ *
+ * -- direct io
+ * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
+ * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
+ * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
+ * WRITE_LIFE_NONE " WRITE_LIFE_NONE
+ * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
+ * WRITE_LIFE_LONG " WRITE_LIFE_LONG
+ */
+enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
+ enum page_type type, enum temp_type temp)
+{
+ switch (type) {
+ case DATA:
+ switch (temp) {
+ case WARM:
+ return WRITE_LIFE_NOT_SET;
+ case HOT:
+ return WRITE_LIFE_SHORT;
+ case COLD:
+ return WRITE_LIFE_EXTREME;
+ default:
+ return WRITE_LIFE_NONE;
+ }
+ case NODE:
+ switch (temp) {
+ case WARM:
+ return WRITE_LIFE_MEDIUM;
+ case HOT:
+ return WRITE_LIFE_NONE;
+ case COLD:
+ return WRITE_LIFE_LONG;
+ default:
+ return WRITE_LIFE_NONE;
+ }
+ case META:
+ return WRITE_LIFE_NONE;
+ default:
+ return WRITE_LIFE_NONE;
+ }
+}
+
static int __get_segment_type_2(struct f2fs_io_info *fio)
{
if (fio->type == DATA)
@@ -3434,7 +3499,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
is_inode_flag_set(inode, FI_HOT_DATA) ||
f2fs_is_cow_file(inode))
return CURSEG_HOT_DATA;
- return f2fs_rw_hint_to_seg_type(inode->i_write_hint);
+ return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode),
+ inode->i_write_hint);
} else {
if (IS_DNODE(fio->page))
return is_cold_node(fio->page) ? CURSEG_WARM_NODE :
@@ -3443,6 +3509,15 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
}
}
+int f2fs_get_segment_temp(int seg_type)
+{
+ if (IS_HOT(seg_type))
+ return HOT;
+ else if (IS_WARM(seg_type))
+ return WARM;
+ return COLD;
+}
+
static int __get_segment_type(struct f2fs_io_info *fio)
{
int type = 0;
@@ -3461,12 +3536,8 @@ static int __get_segment_type(struct f2fs_io_info *fio)
f2fs_bug_on(fio->sbi, true);
}
- if (IS_HOT(type))
- fio->temp = HOT;
- else if (IS_WARM(type))
- fio->temp = WARM;
- else
- fio->temp = COLD;
+ fio->temp = f2fs_get_segment_temp(type);
+
return type;
}
@@ -3559,6 +3630,8 @@ int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
if (segment_full) {
if (type == CURSEG_COLD_DATA_PINNED &&
!((curseg->segno + 1) % sbi->segs_per_sec)) {
+ write_sum_page(sbi, curseg->sum_blk,
+ GET_SUM_BLOCK(sbi, curseg->segno));
reset_curseg_fields(curseg);
goto skip_new_segment;
}
@@ -3612,13 +3685,13 @@ skip_new_segment:
mutex_unlock(&curseg->curseg_mutex);
f2fs_up_read(&SM_I(sbi)->curseg_lock);
return 0;
+
out_err:
*new_blkaddr = NULL_ADDR;
up_write(&sit_i->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
f2fs_up_read(&SM_I(sbi)->curseg_lock);
return ret;
-
}
void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
@@ -3660,8 +3733,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
&fio->new_blkaddr, sum, type, fio)) {
if (fscrypt_inode_uses_fs_layer_crypto(fio->page->mapping->host))
fscrypt_finalize_bounce_page(&fio->encrypted_page);
- if (PageWriteback(fio->page))
- end_page_writeback(fio->page);
+ end_page_writeback(fio->page);
if (f2fs_in_warm_node_list(fio->sbi, fio->page))
f2fs_del_fsync_node_entry(fio->sbi, fio->page);
goto out;
@@ -3904,7 +3976,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
void f2fs_wait_on_page_writeback(struct page *page,
enum page_type type, bool ordered, bool locked)
{
- if (PageWriteback(page)) {
+ if (folio_test_writeback(page_folio(page))) {
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
/* submit cached LFS IO */
@@ -3913,7 +3985,8 @@ void f2fs_wait_on_page_writeback(struct page *page,
f2fs_submit_merged_ipu_write(sbi, NULL, page);
if (ordered) {
wait_on_page_writeback(page);
- f2fs_bug_on(sbi, locked && PageWriteback(page));
+ f2fs_bug_on(sbi, locked &&
+ folio_test_writeback(page_folio(page)));
} else {
wait_for_stable_page(page);
}
@@ -4959,17 +5032,6 @@ out:
}
#ifdef CONFIG_BLK_DEV_ZONED
-static const char *f2fs_zone_status[BLK_ZONE_COND_OFFLINE + 1] = {
- [BLK_ZONE_COND_NOT_WP] = "NOT_WP",
- [BLK_ZONE_COND_EMPTY] = "EMPTY",
- [BLK_ZONE_COND_IMP_OPEN] = "IMPLICIT_OPEN",
- [BLK_ZONE_COND_EXP_OPEN] = "EXPLICIT_OPEN",
- [BLK_ZONE_COND_CLOSED] = "CLOSED",
- [BLK_ZONE_COND_READONLY] = "READONLY",
- [BLK_ZONE_COND_FULL] = "FULL",
- [BLK_ZONE_COND_OFFLINE] = "OFFLINE",
-};
-
static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
struct f2fs_dev_info *fdev,
struct blk_zone *zone)
@@ -5000,7 +5062,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) {
f2fs_notice(sbi, "Open zones: valid block[0x%x,0x%x] cond[%s]",
zone_segno, valid_block_cnt,
- f2fs_zone_status[zone->cond]);
+ blk_zone_cond_str(zone->cond));
return 0;
}
@@ -5011,7 +5073,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
if (!valid_block_cnt) {
f2fs_notice(sbi, "Zone without valid block has non-zero write "
"pointer. Reset the write pointer: cond[%s]",
- f2fs_zone_status[zone->cond]);
+ blk_zone_cond_str(zone->cond));
ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block,
zone->len >> log_sectors_per_block);
if (ret)
@@ -5029,7 +5091,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
*/
f2fs_notice(sbi, "Valid blocks are not aligned with write "
"pointer: valid block[0x%x,0x%x] cond[%s]",
- zone_segno, valid_block_cnt, f2fs_zone_status[zone->cond]);
+ zone_segno, valid_block_cnt, blk_zone_cond_str(zone->cond));
nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a4bc26dfdb1a..1f1b3647a998 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -66,21 +66,31 @@ const char *f2fs_fault_name[FAULT_MAX] = {
[FAULT_NO_SEGMENT] = "no free segment",
};
-void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
- unsigned int type)
+int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate,
+ unsigned long type)
{
struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info;
if (rate) {
+ if (rate > INT_MAX)
+ return -EINVAL;
atomic_set(&ffi->inject_ops, 0);
- ffi->inject_rate = rate;
+ ffi->inject_rate = (int)rate;
}
- if (type)
- ffi->inject_type = type;
+ if (type) {
+ if (type >= BIT(FAULT_MAX))
+ return -EINVAL;
+ ffi->inject_type = (unsigned int)type;
+ }
if (!rate && !type)
memset(ffi, 0, sizeof(struct f2fs_fault_info));
+ else
+ f2fs_info(sbi,
+ "build fault injection attr: rate: %lu, type: 0x%lx",
+ rate, type);
+ return 0;
}
#endif
@@ -886,14 +896,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
case Opt_fault_injection:
if (args->from && match_int(args, &arg))
return -EINVAL;
- f2fs_build_fault_attr(sbi, arg, F2FS_ALL_FAULT_TYPE);
+ if (f2fs_build_fault_attr(sbi, arg,
+ F2FS_ALL_FAULT_TYPE))
+ return -EINVAL;
set_opt(sbi, FAULT_INJECTION);
break;
case Opt_fault_type:
if (args->from && match_int(args, &arg))
return -EINVAL;
- f2fs_build_fault_attr(sbi, 0, arg);
+ if (f2fs_build_fault_attr(sbi, 0, arg))
+ return -EINVAL;
set_opt(sbi, FAULT_INJECTION);
break;
#else
@@ -2132,8 +2145,6 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount)
F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL;
F2FS_OPTION(sbi).errors = MOUNT_ERRORS_CONTINUE;
- sbi->sb->s_flags &= ~SB_INLINECRYPT;
-
set_opt(sbi, INLINE_XATTR);
set_opt(sbi, INLINE_DATA);
set_opt(sbi, INLINE_DENTRY);
@@ -2326,6 +2337,17 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
if (err)
goto restore_opts;
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (f2fs_sb_has_blkzoned(sbi) &&
+ sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) {
+ f2fs_err(sbi,
+ "zoned: max open zones %u is too small, need at least %u open zones",
+ sbi->max_open_zones, F2FS_OPTION(sbi).active_logs);
+ err = -EINVAL;
+ goto restore_opts;
+ }
+#endif
+
/* flush outstanding errors before changing fs state */
flush_work(&sbi->s_error_work);
@@ -2547,6 +2569,11 @@ restore_opts:
return err;
}
+static void f2fs_shutdown(struct super_block *sb)
+{
+ f2fs_do_shutdown(F2FS_SB(sb), F2FS_GOING_DOWN_NOSYNC, false);
+}
+
#ifdef CONFIG_QUOTA
static bool f2fs_need_recovery(struct f2fs_sb_info *sbi)
{
@@ -3146,6 +3173,7 @@ static const struct super_operations f2fs_sops = {
.unfreeze_fs = f2fs_unfreeze,
.statfs = f2fs_statfs,
.remount_fs = f2fs_remount,
+ .shutdown = f2fs_shutdown,
};
#ifdef CONFIG_FS_ENCRYPTION
@@ -3441,7 +3469,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
}
}
- /* Currently, support only 4KB block size */
+ /* only support block_size equals to PAGE_SIZE */
if (le32_to_cpu(raw_super->log_blocksize) != F2FS_BLKSIZE_BITS) {
f2fs_info(sbi, "Invalid log_blocksize (%u), supports only %u",
le32_to_cpu(raw_super->log_blocksize),
@@ -3862,11 +3890,24 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
sector_t nr_sectors = bdev_nr_sectors(bdev);
struct f2fs_report_zones_args rep_zone_arg;
u64 zone_sectors;
+ unsigned int max_open_zones;
int ret;
if (!f2fs_sb_has_blkzoned(sbi))
return 0;
+ if (bdev_is_zoned(FDEV(devi).bdev)) {
+ max_open_zones = bdev_max_open_zones(bdev);
+ if (max_open_zones && (max_open_zones < sbi->max_open_zones))
+ sbi->max_open_zones = max_open_zones;
+ if (sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) {
+ f2fs_err(sbi,
+ "zoned: max open zones %u is too small, need at least %u open zones",
+ sbi->max_open_zones, F2FS_OPTION(sbi).active_logs);
+ return -EINVAL;
+ }
+ }
+
zone_sectors = bdev_zone_sectors(bdev);
if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
SECTOR_TO_BLOCK(zone_sectors))
@@ -4131,9 +4172,15 @@ void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason,
if (shutdown)
set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
- /* continue filesystem operators if errors=continue */
- if (continue_fs || f2fs_readonly(sb))
+ /*
+ * Continue filesystem operators if errors=continue. Should not set
+ * RO by shutdown, since RO bypasses thaw_super which can hang the
+ * system.
+ */
+ if (continue_fs || f2fs_readonly(sb) || shutdown) {
+ f2fs_warn(sbi, "Stopped filesystem due to reason: %d", reason);
return;
+ }
f2fs_warn(sbi, "Remounting filesystem read-only");
/*
@@ -4180,6 +4227,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev);
sbi->aligned_blksize = true;
+#ifdef CONFIG_BLK_DEV_ZONED
+ sbi->max_open_zones = UINT_MAX;
+#endif
for (i = 0; i < max_devices; i++) {
if (i == 0)
@@ -4894,12 +4944,6 @@ static int __init init_f2fs_fs(void)
{
int err;
- if (PAGE_SIZE != F2FS_BLKSIZE) {
- printk("F2FS not supported on PAGE_SIZE(%lu) != BLOCK_SIZE(%lu)\n",
- PAGE_SIZE, F2FS_BLKSIZE);
- return -EINVAL;
- }
-
err = init_inodecache();
if (err)
goto fail;
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index a568ce96cf56..09d3ecfaa4f1 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -484,10 +484,16 @@ out:
if (ret < 0)
return ret;
#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (a->struct_type == FAULT_INFO_TYPE && t >= BIT(FAULT_MAX))
- return -EINVAL;
- if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX)
- return -EINVAL;
+ if (a->struct_type == FAULT_INFO_TYPE) {
+ if (f2fs_build_fault_attr(sbi, 0, t))
+ return -EINVAL;
+ return count;
+ }
+ if (a->struct_type == FAULT_INFO_RATE) {
+ if (f2fs_build_fault_attr(sbi, t, 0))
+ return -EINVAL;
+ return count;
+ }
#endif
if (a->struct_type == RESERVED_BLOCKS) {
spin_lock(&sbi->stat_lock);
@@ -675,6 +681,13 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "gc_pin_file_threshold")) {
+ if (t > MAX_GC_FAILED_PINNED_FILES)
+ return -EINVAL;
+ sbi->gc_pin_file_threshold = t;
+ return count;
+ }
+
if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
if (t != 0)
return -EINVAL;
diff --git a/fs/file.c b/fs/file.c
index 3b683b9101d8..8076aef9c210 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -271,6 +271,11 @@ static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
__clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
}
+static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
+{
+ return test_bit(fd, fdt->open_fds);
+}
+
static unsigned int count_open_files(struct fdtable *fdt)
{
unsigned int size = fdt->max_fds;
@@ -915,13 +920,8 @@ struct file *get_file_rcu(struct file __rcu **f)
struct file __rcu *file;
file = __get_file_rcu(f);
- if (unlikely(!file))
- return NULL;
-
- if (unlikely(IS_ERR(file)))
- continue;
-
- return file;
+ if (!IS_ERR(file))
+ return file;
}
}
EXPORT_SYMBOL_GPL(get_file_rcu);
@@ -1219,12 +1219,9 @@ void set_close_on_exec(unsigned int fd, int flag)
bool get_close_on_exec(unsigned int fd)
{
- struct files_struct *files = current->files;
- struct fdtable *fdt;
bool res;
rcu_read_lock();
- fdt = files_fdtable(files);
- res = close_on_exec(fd, fdt);
+ res = close_on_exec(fd, current->files);
rcu_read_unlock();
return res;
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 3ec8bb5e68ff..9eb191b5c4de 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1813,7 +1813,8 @@ static void fuse_resend(struct fuse_conn *fc)
spin_unlock(&fc->lock);
list_for_each_entry_safe(req, next, &to_queue, list) {
- __set_bit(FR_PENDING, &req->flags);
+ set_bit(FR_PENDING, &req->flags);
+ clear_bit(FR_SENT, &req->flags);
/* mark the request as resend request */
req->in.h.unique |= FUSE_UNIQUE_RESEND;
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index b57ce4157640..f39456c65ed7 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -935,14 +935,10 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
}
for (i = 0; i < ap->num_pages; i++) {
- struct page *page = ap->pages[i];
+ struct folio *folio = page_folio(ap->pages[i]);
- if (!err)
- SetPageUptodate(page);
- else
- SetPageError(page);
- unlock_page(page);
- put_page(page);
+ folio_end_read(folio, !err);
+ folio_put(folio);
}
if (ia->ff)
fuse_file_put(ia->ff, false);
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 726640fa439e..572ce8a82ceb 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -8,6 +8,7 @@
#include <linux/uio.h>
#include <linux/compat.h>
#include <linux/fileattr.h>
+#include <linux/fsverity.h>
static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args,
struct fuse_ioctl_out *outarg)
@@ -117,6 +118,53 @@ static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
return 0;
}
+/* For fs-verity, determine iov lengths from input */
+static int fuse_setup_measure_verity(unsigned long arg, struct iovec *iov)
+{
+ __u16 digest_size;
+ struct fsverity_digest __user *uarg = (void __user *)arg;
+
+ if (copy_from_user(&digest_size, &uarg->digest_size, sizeof(digest_size)))
+ return -EFAULT;
+
+ if (digest_size > SIZE_MAX - sizeof(struct fsverity_digest))
+ return -EINVAL;
+
+ iov->iov_len = sizeof(struct fsverity_digest) + digest_size;
+
+ return 0;
+}
+
+static int fuse_setup_enable_verity(unsigned long arg, struct iovec *iov,
+ unsigned int *in_iovs)
+{
+ struct fsverity_enable_arg enable;
+ struct fsverity_enable_arg __user *uarg = (void __user *)arg;
+ const __u32 max_buffer_len = FUSE_MAX_MAX_PAGES * PAGE_SIZE;
+
+ if (copy_from_user(&enable, uarg, sizeof(enable)))
+ return -EFAULT;
+
+ if (enable.salt_size > max_buffer_len || enable.sig_size > max_buffer_len)
+ return -ENOMEM;
+
+ if (enable.salt_size > 0) {
+ iov++;
+ (*in_iovs)++;
+
+ iov->iov_base = u64_to_user_ptr(enable.salt_ptr);
+ iov->iov_len = enable.salt_size;
+ }
+
+ if (enable.sig_size > 0) {
+ iov++;
+ (*in_iovs)++;
+
+ iov->iov_base = u64_to_user_ptr(enable.sig_ptr);
+ iov->iov_len = enable.sig_size;
+ }
+ return 0;
+}
/*
* For ioctls, there is no generic way to determine how much memory
@@ -227,6 +275,18 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
out_iov = iov;
out_iovs = 1;
}
+
+ err = 0;
+ switch (cmd) {
+ case FS_IOC_MEASURE_VERITY:
+ err = fuse_setup_measure_verity(arg, iov);
+ break;
+ case FS_IOC_ENABLE_VERITY:
+ err = fuse_setup_enable_verity(arg, iov, &in_iovs);
+ break;
+ }
+ if (err)
+ goto out;
}
retry:
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index bb3e941b9503..1a52a51b6b07 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -7,6 +7,8 @@
#include <linux/fs.h>
#include <linux/dax.h>
#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/group_cpus.h>
#include <linux/pfn_t.h>
#include <linux/memremap.h>
#include <linux/module.h>
@@ -67,6 +69,8 @@ struct virtio_fs {
unsigned int num_request_queues; /* number of request queues */
struct dax_device *dax_dev;
+ unsigned int *mq_map; /* index = cpu id, value = request vq id */
+
/* DAX memory window where file contents are mapped */
void *window_kaddr;
phys_addr_t window_phys_addr;
@@ -185,6 +189,7 @@ static void virtio_fs_ktype_release(struct kobject *kobj)
{
struct virtio_fs *vfs = container_of(kobj, struct virtio_fs, kobj);
+ kfree(vfs->mq_map);
kfree(vfs->vqs);
kfree(vfs);
}
@@ -706,6 +711,44 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
}
}
+static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+ const struct cpumask *mask, *masks;
+ unsigned int q, cpu;
+
+ /* First attempt to map using existing transport layer affinities
+ * e.g. PCIe MSI-X
+ */
+ if (!vdev->config->get_vq_affinity)
+ goto fallback;
+
+ for (q = 0; q < fs->num_request_queues; q++) {
+ mask = vdev->config->get_vq_affinity(vdev, VQ_REQUEST + q);
+ if (!mask)
+ goto fallback;
+
+ for_each_cpu(cpu, mask)
+ fs->mq_map[cpu] = q;
+ }
+
+ return;
+fallback:
+ /* Attempt to map evenly in groups over the CPUs */
+ masks = group_cpus_evenly(fs->num_request_queues);
+ /* If even this fails we default to all CPUs use queue zero */
+ if (!masks) {
+ for_each_possible_cpu(cpu)
+ fs->mq_map[cpu] = 0;
+ return;
+ }
+
+ for (q = 0; q < fs->num_request_queues; q++) {
+ for_each_cpu(cpu, &masks[q])
+ fs->mq_map[cpu] = q;
+ }
+ kfree(masks);
+}
+
/* Virtqueue interrupt handler */
static void virtio_fs_vq_done(struct virtqueue *vq)
{
@@ -742,6 +785,11 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
{
struct virtqueue **vqs;
vq_callback_t **callbacks;
+ /* Specify pre_vectors to ensure that the queues before the
+ * request queues (e.g. hiprio) don't claim any of the CPUs in
+ * the multi-queue mapping and interrupt affinities
+ */
+ struct irq_affinity desc = { .pre_vectors = VQ_REQUEST };
const char **names;
unsigned int i;
int ret = 0;
@@ -751,6 +799,9 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
if (fs->num_request_queues == 0)
return -EINVAL;
+ /* Truncate nr of request queues to nr_cpu_id */
+ fs->num_request_queues = min_t(unsigned int, fs->num_request_queues,
+ nr_cpu_ids);
fs->nvqs = VQ_REQUEST + fs->num_request_queues;
fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
if (!fs->vqs)
@@ -760,7 +811,9 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]),
GFP_KERNEL);
names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL);
- if (!vqs || !callbacks || !names) {
+ fs->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*fs->mq_map), GFP_KERNEL,
+ dev_to_node(&vdev->dev));
+ if (!vqs || !callbacks || !names || !fs->mq_map) {
ret = -ENOMEM;
goto out;
}
@@ -780,7 +833,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
names[i] = fs->vqs[i].name;
}
- ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL);
+ ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, &desc);
if (ret < 0)
goto out;
@@ -792,8 +845,10 @@ out:
kfree(names);
kfree(callbacks);
kfree(vqs);
- if (ret)
+ if (ret) {
kfree(fs->vqs);
+ kfree(fs->mq_map);
+ }
return ret;
}
@@ -939,7 +994,7 @@ static int virtio_fs_probe(struct virtio_device *vdev)
if (ret < 0)
goto out;
- /* TODO vq affinity */
+ virtio_fs_map_queues(vdev, fs);
ret = virtio_fs_setup_dax(vdev, fs);
if (ret < 0)
@@ -1023,7 +1078,6 @@ static const unsigned int feature_table[] = {};
static struct virtio_driver virtio_fs_driver = {
.driver.name = KBUILD_MODNAME,
- .driver.owner = THIS_MODULE,
.id_table = id_table,
.feature_table = feature_table,
.feature_table_size = ARRAY_SIZE(feature_table),
@@ -1288,7 +1342,7 @@ out:
static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
__releases(fiq->lock)
{
- unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
+ unsigned int queue_id;
struct virtio_fs *fs;
struct fuse_req *req;
struct virtio_fs_vq *fsvq;
@@ -1302,11 +1356,13 @@ __releases(fiq->lock)
spin_unlock(&fiq->lock);
fs = fiq->priv;
+ queue_id = VQ_REQUEST + fs->mq_map[raw_smp_processor_id()];
- pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
- __func__, req->in.h.opcode, req->in.h.unique,
+ pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u queue_id %u\n",
+ __func__, req->in.h.opcode, req->in.h.unique,
req->in.h.nodeid, req->in.h.len,
- fuse_len_args(req->args->out_numargs, req->args->out_args));
+ fuse_len_args(req->args->out_numargs, req->args->out_args),
+ queue_id);
fsvq = &fs->vqs[queue_id];
ret = virtio_fs_enqueue_req(fsvq, req, false);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9f11fc1e79eb..4ea6c8bfb4e6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1267,7 +1267,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
mapping = gfs2_glock2aspace(gl);
if (mapping) {
mapping->a_ops = &gfs2_meta_aops;
- mapping->host = s->s_bdev->bd_inode;
+ mapping->host = s->s_bdev->bd_mapping->host;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
mapping->i_private_data = NULL;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 227edbaddfbc..05975ec76d35 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -114,7 +114,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
address_space_init_once(mapping);
mapping->a_ops = &gfs2_rgrp_aops;
- mapping->host = sb->s_bdev->bd_inode;
+ mapping->host = sb->s_bdev->bd_mapping->host;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
mapping->i_private_data = NULL;
diff --git a/fs/internal.h b/fs/internal.h
index 7ca738904e34..ab2225136f60 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -62,6 +62,9 @@ int do_mkdirat(int dfd, struct filename *name, umode_t mode);
int do_symlinkat(struct filename *from, int newdfd, struct filename *to);
int do_linkat(int olddfd, struct filename *old, int newdfd,
struct filename *new, int flags);
+int vfs_tmpfile(struct mnt_idmap *idmap,
+ const struct path *parentpath,
+ struct file *file, umode_t mode);
/*
* namespace.c
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 41c8f0c68ef5..c5802a459334 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -898,11 +898,11 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
{
loff_t length = iomap_length(iter);
- size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
loff_t pos = iter->pos;
ssize_t total_written = 0;
long status = 0;
struct address_space *mapping = iter->inode->i_mapping;
+ size_t chunk = mapping_max_folio_size(mapping);
unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
do {
diff --git a/fs/isofs/Makefile b/fs/isofs/Makefile
index 6498fd2b0f60..b25bc542a22b 100644
--- a/fs/isofs/Makefile
+++ b/fs/isofs/Makefile
@@ -5,7 +5,6 @@
obj-$(CONFIG_ISO9660_FS) += isofs.o
-isofs-objs-y := namei.o inode.o dir.o util.o rock.o export.o
-isofs-objs-$(CONFIG_JOLIET) += joliet.o
-isofs-objs-$(CONFIG_ZISOFS) += compress.o
-isofs-objs := $(isofs-objs-y)
+isofs-y := namei.o inode.o dir.o util.o rock.o export.o
+isofs-$(CONFIG_JOLIET) += joliet.o
+isofs-$(CONFIG_ZISOFS) += compress.o
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index c4da3f634b92..34d5baa5d88a 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -346,8 +346,6 @@ static int zisofs_read_folio(struct file *file, struct folio *folio)
for (i = 0; i < pcount; i++, index++) {
if (i != full_page)
pages[i] = grab_cache_page_nowait(mapping, index);
- if (pages[i])
- ClearPageError(pages[i]);
}
err = zisofs_fill_pages(inode, full_page, pcount, pages);
@@ -356,8 +354,6 @@ static int zisofs_read_folio(struct file *file, struct folio *folio)
for (i = 0; i < pcount; i++) {
if (pages[i]) {
flush_dcache_page(pages[i]);
- if (i == full_page && err)
- SetPageError(pages[i]);
unlock_page(pages[i]);
if (i != full_page)
put_page(pages[i]);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 2a616a9f289d..93b1077a380a 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -21,11 +21,12 @@
#include <linux/ctype.h>
#include <linux/statfs.h>
#include <linux/cdrom.h>
-#include <linux/parser.h>
#include <linux/mpage.h>
#include <linux/user_namespace.h>
#include <linux/seq_file.h>
#include <linux/blkdev.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include "isofs.h"
#include "zisofs.h"
@@ -110,10 +111,10 @@ static void destroy_inodecache(void)
kmem_cache_destroy(isofs_inode_cachep);
}
-static int isofs_remount(struct super_block *sb, int *flags, char *data)
+static int isofs_reconfigure(struct fs_context *fc)
{
- sync_filesystem(sb);
- if (!(*flags & SB_RDONLY))
+ sync_filesystem(fc->root->d_sb);
+ if (!(fc->sb_flags & SB_RDONLY))
return -EROFS;
return 0;
}
@@ -123,7 +124,6 @@ static const struct super_operations isofs_sops = {
.free_inode = isofs_free_inode,
.put_super = isofs_put_super,
.statfs = isofs_statfs,
- .remount_fs = isofs_remount,
.show_options = isofs_show_options,
};
@@ -145,7 +145,7 @@ static const struct dentry_operations isofs_dentry_ops[] = {
#endif
};
-struct iso9660_options{
+struct isofs_options{
unsigned int rock:1;
unsigned int joliet:1;
unsigned int cruft:1;
@@ -289,197 +289,161 @@ isofs_dentry_cmpi_ms(const struct dentry *dentry,
#endif
enum {
- Opt_block, Opt_check_r, Opt_check_s, Opt_cruft, Opt_gid, Opt_ignore,
- Opt_iocharset, Opt_map_a, Opt_map_n, Opt_map_o, Opt_mode, Opt_nojoliet,
- Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err,
- Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, Opt_overriderockperm,
+ Opt_block, Opt_check, Opt_cruft, Opt_gid, Opt_ignore, Opt_iocharset,
+ Opt_map, Opt_mode, Opt_nojoliet, Opt_norock, Opt_sb, Opt_session,
+ Opt_uid, Opt_unhide, Opt_utf8, Opt_err, Opt_nocompress, Opt_hide,
+ Opt_showassoc, Opt_dmode, Opt_overriderockperm,
};
-static const match_table_t tokens = {
- {Opt_norock, "norock"},
- {Opt_nojoliet, "nojoliet"},
- {Opt_unhide, "unhide"},
- {Opt_hide, "hide"},
- {Opt_showassoc, "showassoc"},
- {Opt_cruft, "cruft"},
- {Opt_utf8, "utf8"},
- {Opt_iocharset, "iocharset=%s"},
- {Opt_map_a, "map=acorn"},
- {Opt_map_a, "map=a"},
- {Opt_map_n, "map=normal"},
- {Opt_map_n, "map=n"},
- {Opt_map_o, "map=off"},
- {Opt_map_o, "map=o"},
- {Opt_session, "session=%u"},
- {Opt_sb, "sbsector=%u"},
- {Opt_check_r, "check=relaxed"},
- {Opt_check_r, "check=r"},
- {Opt_check_s, "check=strict"},
- {Opt_check_s, "check=s"},
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_mode, "mode=%u"},
- {Opt_dmode, "dmode=%u"},
- {Opt_overriderockperm, "overriderockperm"},
- {Opt_block, "block=%u"},
- {Opt_ignore, "conv=binary"},
- {Opt_ignore, "conv=b"},
- {Opt_ignore, "conv=text"},
- {Opt_ignore, "conv=t"},
- {Opt_ignore, "conv=mtext"},
- {Opt_ignore, "conv=m"},
- {Opt_ignore, "conv=auto"},
- {Opt_ignore, "conv=a"},
- {Opt_nocompress, "nocompress"},
- {Opt_err, NULL}
+static const struct constant_table isofs_param_map[] = {
+ {"acorn", 'a'},
+ {"a", 'a'},
+ {"normal", 'n'},
+ {"n", 'n'},
+ {"off", 'o'},
+ {"o", 'o'},
+ {}
};
-static int parse_options(char *options, struct iso9660_options *popt)
-{
- char *p;
- int option;
- unsigned int uv;
-
- popt->map = 'n';
- popt->rock = 1;
- popt->joliet = 1;
- popt->cruft = 0;
- popt->hide = 0;
- popt->showassoc = 0;
- popt->check = 'u'; /* unset */
- popt->nocompress = 0;
- popt->blocksize = 1024;
- popt->fmode = popt->dmode = ISOFS_INVALID_MODE;
- popt->uid_set = 0;
- popt->gid_set = 0;
- popt->gid = GLOBAL_ROOT_GID;
- popt->uid = GLOBAL_ROOT_UID;
- popt->iocharset = NULL;
- popt->overriderockperm = 0;
- popt->session=-1;
- popt->sbsector=-1;
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- substring_t args[MAX_OPT_ARGS];
- unsigned n;
-
- if (!*p)
- continue;
+static const struct constant_table isofs_param_check[] = {
+ {"relaxed", 'r'},
+ {"r", 'r'},
+ {"strict", 's'},
+ {"s", 's'},
+ {}
+};
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_norock:
- popt->rock = 0;
- break;
- case Opt_nojoliet:
- popt->joliet = 0;
- break;
- case Opt_hide:
- popt->hide = 1;
- break;
- case Opt_unhide:
- case Opt_showassoc:
- popt->showassoc = 1;
- break;
- case Opt_cruft:
- popt->cruft = 1;
- break;
+static const struct fs_parameter_spec isofs_param_spec[] = {
+ fsparam_flag ("norock", Opt_norock),
+ fsparam_flag ("nojoliet", Opt_nojoliet),
+ fsparam_flag ("unhide", Opt_unhide),
+ fsparam_flag ("hide", Opt_hide),
+ fsparam_flag ("showassoc", Opt_showassoc),
+ fsparam_flag ("cruft", Opt_cruft),
+ fsparam_flag ("utf8", Opt_utf8),
+ fsparam_string ("iocharset", Opt_iocharset),
+ fsparam_enum ("map", Opt_map, isofs_param_map),
+ fsparam_u32 ("session", Opt_session),
+ fsparam_u32 ("sbsector", Opt_sb),
+ fsparam_enum ("check", Opt_check, isofs_param_check),
+ fsparam_u32 ("uid", Opt_uid),
+ fsparam_u32 ("gid", Opt_gid),
+ /* Note: mode/dmode historically accepted %u not strictly %o */
+ fsparam_u32 ("mode", Opt_mode),
+ fsparam_u32 ("dmode", Opt_dmode),
+ fsparam_flag ("overriderockperm", Opt_overriderockperm),
+ fsparam_u32 ("block", Opt_block),
+ fsparam_string ("conv", Opt_ignore),
+ fsparam_flag ("nocompress", Opt_nocompress),
+ {}
+};
+
+static int isofs_parse_param(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct isofs_options *popt = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
+ kuid_t uid;
+ kgid_t gid;
+ unsigned int n;
+
+ /* There are no remountable options */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
+ return 0;
+
+ opt = fs_parse(fc, isofs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_norock:
+ popt->rock = 0;
+ break;
+ case Opt_nojoliet:
+ popt->joliet = 0;
+ break;
+ case Opt_hide:
+ popt->hide = 1;
+ break;
+ case Opt_unhide:
+ case Opt_showassoc:
+ popt->showassoc = 1;
+ break;
+ case Opt_cruft:
+ popt->cruft = 1;
+ break;
#ifdef CONFIG_JOLIET
- case Opt_utf8:
- kfree(popt->iocharset);
- popt->iocharset = kstrdup("utf8", GFP_KERNEL);
- if (!popt->iocharset)
- return 0;
- break;
- case Opt_iocharset:
- kfree(popt->iocharset);
- popt->iocharset = match_strdup(&args[0]);
- if (!popt->iocharset)
- return 0;
- break;
+ case Opt_utf8:
+ kfree(popt->iocharset);
+ popt->iocharset = kstrdup("utf8", GFP_KERNEL);
+ if (!popt->iocharset)
+ return -ENOMEM;
+ break;
+ case Opt_iocharset:
+ kfree(popt->iocharset);
+ popt->iocharset = kstrdup(param->string, GFP_KERNEL);
+ if (!popt->iocharset)
+ return -ENOMEM;
+ break;
#endif
- case Opt_map_a:
- popt->map = 'a';
- break;
- case Opt_map_o:
- popt->map = 'o';
- break;
- case Opt_map_n:
- popt->map = 'n';
- break;
- case Opt_session:
- if (match_int(&args[0], &option))
- return 0;
- n = option;
- /*
- * Track numbers are supposed to be in range 1-99, the
- * mount option starts indexing at 0.
- */
- if (n >= 99)
- return 0;
- popt->session = n + 1;
- break;
- case Opt_sb:
- if (match_int(&args[0], &option))
- return 0;
- popt->sbsector = option;
- break;
- case Opt_check_r:
- popt->check = 'r';
- break;
- case Opt_check_s:
- popt->check = 's';
- break;
- case Opt_ignore:
- break;
- case Opt_uid:
- if (match_uint(&args[0], &uv))
- return 0;
- popt->uid = make_kuid(current_user_ns(), uv);
- if (!uid_valid(popt->uid))
- return 0;
- popt->uid_set = 1;
- break;
- case Opt_gid:
- if (match_uint(&args[0], &uv))
- return 0;
- popt->gid = make_kgid(current_user_ns(), uv);
- if (!gid_valid(popt->gid))
- return 0;
- popt->gid_set = 1;
- break;
- case Opt_mode:
- if (match_int(&args[0], &option))
- return 0;
- popt->fmode = option;
- break;
- case Opt_dmode:
- if (match_int(&args[0], &option))
- return 0;
- popt->dmode = option;
- break;
- case Opt_overriderockperm:
- popt->overriderockperm = 1;
- break;
- case Opt_block:
- if (match_int(&args[0], &option))
- return 0;
- n = option;
- if (n != 512 && n != 1024 && n != 2048)
- return 0;
- popt->blocksize = n;
- break;
- case Opt_nocompress:
- popt->nocompress = 1;
- break;
- default:
- return 0;
- }
+ case Opt_map:
+ popt->map = result.uint_32;
+ break;
+ case Opt_session:
+ n = result.uint_32;
+ /*
+ * Track numbers are supposed to be in range 1-99, the
+ * mount option starts indexing at 0.
+ */
+ if (n >= 99)
+ return -EINVAL;
+ popt->session = n + 1;
+ break;
+ case Opt_sb:
+ popt->sbsector = result.uint_32;
+ break;
+ case Opt_check:
+ popt->check = result.uint_32;
+ break;
+ case Opt_ignore:
+ break;
+ case Opt_uid:
+ uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(uid))
+ return -EINVAL;
+ popt->uid = uid;
+ popt->uid_set = 1;
+ break;
+ case Opt_gid:
+ gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(gid))
+ return -EINVAL;
+ popt->gid = gid;
+ popt->gid_set = 1;
+ break;
+ case Opt_mode:
+ popt->fmode = result.uint_32;
+ break;
+ case Opt_dmode:
+ popt->dmode = result.uint_32;
+ break;
+ case Opt_overriderockperm:
+ popt->overriderockperm = 1;
+ break;
+ case Opt_block:
+ n = result.uint_32;
+ if (n != 512 && n != 1024 && n != 2048)
+ return -EINVAL;
+ popt->blocksize = n;
+ break;
+ case Opt_nocompress:
+ popt->nocompress = 1;
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+ return 0;
}
/*
@@ -615,7 +579,7 @@ static bool rootdir_empty(struct super_block *sb, unsigned long block)
/*
* Initialize the superblock and read the root inode.
*/
-static int isofs_fill_super(struct super_block *s, void *data, int silent)
+static int isofs_fill_super(struct super_block *s, struct fs_context *fc)
{
struct buffer_head *bh = NULL, *pri_bh = NULL;
struct hs_primary_descriptor *h_pri = NULL;
@@ -623,7 +587,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
struct iso_supplementary_descriptor *sec = NULL;
struct iso_directory_record *rootp;
struct inode *inode;
- struct iso9660_options opt;
+ struct isofs_options *opt = fc->fs_private;
struct isofs_sb_info *sbi;
unsigned long first_data_zone;
int joliet_level = 0;
@@ -631,15 +595,13 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
int orig_zonesize;
int table, error = -EINVAL;
unsigned int vol_desc_start;
+ int silent = fc->sb_flags & SB_SILENT;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
s->s_fs_info = sbi;
- if (!parse_options((char *)data, &opt))
- goto out_freesbi;
-
/*
* First of all, get the hardware blocksize for this device.
* If we don't know what it is, or the hardware blocksize is
@@ -655,14 +617,14 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
bdev_logical_block_size(s->s_bdev));
goto out_freesbi;
}
- opt.blocksize = sb_min_blocksize(s, opt.blocksize);
+ opt->blocksize = sb_min_blocksize(s, opt->blocksize);
sbi->s_high_sierra = 0; /* default is iso9660 */
- sbi->s_session = opt.session;
- sbi->s_sbsector = opt.sbsector;
+ sbi->s_session = opt->session;
+ sbi->s_sbsector = opt->sbsector;
- vol_desc_start = (opt.sbsector != -1) ?
- opt.sbsector : isofs_get_last_session(s,opt.session);
+ vol_desc_start = (opt->sbsector != -1) ?
+ opt->sbsector : isofs_get_last_session(s, opt->session);
for (iso_blknum = vol_desc_start+16;
iso_blknum < vol_desc_start+100; iso_blknum++) {
@@ -696,7 +658,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) {
sec = (struct iso_supplementary_descriptor *)vdp;
if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) {
- if (opt.joliet) {
+ if (opt->joliet) {
if (sec->escape[2] == 0x40)
joliet_level = 1;
else if (sec->escape[2] == 0x43)
@@ -721,7 +683,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
goto out_freebh;
sbi->s_high_sierra = 1;
- opt.rock = 0;
+ opt->rock = 0;
h_pri = (struct hs_primary_descriptor *)vdp;
goto root_found;
}
@@ -749,7 +711,7 @@ root_found:
goto out_freebh;
}
- if (joliet_level && (!pri || !opt.rock)) {
+ if (joliet_level && (!pri || !opt->rock)) {
/* This is the case of Joliet with the norock mount flag.
* A disc with both Joliet and Rock Ridge is handled later
*/
@@ -780,7 +742,7 @@ root_found:
* blocks that were 512 bytes (which should only very rarely
* happen.)
*/
- if (orig_zonesize < opt.blocksize)
+ if (orig_zonesize < opt->blocksize)
goto out_bad_size;
/* RDE: convert log zone size to bit shift */
@@ -865,10 +827,10 @@ root_found:
#ifdef CONFIG_JOLIET
if (joliet_level) {
- char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT;
+ char *p = opt->iocharset ? opt->iocharset : CONFIG_NLS_DEFAULT;
if (strcmp(p, "utf8") != 0) {
- sbi->s_nls_iocharset = opt.iocharset ?
- load_nls(opt.iocharset) : load_nls_default();
+ sbi->s_nls_iocharset = opt->iocharset ?
+ load_nls(opt->iocharset) : load_nls_default();
if (!sbi->s_nls_iocharset)
goto out_freesbi;
}
@@ -876,29 +838,29 @@ root_found:
#endif
s->s_op = &isofs_sops;
s->s_export_op = &isofs_export_ops;
- sbi->s_mapping = opt.map;
- sbi->s_rock = (opt.rock ? 2 : 0);
+ sbi->s_mapping = opt->map;
+ sbi->s_rock = (opt->rock ? 2 : 0);
sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/
- sbi->s_cruft = opt.cruft;
- sbi->s_hide = opt.hide;
- sbi->s_showassoc = opt.showassoc;
- sbi->s_uid = opt.uid;
- sbi->s_gid = opt.gid;
- sbi->s_uid_set = opt.uid_set;
- sbi->s_gid_set = opt.gid_set;
- sbi->s_nocompress = opt.nocompress;
- sbi->s_overriderockperm = opt.overriderockperm;
+ sbi->s_cruft = opt->cruft;
+ sbi->s_hide = opt->hide;
+ sbi->s_showassoc = opt->showassoc;
+ sbi->s_uid = opt->uid;
+ sbi->s_gid = opt->gid;
+ sbi->s_uid_set = opt->uid_set;
+ sbi->s_gid_set = opt->gid_set;
+ sbi->s_nocompress = opt->nocompress;
+ sbi->s_overriderockperm = opt->overriderockperm;
/*
* It would be incredibly stupid to allow people to mark every file
* on the disk as suid, so we merely allow them to set the default
* permissions.
*/
- if (opt.fmode != ISOFS_INVALID_MODE)
- sbi->s_fmode = opt.fmode & 0777;
+ if (opt->fmode != ISOFS_INVALID_MODE)
+ sbi->s_fmode = opt->fmode & 0777;
else
sbi->s_fmode = ISOFS_INVALID_MODE;
- if (opt.dmode != ISOFS_INVALID_MODE)
- sbi->s_dmode = opt.dmode & 0777;
+ if (opt->dmode != ISOFS_INVALID_MODE)
+ sbi->s_dmode = opt->dmode & 0777;
else
sbi->s_dmode = ISOFS_INVALID_MODE;
@@ -960,12 +922,12 @@ root_found:
}
}
- if (opt.check == 'u') {
+ if (opt->check == 'u') {
/* Only Joliet is case insensitive by default */
if (joliet_level)
- opt.check = 'r';
+ opt->check = 'r';
else
- opt.check = 's';
+ opt->check = 's';
}
sbi->s_joliet_level = joliet_level;
@@ -980,9 +942,9 @@ root_found:
table = 0;
if (joliet_level)
table += 2;
- if (opt.check == 'r')
+ if (opt->check == 'r')
table++;
- sbi->s_check = opt.check;
+ sbi->s_check = opt->check;
if (table)
s->s_d_op = &isofs_dentry_ops[table - 1];
@@ -994,7 +956,7 @@ root_found:
goto out_no_inode;
}
- kfree(opt.iocharset);
+ kfree(opt->iocharset);
return 0;
@@ -1023,7 +985,7 @@ out_bad_zone_size:
goto out_freebh;
out_bad_size:
printk(KERN_WARNING "ISOFS: Logical zone size(%d) < hardware blocksize(%u)\n",
- orig_zonesize, opt.blocksize);
+ orig_zonesize, opt->blocksize);
goto out_freebh;
out_unknown_format:
if (!silent)
@@ -1033,7 +995,7 @@ out_freebh:
brelse(bh);
brelse(pri_bh);
out_freesbi:
- kfree(opt.iocharset);
+ kfree(opt->iocharset);
kfree(sbi);
s->s_fs_info = NULL;
return error;
@@ -1567,18 +1529,63 @@ struct inode *__isofs_iget(struct super_block *sb,
return inode;
}
-static struct dentry *isofs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int isofs_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
+ return get_tree_bdev(fc, isofs_fill_super);
+}
+
+static void isofs_free_fc(struct fs_context *fc)
+{
+ kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations isofs_context_ops = {
+ .parse_param = isofs_parse_param,
+ .get_tree = isofs_get_tree,
+ .reconfigure = isofs_reconfigure,
+ .free = isofs_free_fc,
+};
+
+static int isofs_init_fs_context(struct fs_context *fc)
+{
+ struct isofs_options *opt;
+
+ opt = kzalloc(sizeof(*opt), GFP_KERNEL);
+ if (!opt)
+ return -ENOMEM;
+
+ opt->map = 'n';
+ opt->rock = 1;
+ opt->joliet = 1;
+ opt->cruft = 0;
+ opt->hide = 0;
+ opt->showassoc = 0;
+ opt->check = 'u'; /* unset */
+ opt->nocompress = 0;
+ opt->blocksize = 1024;
+ opt->fmode = opt->dmode = ISOFS_INVALID_MODE;
+ opt->uid_set = 0;
+ opt->gid_set = 0;
+ opt->gid = GLOBAL_ROOT_GID;
+ opt->uid = GLOBAL_ROOT_UID;
+ opt->iocharset = NULL;
+ opt->overriderockperm = 0;
+ opt->session = -1;
+ opt->sbsector = -1;
+
+ fc->fs_private = opt;
+ fc->ops = &isofs_context_ops;
+
+ return 0;
}
static struct file_system_type iso9660_fs_type = {
.owner = THIS_MODULE,
.name = "iso9660",
- .mount = isofs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = isofs_init_fs_context,
+ .parameters = isofs_param_spec,
};
MODULE_ALIAS_FS("iso9660");
MODULE_ALIAS("iso9660");
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b6c114c11b97..03c4b9214f56 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2009,7 +2009,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
byte_count = (block_stop - block_start + 1) *
journal->j_blocksize;
- truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
+ truncate_inode_pages_range(journal->j_dev->bd_mapping,
byte_start, byte_stop);
if (flags & JBD2_JOURNAL_FLUSH_DISCARD) {
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 6da92ecaf66d..bb0ee1a59e71 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -44,8 +44,8 @@ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c)
tsk = kthread_run(jffs2_garbage_collect_thread, c, "jffs2_gcd_mtd%d", c->mtd->index);
if (IS_ERR(tsk)) {
- pr_warn("fork failed for JFFS2 garbage collect thread: %ld\n",
- -PTR_ERR(tsk));
+ pr_warn("fork failed for JFFS2 garbage collect thread: %pe\n",
+ tsk);
complete(&c->gc_thread_exit);
ret = PTR_ERR(tsk);
} else {
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index ce1189793288..411de8b361b2 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -33,27 +33,19 @@ static struct kmem_cache *xattr_ref_cache;
int __init jffs2_create_slab_caches(void)
{
- full_dnode_slab = kmem_cache_create("jffs2_full_dnode",
- sizeof(struct jffs2_full_dnode),
- 0, 0, NULL);
+ full_dnode_slab = KMEM_CACHE(jffs2_full_dnode, 0);
if (!full_dnode_slab)
goto err;
- raw_dirent_slab = kmem_cache_create("jffs2_raw_dirent",
- sizeof(struct jffs2_raw_dirent),
- 0, SLAB_HWCACHE_ALIGN, NULL);
+ raw_dirent_slab = KMEM_CACHE(jffs2_raw_dirent, SLAB_HWCACHE_ALIGN);
if (!raw_dirent_slab)
goto err;
- raw_inode_slab = kmem_cache_create("jffs2_raw_inode",
- sizeof(struct jffs2_raw_inode),
- 0, SLAB_HWCACHE_ALIGN, NULL);
+ raw_inode_slab = KMEM_CACHE(jffs2_raw_inode, SLAB_HWCACHE_ALIGN);
if (!raw_inode_slab)
goto err;
- tmp_dnode_info_slab = kmem_cache_create("jffs2_tmp_dnode",
- sizeof(struct jffs2_tmp_dnode_info),
- 0, 0, NULL);
+ tmp_dnode_info_slab = KMEM_CACHE(jffs2_tmp_dnode_info, 0);
if (!tmp_dnode_info_slab)
goto err;
@@ -63,28 +55,20 @@ int __init jffs2_create_slab_caches(void)
if (!raw_node_ref_slab)
goto err;
- node_frag_slab = kmem_cache_create("jffs2_node_frag",
- sizeof(struct jffs2_node_frag),
- 0, 0, NULL);
+ node_frag_slab = KMEM_CACHE(jffs2_node_frag, 0);
if (!node_frag_slab)
goto err;
- inode_cache_slab = kmem_cache_create("jffs2_inode_cache",
- sizeof(struct jffs2_inode_cache),
- 0, 0, NULL);
+ inode_cache_slab = KMEM_CACHE(jffs2_inode_cache, 0);
if (!inode_cache_slab)
goto err;
#ifdef CONFIG_JFFS2_FS_XATTR
- xattr_datum_cache = kmem_cache_create("jffs2_xattr_datum",
- sizeof(struct jffs2_xattr_datum),
- 0, 0, NULL);
+ xattr_datum_cache = KMEM_CACHE(jffs2_xattr_datum, 0);
if (!xattr_datum_cache)
goto err;
- xattr_ref_cache = kmem_cache_create("jffs2_xattr_ref",
- sizeof(struct jffs2_xattr_ref),
- 0, 0, NULL);
+ xattr_ref_cache = KMEM_CACHE(jffs2_xattr_ref, 0);
if (!xattr_ref_cache)
goto err;
#endif
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index a7bbe879cfc3..bbab2bdc71b6 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -49,28 +49,31 @@ static int jffs2_rp_can_write(struct jffs2_sb_info *c)
return 0;
}
+static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
+ uint32_t *len, uint32_t sumsize);
+
/**
* jffs2_reserve_space - request physical space to write nodes to flash
* @c: superblock info
* @minsize: Minimum acceptable size of allocation
* @len: Returned value of allocation length
* @prio: Allocation type - ALLOC_{NORMAL,DELETION}
+ * @sumsize: summary size requested or JFFS2_SUMMARY_NOSUM_SIZE for no summary
+ *
+ * Requests a block of physical space on the flash.
*
- * Requests a block of physical space on the flash. Returns zero for success
- * and puts 'len' into the appropriate place, or returns -ENOSPC or other
- * error if appropriate. Doesn't return len since that's
+ * Returns: %0 for success and puts 'len' into the appropriate place,
+ * or returns -ENOSPC or other error if appropriate.
+ * Doesn't return len since that's already returned in @len.
*
- * If it returns zero, jffs2_reserve_space() also downs the per-filesystem
+ * If it returns %0, jffs2_reserve_space() also downs the per-filesystem
* allocation semaphore, to prevent more than one allocation from being
- * active at any time. The semaphore is later released by jffs2_commit_allocation()
+ * active at any time. The semaphore is later released by jffs2_commit_allocation().
*
* jffs2_reserve_space() may trigger garbage collection in order to make room
* for the requested allocation.
*/
-static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
- uint32_t *len, uint32_t sumsize);
-
int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
uint32_t *len, int prio, uint32_t sumsize)
{
@@ -488,13 +491,16 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
/**
* jffs2_add_physical_node_ref - add a physical node reference to the list
* @c: superblock info
- * @new: new node reference to add
+ * @ofs: offset in the block
* @len: length of this physical node
+ * @ic: inode cache pointer
*
* Should only be used to report nodes for which space has been allocated
* by jffs2_reserve_space.
*
* Must be called with the alloc_sem held.
+ *
+ * Returns: pointer to new node on success or -errno code on error
*/
struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index aede1be4dc0c..4545f885c41e 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -58,6 +58,7 @@ static void jffs2_i_init_once(void *foo)
struct jffs2_inode_info *f = foo;
mutex_init(&f->sem);
+ f->target = NULL;
inode_init_once(&f->vfs_inode);
}
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index e29f4edf9572..1358c21837f1 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -206,7 +206,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
struct super_block *sb)
{
struct dentry *dentry;
- struct kernfs_node *knparent = NULL;
+ struct kernfs_node *knparent;
BUG_ON(sb->s_op != &kernfs_sops);
diff --git a/fs/namei.c b/fs/namei.c
index cb5dde0e309f..37fb0a8aa09a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3676,9 +3676,9 @@ static int do_open(struct nameidata *nd,
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply pass @nop_mnt_idmap.
*/
-static int vfs_tmpfile(struct mnt_idmap *idmap,
- const struct path *parentpath,
- struct file *file, umode_t mode)
+int vfs_tmpfile(struct mnt_idmap *idmap,
+ const struct path *parentpath,
+ struct file *file, umode_t mode)
{
struct dentry *child;
struct inode *dir = d_inode(parentpath->dentry);
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 1121601536d1..07bc1fd43530 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -181,7 +181,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
struct folio *folio, *writethrough = NULL;
enum netfs_how_to_modify howto;
enum netfs_folio_trace trace;
- unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC;
+ unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0;
ssize_t written = 0, ret, ret2;
loff_t i_size, pos = iocb->ki_pos, from, to;
size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index 608ba6416919..e14cd53ac9fd 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -12,7 +12,7 @@
static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
{
struct inode *inode = wreq->inode;
- unsigned long long end = wreq->start + wreq->len;
+ unsigned long long end = wreq->start + wreq->transferred;
if (!wreq->error &&
i_size_read(inode) < end) {
@@ -27,7 +27,7 @@ static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
* Perform an unbuffered write where we may have to do an RMW operation on an
* encrypted file. This can also be used for direct I/O writes.
*/
-static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
+ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
struct netfs_group *netfs_group)
{
struct netfs_io_request *wreq;
@@ -117,6 +117,7 @@ out:
netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
return ret;
}
+EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked);
/**
* netfs_unbuffered_write_iter - Unbuffered write to a file
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index c90d482b1650..f4a642727479 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -72,6 +72,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
}
}
+ atomic_inc(&ctx->io_count);
trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new);
netfs_proc_add_rreq(rreq);
netfs_stat(&netfs_n_rh_rreq);
@@ -124,6 +125,7 @@ static void netfs_free_request(struct work_struct *work)
{
struct netfs_io_request *rreq =
container_of(work, struct netfs_io_request, work);
+ struct netfs_inode *ictx = netfs_inode(rreq->inode);
unsigned int i;
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
@@ -142,6 +144,9 @@ static void netfs_free_request(struct work_struct *work)
}
kvfree(rreq->direct_bv);
}
+
+ if (atomic_dec_and_test(&ictx->io_count))
+ wake_up_var(&ictx->io_count);
call_rcu(&rreq->rcu, netfs_free_request_rcu);
}
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 60112e4b2c5e..426cf87aaf2e 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -510,7 +510,7 @@ reassess_streams:
* stream has a gap that can be jumped.
*/
if (notes & SOME_EMPTY) {
- unsigned long long jump_to = wreq->start + wreq->len;
+ unsigned long long jump_to = wreq->start + READ_ONCE(wreq->submitted);
for (s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
@@ -690,10 +690,11 @@ void netfs_write_collection_worker(struct work_struct *work)
wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
if (wreq->iocb) {
- wreq->iocb->ki_pos += wreq->transferred;
+ size_t written = min(wreq->transferred, wreq->len);
+ wreq->iocb->ki_pos += written;
if (wreq->iocb->ki_complete)
wreq->iocb->ki_complete(
- wreq->iocb, wreq->error ? wreq->error : wreq->transferred);
+ wreq->iocb, wreq->error ? wreq->error : written);
wreq->iocb = VFS_PTR_POISON;
}
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index e190043bc0da..3aa86e268f40 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -254,7 +254,7 @@ static void netfs_issue_write(struct netfs_io_request *wreq,
stream->construct = NULL;
if (subreq->start + subreq->len > wreq->start + wreq->submitted)
- wreq->len = wreq->submitted = subreq->start + subreq->len - wreq->start;
+ WRITE_ONCE(wreq->submitted, subreq->start + subreq->len - wreq->start);
netfs_do_issue_write(stream, subreq);
}
@@ -636,7 +636,12 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr
mutex_unlock(&ictx->wb_lock);
- ret = wreq->error;
+ if (wreq->iocb) {
+ ret = -EIOCBQUEUED;
+ } else {
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE);
+ ret = wreq->error;
+ }
netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
return ret;
}
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index f7e32d76e34d..57249f040dfc 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -33,12 +33,12 @@ config NFS_FS
config NFS_V2
tristate "NFS client support for NFS version 2"
depends on NFS_FS
- default y
+ default n
help
This option enables support for version 2 of the NFS protocol
(RFC 1094) in the kernel's NFS client.
- If unsure, say Y.
+ If unsure, say N.
config NFS_V3
tristate "NFS client support for NFS version 3"
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ac505671efbd..342930996226 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -56,6 +56,8 @@ static int nfs_readdir(struct file *, struct dir_context *);
static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
static loff_t nfs_llseek_dir(struct file *, loff_t, int);
static void nfs_readdir_clear_array(struct folio *);
+static int nfs_do_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, int open_flags);
const struct file_operations nfs_dir_operations = {
.llseek = nfs_llseek_dir,
@@ -2243,6 +2245,41 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
#endif /* CONFIG_NFSV4 */
+int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
+ struct file *file, unsigned int open_flags,
+ umode_t mode)
+{
+
+ /* Same as look+open from lookup_open(), but with different O_TRUNC
+ * handling.
+ */
+ int error = 0;
+
+ if (open_flags & O_CREAT) {
+ file->f_mode |= FMODE_CREATED;
+ error = nfs_do_create(dir, dentry, mode, open_flags);
+ if (error)
+ return error;
+ return finish_open(file, dentry, NULL);
+ } else if (d_in_lookup(dentry)) {
+ /* The only flags nfs_lookup considers are
+ * LOOKUP_EXCL and LOOKUP_RENAME_TARGET, and
+ * we want those to be zero so the lookup isn't skipped.
+ */
+ struct dentry *res = nfs_lookup(dir, dentry, 0);
+
+ d_lookup_done(dentry);
+ if (unlikely(res)) {
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+ return finish_no_open(file, res);
+ }
+ }
+ return finish_no_open(file, NULL);
+
+}
+EXPORT_SYMBOL_GPL(nfs_atomic_open_v23);
+
struct dentry *
nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
struct nfs_fattr *fattr)
@@ -2303,18 +2340,23 @@ EXPORT_SYMBOL_GPL(nfs_instantiate);
* that the operation succeeded on the server, but an error in the
* reply path made it appear to have failed.
*/
-int nfs_create(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode, bool excl)
+static int nfs_do_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, int open_flags)
{
struct iattr attr;
- int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;
int error;
+ open_flags |= O_CREAT;
+
dfprintk(VFS, "NFS: create(%s/%lu), %pd\n",
dir->i_sb->s_id, dir->i_ino, dentry);
attr.ia_mode = mode;
attr.ia_valid = ATTR_MODE;
+ if (open_flags & O_TRUNC) {
+ attr.ia_size = 0;
+ attr.ia_valid |= ATTR_SIZE;
+ }
trace_nfs_create_enter(dir, dentry, open_flags);
error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags);
@@ -2326,6 +2368,12 @@ out_err:
d_drop(dentry);
return error;
}
+
+int nfs_create(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
+{
+ return nfs_do_create(dir, dentry, mode, excl ? O_EXCL : 0);
+}
EXPORT_SYMBOL_GPL(nfs_create);
/*
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index ce8f8934bca5..29d84dc66ca3 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -605,14 +605,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
dprintk("--> %s\n", __func__);
- /* FIXME: remove this check when layout segment support is added */
- if (lgr->range.offset != 0 ||
- lgr->range.length != NFS4_MAX_UINT64) {
- dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
- __func__);
- goto out;
- }
-
if (fl->pattern_offset > lgr->range.offset) {
dprintk("%s pattern_offset %lld too large\n",
__func__, fl->pattern_offset);
@@ -875,15 +867,15 @@ static void
filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- pnfs_generic_pg_check_layout(pgio);
+ pnfs_generic_pg_check_layout(pgio, req);
if (!pgio->pg_lseg) {
pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
nfs_req_openctx(req),
- 0,
- NFS4_MAX_UINT64,
+ req_offset(req),
+ req->wb_bytes,
IOMODE_READ,
false,
- GFP_KERNEL);
+ nfs_io_gfp_mask());
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
@@ -899,15 +891,15 @@ static void
filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- pnfs_generic_pg_check_layout(pgio);
+ pnfs_generic_pg_check_layout(pgio, req);
if (!pgio->pg_lseg) {
pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
nfs_req_openctx(req),
- 0,
- NFS4_MAX_UINT64,
+ req_offset(req),
+ req->wb_bytes,
IOMODE_RW,
false,
- GFP_NOFS);
+ nfs_io_gfp_mask());
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 3e724cb7ef01..24188af56d5b 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -823,14 +823,6 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
}
static void
-ff_layout_pg_check_layout(struct nfs_pageio_descriptor *pgio,
- struct nfs_page *req)
-{
- pnfs_generic_pg_check_layout(pgio);
- pnfs_generic_pg_check_range(pgio, req);
-}
-
-static void
ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
@@ -840,7 +832,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
u32 ds_idx;
retry:
- ff_layout_pg_check_layout(pgio, req);
+ pnfs_generic_pg_check_layout(pgio, req);
/* Use full layout for now */
if (!pgio->pg_lseg) {
ff_layout_pg_get_read(pgio, req, false);
@@ -895,7 +887,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
u32 i;
retry:
- ff_layout_pg_check_layout(pgio, req);
+ pnfs_generic_pg_check_layout(pgio, req);
if (!pgio->pg_lseg) {
pgio->pg_lseg =
pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index d0a0956f8a13..6c9f3f6645dd 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -600,9 +600,11 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
break;
case Opt_lock:
if (result.negated) {
+ ctx->lock_status = NFS_LOCK_NOLOCK;
ctx->flags |= NFS_MOUNT_NONLM;
ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL);
} else {
+ ctx->lock_status = NFS_LOCK_LOCK;
ctx->flags &= ~NFS_MOUNT_NONLM;
ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL);
}
@@ -1112,9 +1114,12 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
ctx->acdirmax = data->acdirmax;
ctx->need_mount = false;
- memcpy(sap, &data->addr, sizeof(data->addr));
- ctx->nfs_server.addrlen = sizeof(data->addr);
- ctx->nfs_server.port = ntohs(data->addr.sin_port);
+ if (!is_remount_fc(fc)) {
+ memcpy(sap, &data->addr, sizeof(data->addr));
+ ctx->nfs_server.addrlen = sizeof(data->addr);
+ ctx->nfs_server.port = ntohs(data->addr.sin_port);
+ }
+
if (sap->ss_family != AF_INET ||
!nfs_verify_server_address(sap))
goto out_no_address;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 06253695fe53..9f0f4534744b 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -112,6 +112,7 @@ struct nfs_fs_context {
unsigned short protofamily;
unsigned short mountfamily;
bool has_sec_mnt_opts;
+ int lock_status;
struct {
union {
@@ -153,6 +154,12 @@ struct nfs_fs_context {
} clone_data;
};
+enum nfs_lock_status {
+ NFS_LOCK_NOT_SET = 0,
+ NFS_LOCK_LOCK = 1,
+ NFS_LOCK_NOLOCK = 2,
+};
+
#define nfs_errorf(fc, fmt, ...) ((fc)->log.log ? \
errorf(fc, fmt, ## __VA_ARGS__) : \
({ dprintk(fmt "\n", ## __VA_ARGS__); }))
@@ -710,9 +717,9 @@ unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
if ((bsize & (bsize - 1)) || nrbitsp) {
unsigned char nrbits;
- for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--)
+ for (nrbits = 31; nrbits && !(bsize & (1UL << nrbits)); nrbits--)
;
- bsize = 1 << nrbits;
+ bsize = 1UL << nrbits;
if (nrbitsp)
*nrbitsp = nrbits;
}
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index cbbe3f0193b8..74bda639a7cf 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -986,6 +986,7 @@ static int nfs3_have_delegation(struct inode *inode, fmode_t flags)
static const struct inode_operations nfs3_dir_inode_operations = {
.create = nfs_create,
+ .atomic_open = nfs_atomic_open_v23,
.lookup = nfs_lookup,
.link = nfs_link,
.unlink = nfs_unlink,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ea390db94b62..c93c12063b3a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5456,7 +5456,7 @@ static bool nfs4_read_plus_not_supported(struct rpc_task *task,
struct rpc_message *msg = &task->tk_msg;
if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS] &&
- server->caps & NFS_CAP_READ_PLUS && task->tk_status == -ENOTSUPP) {
+ task->tk_status == -ENOTSUPP) {
server->caps &= ~NFS_CAP_READ_PLUS;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
rpc_restart_call_prepare(task);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 662e86ea3a2d..5b452411e8fd 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2116,6 +2116,7 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred
{
struct nfs_client *clp = server->nfs_client;
struct nfs4_fs_locations *locations = NULL;
+ struct nfs_fattr *fattr;
struct inode *inode;
struct page *page;
int status, result;
@@ -2125,19 +2126,16 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred
(unsigned long long)server->fsid.minor,
clp->cl_hostname);
- result = 0;
page = alloc_page(GFP_KERNEL);
locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
- if (page == NULL || locations == NULL) {
- dprintk("<-- %s: no memory\n", __func__);
- goto out;
- }
- locations->fattr = nfs_alloc_fattr();
- if (locations->fattr == NULL) {
+ fattr = nfs_alloc_fattr();
+ if (page == NULL || locations == NULL || fattr == NULL) {
dprintk("<-- %s: no memory\n", __func__);
+ result = 0;
goto out;
}
+ locations->fattr = fattr;
inode = d_inode(server->super->s_root);
result = nfs4_proc_get_locations(server, NFS_FH(inode), locations,
page, cred);
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 10985a4b8259..4de8780a7c48 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -47,7 +47,7 @@ DECLARE_EVENT_CLASS(nfs4_clientid_event,
TP_fast_assign(
__entry->error = error < 0 ? -error : 0;
- __assign_str(dstaddr, clp->cl_hostname);
+ __assign_str(dstaddr);
),
TP_printk(
@@ -94,8 +94,8 @@ TRACE_EVENT(nfs4_trunked_exchange_id,
TP_fast_assign(
__entry->error = error < 0 ? -error : 0;
- __assign_str(main_addr, clp->cl_hostname);
- __assign_str(trunk_addr, addr);
+ __assign_str(main_addr);
+ __assign_str(trunk_addr);
),
TP_printk(
@@ -365,7 +365,7 @@ TRACE_EVENT(nfs4_state_mgr,
TP_fast_assign(
__entry->state = clp->cl_state;
- __assign_str(hostname, clp->cl_hostname);
+ __assign_str(hostname);
),
TP_printk(
@@ -393,8 +393,8 @@ TRACE_EVENT(nfs4_state_mgr_failed,
TP_fast_assign(
__entry->error = status < 0 ? -status : 0;
__entry->state = clp->cl_state;
- __assign_str(hostname, clp->cl_hostname);
- __assign_str(section, section);
+ __assign_str(hostname);
+ __assign_str(section);
),
TP_printk(
@@ -578,7 +578,7 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__entry->fhandle = 0;
}
__entry->dir = NFS_FILEID(d_inode(ctx->dentry->d_parent));
- __assign_str(name, ctx->dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -1072,7 +1072,7 @@ DECLARE_EVENT_CLASS(nfs4_lookup_event,
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
__entry->error = -error;
- __assign_str(name, name->name);
+ __assign_str(name);
),
TP_printk(
@@ -1156,8 +1156,8 @@ TRACE_EVENT(nfs4_rename,
__entry->olddir = NFS_FILEID(olddir);
__entry->newdir = NFS_FILEID(newdir);
__entry->error = error < 0 ? -error : 0;
- __assign_str(oldname, oldname->name);
- __assign_str(newname, newname->name);
+ __assign_str(oldname);
+ __assign_str(newname);
),
TP_printk(
@@ -1359,7 +1359,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
__entry->fileid = 0;
__entry->dev = 0;
}
- __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown");
+ __assign_str(dstaddr);
),
TP_printk(
@@ -1416,7 +1416,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
__entry->fileid = 0;
__entry->dev = 0;
}
- __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown");
+ __assign_str(dstaddr);
__entry->stateid_seq =
be32_to_cpu(stateid->seqid);
__entry->stateid_hash =
@@ -1960,7 +1960,7 @@ DECLARE_EVENT_CLASS(nfs4_deviceid_event,
),
TP_fast_assign(
- __assign_str(dstaddr, clp->cl_hostname);
+ __assign_str(dstaddr);
memcpy(__entry->deviceid, deviceid->data,
NFS4_DEVICEID4_SIZE);
),
@@ -1998,7 +1998,7 @@ DECLARE_EVENT_CLASS(nfs4_deviceid_status,
TP_fast_assign(
__entry->dev = server->s_dev;
__entry->status = status;
- __assign_str(dstaddr, server->nfs_client->cl_hostname);
+ __assign_str(dstaddr);
memcpy(__entry->deviceid, deviceid->data,
NFS4_DEVICEID4_SIZE);
),
@@ -2036,8 +2036,8 @@ TRACE_EVENT(fl_getdevinfo,
),
TP_fast_assign(
- __assign_str(mds_addr, server->nfs_client->cl_hostname);
- __assign_str(ds_ips, ds_remotestr);
+ __assign_str(mds_addr);
+ __assign_str(ds_ips);
memcpy(__entry->deviceid, deviceid->data,
NFS4_DEVICEID4_SIZE);
),
@@ -2083,9 +2083,7 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
be32_to_cpu(hdr->args.stateid.seqid);
__entry->stateid_hash =
nfs_stateid_hash(&hdr->args.stateid);
- __assign_str(dstaddr, hdr->ds_clp ?
- rpc_peeraddr2str(hdr->ds_clp->cl_rpcclient,
- RPC_DISPLAY_ADDR) : "unknown");
+ __assign_str(dstaddr);
),
TP_printk(
@@ -2139,9 +2137,7 @@ TRACE_EVENT(ff_layout_commit_error,
__entry->dev = inode->i_sb->s_dev;
__entry->offset = data->args.offset;
__entry->count = data->args.count;
- __assign_str(dstaddr, data->ds_clp ?
- rpc_peeraddr2str(data->ds_clp->cl_rpcclient,
- RPC_DISPLAY_ADDR) : "unknown");
+ __assign_str(dstaddr);
),
TP_printk(
@@ -2579,7 +2575,7 @@ DECLARE_EVENT_CLASS(nfs4_xattr_event,
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk(
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index afedb449b54f..1e710654af11 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -409,7 +409,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event,
__entry->dir = NFS_FILEID(dir);
__entry->flags = flags;
__entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry));
- __assign_str(name, dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -457,7 +457,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done,
__entry->error = error < 0 ? -error : 0;
__entry->flags = flags;
__entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry));
- __assign_str(name, dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -512,7 +512,7 @@ TRACE_EVENT(nfs_atomic_open_enter,
__entry->dir = NFS_FILEID(dir);
__entry->flags = flags;
__entry->fmode = (__force unsigned long)ctx->mode;
- __assign_str(name, ctx->dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -551,7 +551,7 @@ TRACE_EVENT(nfs_atomic_open_exit,
__entry->dir = NFS_FILEID(dir);
__entry->flags = flags;
__entry->fmode = (__force unsigned long)ctx->mode;
- __assign_str(name, ctx->dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -587,7 +587,7 @@ TRACE_EVENT(nfs_create_enter,
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
__entry->flags = flags;
- __assign_str(name, dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -623,7 +623,7 @@ TRACE_EVENT(nfs_create_exit,
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
__entry->flags = flags;
- __assign_str(name, dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -654,7 +654,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event,
TP_fast_assign(
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
- __assign_str(name, dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -693,7 +693,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event_done,
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
__entry->error = error < 0 ? -error : 0;
- __assign_str(name, dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -747,7 +747,7 @@ TRACE_EVENT(nfs_link_enter,
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->dir = NFS_FILEID(dir);
- __assign_str(name, dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -783,7 +783,7 @@ TRACE_EVENT(nfs_link_exit,
__entry->fileid = NFS_FILEID(inode);
__entry->dir = NFS_FILEID(dir);
__entry->error = error < 0 ? -error : 0;
- __assign_str(name, dentry->d_name.name);
+ __assign_str(name);
),
TP_printk(
@@ -819,8 +819,8 @@ DECLARE_EVENT_CLASS(nfs_rename_event,
__entry->dev = old_dir->i_sb->s_dev;
__entry->old_dir = NFS_FILEID(old_dir);
__entry->new_dir = NFS_FILEID(new_dir);
- __assign_str(old_name, old_dentry->d_name.name);
- __assign_str(new_name, new_dentry->d_name.name);
+ __assign_str(old_name);
+ __assign_str(new_name);
),
TP_printk(
@@ -868,8 +868,8 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done,
__entry->error = -error;
__entry->old_dir = NFS_FILEID(old_dir);
__entry->new_dir = NFS_FILEID(new_dir);
- __assign_str(old_name, old_dentry->d_name.name);
- __assign_str(new_name, new_dentry->d_name.name);
+ __assign_str(old_name);
+ __assign_str(new_name);
),
TP_printk(
@@ -1636,8 +1636,8 @@ TRACE_EVENT(nfs_mount_assign,
),
TP_fast_assign(
- __assign_str(option, option);
- __assign_str(value, value);
+ __assign_str(option);
+ __assign_str(value);
),
TP_printk("option %s=%s",
@@ -1657,7 +1657,7 @@ TRACE_EVENT(nfs_mount_option,
),
TP_fast_assign(
- __assign_str(option, param->key);
+ __assign_str(option);
),
TP_printk("option %s", __get_str(option))
@@ -1675,7 +1675,7 @@ TRACE_EVENT(nfs_mount_path,
),
TP_fast_assign(
- __assign_str(path, path);
+ __assign_str(path);
),
TP_printk("path='%s'", __get_str(path))
@@ -1710,9 +1710,8 @@ DECLARE_EVENT_CLASS(nfs_xdr_event,
__entry->xid = be32_to_cpu(rqstp->rq_xid);
__entry->version = task->tk_client->cl_vers;
__entry->error = error;
- __assign_str(program,
- task->tk_client->cl_program->name);
- __assign_str(procedure, task->tk_msg.rpc_proc->p_name);
+ __assign_str(program);
+ __assign_str(procedure);
),
TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a5cc6199127f..b5834728f31b 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2705,38 +2705,28 @@ pnfs_layout_return_unused_byclid(struct nfs_client *clp,
&range);
}
+/* Check if we have we have a valid layout but if there isn't an intersection
+ * between the request and the pgio->pg_lseg, put this pgio->pg_lseg away.
+ */
void
-pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio)
+pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
{
if (pgio->pg_lseg == NULL ||
- test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags))
+ (test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags) &&
+ pnfs_lseg_request_intersecting(pgio->pg_lseg, req)))
return;
pnfs_put_lseg(pgio->pg_lseg);
pgio->pg_lseg = NULL;
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout);
-/*
- * Check for any intersection between the request and the pgio->pg_lseg,
- * and if none, put this pgio->pg_lseg away.
- */
-void
-pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
-{
- if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) {
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
- }
-}
-EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range);
-
void
pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
u64 rd_size;
- pnfs_generic_pg_check_layout(pgio);
- pnfs_generic_pg_check_range(pgio, req);
+ pnfs_generic_pg_check_layout(pgio, req);
if (pgio->pg_lseg == NULL) {
if (pgio->pg_dreq == NULL)
rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
@@ -2766,8 +2756,7 @@ void
pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req, u64 wb_size)
{
- pnfs_generic_pg_check_layout(pgio);
- pnfs_generic_pg_check_range(pgio, req);
+ pnfs_generic_pg_check_layout(pgio, req);
if (pgio->pg_lseg == NULL) {
pgio->pg_lseg =
pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index db57a85500ee..fa5beeaaf5da 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -257,8 +257,7 @@ void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *);
void unset_pnfs_layoutdriver(struct nfs_server *);
-void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio);
-void pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req);
+void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio, struct nfs_page *req);
void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ad3a321ae997..d105e5b2659d 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -695,6 +695,7 @@ static int nfs_have_delegation(struct inode *inode, fmode_t flags)
static const struct inode_operations nfs_dir_inode_operations = {
.create = nfs_create,
.lookup = nfs_lookup,
+ .atomic_open = nfs_atomic_open_v23,
.link = nfs_link,
.unlink = nfs_unlink,
.symlink = nfs_symlink,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index dc03f98f7616..cbbd4866b0b7 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -901,6 +901,16 @@ static struct nfs_server *nfs_try_mount_request(struct fs_context *fc)
rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS];
unsigned int authlist_len = ARRAY_SIZE(authlist);
+ /* make sure 'nolock'/'lock' override the 'local_lock' mount option */
+ if (ctx->lock_status) {
+ if (ctx->lock_status == NFS_LOCK_NOLOCK) {
+ ctx->flags |= NFS_MOUNT_NONLM;
+ ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL);
+ } else {
+ ctx->flags &= ~NFS_MOUNT_NONLM;
+ ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL);
+ }
+ }
status = nfs_request_mount(fc, ctx->mntfh, authlist, &authlist_len);
if (status)
return ERR_PTR(status);
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ddd3e0d9cfa6..ad9083ca144b 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -159,8 +159,8 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode)
do {
fsnotify_group_lock(nfsd_file_fsnotify_group);
- mark = fsnotify_find_mark(&inode->i_fsnotify_marks,
- nfsd_file_fsnotify_group);
+ mark = fsnotify_find_inode_mark(inode,
+ nfsd_file_fsnotify_group);
if (mark) {
nfm = nfsd_file_mark_get(container_of(mark,
struct nfsd_file_mark,
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index b5e48d504062..77bbd23aa150 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -104,7 +104,7 @@ TRACE_EVENT(nfsd_compound,
TP_fast_assign(
__entry->xid = be32_to_cpu(rqst->rq_xid);
__entry->opcnt = opcnt;
- __assign_str(tag, tag);
+ __assign_str(tag);
),
TP_printk("xid=0x%08x opcnt=%u tag=%s",
__entry->xid, __entry->opcnt, __get_str(tag)
@@ -127,7 +127,7 @@ TRACE_EVENT(nfsd_compound_status,
__entry->args_opcnt = args_opcnt;
__entry->resp_opcnt = resp_opcnt;
__entry->status = be32_to_cpu(status);
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("op=%u/%u %s status=%d",
__entry->resp_opcnt, __entry->args_opcnt,
@@ -318,7 +318,7 @@ TRACE_EVENT(nfsd_exp_find_key,
TP_fast_assign(
__entry->fsidtype = key->ek_fsidtype;
memcpy(__entry->fsid, key->ek_fsid, 4*6);
- __assign_str(auth_domain, key->ek_client->name);
+ __assign_str(auth_domain);
__entry->status = status;
),
TP_printk("fsid=%x::%s domain=%s status=%d",
@@ -342,8 +342,8 @@ TRACE_EVENT(nfsd_expkey_update,
TP_fast_assign(
__entry->fsidtype = key->ek_fsidtype;
memcpy(__entry->fsid, key->ek_fsid, 4*6);
- __assign_str(auth_domain, key->ek_client->name);
- __assign_str(path, exp_path);
+ __assign_str(auth_domain);
+ __assign_str(path);
__entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags);
),
TP_printk("fsid=%x::%s domain=%s path=%s cache=%s",
@@ -365,8 +365,8 @@ TRACE_EVENT(nfsd_exp_get_by_name,
__field(int, status)
),
TP_fast_assign(
- __assign_str(path, key->ex_path.dentry->d_name.name);
- __assign_str(auth_domain, key->ex_client->name);
+ __assign_str(path);
+ __assign_str(auth_domain);
__entry->status = status;
),
TP_printk("path=%s domain=%s status=%d",
@@ -385,8 +385,8 @@ TRACE_EVENT(nfsd_export_update,
__field(bool, cache)
),
TP_fast_assign(
- __assign_str(path, key->ex_path.dentry->d_name.name);
- __assign_str(auth_domain, key->ex_client->name);
+ __assign_str(path);
+ __assign_str(auth_domain);
__entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags);
),
TP_printk("path=%s domain=%s cache=%s",
@@ -485,7 +485,7 @@ TRACE_EVENT(nfsd_dirent,
TP_fast_assign(
__entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0;
__entry->ino = ino;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("fh_hash=0x%08x ino=%llu name=%s",
__entry->fh_hash, __entry->ino, __get_str(name)
@@ -1000,7 +1000,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class,
__entry->flavor = clp->cl_cred.cr_flavor;
memcpy(__entry->verifier, (void *)&clp->cl_verifier,
NFS4_VERIFIER_SIZE);
- __assign_str(name, clp->cl_name.data);
+ __assign_str(name);
),
TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x",
__entry->addr, __get_str(name),
@@ -1519,7 +1519,7 @@ TRACE_EVENT(nfsd_cb_setup,
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
- __assign_str(netid, netid);
+ __assign_str(netid);
__entry->authflavor = authflavor;
__assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
clp->cl_cb_conn.cb_addrlen)
@@ -1864,7 +1864,7 @@ TRACE_EVENT(nfsd_ctl_unlock_ip,
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
- __assign_str(address, address);
+ __assign_str(address);
),
TP_printk("address=%s",
__get_str(address)
@@ -1883,7 +1883,7 @@ TRACE_EVENT(nfsd_ctl_unlock_fs,
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
- __assign_str(path, path);
+ __assign_str(path);
),
TP_printk("path=%s",
__get_str(path)
@@ -1907,8 +1907,8 @@ TRACE_EVENT(nfsd_ctl_filehandle,
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__entry->maxsize = maxsize;
- __assign_str(domain, domain);
- __assign_str(path, path);
+ __assign_str(domain);
+ __assign_str(path);
),
TP_printk("domain=%s path=%s maxsize=%d",
__get_str(domain), __get_str(path), __entry->maxsize
@@ -1968,7 +1968,7 @@ TRACE_EVENT(nfsd_ctl_version,
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
- __assign_str(mesg, mesg);
+ __assign_str(mesg);
),
TP_printk("%s",
__get_str(mesg)
@@ -2009,7 +2009,7 @@ TRACE_EVENT(nfsd_ctl_ports_addxprt,
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__entry->port = port;
- __assign_str(transport, transport);
+ __assign_str(transport);
),
TP_printk("transport=%s port=%d",
__get_str(transport), __entry->port
@@ -2070,7 +2070,7 @@ TRACE_EVENT(nfsd_ctl_time,
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__entry->time = time;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("file=%s time=%d",
__get_str(name), __entry->time
@@ -2089,7 +2089,7 @@ TRACE_EVENT(nfsd_ctl_recoverydir,
),
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
- __assign_str(recdir, recdir);
+ __assign_str(recdir);
),
TP_printk("recdir=%s",
__get_str(recdir)
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 020f304c600e..b638dc06df2f 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -702,8 +702,12 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
if (WARN_ON(!bh))
return; /* should never happen */
+ lock_buffer(bh);
memset(bh->b_data, 0, bh->b_size);
+ set_buffer_uptodate(bh);
set_buffer_dirty(bh);
+ unlock_buffer(bh);
+
err = sync_dirty_buffer(bh);
if (unlikely(err))
nilfs_warn(nilfs->ns_sb,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 8654ab8ad534..60d4f59f7665 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2118,8 +2118,10 @@ static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
{
spin_lock(&sci->sc_state_lock);
if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
- sci->sc_timer.expires = jiffies + sci->sc_interval;
- add_timer(&sci->sc_timer);
+ if (sci->sc_task) {
+ sci->sc_timer.expires = jiffies + sci->sc_interval;
+ add_timer(&sci->sc_timer);
+ }
sci->sc_state |= NILFS_SEGCTOR_COMMIT;
}
spin_unlock(&sci->sc_state_lock);
@@ -2166,19 +2168,36 @@ static int nilfs_segctor_sync(struct nilfs_sc_info *sci)
struct nilfs_segctor_wait_request wait_req;
int err = 0;
- spin_lock(&sci->sc_state_lock);
init_wait(&wait_req.wq);
wait_req.err = 0;
atomic_set(&wait_req.done, 0);
+ init_waitqueue_entry(&wait_req.wq, current);
+
+ /*
+ * To prevent a race issue where completion notifications from the
+ * log writer thread are missed, increment the request sequence count
+ * "sc_seq_request" and insert a wait queue entry using the current
+ * sequence number into the "sc_wait_request" queue at the same time
+ * within the lock section of "sc_state_lock".
+ */
+ spin_lock(&sci->sc_state_lock);
wait_req.seq = ++sci->sc_seq_request;
+ add_wait_queue(&sci->sc_wait_request, &wait_req.wq);
spin_unlock(&sci->sc_state_lock);
- init_waitqueue_entry(&wait_req.wq, current);
- add_wait_queue(&sci->sc_wait_request, &wait_req.wq);
- set_current_state(TASK_INTERRUPTIBLE);
wake_up(&sci->sc_wait_daemon);
for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /*
+ * Synchronize only while the log writer thread is alive.
+ * Leave flushing out after the log writer thread exits to
+ * the cleanup work in nilfs_segctor_destroy().
+ */
+ if (!sci->sc_task)
+ break;
+
if (atomic_read(&wait_req.done)) {
err = wait_req.err;
break;
@@ -2194,7 +2213,7 @@ static int nilfs_segctor_sync(struct nilfs_sc_info *sci)
return err;
}
-static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
+static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err, bool force)
{
struct nilfs_segctor_wait_request *wrq, *n;
unsigned long flags;
@@ -2202,7 +2221,7 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
spin_lock_irqsave(&sci->sc_wait_request.lock, flags);
list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.head, wq.entry) {
if (!atomic_read(&wrq->done) &&
- nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) {
+ (force || nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq))) {
wrq->err = err;
atomic_set(&wrq->done, 1);
}
@@ -2320,10 +2339,21 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
*/
static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
{
+ bool thread_is_alive;
+
spin_lock(&sci->sc_state_lock);
sci->sc_seq_accepted = sci->sc_seq_request;
+ thread_is_alive = (bool)sci->sc_task;
spin_unlock(&sci->sc_state_lock);
- del_timer_sync(&sci->sc_timer);
+
+ /*
+ * This function does not race with the log writer thread's
+ * termination. Therefore, deleting sc_timer, which should not be
+ * done after the log writer thread exits, can be done safely outside
+ * the area protected by sc_state_lock.
+ */
+ if (thread_is_alive)
+ del_timer_sync(&sci->sc_timer);
}
/**
@@ -2340,7 +2370,7 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
if (mode == SC_LSEG_SR) {
sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
sci->sc_seq_done = sci->sc_seq_accepted;
- nilfs_segctor_wakeup(sci, err);
+ nilfs_segctor_wakeup(sci, err, false);
sci->sc_flush_request = 0;
} else {
if (mode == SC_FLUSH_FILE)
@@ -2349,7 +2379,7 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
sci->sc_flush_request &= ~FLUSH_DAT_BIT;
/* re-enable timer if checkpoint creation was not done */
- if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+ if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && sci->sc_task &&
time_before(jiffies, sci->sc_timer.expires))
add_timer(&sci->sc_timer);
}
@@ -2539,6 +2569,7 @@ static int nilfs_segctor_thread(void *arg)
int timeout = 0;
sci->sc_timer_task = current;
+ timer_setup(&sci->sc_timer, nilfs_construction_timeout, 0);
/* start sync. */
sci->sc_task = current;
@@ -2606,6 +2637,7 @@ static int nilfs_segctor_thread(void *arg)
end_thread:
/* end sync. */
sci->sc_task = NULL;
+ timer_shutdown_sync(&sci->sc_timer);
wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */
spin_unlock(&sci->sc_state_lock);
return 0;
@@ -2669,7 +2701,6 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
INIT_LIST_HEAD(&sci->sc_gc_inodes);
INIT_LIST_HEAD(&sci->sc_iput_queue);
INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func);
- timer_setup(&sci->sc_timer, nilfs_construction_timeout, 0);
sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
@@ -2723,6 +2754,13 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
|| sci->sc_seq_request != sci->sc_seq_done);
spin_unlock(&sci->sc_state_lock);
+ /*
+ * Forcibly wake up tasks waiting in nilfs_segctor_sync(), which can
+ * be called from delayed iput() via nilfs_evict_inode() and can race
+ * with the above log writer thread termination.
+ */
+ nilfs_segctor_wakeup(sci, 0, true);
+
if (flush_work(&sci->sc_iput_work))
flag = true;
@@ -2748,7 +2786,6 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
down_write(&nilfs->ns_segctor_sem);
- timer_shutdown_sync(&sci->sc_timer);
kfree(sci);
}
@@ -2784,7 +2821,7 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
if (!nilfs->ns_writer)
return -ENOMEM;
- inode_attach_wb(nilfs->ns_bdev->bd_inode, NULL);
+ inode_attach_wb(nilfs->ns_bdev->bd_mapping->host, NULL);
err = nilfs_segctor_start_thread(nilfs->ns_writer);
if (unlikely(err))
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 3464fa7e8538..f3669403fabf 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -162,7 +162,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
if (!S_ISDIR(inode->i_mode))
return;
- fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
+ fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group);
if (!fsn_mark)
return;
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
@@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
fsnotify_group_lock(dnotify_group);
/* add the new_fsn_mark or find an old one. */
- fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
+ fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group);
if (fsn_mark) {
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
spin_lock(&fsn_mark->lock);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index fbdc63cc10d9..9ec313e9f6e1 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1076,7 +1076,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
}
static int fanotify_remove_mark(struct fsnotify_group *group,
- fsnotify_connp_t *connp, __u32 mask,
+ void *obj, unsigned int obj_type, __u32 mask,
unsigned int flags, __u32 umask)
{
struct fsnotify_mark *fsn_mark = NULL;
@@ -1084,7 +1084,7 @@ static int fanotify_remove_mark(struct fsnotify_group *group,
int destroy_mark;
fsnotify_group_lock(group);
- fsn_mark = fsnotify_find_mark(connp, group);
+ fsn_mark = fsnotify_find_mark(obj, obj_type, group);
if (!fsn_mark) {
fsnotify_group_unlock(group);
return -ENOENT;
@@ -1105,30 +1105,6 @@ static int fanotify_remove_mark(struct fsnotify_group *group,
return 0;
}
-static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
- struct vfsmount *mnt, __u32 mask,
- unsigned int flags, __u32 umask)
-{
- return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
- mask, flags, umask);
-}
-
-static int fanotify_remove_sb_mark(struct fsnotify_group *group,
- struct super_block *sb, __u32 mask,
- unsigned int flags, __u32 umask)
-{
- return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask,
- flags, umask);
-}
-
-static int fanotify_remove_inode_mark(struct fsnotify_group *group,
- struct inode *inode, __u32 mask,
- unsigned int flags, __u32 umask)
-{
- return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask,
- flags, umask);
-}
-
static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
unsigned int fan_flags)
{
@@ -1249,7 +1225,7 @@ static int fanotify_set_mark_fsid(struct fsnotify_group *group,
}
static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
- fsnotify_connp_t *connp,
+ void *obj,
unsigned int obj_type,
unsigned int fan_flags,
struct fan_fsid *fsid)
@@ -1288,7 +1264,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0;
}
- ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0);
+ ret = fsnotify_add_mark_locked(mark, obj, obj_type, 0);
if (ret)
goto out_put_mark;
@@ -1344,7 +1320,7 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
}
static int fanotify_add_mark(struct fsnotify_group *group,
- fsnotify_connp_t *connp, unsigned int obj_type,
+ void *obj, unsigned int obj_type,
__u32 mask, unsigned int fan_flags,
struct fan_fsid *fsid)
{
@@ -1353,9 +1329,9 @@ static int fanotify_add_mark(struct fsnotify_group *group,
int ret = 0;
fsnotify_group_lock(group);
- fsn_mark = fsnotify_find_mark(connp, group);
+ fsn_mark = fsnotify_find_mark(obj, obj_type, group);
if (!fsn_mark) {
- fsn_mark = fanotify_add_new_mark(group, connp, obj_type,
+ fsn_mark = fanotify_add_new_mark(group, obj, obj_type,
fan_flags, fsid);
if (IS_ERR(fsn_mark)) {
fsnotify_group_unlock(group);
@@ -1392,42 +1368,6 @@ out:
return ret;
}
-static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
- struct vfsmount *mnt, __u32 mask,
- unsigned int flags, struct fan_fsid *fsid)
-{
- return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
- FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
-}
-
-static int fanotify_add_sb_mark(struct fsnotify_group *group,
- struct super_block *sb, __u32 mask,
- unsigned int flags, struct fan_fsid *fsid)
-{
- return fanotify_add_mark(group, &sb->s_fsnotify_marks,
- FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
-}
-
-static int fanotify_add_inode_mark(struct fsnotify_group *group,
- struct inode *inode, __u32 mask,
- unsigned int flags, struct fan_fsid *fsid)
-{
- pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
-
- /*
- * If some other task has this inode open for write we should not add
- * an ignore mask, unless that ignore mask is supposed to survive
- * modification changes anyway.
- */
- if ((flags & FANOTIFY_MARK_IGNORE_BITS) &&
- !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
- inode_is_open_for_write(inode))
- return 0;
-
- return fanotify_add_mark(group, &inode->i_fsnotify_marks,
- FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
-}
-
static struct fsnotify_event *fanotify_alloc_overflow_event(void)
{
struct fanotify_event *oevent;
@@ -1576,13 +1516,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
INIT_LIST_HEAD(&group->fanotify_data.access_list);
switch (class) {
case FAN_CLASS_NOTIF:
- group->priority = FS_PRIO_0;
+ group->priority = FSNOTIFY_PRIO_NORMAL;
break;
case FAN_CLASS_CONTENT:
- group->priority = FS_PRIO_1;
+ group->priority = FSNOTIFY_PRIO_CONTENT;
break;
case FAN_CLASS_PRE_CONTENT:
- group->priority = FS_PRIO_2;
+ group->priority = FSNOTIFY_PRIO_PRE_CONTENT;
break;
default:
fd = -EINVAL;
@@ -1750,6 +1690,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS;
unsigned int obj_type, fid_mode;
+ void *obj;
u32 umask = 0;
int ret;
@@ -1833,12 +1774,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
goto fput_and_out;
/*
- * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not
- * allowed to set permissions events.
+ * Permission events require minimum priority FAN_CLASS_CONTENT.
*/
ret = -EINVAL;
if (mask & FANOTIFY_PERM_EVENTS &&
- group->priority == FS_PRIO_0)
+ group->priority < FSNOTIFY_PRIO_CONTENT)
goto fput_and_out;
if (mask & FAN_FS_ERROR &&
@@ -1908,17 +1848,34 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
}
/* inode held in place by reference to path; group by fget on fd */
- if (mark_type == FAN_MARK_INODE)
+ if (mark_type == FAN_MARK_INODE) {
inode = path.dentry->d_inode;
- else
+ obj = inode;
+ } else {
mnt = path.mnt;
+ if (mark_type == FAN_MARK_MOUNT)
+ obj = mnt;
+ else
+ obj = mnt->mnt_sb;
+ }
- ret = mnt ? -EINVAL : -EISDIR;
- /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
- if (mark_cmd == FAN_MARK_ADD && ignore == FAN_MARK_IGNORE &&
- (mnt || S_ISDIR(inode->i_mode)) &&
- !(flags & FAN_MARK_IGNORED_SURV_MODIFY))
- goto path_put_and_out;
+ /*
+ * If some other task has this inode open for write we should not add
+ * an ignore mask, unless that ignore mask is supposed to survive
+ * modification changes anyway.
+ */
+ if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) &&
+ !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) {
+ ret = mnt ? -EINVAL : -EISDIR;
+ /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
+ if (ignore == FAN_MARK_IGNORE &&
+ (mnt || S_ISDIR(inode->i_mode)))
+ goto path_put_and_out;
+
+ ret = 0;
+ if (inode && inode_is_open_for_write(inode))
+ goto path_put_and_out;
+ }
/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
if (mnt || !S_ISDIR(inode->i_mode)) {
@@ -1936,26 +1893,12 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
/* create/update an inode mark */
switch (mark_cmd) {
case FAN_MARK_ADD:
- if (mark_type == FAN_MARK_MOUNT)
- ret = fanotify_add_vfsmount_mark(group, mnt, mask,
- flags, fsid);
- else if (mark_type == FAN_MARK_FILESYSTEM)
- ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
- flags, fsid);
- else
- ret = fanotify_add_inode_mark(group, inode, mask,
- flags, fsid);
+ ret = fanotify_add_mark(group, obj, obj_type, mask, flags,
+ fsid);
break;
case FAN_MARK_REMOVE:
- if (mark_type == FAN_MARK_MOUNT)
- ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
- flags, umask);
- else if (mark_type == FAN_MARK_FILESYSTEM)
- ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
- flags, umask);
- else
- ret = fanotify_remove_inode_mark(group, inode, mask,
- flags, umask);
+ ret = fanotify_remove_mark(group, obj, obj_type, mask, flags,
+ umask);
break;
default:
ret = -EINVAL;
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 5c430736ec12..dec553034027 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -41,29 +41,25 @@ static void show_fdinfo(struct seq_file *m, struct file *f,
#if defined(CONFIG_EXPORTFS)
static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
{
- struct {
- struct file_handle handle;
- u8 pad[MAX_HANDLE_SZ];
- } f;
+ DEFINE_FLEX(struct file_handle, f, f_handle, handle_bytes, MAX_HANDLE_SZ);
int size, ret, i;
- f.handle.handle_bytes = sizeof(f.pad);
- size = f.handle.handle_bytes >> 2;
+ size = f->handle_bytes >> 2;
- ret = exportfs_encode_fid(inode, (struct fid *)f.handle.f_handle, &size);
+ ret = exportfs_encode_fid(inode, (struct fid *)f->f_handle, &size);
if ((ret == FILEID_INVALID) || (ret < 0)) {
WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
return;
}
- f.handle.handle_type = ret;
- f.handle.handle_bytes = size * sizeof(u32);
+ f->handle_type = ret;
+ f->handle_bytes = size * sizeof(u32);
seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:",
- f.handle.handle_bytes, f.handle.handle_type);
+ f->handle_bytes, f->handle_type);
- for (i = 0; i < f.handle.handle_bytes; i++)
- seq_printf(m, "%02x", (int)f.handle.f_handle[i]);
+ for (i = 0; i < f->handle_bytes; i++)
+ seq_printf(m, "%02x", (int)f->f_handle[i]);
}
#else
static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 2fc105a72a8f..ff69ae24c4e8 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -89,11 +89,25 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
void fsnotify_sb_delete(struct super_block *sb)
{
+ struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
+
+ /* Were any marks ever added to any object on this sb? */
+ if (!sbinfo)
+ return;
+
fsnotify_unmount_inodes(sb);
fsnotify_clear_marks_by_sb(sb);
/* Wait for outstanding object references from connectors */
- wait_var_event(&sb->s_fsnotify_connectors,
- !atomic_long_read(&sb->s_fsnotify_connectors));
+ wait_var_event(fsnotify_sb_watched_objects(sb),
+ !atomic_long_read(fsnotify_sb_watched_objects(sb)));
+ WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT));
+ WARN_ON(fsnotify_sb_has_priority_watchers(sb,
+ FSNOTIFY_PRIO_PRE_CONTENT));
+}
+
+void fsnotify_sb_free(struct super_block *sb)
+{
+ kfree(sb->s_fsnotify_info);
}
/*
@@ -489,6 +503,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
{
const struct path *path = fsnotify_data_path(data, data_type);
struct super_block *sb = fsnotify_data_sb(data, data_type);
+ struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
struct fsnotify_iter_info iter_info = {};
struct mount *mnt = NULL;
struct inode *inode2 = NULL;
@@ -525,7 +540,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
* SRCU because we have no references to any objects and do not
* need SRCU to keep them "alive".
*/
- if (!sb->s_fsnotify_marks &&
+ if ((!sbinfo || !sbinfo->sb_marks) &&
(!mnt || !mnt->mnt_fsnotify_marks) &&
(!inode || !inode->i_fsnotify_marks) &&
(!inode2 || !inode2->i_fsnotify_marks))
@@ -552,8 +567,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
- iter_info.marks[FSNOTIFY_ITER_TYPE_SB] =
- fsnotify_first_mark(&sb->s_fsnotify_marks);
+ if (sbinfo) {
+ iter_info.marks[FSNOTIFY_ITER_TYPE_SB] =
+ fsnotify_first_mark(&sbinfo->sb_marks);
+ }
if (mnt) {
iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] =
fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index fde74eb333cc..2d059f789ee3 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -9,39 +9,58 @@
#include "../mount.h"
+/*
+ * fsnotify_connp_t is what we embed in objects which connector can be attached
+ * to.
+ */
+typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t;
+
static inline struct inode *fsnotify_conn_inode(
struct fsnotify_mark_connector *conn)
{
- return container_of(conn->obj, struct inode, i_fsnotify_marks);
+ return conn->obj;
}
static inline struct mount *fsnotify_conn_mount(
struct fsnotify_mark_connector *conn)
{
- return container_of(conn->obj, struct mount, mnt_fsnotify_marks);
+ return real_mount(conn->obj);
}
static inline struct super_block *fsnotify_conn_sb(
struct fsnotify_mark_connector *conn)
{
- return container_of(conn->obj, struct super_block, s_fsnotify_marks);
+ return conn->obj;
}
-static inline struct super_block *fsnotify_connector_sb(
- struct fsnotify_mark_connector *conn)
+static inline struct super_block *fsnotify_object_sb(void *obj,
+ enum fsnotify_obj_type obj_type)
{
- switch (conn->type) {
+ switch (obj_type) {
case FSNOTIFY_OBJ_TYPE_INODE:
- return fsnotify_conn_inode(conn)->i_sb;
+ return ((struct inode *)obj)->i_sb;
case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
- return fsnotify_conn_mount(conn)->mnt.mnt_sb;
+ return ((struct vfsmount *)obj)->mnt_sb;
case FSNOTIFY_OBJ_TYPE_SB:
- return fsnotify_conn_sb(conn);
+ return (struct super_block *)obj;
default:
return NULL;
}
}
+static inline struct super_block *fsnotify_connector_sb(
+ struct fsnotify_mark_connector *conn)
+{
+ return fsnotify_object_sb(conn->obj, conn->type);
+}
+
+static inline fsnotify_connp_t *fsnotify_sb_marks(struct super_block *sb)
+{
+ struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
+
+ return sbinfo ? &sbinfo->sb_marks : NULL;
+}
+
/* destroy all events sitting in this groups notification queue */
extern void fsnotify_flush_notify(struct fsnotify_group *group);
@@ -67,7 +86,7 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
/* run the list of all marks associated with sb and destroy them */
static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
{
- fsnotify_destroy_marks(&sb->s_fsnotify_marks);
+ fsnotify_destroy_marks(fsnotify_sb_marks(sb));
}
/*
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 85d8fdd55329..4ffc30606e0b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -544,7 +544,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
int create = (arg & IN_MASK_CREATE);
int ret;
- fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
+ fsn_mark = fsnotify_find_inode_mark(inode, group);
if (!fsn_mark)
return -ENOENT;
else if (create) {
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d6944ff86ffa..c3eefa70633c 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -97,6 +97,21 @@ void fsnotify_get_mark(struct fsnotify_mark *mark)
refcount_inc(&mark->refcnt);
}
+static fsnotify_connp_t *fsnotify_object_connp(void *obj,
+ enum fsnotify_obj_type obj_type)
+{
+ switch (obj_type) {
+ case FSNOTIFY_OBJ_TYPE_INODE:
+ return &((struct inode *)obj)->i_fsnotify_marks;
+ case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
+ return &real_mount(obj)->mnt_fsnotify_marks;
+ case FSNOTIFY_OBJ_TYPE_SB:
+ return fsnotify_sb_marks(obj);
+ default:
+ return NULL;
+ }
+}
+
static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn)
{
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
@@ -116,10 +131,69 @@ __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn)
return *fsnotify_conn_mask_p(conn);
}
+static void fsnotify_get_sb_watched_objects(struct super_block *sb)
+{
+ atomic_long_inc(fsnotify_sb_watched_objects(sb));
+}
+
+static void fsnotify_put_sb_watched_objects(struct super_block *sb)
+{
+ if (atomic_long_dec_and_test(fsnotify_sb_watched_objects(sb)))
+ wake_up_var(fsnotify_sb_watched_objects(sb));
+}
+
static void fsnotify_get_inode_ref(struct inode *inode)
{
ihold(inode);
- atomic_long_inc(&inode->i_sb->s_fsnotify_connectors);
+ fsnotify_get_sb_watched_objects(inode->i_sb);
+}
+
+static void fsnotify_put_inode_ref(struct inode *inode)
+{
+ fsnotify_put_sb_watched_objects(inode->i_sb);
+ iput(inode);
+}
+
+/*
+ * Grab or drop watched objects reference depending on whether the connector
+ * is attached and has any marks attached.
+ */
+static void fsnotify_update_sb_watchers(struct super_block *sb,
+ struct fsnotify_mark_connector *conn)
+{
+ struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
+ bool is_watched = conn->flags & FSNOTIFY_CONN_FLAG_IS_WATCHED;
+ struct fsnotify_mark *first_mark = NULL;
+ unsigned int highest_prio = 0;
+
+ if (conn->obj)
+ first_mark = hlist_entry_safe(conn->list.first,
+ struct fsnotify_mark, obj_list);
+ if (first_mark)
+ highest_prio = first_mark->group->priority;
+ if (WARN_ON(highest_prio >= __FSNOTIFY_PRIO_NUM))
+ highest_prio = 0;
+
+ /*
+ * If the highest priority of group watching this object is prio,
+ * then watched object has a reference on counters [0..prio].
+ * Update priority >= 1 watched objects counters.
+ */
+ for (unsigned int p = conn->prio + 1; p <= highest_prio; p++)
+ atomic_long_inc(&sbinfo->watched_objects[p]);
+ for (unsigned int p = conn->prio; p > highest_prio; p--)
+ atomic_long_dec(&sbinfo->watched_objects[p]);
+ conn->prio = highest_prio;
+
+ /* Update priority >= 0 (a.k.a total) watched objects counter */
+ BUILD_BUG_ON(FSNOTIFY_PRIO_NORMAL != 0);
+ if (first_mark && !is_watched) {
+ conn->flags |= FSNOTIFY_CONN_FLAG_IS_WATCHED;
+ fsnotify_get_sb_watched_objects(sb);
+ } else if (!first_mark && is_watched) {
+ conn->flags &= ~FSNOTIFY_CONN_FLAG_IS_WATCHED;
+ fsnotify_put_sb_watched_objects(sb);
+ }
}
/*
@@ -213,35 +287,12 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work)
}
}
-static void fsnotify_put_inode_ref(struct inode *inode)
-{
- struct super_block *sb = inode->i_sb;
-
- iput(inode);
- if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
- wake_up_var(&sb->s_fsnotify_connectors);
-}
-
-static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn)
-{
- struct super_block *sb = fsnotify_connector_sb(conn);
-
- if (sb)
- atomic_long_inc(&sb->s_fsnotify_connectors);
-}
-
-static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn)
-{
- struct super_block *sb = fsnotify_connector_sb(conn);
-
- if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
- wake_up_var(&sb->s_fsnotify_connectors);
-}
-
static void *fsnotify_detach_connector_from_object(
struct fsnotify_mark_connector *conn,
unsigned int *type)
{
+ fsnotify_connp_t *connp = fsnotify_object_connp(conn->obj, conn->type);
+ struct super_block *sb = fsnotify_connector_sb(conn);
struct inode *inode = NULL;
*type = conn->type;
@@ -261,10 +312,10 @@ static void *fsnotify_detach_connector_from_object(
fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
}
- fsnotify_put_sb_connectors(conn);
- rcu_assign_pointer(*(conn->obj), NULL);
+ rcu_assign_pointer(*connp, NULL);
conn->obj = NULL;
conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
+ fsnotify_update_sb_watchers(sb, conn);
return inode;
}
@@ -316,6 +367,11 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
objp = fsnotify_detach_connector_from_object(conn, &type);
free_conn = true;
} else {
+ struct super_block *sb = fsnotify_connector_sb(conn);
+
+ /* Update watched objects after detaching mark */
+ if (sb)
+ fsnotify_update_sb_watchers(sb, conn);
objp = __fsnotify_recalc_mask(conn);
type = conn->type;
}
@@ -536,8 +592,28 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
return -1;
}
+static int fsnotify_attach_info_to_sb(struct super_block *sb)
+{
+ struct fsnotify_sb_info *sbinfo;
+
+ /* sb info is freed on fsnotify_sb_delete() */
+ sbinfo = kzalloc(sizeof(*sbinfo), GFP_KERNEL);
+ if (!sbinfo)
+ return -ENOMEM;
+
+ /*
+ * cmpxchg() provides the barrier so that callers of fsnotify_sb_info()
+ * will observe an initialized structure
+ */
+ if (cmpxchg(&sb->s_fsnotify_info, NULL, sbinfo)) {
+ /* Someone else created sbinfo for us */
+ kfree(sbinfo);
+ }
+ return 0;
+}
+
static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
- unsigned int obj_type)
+ void *obj, unsigned int obj_type)
{
struct fsnotify_mark_connector *conn;
@@ -547,10 +623,9 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
spin_lock_init(&conn->lock);
INIT_HLIST_HEAD(&conn->list);
conn->flags = 0;
+ conn->prio = 0;
conn->type = obj_type;
- conn->obj = connp;
- conn->flags = 0;
- fsnotify_get_sb_connectors(conn);
+ conn->obj = obj;
/*
* cmpxchg() provides the barrier so that readers of *connp can see
@@ -558,10 +633,8 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
*/
if (cmpxchg(connp, NULL, conn)) {
/* Someone else created list structure for us */
- fsnotify_put_sb_connectors(conn);
kmem_cache_free(fsnotify_mark_connector_cachep, conn);
}
-
return 0;
}
@@ -598,24 +671,36 @@ out:
* to which group and for which inodes. These marks are ordered according to
* priority, highest number first, and then by the group's location in memory.
*/
-static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
- fsnotify_connp_t *connp,
+static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj,
unsigned int obj_type, int add_flags)
{
+ struct super_block *sb = fsnotify_object_sb(obj, obj_type);
struct fsnotify_mark *lmark, *last = NULL;
struct fsnotify_mark_connector *conn;
+ fsnotify_connp_t *connp;
int cmp;
int err = 0;
if (WARN_ON(!fsnotify_valid_obj_type(obj_type)))
return -EINVAL;
+ /*
+ * Attach the sb info before attaching a connector to any object on sb.
+ * The sb info will remain attached as long as sb lives.
+ */
+ if (!fsnotify_sb_info(sb)) {
+ err = fsnotify_attach_info_to_sb(sb);
+ if (err)
+ return err;
+ }
+
+ connp = fsnotify_object_connp(obj, obj_type);
restart:
spin_lock(&mark->lock);
conn = fsnotify_grab_connector(connp);
if (!conn) {
spin_unlock(&mark->lock);
- err = fsnotify_attach_connector_to_object(connp, obj_type);
+ err = fsnotify_attach_connector_to_object(connp, obj, obj_type);
if (err)
return err;
goto restart;
@@ -649,6 +734,7 @@ restart:
/* mark should be the last entry. last is the current last entry */
hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
added:
+ fsnotify_update_sb_watchers(sb, conn);
/*
* Since connector is attached to object using cmpxchg() we are
* guaranteed that connector initialization is fully visible by anyone
@@ -667,7 +753,7 @@ out_err:
* event types should be delivered to which group.
*/
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
- fsnotify_connp_t *connp, unsigned int obj_type,
+ void *obj, unsigned int obj_type,
int add_flags)
{
struct fsnotify_group *group = mark->group;
@@ -688,7 +774,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_get_mark(mark); /* for g_list */
spin_unlock(&mark->lock);
- ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags);
+ ret = fsnotify_add_mark_list(mark, obj, obj_type, add_flags);
if (ret)
goto err;
@@ -706,14 +792,14 @@ err:
return ret;
}
-int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
+int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj,
unsigned int obj_type, int add_flags)
{
int ret;
struct fsnotify_group *group = mark->group;
fsnotify_group_lock(group);
- ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags);
+ ret = fsnotify_add_mark_locked(mark, obj, obj_type, add_flags);
fsnotify_group_unlock(group);
return ret;
}
@@ -723,12 +809,16 @@ EXPORT_SYMBOL_GPL(fsnotify_add_mark);
* Given a list of marks, find the mark associated with given group. If found
* take a reference to that mark and return it, else return NULL.
*/
-struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp,
+struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type,
struct fsnotify_group *group)
{
+ fsnotify_connp_t *connp = fsnotify_object_connp(obj, obj_type);
struct fsnotify_mark_connector *conn;
struct fsnotify_mark *mark;
+ if (!connp)
+ return NULL;
+
conn = fsnotify_grab_connector(connp);
if (!conn)
return NULL;
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index 7aadf5010999..8e6bcdf99770 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -2558,3 +2558,35 @@ undo_insert_range:
goto out;
}
+
+/*
+ * attr_force_nonresident
+ *
+ * Convert default data attribute into non resident form.
+ */
+int attr_force_nonresident(struct ntfs_inode *ni)
+{
+ int err;
+ struct ATTRIB *attr;
+ struct ATTR_LIST_ENTRY *le = NULL;
+ struct mft_inode *mi;
+
+ attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, &mi);
+ if (!attr) {
+ ntfs_bad_inode(&ni->vfs_inode, "no data attribute");
+ return -ENOENT;
+ }
+
+ if (attr->non_res) {
+ /* Already non resident. */
+ return 0;
+ }
+
+ down_write(&ni->file.run_lock);
+ err = attr_make_nonresident(ni, attr, le, mi,
+ le32_to_cpu(attr->res.data_size),
+ &ni->file.run, &attr, NULL);
+ up_write(&ni->file.run_lock);
+
+ return err;
+}
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
index 263635199b60..1937e8e612f8 100644
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -475,6 +475,7 @@ static int ntfs_readdir(struct file *file, struct dir_context *ctx)
vbo = (u64)bit << index_bits;
if (vbo >= i_size) {
ntfs_inode_err(dir, "Looks like your dir is corrupt");
+ ctx->pos = eod;
err = -EINVAL;
goto out;
}
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index b73969e05052..2f903b6ce157 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -578,6 +578,15 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
/* Check new size. */
u8 cluster_bits = sbi->cluster_bits;
+ /* Be sure file is non resident. */
+ if (is_resident(ni)) {
+ ni_lock(ni);
+ err = attr_force_nonresident(ni);
+ ni_unlock(ni);
+ if (err)
+ goto out;
+ }
+
/* generic/213: expected -ENOSPC instead of -EFBIG. */
if (!is_supported_holes) {
loff_t to_alloc = new_size - inode_get_bytes(inode);
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 7f27382e0ce2..0008670939a4 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -2636,7 +2636,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages,
goto out1;
}
- pages_disk = kzalloc(npages_disk * sizeof(struct page *), GFP_NOFS);
+ pages_disk = kcalloc(npages_disk, sizeof(*pages_disk), GFP_NOFS);
if (!pages_disk) {
err = -ENOMEM;
goto out2;
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index 855519713bf7..d7807d255dfe 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -517,7 +517,7 @@ static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr)
seq_bits -= 1;
}
- if (seq_bits != ra->seq_num_bits)
+ if (seq_bits != le32_to_cpu(ra->seq_num_bits))
return false;
/* The log page data offset and record header length must be quad-aligned. */
@@ -1184,7 +1184,8 @@ out:
static int log_read_rst(struct ntfs_log *log, bool first,
struct restart_info *info)
{
- u32 skip, vbo;
+ u32 skip;
+ u64 vbo;
struct RESTART_HDR *r_page = NULL;
/* Determine which restart area we are looking for. */
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 43796aaa3d97..d0f15bbf78f6 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -1534,6 +1534,11 @@ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
goto out1;
}
+ if (data_size <= le64_to_cpu(alloc->nres.data_size)) {
+ /* Reuse index. */
+ goto out;
+ }
+
/* Increase allocation. */
err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len,
&indx->alloc_run, data_size, &data_size, true,
@@ -1547,6 +1552,7 @@ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
if (in->name == I30_NAME)
i_size_write(&ni->vfs_inode, data_size);
+out:
*vbn = bit << indx->idx2vbn_bits;
return 0;
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index d273eda1cf45..0f1664db94ad 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -37,7 +37,7 @@ static struct inode *ntfs_read_mft(struct inode *inode,
bool is_dir;
unsigned long ino = inode->i_ino;
u32 rp_fa = 0, asize, t32;
- u16 roff, rsize, names = 0;
+ u16 roff, rsize, names = 0, links = 0;
const struct ATTR_FILE_NAME *fname = NULL;
const struct INDEX_ROOT *root;
struct REPARSE_DATA_BUFFER rp; // 0x18 bytes
@@ -200,11 +200,12 @@ next_attr:
rsize < SIZEOF_ATTRIBUTE_FILENAME)
goto out;
+ names += 1;
fname = Add2Ptr(attr, roff);
if (fname->type == FILE_NAME_DOS)
goto next_attr;
- names += 1;
+ links += 1;
if (name && name->len == fname->name_len &&
!ntfs_cmp_names_cpu(name, (struct le_str *)&fname->name_len,
NULL, false))
@@ -429,7 +430,7 @@ end_enum:
ni->mi.dirty = true;
}
- set_nlink(inode, names);
+ set_nlink(inode, links);
if (S_ISDIR(mode)) {
ni->std_fa |= FILE_ATTRIBUTE_DIRECTORY;
@@ -576,13 +577,18 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
clear_buffer_uptodate(bh);
if (is_resident(ni)) {
- ni_lock(ni);
- err = attr_data_read_resident(ni, &folio->page);
- ni_unlock(ni);
-
- if (!err)
- set_buffer_uptodate(bh);
+ bh->b_blocknr = RESIDENT_LCN;
bh->b_size = block_size;
+ if (!folio) {
+ err = 0;
+ } else {
+ ni_lock(ni);
+ err = attr_data_read_resident(ni, &folio->page);
+ ni_unlock(ni);
+
+ if (!err)
+ set_buffer_uptodate(bh);
+ }
return err;
}
@@ -1216,11 +1222,10 @@ out:
*
* NOTE: if fnd != NULL (ntfs_atomic_open) then @dir is locked
*/
-struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry,
- const struct cpu_str *uni, umode_t mode,
- dev_t dev, const char *symname, u32 size,
- struct ntfs_fnd *fnd)
+int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, const struct cpu_str *uni,
+ umode_t mode, dev_t dev, const char *symname, u32 size,
+ struct ntfs_fnd *fnd)
{
int err;
struct super_block *sb = dir->i_sb;
@@ -1245,6 +1250,9 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
struct REPARSE_DATA_BUFFER *rp = NULL;
bool rp_inserted = false;
+ /* New file will be resident or non resident. */
+ const bool new_file_resident = 1;
+
if (!fnd)
ni_lock_dir(dir_ni);
@@ -1484,7 +1492,7 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
attr->size = cpu_to_le32(SIZEOF_RESIDENT);
attr->name_off = SIZEOF_RESIDENT_LE;
attr->res.data_off = SIZEOF_RESIDENT_LE;
- } else if (S_ISREG(mode)) {
+ } else if (!new_file_resident && S_ISREG(mode)) {
/*
* Regular file. Create empty non resident data attribute.
*/
@@ -1727,12 +1735,10 @@ out1:
if (!fnd)
ni_unlock(dir_ni);
- if (err)
- return ERR_PTR(err);
-
- unlock_new_inode(inode);
+ if (!err)
+ unlock_new_inode(inode);
- return inode;
+ return err;
}
int ntfs_link_inode(struct inode *inode, struct dentry *dentry)
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index 084d19d78397..71498421ce60 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -107,12 +107,8 @@ static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry,
static int ntfs_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl)
{
- struct inode *inode;
-
- inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFREG | mode, 0,
- NULL, 0, NULL);
-
- return IS_ERR(inode) ? PTR_ERR(inode) : 0;
+ return ntfs_create_inode(idmap, dir, dentry, NULL, S_IFREG | mode, 0,
+ NULL, 0, NULL);
}
/*
@@ -123,12 +119,8 @@ static int ntfs_create(struct mnt_idmap *idmap, struct inode *dir,
static int ntfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t rdev)
{
- struct inode *inode;
-
- inode = ntfs_create_inode(idmap, dir, dentry, NULL, mode, rdev, NULL, 0,
- NULL);
-
- return IS_ERR(inode) ? PTR_ERR(inode) : 0;
+ return ntfs_create_inode(idmap, dir, dentry, NULL, mode, rdev, NULL, 0,
+ NULL);
}
/*
@@ -200,15 +192,12 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *symname)
{
u32 size = strlen(symname);
- struct inode *inode;
if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
return -EIO;
- inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFLNK | 0777, 0,
- symname, size, NULL);
-
- return IS_ERR(inode) ? PTR_ERR(inode) : 0;
+ return ntfs_create_inode(idmap, dir, dentry, NULL, S_IFLNK | 0777, 0,
+ symname, size, NULL);
}
/*
@@ -217,12 +206,8 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
static int ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
- struct inode *inode;
-
- inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0,
- NULL, 0, NULL);
-
- return IS_ERR(inode) ? PTR_ERR(inode) : 0;
+ return ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0,
+ NULL, 0, NULL);
}
/*
@@ -358,95 +343,6 @@ out:
return err;
}
-/*
- * ntfs_atomic_open
- *
- * inode_operations::atomic_open
- */
-static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry,
- struct file *file, u32 flags, umode_t mode)
-{
- int err;
- struct inode *inode;
- struct ntfs_fnd *fnd = NULL;
- struct ntfs_inode *ni = ntfs_i(dir);
- struct dentry *d = NULL;
- struct cpu_str *uni = __getname();
- bool locked = false;
-
- if (!uni)
- return -ENOMEM;
-
- err = ntfs_nls_to_utf16(ni->mi.sbi, dentry->d_name.name,
- dentry->d_name.len, uni, NTFS_NAME_LEN,
- UTF16_HOST_ENDIAN);
- if (err < 0)
- goto out;
-
-#ifdef CONFIG_NTFS3_FS_POSIX_ACL
- if (IS_POSIXACL(dir)) {
- /*
- * Load in cache current acl to avoid ni_lock(dir):
- * ntfs_create_inode -> ntfs_init_acl -> posix_acl_create ->
- * ntfs_get_acl -> ntfs_get_acl_ex -> ni_lock
- */
- struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
-
- if (IS_ERR(p)) {
- err = PTR_ERR(p);
- goto out;
- }
- posix_acl_release(p);
- }
-#endif
-
- if (d_in_lookup(dentry)) {
- ni_lock_dir(ni);
- locked = true;
- fnd = fnd_get();
- if (!fnd) {
- err = -ENOMEM;
- goto out1;
- }
-
- d = d_splice_alias(dir_search_u(dir, uni, fnd), dentry);
- if (IS_ERR(d)) {
- err = PTR_ERR(d);
- d = NULL;
- goto out2;
- }
-
- if (d)
- dentry = d;
- }
-
- if (!(flags & O_CREAT) || d_really_is_positive(dentry)) {
- err = finish_no_open(file, d);
- goto out2;
- }
-
- file->f_mode |= FMODE_CREATED;
-
- /*
- * fnd contains tree's path to insert to.
- * If fnd is not NULL then dir is locked.
- */
- inode = ntfs_create_inode(file_mnt_idmap(file), dir, dentry, uni,
- mode, 0, NULL, 0, fnd);
- err = IS_ERR(inode) ? PTR_ERR(inode) :
- finish_open(file, dentry, ntfs_file_open);
- dput(d);
-
-out2:
- fnd_put(fnd);
-out1:
- if (locked)
- ni_unlock(ni);
-out:
- __putname(uni);
- return err;
-}
-
struct dentry *ntfs3_get_parent(struct dentry *child)
{
struct inode *inode = d_inode(child);
@@ -612,7 +508,6 @@ const struct inode_operations ntfs_dir_inode_operations = {
.setattr = ntfs3_setattr,
.getattr = ntfs_getattr,
.listxattr = ntfs_listxattr,
- .atomic_open = ntfs_atomic_open,
.fiemap = ntfs_fiemap,
};
diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h
index 9c7478150a03..3d6143c7abc0 100644
--- a/fs/ntfs3/ntfs.h
+++ b/fs/ntfs3/ntfs.h
@@ -59,7 +59,7 @@ struct GUID {
struct cpu_str {
u8 len;
u8 unused;
- u16 name[10];
+ u16 name[];
};
struct le_str {
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index c018fad4c037..f9ed6d2b065d 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -452,6 +452,7 @@ int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size,
int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes);
int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes);
int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size);
+int attr_force_nonresident(struct ntfs_inode *ni);
/* Functions from attrlist.c */
void al_destroy(struct ntfs_inode *ni);
@@ -716,11 +717,10 @@ int ntfs_sync_inode(struct inode *inode);
int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
struct inode *i2);
int inode_write_data(struct inode *inode, const void *data, size_t bytes);
-struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry,
- const struct cpu_str *uni, umode_t mode,
- dev_t dev, const char *symname, u32 size,
- struct ntfs_fnd *fnd);
+int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, const struct cpu_str *uni,
+ umode_t mode, dev_t dev, const char *symname, u32 size,
+ struct ntfs_fnd *fnd);
int ntfs_link_inode(struct inode *inode, struct dentry *dentry);
int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry);
void ntfs_evict_inode(struct inode *inode);
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index 6aa3a9d44df1..6c76503edc20 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -534,16 +534,9 @@ bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi,
if (aoff + asize > used)
return false;
- if (ni && is_attr_indexed(attr)) {
+ if (ni && is_attr_indexed(attr) && attr->type == ATTR_NAME) {
u16 links = le16_to_cpu(ni->mi.mrec->hard_links);
- struct ATTR_FILE_NAME *fname =
- attr->type != ATTR_NAME ?
- NULL :
- resident_data_ex(attr,
- SIZEOF_ATTRIBUTE_FILENAME);
- if (fname && fname->type == FILE_NAME_DOS) {
- /* Do not decrease links count deleting DOS name. */
- } else if (!links) {
+ if (!links) {
/* minor error. Not critical. */
} else {
ni->mi.mrec->hard_links = cpu_to_le16(links - 1);
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index f41e01c5676a..27fbde2701b6 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -1861,8 +1861,6 @@ static int __init init_ntfs_fs(void)
{
int err;
- pr_info("ntfs3: Max link count %u\n", NTFS_LINK_MAX);
-
if (IS_ENABLED(CONFIG_NTFS3_FS_POSIX_ACL))
pr_info("ntfs3: Enabled Linux POSIX ACLs support\n");
if (IS_ENABLED(CONFIG_NTFS3_64BIT_CLUSTER))
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 53e7d1fa036a..73785dece7a7 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -219,8 +219,11 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer,
if (!ea->name_len)
break;
- if (ea->name_len > ea_size)
+ if (ea->name_len > ea_size) {
+ ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR);
+ err = -EINVAL; /* corrupted fs */
break;
+ }
if (buffer) {
/* Check if we can use field ea->name */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 9898c11bdfa1..60e208b01c8d 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -82,7 +82,7 @@ DECLARE_EVENT_CLASS(ocfs2__string,
__string(name,name)
),
TP_fast_assign(
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%s", __get_str(name))
);
@@ -1289,7 +1289,7 @@ DECLARE_EVENT_CLASS(ocfs2__file_ops,
__entry->dentry = dentry;
__entry->ino = ino;
__entry->d_len = d_len;
- __assign_str(d_name, d_name);
+ __assign_str(d_name);
__entry->para = para;
),
TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file,
@@ -1425,7 +1425,7 @@ TRACE_EVENT(ocfs2_setattr,
__entry->dentry = dentry;
__entry->ino = ino;
__entry->d_len = d_len;
- __assign_str(d_name, d_name);
+ __assign_str(d_name);
__entry->ia_valid = ia_valid;
__entry->ia_mode = ia_mode;
__entry->ia_uid = ia_uid;
@@ -1683,7 +1683,7 @@ TRACE_EVENT(ocfs2_parse_options,
),
TP_fast_assign(
__entry->is_remount = is_remount;
- __assign_str(options, options);
+ __assign_str(options);
),
TP_printk("%d %s", __entry->is_remount, __get_str(options))
);
@@ -1718,8 +1718,8 @@ TRACE_EVENT(ocfs2_initialize_super,
__field(int, cluster_bits)
),
TP_fast_assign(
- __assign_str(label, label);
- __assign_str(uuid_str, uuid_str);
+ __assign_str(label);
+ __assign_str(uuid_str);
__entry->root_dir = root_dir;
__entry->system_dir = system_dir;
__entry->cluster_bits = cluster_bits;
@@ -1746,7 +1746,7 @@ TRACE_EVENT(ocfs2_init_xattr_set_ctxt,
__field(int, credits)
),
TP_fast_assign(
- __assign_str(name, name);
+ __assign_str(name);
__entry->meta = meta;
__entry->clusters = clusters;
__entry->credits = credits;
@@ -1770,7 +1770,7 @@ DECLARE_EVENT_CLASS(ocfs2__xattr_find,
),
TP_fast_assign(
__entry->ino = ino;
- __assign_str(name, name);
+ __assign_str(name);
__entry->name_index = name_index;
__entry->hash = hash;
__entry->location = location;
@@ -2019,7 +2019,7 @@ TRACE_EVENT(ocfs2_sync_dquot_helper,
__entry->dq_id = dq_id;
__entry->dq_type = dq_type;
__entry->type = type;
- __assign_str(s_id, s_id);
+ __assign_str(s_id);
),
TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type,
__entry->type, __get_str(s_id))
@@ -2060,7 +2060,7 @@ TRACE_EVENT(ocfs2_dx_dir_search,
TP_fast_assign(
__entry->ino = ino;
__entry->namelen = namelen;
- __assign_str(name, name);
+ __assign_str(name);
__entry->major_hash = major_hash;
__entry->minor_hash = minor_hash;
__entry->blkno = blkno;
@@ -2088,7 +2088,7 @@ TRACE_EVENT(ocfs2_find_files_on_disk,
),
TP_fast_assign(
__entry->namelen = namelen;
- __assign_str(name, name);
+ __assign_str(name);
__entry->blkno = blkno;
__entry->dir = dir;
),
@@ -2107,7 +2107,7 @@ TRACE_EVENT(ocfs2_check_dir_for_entry,
TP_fast_assign(
__entry->dir = dir;
__entry->namelen = namelen;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%llu %.*s", __entry->dir,
__entry->namelen, __get_str(name))
@@ -2135,7 +2135,7 @@ TRACE_EVENT(ocfs2_dx_dir_index_root_block,
__entry->major_hash = major_hash;
__entry->minor_hash = minor_hash;
__entry->namelen = namelen;
- __assign_str(name, name);
+ __assign_str(name);
__entry->num_used = num_used;
),
TP_printk("%llu %x %x %.*s %u", __entry->dir,
@@ -2171,7 +2171,7 @@ DECLARE_EVENT_CLASS(ocfs2__dentry_ops,
__entry->dir = dir;
__entry->dentry = dentry;
__entry->name_len = name_len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->dir_blkno = dir_blkno;
__entry->extra = extra;
),
@@ -2217,7 +2217,7 @@ TRACE_EVENT(ocfs2_mknod,
__entry->dir = dir;
__entry->dentry = dentry;
__entry->name_len = name_len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->dir_blkno = dir_blkno;
__entry->dev = dev;
__entry->mode = mode;
@@ -2241,9 +2241,9 @@ TRACE_EVENT(ocfs2_link,
TP_fast_assign(
__entry->ino = ino;
__entry->old_len = old_len;
- __assign_str(old_name, old_name);
+ __assign_str(old_name);
__entry->name_len = name_len;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%llu %.*s %.*s", __entry->ino,
__entry->old_len, __get_str(old_name),
@@ -2279,9 +2279,9 @@ TRACE_EVENT(ocfs2_rename,
__entry->new_dir = new_dir;
__entry->new_dentry = new_dentry;
__entry->old_len = old_len;
- __assign_str(old_name, old_name);
+ __assign_str(old_name);
__entry->new_len = new_len;
- __assign_str(new_name, new_name);
+ __assign_str(new_name);
),
TP_printk("%p %p %p %p %.*s %.*s",
__entry->old_dir, __entry->old_dentry,
@@ -2301,7 +2301,7 @@ TRACE_EVENT(ocfs2_rename_target_exists,
),
TP_fast_assign(
__entry->new_len = new_len;
- __assign_str(new_name, new_name);
+ __assign_str(new_name);
),
TP_printk("%.*s", __entry->new_len, __get_str(new_name))
);
@@ -2344,7 +2344,7 @@ TRACE_EVENT(ocfs2_symlink_begin,
__entry->dentry = dentry;
__entry->symname = symname;
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry,
__entry->symname, __entry->len, __get_str(name))
@@ -2360,7 +2360,7 @@ TRACE_EVENT(ocfs2_blkno_stringify,
),
TP_fast_assign(
__entry->blkno = blkno;
- __assign_str(name, name);
+ __assign_str(name);
__entry->namelen = namelen;
),
TP_printk("%llu %s %d", __entry->blkno, __get_str(name),
@@ -2381,7 +2381,7 @@ TRACE_EVENT(ocfs2_orphan_del,
),
TP_fast_assign(
__entry->dir = dir;
- __assign_str(name, name);
+ __assign_str(name);
__entry->namelen = namelen;
),
TP_printk("%llu %s %d", __entry->dir, __get_str(name),
@@ -2403,7 +2403,7 @@ TRACE_EVENT(ocfs2_dentry_revalidate,
TP_fast_assign(
__entry->dentry = dentry;
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name))
);
@@ -2420,7 +2420,7 @@ TRACE_EVENT(ocfs2_dentry_revalidate_negative,
),
TP_fast_assign(
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->pgen = pgen;
__entry->gen = gen;
),
@@ -2445,7 +2445,7 @@ TRACE_EVENT(ocfs2_find_local_alias,
),
TP_fast_assign(
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%.*s", __entry->len, __get_str(name))
);
@@ -2462,7 +2462,7 @@ TRACE_EVENT(ocfs2_dentry_attach_lock,
),
TP_fast_assign(
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->parent = parent;
__entry->fsdata = fsdata;
),
@@ -2480,7 +2480,7 @@ TRACE_EVENT(ocfs2_dentry_attach_lock_found,
__field(unsigned long long, ino)
),
TP_fast_assign(
- __assign_str(name, name);
+ __assign_str(name);
__entry->parent = parent;
__entry->ino = ino;
),
@@ -2527,7 +2527,7 @@ TRACE_EVENT(ocfs2_get_parent,
TP_fast_assign(
__entry->child = child;
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->ino = ino;
),
TP_printk("%p %.*s %llu", __entry->child, __entry->len,
@@ -2551,7 +2551,7 @@ TRACE_EVENT(ocfs2_encode_fh_begin,
TP_fast_assign(
__entry->dentry = dentry;
__entry->name_len = name_len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->fh = fh;
__entry->len = len;
__entry->connectable = connectable;
diff --git a/fs/open.c b/fs/open.c
index ee8460c83c77..89cafb572061 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -902,10 +902,10 @@ cleanup_inode:
}
static int do_dentry_open(struct file *f,
- struct inode *inode,
int (*open)(struct inode *, struct file *))
{
static const struct file_operations empty_fops = {};
+ struct inode *inode = f->f_path.dentry->d_inode;
int error;
path_get(&f->f_path);
@@ -1047,7 +1047,7 @@ int finish_open(struct file *file, struct dentry *dentry,
BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
file->f_path.dentry = dentry;
- return do_dentry_open(file, d_backing_inode(dentry), open);
+ return do_dentry_open(file, open);
}
EXPORT_SYMBOL(finish_open);
@@ -1086,7 +1086,7 @@ EXPORT_SYMBOL(file_path);
int vfs_open(const struct path *path, struct file *file)
{
file->f_path = *path;
- return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
+ return do_dentry_open(file, NULL);
}
struct file *dentry_open(const struct path *path, int flags,
@@ -1155,7 +1155,6 @@ EXPORT_SYMBOL(dentry_create);
* kernel_file_open - open a file for kernel internal use
* @path: path of the file to open
* @flags: open flags
- * @inode: the inode
* @cred: credentials for open
*
* Open a file for use by in-kernel consumers. The file is not accounted
@@ -1165,7 +1164,7 @@ EXPORT_SYMBOL(dentry_create);
* Return: Opened file on success, an error pointer on failure.
*/
struct file *kernel_file_open(const struct path *path, int flags,
- struct inode *inode, const struct cred *cred)
+ const struct cred *cred)
{
struct file *f;
int error;
@@ -1175,7 +1174,7 @@ struct file *kernel_file_open(const struct path *path, int flags,
return f;
f->f_path = *path;
- error = do_dentry_open(f, inode, NULL);
+ error = do_dentry_open(f, NULL);
if (error) {
fput(f);
f = ERR_PTR(error);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 0f8b4a719237..116f542442dd 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -14,6 +14,7 @@
#include <linux/posix_acl_xattr.h>
#include <linux/atomic.h>
#include <linux/ratelimit.h>
+#include <linux/backing-file.h>
#include "overlayfs.h"
static unsigned short ovl_redirect_max = 256;
@@ -260,14 +261,13 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry)
* may not use to instantiate the new dentry.
*/
static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
- struct dentry *newdentry, bool hardlink)
+ struct dentry *newdentry, bool hardlink, struct file *tmpfile)
{
struct ovl_inode_params oip = {
.upperdentry = newdentry,
.newinode = inode,
};
- ovl_dir_modified(dentry->d_parent, false);
ovl_dentry_set_upper_alias(dentry);
ovl_dentry_init_reval(dentry, newdentry, NULL);
@@ -295,6 +295,9 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
inc_nlink(inode);
}
+ if (tmpfile)
+ d_mark_tmpfile(tmpfile, inode);
+
d_instantiate(dentry, inode);
if (inode != oip.newinode) {
pr_warn_ratelimited("newly created inode found in cache (%pd2)\n",
@@ -327,9 +330,6 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct dentry *newdentry;
int err;
- if (!attr->hardlink && !IS_POSIXACL(udir))
- attr->mode &= ~current_umask();
-
inode_lock_nested(udir, I_MUTEX_PARENT);
newdentry = ovl_create_real(ofs, udir,
ovl_lookup_upper(ofs, dentry->d_name.name,
@@ -345,7 +345,8 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
ovl_set_opaque(dentry, newdentry);
}
- err = ovl_instantiate(dentry, inode, newdentry, !!attr->hardlink);
+ ovl_dir_modified(dentry->d_parent, false);
+ err = ovl_instantiate(dentry, inode, newdentry, !!attr->hardlink, NULL);
if (err)
goto out_cleanup;
out_unlock:
@@ -529,7 +530,8 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out_cleanup;
}
- err = ovl_instantiate(dentry, inode, newdentry, hardlink);
+ ovl_dir_modified(dentry->d_parent, false);
+ err = ovl_instantiate(dentry, inode, newdentry, hardlink, NULL);
if (err) {
ovl_cleanup(ofs, udir, newdentry);
dput(newdentry);
@@ -551,12 +553,35 @@ out_cleanup:
goto out_dput;
}
+static int ovl_setup_cred_for_create(struct dentry *dentry, struct inode *inode,
+ umode_t mode, const struct cred *old_cred)
+{
+ int err;
+ struct cred *override_cred;
+
+ override_cred = prepare_creds();
+ if (!override_cred)
+ return -ENOMEM;
+
+ override_cred->fsuid = inode->i_uid;
+ override_cred->fsgid = inode->i_gid;
+ err = security_dentry_create_files_as(dentry, mode, &dentry->d_name,
+ old_cred, override_cred);
+ if (err) {
+ put_cred(override_cred);
+ return err;
+ }
+ put_cred(override_creds(override_cred));
+ put_cred(override_cred);
+
+ return 0;
+}
+
static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
struct ovl_cattr *attr, bool origin)
{
int err;
const struct cred *old_cred;
- struct cred *override_cred;
struct dentry *parent = dentry->d_parent;
old_cred = ovl_override_creds(dentry->d_sb);
@@ -572,10 +597,6 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
}
if (!attr->hardlink) {
- err = -ENOMEM;
- override_cred = prepare_creds();
- if (!override_cred)
- goto out_revert_creds;
/*
* In the creation cases(create, mkdir, mknod, symlink),
* ovl should transfer current's fs{u,g}id to underlying
@@ -589,17 +610,9 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
* create a new inode, so just use the ovl mounter's
* fs{u,g}id.
*/
- override_cred->fsuid = inode->i_uid;
- override_cred->fsgid = inode->i_gid;
- err = security_dentry_create_files_as(dentry,
- attr->mode, &dentry->d_name, old_cred,
- override_cred);
- if (err) {
- put_cred(override_cred);
+ err = ovl_setup_cred_for_create(dentry, inode, attr->mode, old_cred);
+ if (err)
goto out_revert_creds;
- }
- put_cred(override_creds(override_cred));
- put_cred(override_cred);
}
if (!ovl_dentry_is_whiteout(dentry))
@@ -1290,6 +1303,100 @@ out:
return err;
}
+static int ovl_create_tmpfile(struct file *file, struct dentry *dentry,
+ struct inode *inode, umode_t mode)
+{
+ const struct cred *old_cred;
+ struct path realparentpath;
+ struct file *realfile;
+ struct dentry *newdentry;
+ /* It's okay to set O_NOATIME, since the owner will be current fsuid */
+ int flags = file->f_flags | OVL_OPEN_FLAGS;
+ int err;
+
+ err = ovl_copy_up(dentry->d_parent);
+ if (err)
+ return err;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = ovl_setup_cred_for_create(dentry, inode, mode, old_cred);
+ if (err)
+ goto out_revert_creds;
+
+ ovl_path_upper(dentry->d_parent, &realparentpath);
+ realfile = backing_tmpfile_open(&file->f_path, flags, &realparentpath,
+ mode, current_cred());
+ err = PTR_ERR_OR_ZERO(realfile);
+ pr_debug("tmpfile/open(%pd2, 0%o) = %i\n", realparentpath.dentry, mode, err);
+ if (err)
+ goto out_revert_creds;
+
+ /* ovl_instantiate() consumes the newdentry reference on success */
+ newdentry = dget(realfile->f_path.dentry);
+ err = ovl_instantiate(dentry, inode, newdentry, false, file);
+ if (!err) {
+ file->private_data = realfile;
+ } else {
+ dput(newdentry);
+ fput(realfile);
+ }
+out_revert_creds:
+ revert_creds(old_cred);
+ return err;
+}
+
+static int ovl_dummy_open(struct inode *inode, struct file *file)
+{
+ return 0;
+}
+
+static int ovl_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
+ struct file *file, umode_t mode)
+{
+ int err;
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode;
+
+ if (!OVL_FS(dentry->d_sb)->tmpfile)
+ return -EOPNOTSUPP;
+
+ err = ovl_want_write(dentry);
+ if (err)
+ return err;
+
+ err = -ENOMEM;
+ inode = ovl_new_inode(dentry->d_sb, mode, 0);
+ if (!inode)
+ goto drop_write;
+
+ inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
+ err = ovl_create_tmpfile(file, dentry, inode, inode->i_mode);
+ if (err)
+ goto put_inode;
+
+ /*
+ * Check if the preallocated inode was actually used. Having something
+ * else assigned to the dentry shouldn't happen as that would indicate
+ * that the backing tmpfile "leaked" out of overlayfs.
+ */
+ err = -EIO;
+ if (WARN_ON(inode != d_inode(dentry)))
+ goto put_realfile;
+
+ /* inode reference was transferred to dentry */
+ inode = NULL;
+ err = finish_open(file, dentry, ovl_dummy_open);
+put_realfile:
+ /* Without FMODE_OPENED ->release() won't be called on @file */
+ if (!(file->f_mode & FMODE_OPENED))
+ fput(file->private_data);
+put_inode:
+ iput(inode);
+drop_write:
+ ovl_drop_write(dentry);
+ return err;
+}
+
const struct inode_operations ovl_dir_inode_operations = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
@@ -1310,4 +1417,5 @@ const struct inode_operations ovl_dir_inode_operations = {
.update_time = ovl_update_time,
.fileattr_get = ovl_fileattr_get,
.fileattr_set = ovl_fileattr_set,
+ .tmpfile = ovl_tmpfile,
};
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 05536964d37f..1a411cae57ed 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -24,9 +24,6 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode)
return 'm';
}
-/* No atime modification on underlying */
-#define OVL_OPEN_FLAGS (O_NOATIME)
-
static struct file *ovl_open_realfile(const struct file *file,
const struct path *realpath)
{
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index c63b31a460be..35fd3e3e1778 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -8,7 +8,6 @@
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/xattr.h>
-#include <linux/posix_acl.h>
#include <linux/ratelimit.h>
#include <linux/fiemap.h>
#include <linux/fileattr.h>
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index ee949f3e7c77..0bfe35da4b7b 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -175,6 +175,9 @@ static inline int ovl_metadata_digest_size(const struct ovl_metacopy *metacopy)
return (int)metacopy->len - OVL_METACOPY_MIN_SIZE;
}
+/* No atime modification on underlying */
+#define OVL_OPEN_FLAGS (O_NOATIME)
+
extern const char *const ovl_xattr_table[][2];
static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox)
{
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index d285d1d7baad..edc9216f6e27 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1376,7 +1376,7 @@ int ovl_ensure_verity_loaded(struct path *datapath)
* If this inode was not yet opened, the verity info hasn't been
* loaded yet, so we need to do that here to force it into memory.
*/
- filp = kernel_file_open(datapath, O_RDONLY, inode, current_cred());
+ filp = kernel_file_open(datapath, O_RDONLY, current_cred());
if (IS_ERR(filp))
return PTR_ERR(filp);
fput(filp);
diff --git a/fs/pidfs.c b/fs/pidfs.c
index a63d5d24aa02..dbb9d854d1c5 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -169,6 +169,24 @@ static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
return -EOPNOTSUPP;
}
+
+/*
+ * User space expects pidfs inodes to have no file type in st_mode.
+ *
+ * In particular, 'lsof' has this legacy logic:
+ *
+ * type = s->st_mode & S_IFMT;
+ * switch (type) {
+ * ...
+ * case 0:
+ * if (!strcmp(p, "anon_inode"))
+ * Lf->ntype = Ntype = N_ANON_INODE;
+ *
+ * to detect our old anon_inode logic.
+ *
+ * Rather than mess with our internal sane inode data, just fix it
+ * up here in getattr() by masking off the format bits.
+ */
static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
unsigned int query_flags)
@@ -176,6 +194,7 @@ static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct inode *inode = d_inode(path->dentry);
generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ stat->mode &= ~S_IFMT;
return 0;
}
@@ -199,12 +218,13 @@ static const struct super_operations pidfs_sops = {
.statfs = simple_statfs,
};
+/*
+ * 'lsof' has knowledge of out historical anon_inode use, and expects
+ * the pidfs dentry name to start with 'anon_inode'.
+ */
static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
{
- struct inode *inode = d_inode(dentry);
- struct pid *pid = inode->i_private;
-
- return dynamic_dname(buffer, buflen, "pidfd:[%llu]", pid->ino);
+ return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]");
}
static const struct dentry_operations pidfs_dentry_operations = {
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index f4b1c8b42a51..586bbc84ca04 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -39,10 +39,8 @@ static int seq_show(struct seq_file *m, void *v)
spin_lock(&files->file_lock);
file = files_lookup_fd_locked(files, fd);
if (file) {
- struct fdtable *fdt = files_fdtable(files);
-
f_flags = file->f_flags;
- if (close_on_exec(fd, fdt))
+ if (close_on_exec(fd, files))
f_flags |= O_CLOEXEC;
get_file(file);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e5a5f015ff03..f8d35f993fe5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -970,12 +970,17 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
break;
/* Case 1 and 2 above */
- if (vma->vm_start >= last_vma_end)
+ if (vma->vm_start >= last_vma_end) {
+ smap_gather_stats(vma, &mss, 0);
+ last_vma_end = vma->vm_end;
continue;
+ }
/* Case 4 above */
- if (vma->vm_end > last_vma_end)
+ if (vma->vm_end > last_vma_end) {
smap_gather_stats(vma, &mss, last_vma_end);
+ last_vma_end = vma->vm_end;
+ }
}
} for_each_vma(vmi, vma);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index dacbee455c03..627eb2f72ef3 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -410,7 +410,7 @@ static inline int mark_all_dquot_dirty(struct dquot __rcu * const *dquots)
if (dquot)
/* Even in case of error we have to continue */
ret = mark_dquot_dirty(dquot);
- if (!err)
+ if (!err && ret < 0)
err = ret;
}
return err;
@@ -1737,7 +1737,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
if (reserve)
goto out_flush_warn;
- mark_all_dquot_dirty(dquots);
+ ret = mark_all_dquot_dirty(dquots);
out_flush_warn:
srcu_read_unlock(&dquot_srcu, index);
flush_warnings(warn);
@@ -1786,7 +1786,7 @@ int dquot_alloc_inode(struct inode *inode)
warn_put_all:
spin_unlock(&inode->i_lock);
if (ret == 0)
- mark_all_dquot_dirty(dquots);
+ ret = mark_all_dquot_dirty(dquots);
srcu_read_unlock(&dquot_srcu, index);
flush_warnings(warn);
return ret;
@@ -1990,7 +1990,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
qsize_t inode_usage = 1;
struct dquot __rcu **dquots;
struct dquot *transfer_from[MAXQUOTAS] = {};
- int cnt, index, ret = 0;
+ int cnt, index, ret = 0, err;
char is_valid[MAXQUOTAS] = {};
struct dquot_warn warn_to[MAXQUOTAS];
struct dquot_warn warn_from_inodes[MAXQUOTAS];
@@ -2087,8 +2087,12 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
* mark_all_dquot_dirty().
*/
index = srcu_read_lock(&dquot_srcu);
- mark_all_dquot_dirty((struct dquot __rcu **)transfer_from);
- mark_all_dquot_dirty((struct dquot __rcu **)transfer_to);
+ err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_from);
+ if (err < 0)
+ ret = err;
+ err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_to);
+ if (err < 0)
+ ret = err;
srcu_read_unlock(&dquot_srcu, index);
flush_warnings(warn_to);
@@ -2098,7 +2102,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
if (is_valid[cnt])
transfer_to[cnt] = transfer_from[cnt];
- return 0;
+ return ret;
over_quota:
/* Back out changes we already did */
for (cnt--; cnt >= 0; cnt--) {
@@ -2726,6 +2730,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
struct mem_dqblk *dm = &dquot->dq_dqb;
int check_blim = 0, check_ilim = 0;
struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
+ int ret;
if (di->d_fieldmask & ~VFS_QC_MASK)
return -EINVAL;
@@ -2807,8 +2812,9 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
else
set_bit(DQ_FAKE_B, &dquot->dq_flags);
spin_unlock(&dquot->dq_dqb_lock);
- mark_dquot_dirty(dquot);
-
+ ret = mark_dquot_dirty(dquot);
+ if (ret < 0)
+ return ret;
return 0;
}
@@ -3016,11 +3022,10 @@ static int __init dquot_init(void)
if (!dquot_hash)
panic("Cannot create dquot hash table");
- for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
- ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL);
- if (ret)
- panic("Cannot create dquot stat counters");
- }
+ ret = percpu_counter_init_many(dqstats.counter, 0, GFP_KERNEL,
+ _DQST_DQSTAT_LAST);
+ if (ret)
+ panic("Cannot create dquot stat counters");
/* Find power-of-two hlist_heads which can fit into allocation */
nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
diff --git a/fs/read_write.c b/fs/read_write.c
index 2115d1f40bd5..ef6339391351 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -392,7 +392,7 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo
kiocb.ki_pos = (ppos ? *ppos : 0);
iov_iter_ubuf(&iter, ITER_DEST, buf, len);
- ret = call_read_iter(filp, &kiocb, &iter);
+ ret = filp->f_op->read_iter(&kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
if (ppos)
*ppos = kiocb.ki_pos;
@@ -494,7 +494,7 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
kiocb.ki_pos = (ppos ? *ppos : 0);
iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len);
- ret = call_write_iter(filp, &kiocb, &iter);
+ ret = filp->f_op->write_iter(&kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
if (ret > 0 && ppos)
*ppos = kiocb.ki_pos;
@@ -736,9 +736,9 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
kiocb.ki_pos = (ppos ? *ppos : 0);
if (type == READ)
- ret = call_read_iter(filp, &kiocb, iter);
+ ret = filp->f_op->read_iter(&kiocb, iter);
else
- ret = call_write_iter(filp, &kiocb, iter);
+ ret = filp->f_op->write_iter(&kiocb, iter);
BUG_ON(ret == -EIOCBQUEUED);
if (ppos)
*ppos = kiocb.ki_pos;
@@ -799,7 +799,7 @@ ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
if (ret < 0)
return ret;
- ret = call_read_iter(file, iocb, iter);
+ ret = file->f_op->read_iter(iocb, iter);
out:
if (ret >= 0)
fsnotify_access(file);
@@ -860,7 +860,7 @@ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
return ret;
kiocb_start_write(iocb);
- ret = call_write_iter(file, iocb, iter);
+ ret = file->f_op->write_iter(iocb, iter);
if (ret != -EIOCBQUEUED)
kiocb_end_write(iocb);
if (ret > 0)
@@ -1667,6 +1667,7 @@ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
return 0;
}
+EXPORT_SYMBOL_GPL(generic_write_check_limits);
/* Like generic_write_checks(), but takes size of write instead of iter. */
int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
index e2f7a264e3ff..11e9ecf24b63 100644
--- a/fs/reiserfs/README
+++ b/fs/reiserfs/README
@@ -102,19 +102,9 @@ that start on a node aligned boundary (there are reasons to want to node
align files), and he invented and implemented indirect items and
unformatted nodes as the solution.
-Konstantin Shvachko, with the help of the Russian version of a VC,
-tried to put me in a position where I was forced into giving control
-of the project to him. (Fortunately, as the person paying the money
-for all salaries from my dayjob I owned all copyrights, and you can't
-really force takeovers of sole proprietorships.) This was something
-curious, because he never really understood the value of our project,
-why we should do what we do, or why innovation was possible in
-general, but he was sure that he ought to be controlling it. Every
-innovation had to be forced past him while he was with us. He added
-two years to the time required to complete reiserfs, and was a net
-loss for me. Mikhail Gilula was a brilliant innovator who also left
-in a destructive way that erased the value of his contributions, and
-that he was shown much generosity just makes it more painful.
+Konstantin Shvachko was taking part in the early days.
+
+Mikhail Gilula was a brilliant innovator that has shown much generosity.
Grigory Zaigralin was an extremely effective system administrator for
our group.
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 1d825459ee6e..c1daedc50f4c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2503,8 +2503,8 @@ out:
* start/recovery path as __block_write_full_folio, along with special
* code to handle reiserfs tails.
*/
-static int reiserfs_write_full_folio(struct folio *folio,
- struct writeback_control *wbc)
+static int reiserfs_write_folio(struct folio *folio,
+ struct writeback_control *wbc, void *data)
{
struct inode *inode = folio->mapping->host;
unsigned long end_index = inode->i_size >> PAGE_SHIFT;
@@ -2721,12 +2721,11 @@ static int reiserfs_read_folio(struct file *f, struct folio *folio)
return block_read_full_folio(folio, reiserfs_get_block);
}
-static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
+static int reiserfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- struct folio *folio = page_folio(page);
- struct inode *inode = folio->mapping->host;
- reiserfs_wait_on_write_block(inode->i_sb);
- return reiserfs_write_full_folio(folio, wbc);
+ reiserfs_wait_on_write_block(mapping->host->i_sb);
+ return write_cache_pages(mapping, wbc, reiserfs_write_folio, NULL);
}
static void reiserfs_truncate_failed_write(struct inode *inode)
@@ -3405,7 +3404,7 @@ out:
}
const struct address_space_operations reiserfs_address_space_operations = {
- .writepage = reiserfs_writepage,
+ .writepages = reiserfs_writepages,
.read_folio = reiserfs_read_folio,
.readahead = reiserfs_readahead,
.release_folio = reiserfs_release_folio,
@@ -3415,4 +3414,5 @@ const struct address_space_operations reiserfs_address_space_operations = {
.bmap = reiserfs_aop_bmap,
.direct_IO = reiserfs_direct_IO,
.dirty_folio = reiserfs_dirty_folio,
+ .migrate_folio = buffer_migrate_folio,
};
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index e539ccd39e1e..e477ee0ff35d 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2626,8 +2626,7 @@ static int journal_init_dev(struct super_block *super,
MAJOR(jdev), MINOR(jdev), result);
return result;
} else if (jdev != super->s_dev)
- set_blocksize(file_bdev(journal->j_bdev_file),
- super->s_blocksize);
+ set_blocksize(journal->j_bdev_file, super->s_blocksize);
return 0;
}
@@ -2643,7 +2642,7 @@ static int journal_init_dev(struct super_block *super,
return result;
}
- set_blocksize(file_bdev(journal->j_bdev_file), super->s_blocksize);
+ set_blocksize(journal->j_bdev_file, super->s_blocksize);
reiserfs_info(super,
"journal_init_dev: journal device: %pg\n",
file_bdev(journal->j_bdev_file));
diff --git a/fs/remap_range.c b/fs/remap_range.c
index de07f978ce3e..28246dfc8485 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -99,8 +99,7 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in,
return 0;
}
-static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
- bool write)
+int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write)
{
int mask = write ? MAY_WRITE : MAY_READ;
loff_t tmp;
@@ -118,6 +117,7 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
return fsnotify_file_area_perm(file, mask, &pos, len);
}
+EXPORT_SYMBOL_GPL(remap_verify_area);
/*
* Ensure that we don't remap a partial EOF block in the middle of something
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 4a5614442dbf..ec7b2da2477a 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -282,14 +282,10 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
if (IS_ERR(file)) {
put_unused_fd(ufd);
kfree(ctx);
- return ufd;
+ return PTR_ERR(file);
}
file->f_mode |= FMODE_NOWAIT;
- /*
- * When we call this, the initialization must be complete, since
- * anon_inode_getfd() will install the fd.
- */
fd_install(ufd, file);
} else {
struct fd f = fdget(ufd);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index ec5b639f421a..bb86fc0641d8 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -431,6 +431,7 @@ cifs_free_inode(struct inode *inode)
static void
cifs_evict_inode(struct inode *inode)
{
+ netfs_wait_for_outstanding_io(inode);
truncate_inode_pages_final(&inode->i_data);
if (inode->i_state & I_PINNING_NETFS_WB)
cifs_fscache_unuse_inode_cookie(inode, true);
@@ -1226,7 +1227,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
struct cifsFileInfo *smb_file_src = src_file->private_data;
struct cifsFileInfo *smb_file_target = dst_file->private_data;
struct cifs_tcon *target_tcon, *src_tcon;
- unsigned long long destend, fstart, fend, new_size;
+ unsigned long long destend, fstart, fend, old_size, new_size;
unsigned int xid;
int rc;
@@ -1293,6 +1294,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
goto unlock;
if (fend > target_cifsi->netfs.zero_point)
target_cifsi->netfs.zero_point = fend + 1;
+ old_size = target_cifsi->netfs.remote_i_size;
/* Discard all the folios that overlap the destination region. */
cifs_dbg(FYI, "about to discard pages %llx-%llx\n", fstart, fend);
@@ -1305,9 +1307,8 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
if (target_tcon->ses->server->ops->duplicate_extents) {
rc = target_tcon->ses->server->ops->duplicate_extents(xid,
smb_file_src, smb_file_target, off, len, destoff);
- if (rc == 0 && new_size > i_size_read(target_inode)) {
+ if (rc == 0 && new_size > old_size) {
truncate_setsize(target_inode, new_size);
- netfs_resize_file(&target_cifsi->netfs, new_size, true);
fscache_resize_cookie(cifs_inode_cookie(target_inode),
new_size);
}
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 87310f05d397..62d5fee3e5eb 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -147,6 +147,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 48
-#define CIFS_VERSION "2.48"
+#define SMB3_PRODUCT_BUILD 49
+#define CIFS_VERSION "2.49"
#endif /* _CIFSFS_H */
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 9d38294a7e68..9d5c2440abfc 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -3189,6 +3189,28 @@ static void cifs_swap_deactivate(struct file *file)
/* do we need to unpin (or unlock) the file */
}
+/**
+ * cifs_swap_rw - SMB3 address space operation for swap I/O
+ * @iocb: target I/O control block
+ * @iter: I/O buffer
+ *
+ * Perform IO to the swap-file. This is much like direct IO.
+ */
+static int cifs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
+{
+ ssize_t ret;
+
+ WARN_ON_ONCE(iov_iter_count(iter) != PAGE_SIZE);
+
+ if (iov_iter_rw(iter) == READ)
+ ret = netfs_unbuffered_read_iter_locked(iocb, iter);
+ else
+ ret = netfs_unbuffered_write_iter_locked(iocb, iter, NULL);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
const struct address_space_operations cifs_addr_ops = {
.read_folio = netfs_read_folio,
.readahead = netfs_readahead,
@@ -3204,6 +3226,7 @@ const struct address_space_operations cifs_addr_ops = {
*/
.swap_activate = cifs_swap_activate,
.swap_deactivate = cifs_swap_deactivate,
+ .swap_rw = cifs_swap_rw,
};
/*
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index ef18cd30f66c..4ce6c3121a7e 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -2028,6 +2028,7 @@ smb2_duplicate_extents(const unsigned int xid,
* size will be queried on next revalidate, but it is important
* to make sure that file's cached size is updated immediately
*/
+ netfs_resize_file(netfs_inode(inode), dest_off + len, true);
cifs_setsize(inode, dest_off + len);
}
rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
@@ -3636,6 +3637,7 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon,
rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len);
if (rc < 0)
goto out_2;
+ cifsi->netfs.zero_point = new_eof;
rc = smb3_zero_data(file, tcon, off, len, xid);
if (rc < 0)
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index af97389e983e..36d47ce59631 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -518,7 +518,7 @@ DECLARE_EVENT_CLASS(smb3_inf_compound_enter_class,
__entry->xid = xid;
__entry->tid = tid;
__entry->sesid = sesid;
- __assign_str(path, full_path);
+ __assign_str(path);
),
TP_printk("xid=%u sid=0x%llx tid=0x%x path=%s",
__entry->xid, __entry->sesid, __entry->tid,
@@ -762,7 +762,7 @@ DECLARE_EVENT_CLASS(smb3_exit_err_class,
),
TP_fast_assign(
__entry->xid = xid;
- __assign_str(func_name, func_name);
+ __assign_str(func_name);
__entry->rc = rc;
),
TP_printk("\t%s: xid=%u rc=%d",
@@ -815,7 +815,7 @@ DECLARE_EVENT_CLASS(smb3_enter_exit_class,
),
TP_fast_assign(
__entry->xid = xid;
- __assign_str(func_name, func_name);
+ __assign_str(func_name);
),
TP_printk("\t%s: xid=%u",
__get_str(func_name), __entry->xid)
@@ -852,7 +852,7 @@ DECLARE_EVENT_CLASS(smb3_tcon_class,
__entry->xid = xid;
__entry->tid = tid;
__entry->sesid = sesid;
- __assign_str(name, unc_name);
+ __assign_str(name);
__entry->rc = rc;
),
TP_printk("xid=%u sid=0x%llx tid=0x%x unc_name=%s rc=%d",
@@ -896,7 +896,7 @@ DECLARE_EVENT_CLASS(smb3_open_enter_class,
__entry->xid = xid;
__entry->tid = tid;
__entry->sesid = sesid;
- __assign_str(path, full_path);
+ __assign_str(path);
__entry->create_options = create_options;
__entry->desired_access = desired_access;
),
@@ -1098,7 +1098,7 @@ DECLARE_EVENT_CLASS(smb3_connect_class,
__entry->conn_id = conn_id;
pss = (struct sockaddr_storage *)__entry->dst_addr;
*pss = *dst_addr;
- __assign_str(hostname, hostname);
+ __assign_str(hostname);
),
TP_printk("conn_id=0x%llx server=%s addr=%pISpsfc",
__entry->conn_id,
@@ -1134,7 +1134,7 @@ DECLARE_EVENT_CLASS(smb3_connect_err_class,
__entry->rc = rc;
pss = (struct sockaddr_storage *)__entry->dst_addr;
*pss = *dst_addr;
- __assign_str(hostname, hostname);
+ __assign_str(hostname);
),
TP_printk("rc=%d conn_id=0x%llx server=%s addr=%pISpsfc",
__entry->rc,
@@ -1166,7 +1166,7 @@ DECLARE_EVENT_CLASS(smb3_reconnect_class,
TP_fast_assign(
__entry->currmid = currmid;
__entry->conn_id = conn_id;
- __assign_str(hostname, hostname);
+ __assign_str(hostname);
),
TP_printk("conn_id=0x%llx server=%s current_mid=%llu",
__entry->conn_id,
@@ -1255,7 +1255,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class,
TP_fast_assign(
__entry->currmid = currmid;
__entry->conn_id = conn_id;
- __assign_str(hostname, hostname);
+ __assign_str(hostname);
__entry->credits = credits;
__entry->credits_to_add = credits_to_add;
__entry->in_flight = in_flight;
diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c
index a2f0a2edceb8..e0a6b758094f 100644
--- a/fs/smb/server/mgmt/share_config.c
+++ b/fs/smb/server/mgmt/share_config.c
@@ -165,8 +165,12 @@ static struct ksmbd_share_config *share_config_request(struct unicode_map *um,
share->path = kstrndup(ksmbd_share_config_path(resp), path_len,
GFP_KERNEL);
- if (share->path)
+ if (share->path) {
share->path_sz = strlen(share->path);
+ while (share->path_sz > 1 &&
+ share->path[share->path_sz - 1] == '/')
+ share->path[--share->path_sz] = '\0';
+ }
share->create_mask = resp->create_mask;
share->directory_mask = resp->directory_mask;
share->force_create_mode = resp->force_create_mode;
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index b9d9116fc2b3..a8f52c4ebbda 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -610,19 +610,24 @@ static int oplock_break_pending(struct oplock_info *opinfo, int req_op_level)
if (opinfo->op_state == OPLOCK_CLOSING)
return -ENOENT;
else if (opinfo->level <= req_op_level) {
- if (opinfo->is_lease &&
- opinfo->o_lease->state !=
- (SMB2_LEASE_HANDLE_CACHING_LE |
- SMB2_LEASE_READ_CACHING_LE))
+ if (opinfo->is_lease == false)
+ return 1;
+
+ if (opinfo->o_lease->state !=
+ (SMB2_LEASE_HANDLE_CACHING_LE |
+ SMB2_LEASE_READ_CACHING_LE))
return 1;
}
}
if (opinfo->level <= req_op_level) {
- if (opinfo->is_lease &&
- opinfo->o_lease->state !=
- (SMB2_LEASE_HANDLE_CACHING_LE |
- SMB2_LEASE_READ_CACHING_LE)) {
+ if (opinfo->is_lease == false) {
+ wake_up_oplock_break(opinfo);
+ return 1;
+ }
+ if (opinfo->o_lease->state !=
+ (SMB2_LEASE_HANDLE_CACHING_LE |
+ SMB2_LEASE_READ_CACHING_LE)) {
wake_up_oplock_break(opinfo);
return 1;
}
diff --git a/fs/splice.c b/fs/splice.c
index 218e24b1ac40..60aed8de21f8 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -362,7 +362,7 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos,
iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;
- ret = call_read_iter(in, &kiocb, &to);
+ ret = in->f_op->read_iter(&kiocb, &to);
if (ret > 0) {
keep = DIV_ROUND_UP(ret, PAGE_SIZE);
@@ -740,7 +740,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
init_sync_kiocb(&kiocb, out);
kiocb.ki_pos = sd.pos;
- ret = call_write_iter(out, &kiocb, &from);
+ ret = out->f_op->write_iter(&kiocb, &from);
sd.pos = kiocb.ki_pos;
if (ret <= 0)
break;
diff --git a/fs/super.c b/fs/super.c
index 69ce6c600968..b72f1d288e95 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -274,6 +274,7 @@ static void destroy_super_work(struct work_struct *work)
{
struct super_block *s = container_of(work, struct super_block,
destroy_work);
+ fsnotify_sb_free(s);
security_sb_free(s);
put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 7cd64021d453..d1995e2d6c94 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -785,3 +785,30 @@ int sysfs_emit_at(char *buf, int at, const char *fmt, ...)
return len;
}
EXPORT_SYMBOL_GPL(sysfs_emit_at);
+
+/**
+ * sysfs_bin_attr_simple_read - read callback to simply copy from memory.
+ * @file: attribute file which is being read.
+ * @kobj: object to which the attribute belongs.
+ * @attr: attribute descriptor.
+ * @buf: destination buffer.
+ * @off: offset in bytes from which to read.
+ * @count: maximum number of bytes to read.
+ *
+ * Simple ->read() callback for bin_attributes backed by a buffer in memory.
+ * The @private and @size members in struct bin_attribute must be set to the
+ * buffer's location and size before the bin_attribute is created in sysfs.
+ *
+ * Bounds check for @off and @count is done in sysfs_kf_bin_read().
+ * Negative value check for @off is done in vfs_setpos() and default_llseek().
+ *
+ * Returns number of bytes written to @buf.
+ */
+ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *attr, char *buf,
+ loff_t off, size_t count)
+{
+ memcpy(buf, attr->private + off, count);
+ return count;
+}
+EXPORT_SYMBOL_GPL(sysfs_bin_attr_simple_read);
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 0256afdd4acf..5d88c184f0fc 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -37,7 +37,6 @@ static DEFINE_MUTEX(eventfs_mutex);
struct eventfs_root_inode {
struct eventfs_inode ei;
- struct inode *parent_inode;
struct dentry *events_dir;
};
@@ -50,8 +49,12 @@ static struct eventfs_root_inode *get_root_inode(struct eventfs_inode *ei)
/* Just try to make something consistent and unique */
static int eventfs_dir_ino(struct eventfs_inode *ei)
{
- if (!ei->ino)
+ if (!ei->ino) {
ei->ino = get_next_ino();
+ /* Must not have the file inode number */
+ if (ei->ino == EVENTFS_FILE_INODE_INO)
+ ei->ino = get_next_ino();
+ }
return ei->ino;
}
@@ -207,7 +210,9 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
* determined by the parent directory.
*/
if (dentry->d_inode->i_mode & S_IFDIR) {
- update_attr(&ei->attr, iattr);
+ /* Just use the inode permissions for the events directory */
+ if (!ei->is_events)
+ update_attr(&ei->attr, iattr);
} else {
name = dentry->d_name.name;
@@ -225,70 +230,9 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
return ret;
}
-static void update_events_attr(struct eventfs_inode *ei, struct super_block *sb)
-{
- struct eventfs_root_inode *rei;
- struct inode *parent;
-
- rei = get_root_inode(ei);
-
- /* Use the parent inode permissions unless root set its permissions */
- parent = rei->parent_inode;
-
- if (rei->ei.attr.mode & EVENTFS_SAVE_UID)
- ei->attr.uid = rei->ei.attr.uid;
- else
- ei->attr.uid = parent->i_uid;
-
- if (rei->ei.attr.mode & EVENTFS_SAVE_GID)
- ei->attr.gid = rei->ei.attr.gid;
- else
- ei->attr.gid = parent->i_gid;
-}
-
-static void set_top_events_ownership(struct inode *inode)
-{
- struct tracefs_inode *ti = get_tracefs(inode);
- struct eventfs_inode *ei = ti->private;
-
- /* The top events directory doesn't get automatically updated */
- if (!ei || !ei->is_events)
- return;
-
- update_events_attr(ei, inode->i_sb);
-
- if (!(ei->attr.mode & EVENTFS_SAVE_UID))
- inode->i_uid = ei->attr.uid;
-
- if (!(ei->attr.mode & EVENTFS_SAVE_GID))
- inode->i_gid = ei->attr.gid;
-}
-
-static int eventfs_get_attr(struct mnt_idmap *idmap,
- const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
-{
- struct dentry *dentry = path->dentry;
- struct inode *inode = d_backing_inode(dentry);
-
- set_top_events_ownership(inode);
-
- generic_fillattr(idmap, request_mask, inode, stat);
- return 0;
-}
-
-static int eventfs_permission(struct mnt_idmap *idmap,
- struct inode *inode, int mask)
-{
- set_top_events_ownership(inode);
- return generic_permission(idmap, inode, mask);
-}
-
static const struct inode_operations eventfs_dir_inode_operations = {
.lookup = eventfs_root_lookup,
.setattr = eventfs_set_attr,
- .getattr = eventfs_get_attr,
- .permission = eventfs_permission,
};
static const struct inode_operations eventfs_file_inode_operations = {
@@ -301,84 +245,109 @@ static const struct file_operations eventfs_file_operations = {
.llseek = generic_file_llseek,
};
-/*
- * On a remount of tracefs, if UID or GID options are set, then
- * the mount point inode permissions should be used.
- * Reset the saved permission flags appropriately.
- */
-void eventfs_remount(struct tracefs_inode *ti, bool update_uid, bool update_gid)
+static void eventfs_set_attrs(struct eventfs_inode *ei, bool update_uid, kuid_t uid,
+ bool update_gid, kgid_t gid, int level)
{
- struct eventfs_inode *ei = ti->private;
+ struct eventfs_inode *ei_child;
- if (!ei)
+ /* Update events/<system>/<event> */
+ if (WARN_ON_ONCE(level > 3))
return;
- if (update_uid)
+ if (update_uid) {
ei->attr.mode &= ~EVENTFS_SAVE_UID;
+ ei->attr.uid = uid;
+ }
- if (update_gid)
+ if (update_gid) {
ei->attr.mode &= ~EVENTFS_SAVE_GID;
+ ei->attr.gid = gid;
+ }
+
+ list_for_each_entry(ei_child, &ei->children, list) {
+ eventfs_set_attrs(ei_child, update_uid, uid, update_gid, gid, level + 1);
+ }
if (!ei->entry_attrs)
return;
for (int i = 0; i < ei->nr_entries; i++) {
- if (update_uid)
+ if (update_uid) {
ei->entry_attrs[i].mode &= ~EVENTFS_SAVE_UID;
- if (update_gid)
+ ei->entry_attrs[i].uid = uid;
+ }
+ if (update_gid) {
ei->entry_attrs[i].mode &= ~EVENTFS_SAVE_GID;
+ ei->entry_attrs[i].gid = gid;
+ }
}
+
}
-/* Return the evenfs_inode of the "events" directory */
-static struct eventfs_inode *eventfs_find_events(struct dentry *dentry)
+/*
+ * On a remount of tracefs, if UID or GID options are set, then
+ * the mount point inode permissions should be used.
+ * Reset the saved permission flags appropriately.
+ */
+void eventfs_remount(struct tracefs_inode *ti, bool update_uid, bool update_gid)
{
- struct eventfs_inode *ei;
+ struct eventfs_inode *ei = ti->private;
- do {
- // The parent is stable because we do not do renames
- dentry = dentry->d_parent;
- // ... and directories always have d_fsdata
- ei = dentry->d_fsdata;
+ /* Only the events directory does the updates */
+ if (!ei || !ei->is_events || ei->is_freed)
+ return;
- /*
- * If the ei is being freed, the ownership of the children
- * doesn't matter.
- */
- if (ei->is_freed)
- return NULL;
+ eventfs_set_attrs(ei, update_uid, ti->vfs_inode.i_uid,
+ update_gid, ti->vfs_inode.i_gid, 0);
+}
- // Walk upwards until you find the events inode
- } while (!ei->is_events);
+static void update_inode_attr(struct inode *inode, umode_t mode,
+ struct eventfs_attr *attr, struct eventfs_root_inode *rei)
+{
+ if (attr && attr->mode & EVENTFS_SAVE_MODE)
+ inode->i_mode = attr->mode & EVENTFS_MODE_MASK;
+ else
+ inode->i_mode = mode;
- update_events_attr(ei, dentry->d_sb);
+ if (attr && attr->mode & EVENTFS_SAVE_UID)
+ inode->i_uid = attr->uid;
+ else
+ inode->i_uid = rei->ei.attr.uid;
- return ei;
+ if (attr && attr->mode & EVENTFS_SAVE_GID)
+ inode->i_gid = attr->gid;
+ else
+ inode->i_gid = rei->ei.attr.gid;
}
-static void update_inode_attr(struct dentry *dentry, struct inode *inode,
- struct eventfs_attr *attr, umode_t mode)
+static struct inode *eventfs_get_inode(struct dentry *dentry, struct eventfs_attr *attr,
+ umode_t mode, struct eventfs_inode *ei)
{
- struct eventfs_inode *events_ei = eventfs_find_events(dentry);
+ struct eventfs_root_inode *rei;
+ struct eventfs_inode *pei;
+ struct tracefs_inode *ti;
+ struct inode *inode;
- if (!events_ei)
- return;
+ inode = tracefs_get_inode(dentry->d_sb);
+ if (!inode)
+ return NULL;
- inode->i_mode = mode;
- inode->i_uid = events_ei->attr.uid;
- inode->i_gid = events_ei->attr.gid;
+ ti = get_tracefs(inode);
+ ti->private = ei;
+ ti->flags |= TRACEFS_EVENT_INODE;
- if (!attr)
- return;
+ /* Find the top dentry that holds the "events" directory */
+ do {
+ dentry = dentry->d_parent;
+ /* Directories always have d_fsdata */
+ pei = dentry->d_fsdata;
+ } while (!pei->is_events);
- if (attr->mode & EVENTFS_SAVE_MODE)
- inode->i_mode = attr->mode & EVENTFS_MODE_MASK;
+ rei = get_root_inode(pei);
- if (attr->mode & EVENTFS_SAVE_UID)
- inode->i_uid = attr->uid;
+ update_inode_attr(inode, mode, attr, rei);
- if (attr->mode & EVENTFS_SAVE_GID)
- inode->i_gid = attr->gid;
+ return inode;
}
/**
@@ -401,7 +370,6 @@ static struct dentry *lookup_file(struct eventfs_inode *parent_ei,
void *data,
const struct file_operations *fop)
{
- struct tracefs_inode *ti;
struct inode *inode;
if (!(mode & S_IFMT))
@@ -410,13 +378,11 @@ static struct dentry *lookup_file(struct eventfs_inode *parent_ei,
if (WARN_ON_ONCE(!S_ISREG(mode)))
return ERR_PTR(-EIO);
- inode = tracefs_get_inode(dentry->d_sb);
+ /* Only directories have ti->private set to an ei, not files */
+ inode = eventfs_get_inode(dentry, attr, mode, NULL);
if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
- /* If the user updated the directory's attributes, use them */
- update_inode_attr(dentry, inode, attr, mode);
-
inode->i_op = &eventfs_file_inode_operations;
inode->i_fop = fop;
inode->i_private = data;
@@ -424,9 +390,6 @@ static struct dentry *lookup_file(struct eventfs_inode *parent_ei,
/* All files will have the same inode number */
inode->i_ino = EVENTFS_FILE_INODE_INO;
- ti = get_tracefs(inode);
- ti->flags |= TRACEFS_EVENT_INODE;
-
// Files have their parent's ei as their fsdata
dentry->d_fsdata = get_ei(parent_ei);
@@ -446,28 +409,19 @@ static struct dentry *lookup_file(struct eventfs_inode *parent_ei,
static struct dentry *lookup_dir_entry(struct dentry *dentry,
struct eventfs_inode *pei, struct eventfs_inode *ei)
{
- struct tracefs_inode *ti;
struct inode *inode;
+ umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
- inode = tracefs_get_inode(dentry->d_sb);
+ inode = eventfs_get_inode(dentry, &ei->attr, mode, ei);
if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
- /* If the user updated the directory's attributes, use them */
- update_inode_attr(dentry, inode, &ei->attr,
- S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO);
-
inode->i_op = &eventfs_dir_inode_operations;
inode->i_fop = &eventfs_file_operations;
/* All directories will have the same inode number */
inode->i_ino = eventfs_dir_ino(ei);
- ti = get_tracefs(inode);
- ti->flags |= TRACEFS_EVENT_INODE;
- /* Only directories have ti->private set to an ei, not files */
- ti->private = ei;
-
dentry->d_fsdata = get_ei(ei);
d_add(dentry, inode);
@@ -828,7 +782,6 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
// Note: we have a ref to the dentry from tracefs_start_creating()
rei = get_root_inode(ei);
rei->events_dir = dentry;
- rei->parent_inode = d_inode(dentry->d_sb->s_root);
ei->entries = entries;
ei->nr_entries = size;
@@ -838,14 +791,12 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
uid = d_inode(dentry->d_parent)->i_uid;
gid = d_inode(dentry->d_parent)->i_gid;
- ei->attr.uid = uid;
- ei->attr.gid = gid;
-
/*
- * When the "events" directory is created, it takes on the
- * permissions of its parent. But can be reset on remount.
+ * The ei->attr will be used as the default values for the
+ * files beneath this directory.
*/
- ei->attr.mode |= EVENTFS_SAVE_UID | EVENTFS_SAVE_GID;
+ ei->attr.uid = uid;
+ ei->attr.gid = gid;
INIT_LIST_HEAD(&ei->children);
INIT_LIST_HEAD(&ei->list);
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index a827f6a716c4..7c29f4afc23d 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -373,12 +373,21 @@ static int tracefs_apply_options(struct super_block *sb, bool remount)
rcu_read_lock();
list_for_each_entry_rcu(ti, &tracefs_inodes, list) {
- if (update_uid)
+ if (update_uid) {
ti->flags &= ~TRACEFS_UID_PERM_SET;
+ ti->vfs_inode.i_uid = fsi->uid;
+ }
- if (update_gid)
+ if (update_gid) {
ti->flags &= ~TRACEFS_GID_PERM_SET;
-
+ ti->vfs_inode.i_gid = fsi->gid;
+ }
+
+ /*
+ * Note, the above ti->vfs_inode updates are
+ * used in eventfs_remount() so they must come
+ * before calling it.
+ */
if (ti->flags & TRACEFS_EVENT_INODE)
eventfs_remount(ti, update_uid, update_gid);
}
@@ -417,10 +426,26 @@ static int tracefs_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
+static int tracefs_drop_inode(struct inode *inode)
+{
+ struct tracefs_inode *ti = get_tracefs(inode);
+
+ /*
+ * This inode is being freed and cannot be used for
+ * eventfs. Clear the flag so that it doesn't call into
+ * eventfs during the remount flag updates. The eventfs_inode
+ * gets freed after an RCU cycle, so the content will still
+ * be safe if the iteration is going on now.
+ */
+ ti->flags &= ~TRACEFS_EVENT_INODE;
+
+ return 1;
+}
+
static const struct super_operations tracefs_super_operations = {
.alloc_inode = tracefs_alloc_inode,
.free_inode = tracefs_free_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = tracefs_drop_inode,
.statfs = simple_statfs,
.show_options = tracefs_show_options,
};
@@ -446,22 +471,7 @@ static int tracefs_d_revalidate(struct dentry *dentry, unsigned int flags)
return !(ei && ei->is_freed);
}
-static void tracefs_d_iput(struct dentry *dentry, struct inode *inode)
-{
- struct tracefs_inode *ti = get_tracefs(inode);
-
- /*
- * This inode is being freed and cannot be used for
- * eventfs. Clear the flag so that it doesn't call into
- * eventfs during the remount flag updates. The eventfs_inode
- * gets freed after an RCU cycle, so the content will still
- * be safe if the iteration is going on now.
- */
- ti->flags &= ~TRACEFS_EVENT_INODE;
-}
-
static const struct dentry_operations tracefs_dentry_operations = {
- .d_iput = tracefs_d_iput,
.d_revalidate = tracefs_d_revalidate,
.d_release = tracefs_d_release,
};
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 0ceac4b5937c..97c59585208c 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -39,7 +39,7 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct inode *inode = file_inode(vma->vm_file);
struct address_space *mapping = inode->i_mapping;
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
loff_t size;
unsigned int end;
vm_fault_t ret = VM_FAULT_LOCKED;
@@ -48,31 +48,31 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
filemap_invalidate_lock_shared(mapping);
- lock_page(page);
+ folio_lock(folio);
size = i_size_read(inode);
- if (page->mapping != inode->i_mapping || page_offset(page) >= size) {
- unlock_page(page);
+ if (folio->mapping != inode->i_mapping || folio_pos(folio) >= size) {
+ folio_unlock(folio);
ret = VM_FAULT_NOPAGE;
goto out_unlock;
}
/* Space is already allocated for in-ICB file */
if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
goto out_dirty;
- if (page->index == size >> PAGE_SHIFT)
+ if (folio->index == size >> PAGE_SHIFT)
end = size & ~PAGE_MASK;
else
end = PAGE_SIZE;
- err = __block_write_begin(page, 0, end, udf_get_block);
+ err = __block_write_begin(&folio->page, 0, end, udf_get_block);
if (err) {
- unlock_page(page);
+ folio_unlock(folio);
ret = vmf_fs_error(err);
goto out_unlock;
}
- block_commit_write(page, 0, end);
+ block_commit_write(&folio->page, 0, end);
out_dirty:
- set_page_dirty(page);
- wait_for_stable_page(page);
+ folio_mark_dirty(folio);
+ folio_wait_stable(folio);
out_unlock:
filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(inode->i_sb);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 2f831a3a91af..2fb21c5ffccf 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -208,19 +208,14 @@ static int udf_writepages(struct address_space *mapping,
return write_cache_pages(mapping, wbc, udf_adinicb_writepage, NULL);
}
-static void udf_adinicb_readpage(struct page *page)
+static void udf_adinicb_read_folio(struct folio *folio)
{
- struct inode *inode = page->mapping->host;
- char *kaddr;
+ struct inode *inode = folio->mapping->host;
struct udf_inode_info *iinfo = UDF_I(inode);
loff_t isize = i_size_read(inode);
- kaddr = kmap_local_page(page);
- memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, isize);
- memset(kaddr + isize, 0, PAGE_SIZE - isize);
- flush_dcache_page(page);
- SetPageUptodate(page);
- kunmap_local(kaddr);
+ folio_fill_tail(folio, 0, iinfo->i_data + iinfo->i_lenEAttr, isize);
+ folio_mark_uptodate(folio);
}
static int udf_read_folio(struct file *file, struct folio *folio)
@@ -228,7 +223,7 @@ static int udf_read_folio(struct file *file, struct folio *folio)
struct udf_inode_info *iinfo = UDF_I(file_inode(file));
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
- udf_adinicb_readpage(&folio->page);
+ udf_adinicb_read_folio(folio);
folio_unlock(folio);
return 0;
}
@@ -254,7 +249,7 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct udf_inode_info *iinfo = UDF_I(file_inode(file));
- struct page *page;
+ struct folio *folio;
int ret;
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
@@ -266,12 +261,13 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
}
if (WARN_ON_ONCE(pos >= PAGE_SIZE))
return -EIO;
- page = grab_cache_page_write_begin(mapping, 0);
- if (!page)
- return -ENOMEM;
- *pagep = page;
- if (!PageUptodate(page))
- udf_adinicb_readpage(page);
+ folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ *pagep = &folio->page;
+ if (!folio_test_uptodate(folio))
+ udf_adinicb_read_folio(folio);
return 0;
}
@@ -280,17 +276,19 @@ static int udf_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = file_inode(file);
+ struct folio *folio;
loff_t last_pos;
if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB)
return generic_write_end(file, mapping, pos, len, copied, page,
fsdata);
+ folio = page_folio(page);
last_pos = pos + copied;
if (last_pos > inode->i_size)
i_size_write(inode, last_pos);
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return copied;
}
@@ -341,7 +339,7 @@ const struct address_space_operations udf_aops = {
*/
int udf_expand_file_adinicb(struct inode *inode)
{
- struct page *page;
+ struct folio *folio;
struct udf_inode_info *iinfo = UDF_I(inode);
int err;
@@ -357,12 +355,13 @@ int udf_expand_file_adinicb(struct inode *inode)
return 0;
}
- page = find_or_create_page(inode->i_mapping, 0, GFP_KERNEL);
- if (!page)
- return -ENOMEM;
+ folio = __filemap_get_folio(inode->i_mapping, 0,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_KERNEL);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- if (!PageUptodate(page))
- udf_adinicb_readpage(page);
+ if (!folio_test_uptodate(folio))
+ udf_adinicb_read_folio(folio);
down_write(&iinfo->i_data_sem);
memset(iinfo->i_data + iinfo->i_lenEAttr, 0x00,
iinfo->i_lenAlloc);
@@ -371,22 +370,22 @@ int udf_expand_file_adinicb(struct inode *inode)
iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
else
iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
- set_page_dirty(page);
- unlock_page(page);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
up_write(&iinfo->i_data_sem);
err = filemap_fdatawrite(inode->i_mapping);
if (err) {
/* Restore everything back so that we don't lose data... */
- lock_page(page);
+ folio_lock(folio);
down_write(&iinfo->i_data_sem);
- memcpy_to_page(page, 0, iinfo->i_data + iinfo->i_lenEAttr,
- inode->i_size);
- unlock_page(page);
+ memcpy_from_folio(iinfo->i_data + iinfo->i_lenEAttr,
+ folio, 0, inode->i_size);
+ folio_unlock(folio);
iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
iinfo->i_lenAlloc = inode->i_size;
up_write(&iinfo->i_data_sem);
}
- put_page(page);
+ folio_put(folio);
mark_inode_dirty(inode);
return err;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 2217f7ed7a49..9381a66c6ce5 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -630,7 +630,7 @@ static int udf_parse_param(struct fs_context *fc, struct fs_parameter *param)
if (!uopt->nls_map) {
errorf(fc, "iocharset %s not found",
param->string);
- return -EINVAL;;
+ return -EINVAL;
}
}
break;
@@ -895,7 +895,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
int ret;
struct timestamp *ts;
- outstr = kmalloc(128, GFP_KERNEL);
+ outstr = kzalloc(128, GFP_KERNEL);
if (!outstr)
return -ENOMEM;
@@ -921,11 +921,11 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
ret = udf_dstrCS0toChar(sb, outstr, 31, pvoldesc->volIdent, 32);
if (ret < 0) {
- strcpy(UDF_SB(sb)->s_volume_ident, "InvalidName");
+ strscpy_pad(UDF_SB(sb)->s_volume_ident, "InvalidName");
pr_warn("incorrect volume identification, setting to "
"'InvalidName'\n");
} else {
- strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret);
+ strscpy_pad(UDF_SB(sb)->s_volume_ident, outstr);
}
udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident);
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index f7eaf7b14594..fe03745d09b1 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -99,18 +99,17 @@ static int udf_pc_to_char(struct super_block *sb, unsigned char *from,
static int udf_symlink_filler(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct buffer_head *bh = NULL;
unsigned char *symlink;
int err = 0;
- unsigned char *p = page_address(page);
+ unsigned char *p = folio_address(folio);
struct udf_inode_info *iinfo = UDF_I(inode);
/* We don't support symlinks longer than one block */
if (inode->i_size > inode->i_sb->s_blocksize) {
err = -ENAMETOOLONG;
- goto out_unlock;
+ goto out;
}
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
@@ -120,24 +119,15 @@ static int udf_symlink_filler(struct file *file, struct folio *folio)
if (!bh) {
if (!err)
err = -EFSCORRUPTED;
- goto out_err;
+ goto out;
}
symlink = bh->b_data;
}
err = udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p, PAGE_SIZE);
brelse(bh);
- if (err)
- goto out_err;
-
- SetPageUptodate(page);
- unlock_page(page);
- return 0;
-
-out_err:
- SetPageError(page);
-out_unlock:
- unlock_page(page);
+out:
+ folio_end_read(folio, err == 0);
return err;
}
@@ -147,12 +137,12 @@ static int udf_symlink_getattr(struct mnt_idmap *idmap,
{
struct dentry *dentry = path->dentry;
struct inode *inode = d_backing_inode(dentry);
- struct page *page;
+ struct folio *folio;
generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
- page = read_mapping_page(inode->i_mapping, 0, NULL);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ folio = read_mapping_folio(inode->i_mapping, 0, NULL);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
/*
* UDF uses non-trivial encoding of symlinks so i_size does not match
* number of characters reported by readlink(2) which apparently some
@@ -162,8 +152,8 @@ static int udf_symlink_getattr(struct mnt_idmap *idmap,
* let's report the length of string returned by readlink(2) for
* st_size.
*/
- stat->size = strlen(page_address(page));
- put_page(page);
+ stat->size = strlen(folio_address(folio));
+ folio_put(folio);
return 0;
}
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index 758163af39c2..78ecc633606f 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -46,13 +46,18 @@ udf_disk_stamp_to_time(struct timespec64 *dest, struct timestamp src)
dest->tv_sec = mktime64(year, src.month, src.day, src.hour, src.minute,
src.second);
dest->tv_sec -= offset * 60;
- dest->tv_nsec = 1000 * (src.centiseconds * 10000 +
- src.hundredsOfMicroseconds * 100 + src.microseconds);
+
/*
* Sanitize nanosecond field since reportedly some filesystems are
* recorded with bogus sub-second values.
*/
- dest->tv_nsec %= NSEC_PER_SEC;
+ if (src.centiseconds < 100 && src.hundredsOfMicroseconds < 100 &&
+ src.microseconds < 100) {
+ dest->tv_nsec = 1000 * (src.centiseconds * 10000 +
+ src.hundredsOfMicroseconds * 100 + src.microseconds);
+ } else {
+ dest->tv_nsec = 0;
+ }
}
void
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index c5a35e32adf0..c50447548d65 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -34,6 +34,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_dir2_node.o \
xfs_dir2_sf.o \
xfs_dquot_buf.o \
+ xfs_exchmaps.o \
xfs_ialloc.o \
xfs_ialloc_btree.o \
xfs_iext_tree.o \
@@ -41,6 +42,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_inode_buf.o \
xfs_log_rlimit.o \
xfs_ag_resv.o \
+ xfs_parent.o \
xfs_rmap.o \
xfs_rmap_btree.o \
xfs_refcount.o \
@@ -49,6 +51,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_symlink_remote.o \
xfs_trans_inode.o \
xfs_trans_resv.o \
+ xfs_trans_space.o \
xfs_types.o \
)
# xfs_rtbitmap is shared with libxfs
@@ -67,6 +70,7 @@ xfs-y += xfs_aops.o \
xfs_dir2_readdir.o \
xfs_discard.o \
xfs_error.o \
+ xfs_exchrange.o \
xfs_export.o \
xfs_extent_busy.o \
xfs_file.o \
@@ -74,6 +78,7 @@ xfs-y += xfs_aops.o \
xfs_fsmap.o \
xfs_fsops.o \
xfs_globals.o \
+ xfs_handle.o \
xfs_health.o \
xfs_icache.o \
xfs_ioctl.o \
@@ -101,6 +106,7 @@ xfs-y += xfs_log.o \
xfs_buf_item.o \
xfs_buf_item_recover.o \
xfs_dquot_item_recover.o \
+ xfs_exchmaps_item.o \
xfs_extfree_item.o \
xfs_attr_item.o \
xfs_icreate_item.o \
@@ -157,11 +163,13 @@ xfs-y += $(addprefix scrub/, \
common.o \
dabtree.o \
dir.o \
+ dirtree.o \
fscounters.o \
health.o \
ialloc.o \
inode.o \
iscan.o \
+ listxattr.o \
nlinks.o \
parent.o \
readdir.o \
@@ -170,6 +178,7 @@ xfs-y += $(addprefix scrub/, \
scrub.o \
symlink.o \
xfarray.o \
+ xfblob.o \
xfile.o \
)
@@ -191,23 +200,32 @@ ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
xfs-y += $(addprefix scrub/, \
agheader_repair.o \
alloc_repair.o \
+ attr_repair.o \
bmap_repair.o \
cow_repair.o \
+ dir_repair.o \
+ dirtree_repair.o \
+ findparent.o \
fscounters_repair.o \
ialloc_repair.o \
inode_repair.o \
newbt.o \
nlinks_repair.o \
+ orphanage.o \
+ parent_repair.o \
rcbag_btree.o \
rcbag.o \
reap.o \
refcount_repair.o \
repair.o \
rmap_repair.o \
+ symlink_repair.o \
+ tempfile.o \
)
xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
rtbitmap_repair.o \
+ rtsummary_repair.o \
)
xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index dc1873f76bff..240e079cb3fb 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -194,7 +194,7 @@ xfs_initialize_perag_data(
pag = xfs_perag_get(mp, index);
error = xfs_alloc_read_agf(pag, NULL, 0, NULL);
if (!error)
- error = xfs_ialloc_read_agi(pag, NULL, NULL);
+ error = xfs_ialloc_read_agi(pag, NULL, 0, NULL);
if (error) {
xfs_perag_put(pag);
return error;
@@ -931,7 +931,7 @@ xfs_ag_shrink_space(
int error, err2;
ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1);
- error = xfs_ialloc_read_agi(pag, *tpp, &agibp);
+ error = xfs_ialloc_read_agi(pag, *tpp, 0, &agibp);
if (error)
return error;
@@ -963,9 +963,7 @@ xfs_ag_shrink_space(
* Disable perag reservations so it doesn't cause the allocation request
* to fail. We'll reestablish reservation before we return.
*/
- error = xfs_ag_resv_free(pag);
- if (error)
- return error;
+ xfs_ag_resv_free(pag);
/* internal log shouldn't also show up in the free space btrees */
error = xfs_alloc_vextent_exact_bno(&args,
@@ -1062,7 +1060,7 @@ xfs_ag_extend_space(
ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1);
- error = xfs_ialloc_read_agi(pag, tp, &bp);
+ error = xfs_ialloc_read_agi(pag, tp, 0, &bp);
if (error)
return error;
@@ -1119,7 +1117,7 @@ xfs_ag_get_geometry(
int error;
/* Lock the AG headers. */
- error = xfs_ialloc_read_agi(pag, NULL, &agi_bp);
+ error = xfs_ialloc_read_agi(pag, NULL, 0, &agi_bp);
if (error)
return error;
error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp);
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index da1057bd0e60..216423df939e 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -126,14 +126,13 @@ xfs_ag_resv_needed(
}
/* Clean out a reservation */
-static int
+static void
__xfs_ag_resv_free(
struct xfs_perag *pag,
enum xfs_ag_resv_type type)
{
struct xfs_ag_resv *resv;
xfs_extlen_t oldresv;
- int error;
trace_xfs_ag_resv_free(pag, type, 0);
@@ -149,30 +148,19 @@ __xfs_ag_resv_free(
oldresv = resv->ar_orig_reserved;
else
oldresv = resv->ar_reserved;
- error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
+ xfs_add_fdblocks(pag->pag_mount, oldresv);
resv->ar_reserved = 0;
resv->ar_asked = 0;
resv->ar_orig_reserved = 0;
-
- if (error)
- trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
- error, _RET_IP_);
- return error;
}
/* Free a per-AG reservation. */
-int
+void
xfs_ag_resv_free(
struct xfs_perag *pag)
{
- int error;
- int err2;
-
- error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
- err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
- if (err2 && !error)
- error = err2;
- return error;
+ __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
+ __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
}
static int
@@ -216,7 +204,7 @@ __xfs_ag_resv_init(
if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
error = -ENOSPC;
else
- error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true);
+ error = xfs_dec_fdblocks(mp, hidden_space, true);
if (error) {
trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
error, _RET_IP_);
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h
index b74b210008ea..ff20ed93de77 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.h
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -6,7 +6,7 @@
#ifndef __XFS_AG_RESV_H__
#define __XFS_AG_RESV_H__
-int xfs_ag_resv_free(struct xfs_perag *pag);
+void xfs_ag_resv_free(struct xfs_perag *pag);
int xfs_ag_resv_init(struct xfs_perag *pag, struct xfs_trans *tp);
bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 9da52e92172a..6cb8b2ddc541 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -79,7 +79,7 @@ xfs_prealloc_blocks(
}
/*
- * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
+ * The number of blocks per AG that we withhold from xfs_dec_fdblocks to
* guarantee that we can refill the AGFL prior to allocating space in a nearly
* full AG. Although the space described by the free space btrees, the
* blocks used by the freesp btrees themselves, and the blocks owned by the
@@ -89,7 +89,7 @@ xfs_prealloc_blocks(
* until the fs goes down, we subtract this many AG blocks from the incore
* fdblocks to ensure user allocation does not overcommit the space the
* filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to
- * withhold space from xfs_mod_fdblocks, so we do not account for that here.
+ * withhold space from xfs_dec_fdblocks, so we do not account for that here.
*/
#define XFS_ALLOCBT_AGFL_RESERVE 4
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 673a4b6d2e8d..430cd3244c14 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -26,6 +26,7 @@
#include "xfs_trace.h"
#include "xfs_attr_item.h"
#include "xfs_xattr.h"
+#include "xfs_parent.h"
struct kmem_cache *xfs_attr_intent_cache;
@@ -87,6 +88,8 @@ xfs_attr_is_leaf(
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec imap;
+ ASSERT(!xfs_need_iread_extents(ifp));
+
if (ifp->if_nextents != 1 || ifp->if_format != XFS_DINODE_FMT_EXTENTS)
return false;
@@ -224,11 +227,21 @@ int
xfs_attr_get_ilocked(
struct xfs_da_args *args)
{
+ int error;
+
xfs_assert_ilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
if (!xfs_inode_hasattr(args->dp))
return -ENOATTR;
+ /*
+ * The incore attr fork iext tree must be loaded for xfs_attr_is_leaf
+ * to work correctly.
+ */
+ error = xfs_iread_extents(args->trans, args->dp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
if (args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
return xfs_attr_shortform_getvalue(args);
if (xfs_attr_is_leaf(args->dp))
@@ -264,9 +277,11 @@ xfs_attr_get(
if (xfs_is_shutdown(args->dp->i_mount))
return -EIO;
+ if (!args->owner)
+ args->owner = args->dp->i_ino;
args->geo = args->dp->i_mount->m_attr_geo;
args->whichfork = XFS_ATTR_FORK;
- args->hashval = xfs_da_hashname(args->name, args->namelen);
+ xfs_attr_sethash(args);
/* Entirely possible to look up a name which doesn't exist */
args->op_flags = XFS_DA_OP_OKNOENT;
@@ -363,7 +378,7 @@ xfs_attr_try_sf_addname(
* Commit the shortform mods, and we're done.
* NOTE: this is also the error path (EEXIST, etc).
*/
- if (!error && !(args->op_flags & XFS_DA_OP_NOTIME))
+ if (!error)
xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
if (xfs_has_wsync(dp->i_mount))
@@ -401,6 +416,50 @@ out:
return error;
}
+/* Compute the hash value for a user/root/secure extended attribute */
+xfs_dahash_t
+xfs_attr_hashname(
+ const uint8_t *name,
+ int namelen)
+{
+ return xfs_da_hashname(name, namelen);
+}
+
+/* Compute the hash value for any extended attribute from any namespace. */
+xfs_dahash_t
+xfs_attr_hashval(
+ struct xfs_mount *mp,
+ unsigned int attr_flags,
+ const uint8_t *name,
+ int namelen,
+ const void *value,
+ int valuelen)
+{
+ ASSERT(xfs_attr_check_namespace(attr_flags));
+
+ if (attr_flags & XFS_ATTR_PARENT)
+ return xfs_parent_hashattr(mp, name, namelen, value, valuelen);
+
+ return xfs_attr_hashname(name, namelen);
+}
+
+/*
+ * PPTR_REPLACE operations require the caller to set the old and new names and
+ * values explicitly. Update the canonical fields to the new name and value
+ * here now that the removal phase has finished.
+ */
+static void
+xfs_attr_update_pptr_replace_args(
+ struct xfs_da_args *args)
+{
+ ASSERT(args->new_namelen > 0);
+ args->name = args->new_name;
+ args->namelen = args->new_namelen;
+ args->value = args->new_value;
+ args->valuelen = args->new_valuelen;
+ xfs_attr_sethash(args);
+}
+
/*
* Handle the state change on completion of a multi-state attr operation.
*
@@ -418,14 +477,15 @@ xfs_attr_complete_op(
enum xfs_delattr_state replace_state)
{
struct xfs_da_args *args = attr->xattri_da_args;
- bool do_replace = args->op_flags & XFS_DA_OP_REPLACE;
+
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
+ replace_state = XFS_DAS_DONE;
+ else if (xfs_attr_intent_op(attr) == XFS_ATTRI_OP_FLAGS_PPTR_REPLACE)
+ xfs_attr_update_pptr_replace_args(args);
args->op_flags &= ~XFS_DA_OP_REPLACE;
args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
- if (do_replace)
- return replace_state;
-
- return XFS_DAS_DONE;
+ return replace_state;
}
static int
@@ -647,8 +707,8 @@ xfs_attr_leaf_remove_attr(
int forkoff;
int error;
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
- &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner,
+ args->blkno, &bp);
if (error)
return error;
@@ -679,7 +739,7 @@ xfs_attr_leaf_shrink(
if (!xfs_attr_is_leaf(dp))
return 0;
- error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, &bp);
if (error)
return error;
@@ -868,6 +928,11 @@ xfs_attr_lookup(
return -ENOATTR;
}
+ /* Prerequisite for xfs_attr_is_leaf */
+ error = xfs_iread_extents(args->trans, args->dp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
if (xfs_attr_is_leaf(dp)) {
error = xfs_attr_leaf_hasname(args, &bp);
@@ -883,74 +948,72 @@ xfs_attr_lookup(
return error;
}
-static void
-xfs_attr_defer_add(
- struct xfs_da_args *args,
- unsigned int op_flags)
+int
+xfs_attr_add_fork(
+ struct xfs_inode *ip, /* incore inode pointer */
+ int size, /* space new attribute needs */
+ int rsvd) /* xact may use reserved blks */
{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp; /* transaction pointer */
+ unsigned int blks; /* space reservation */
+ int error; /* error return value */
- struct xfs_attr_intent *new;
+ ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
- new = kmem_cache_zalloc(xfs_attr_intent_cache,
- GFP_KERNEL | __GFP_NOFAIL);
- new->xattri_op_flags = op_flags;
- new->xattri_da_args = args;
+ blks = XFS_ADDAFORK_SPACE_RES(mp);
- switch (op_flags) {
- case XFS_ATTRI_OP_FLAGS_SET:
- new->xattri_dela_state = xfs_attr_init_add_state(args);
- break;
- case XFS_ATTRI_OP_FLAGS_REPLACE:
- new->xattri_dela_state = xfs_attr_init_replace_state(args);
- break;
- case XFS_ATTRI_OP_FLAGS_REMOVE:
- new->xattri_dela_state = xfs_attr_init_remove_state(args);
- break;
- default:
- ASSERT(0);
- }
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0,
+ rsvd, &tp);
+ if (error)
+ return error;
+
+ if (xfs_inode_has_attr_fork(ip))
+ goto trans_cancel;
+
+ error = xfs_bmap_add_attrfork(tp, ip, size, rsvd);
+ if (error)
+ goto trans_cancel;
+
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
- xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type);
- trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp);
+trans_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
}
/*
- * Note: If args->value is NULL the attribute will be removed, just like the
- * Linux ->setattr API.
+ * Make a change to the xattr structure.
+ *
+ * The caller must have initialized @args, attached dquots, and must not hold
+ * any ILOCKs. Reserved data blocks may be used if @rsvd is set.
+ *
+ * Returns -EEXIST for XFS_ATTRUPDATE_CREATE if the name already exists.
+ * Returns -ENOATTR for XFS_ATTRUPDATE_REMOVE if the name does not exist.
+ * Returns 0 on success, or a negative errno if something else went wrong.
*/
int
xfs_attr_set(
- struct xfs_da_args *args)
+ struct xfs_da_args *args,
+ enum xfs_attr_update op,
+ bool rsvd)
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
struct xfs_trans_res tres;
- bool rsvd = (args->attr_filter & XFS_ATTR_ROOT);
int error, local;
int rmt_blks = 0;
unsigned int total;
- if (xfs_is_shutdown(dp->i_mount))
- return -EIO;
-
- error = xfs_qm_dqattach(dp);
- if (error)
- return error;
-
- args->geo = mp->m_attr_geo;
- args->whichfork = XFS_ATTR_FORK;
- args->hashval = xfs_da_hashname(args->name, args->namelen);
+ ASSERT(!args->trans);
- /*
- * We have no control over the attribute names that userspace passes us
- * to remove, so we have to allow the name lookup prior to attribute
- * removal to fail as well. Preserve the logged flag, since we need
- * to pass that through to the logging code.
- */
- args->op_flags = XFS_DA_OP_OKNOENT |
- (args->op_flags & XFS_DA_OP_LOGGED);
-
- if (args->value) {
+ switch (op) {
+ case XFS_ATTRUPDATE_UPSERT:
+ case XFS_ATTRUPDATE_CREATE:
+ case XFS_ATTRUPDATE_REPLACE:
XFS_STATS_INC(mp, xs_attr_set);
args->total = xfs_attr_calc_size(args, &local);
@@ -963,16 +1026,18 @@ xfs_attr_set(
xfs_attr_sf_entsize_byname(args->namelen,
args->valuelen);
- error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
+ error = xfs_attr_add_fork(dp, sf_size, rsvd);
if (error)
return error;
}
if (!local)
rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen);
- } else {
+ break;
+ case XFS_ATTRUPDATE_REMOVE:
XFS_STATS_INC(mp, xs_attr_remove);
- rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);
+ rmt_blks = xfs_attr3_max_rmt_blocks(mp);
+ break;
}
/*
@@ -984,12 +1049,9 @@ xfs_attr_set(
if (error)
return error;
- if (args->value || xfs_inode_hasattr(dp)) {
- error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK,
+ if (op != XFS_ATTRUPDATE_REMOVE || xfs_inode_hasattr(dp)) {
+ error = xfs_iext_count_extend(args->trans, dp, XFS_ATTR_FORK,
XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(args->trans, dp,
- XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
if (error)
goto out_trans_cancel;
}
@@ -997,26 +1059,26 @@ xfs_attr_set(
error = xfs_attr_lookup(args);
switch (error) {
case -EEXIST:
- if (!args->value) {
+ if (op == XFS_ATTRUPDATE_REMOVE) {
/* if no value, we are performing a remove operation */
- xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REMOVE);
+ xfs_attr_defer_add(args, XFS_ATTR_DEFER_REMOVE);
break;
}
/* Pure create fails if the attr already exists */
- if (args->attr_flags & XATTR_CREATE)
+ if (op == XFS_ATTRUPDATE_CREATE)
goto out_trans_cancel;
- xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REPLACE);
+ xfs_attr_defer_add(args, XFS_ATTR_DEFER_REPLACE);
break;
case -ENOATTR:
/* Can't remove what isn't there. */
- if (!args->value)
+ if (op == XFS_ATTRUPDATE_REMOVE)
goto out_trans_cancel;
/* Pure replace fails if no existing attr to replace. */
- if (args->attr_flags & XATTR_REPLACE)
+ if (op == XFS_ATTRUPDATE_REPLACE)
goto out_trans_cancel;
- xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_SET);
+ xfs_attr_defer_add(args, XFS_ATTR_DEFER_SET);
break;
default:
goto out_trans_cancel;
@@ -1029,8 +1091,7 @@ xfs_attr_set(
if (xfs_has_wsync(mp))
xfs_trans_set_sync(args->trans);
- if (!(args->op_flags & XFS_DA_OP_NOTIME))
- xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
+ xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
/*
* Commit the last in the sequence of transactions.
@@ -1039,6 +1100,7 @@ xfs_attr_set(
error = xfs_trans_commit(args->trans);
out_unlock:
xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ args->trans = NULL;
return error;
out_trans_cancel:
@@ -1051,7 +1113,7 @@ out_trans_cancel:
* External routines when attribute list is inside the inode
*========================================================================*/
-static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
+int xfs_attr_sf_totsize(struct xfs_inode *dp)
{
struct xfs_attr_sf_hdr *sf = dp->i_af.if_data;
@@ -1154,7 +1216,7 @@ xfs_attr_leaf_try_add(
struct xfs_buf *bp;
int error;
- error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, &bp);
if (error)
return error;
@@ -1202,7 +1264,7 @@ xfs_attr_leaf_hasname(
{
int error = 0;
- error = xfs_attr3_leaf_read(args->trans, args->dp, 0, bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, bp);
if (error)
return error;
@@ -1511,12 +1573,23 @@ out_release:
return error;
}
+/* Enforce that there is at most one namespace bit per attr. */
+inline bool xfs_attr_check_namespace(unsigned int attr_flags)
+{
+ return hweight32(attr_flags & XFS_ATTR_NSP_ONDISK_MASK) < 2;
+}
+
/* Returns true if the attribute entry name is valid. */
bool
xfs_attr_namecheck(
+ unsigned int attr_flags,
const void *name,
size_t length)
{
+ /* Only one namespace bit allowed. */
+ if (!xfs_attr_check_namespace(attr_flags))
+ return false;
+
/*
* MAXNAMELEN includes the trailing null, but (name/length) leave it
* out, so use >= for the length check.
@@ -1524,6 +1597,10 @@ xfs_attr_namecheck(
if (length >= MAXNAMELEN)
return false;
+ /* Parent pointers have their own validation. */
+ if (attr_flags & XFS_ATTR_PARENT)
+ return xfs_parent_namecheck(attr_flags, name, length);
+
/* There shouldn't be any nulls here */
return !memchr(name, 0, length);
}
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index 81be9b3e4004..088cb7b30168 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -47,8 +47,9 @@ struct xfs_attrlist_cursor_kern {
/* void; state communicated via *context */
-typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int,
- unsigned char *, int, int);
+typedef void (*put_listent_func_t)(struct xfs_attr_list_context *context,
+ int flags, unsigned char *name, int namelen, void *value,
+ int valuelen);
struct xfs_attr_list_context {
struct xfs_trans *tp;
@@ -510,8 +511,8 @@ struct xfs_attr_intent {
struct xfs_da_args *xattri_da_args;
/*
- * Shared buffer containing the attr name and value so that the logging
- * code can share large memory buffers between log items.
+ * Shared buffer containing the attr name, new name, and value so that
+ * the logging code can share large memory buffers between log items.
*/
struct xfs_attri_log_nameval *xattri_nameval;
@@ -529,6 +530,11 @@ struct xfs_attr_intent {
struct xfs_bmbt_irec xattri_map;
};
+static inline unsigned int
+xfs_attr_intent_op(const struct xfs_attr_intent *attr)
+{
+ return attr->xattri_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK;
+}
/*========================================================================
* Function prototypes for the kernel.
@@ -544,10 +550,20 @@ int xfs_inode_hasattr(struct xfs_inode *ip);
bool xfs_attr_is_leaf(struct xfs_inode *ip);
int xfs_attr_get_ilocked(struct xfs_da_args *args);
int xfs_attr_get(struct xfs_da_args *args);
-int xfs_attr_set(struct xfs_da_args *args);
+
+enum xfs_attr_update {
+ XFS_ATTRUPDATE_REMOVE, /* remove attr */
+ XFS_ATTRUPDATE_UPSERT, /* set value, replace any existing attr */
+ XFS_ATTRUPDATE_CREATE, /* set value, fail if attr already exists */
+ XFS_ATTRUPDATE_REPLACE, /* set value, fail if attr does not exist */
+};
+
+int xfs_attr_set(struct xfs_da_args *args, enum xfs_attr_update op, bool rsvd);
int xfs_attr_set_iter(struct xfs_attr_intent *attr);
int xfs_attr_remove_iter(struct xfs_attr_intent *attr);
-bool xfs_attr_namecheck(const void *name, size_t length);
+bool xfs_attr_check_namespace(unsigned int attr_flags);
+bool xfs_attr_namecheck(unsigned int attr_flags, const void *name,
+ size_t length);
int xfs_attr_calc_size(struct xfs_da_args *args, int *local);
void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres,
unsigned int *total);
@@ -590,7 +606,6 @@ xfs_attr_init_add_state(struct xfs_da_args *args)
static inline enum xfs_delattr_state
xfs_attr_init_remove_state(struct xfs_da_args *args)
{
- args->op_flags |= XFS_DA_OP_REMOVE;
if (xfs_attr_is_shortform(args->dp))
return XFS_DAS_SF_REMOVE;
if (xfs_attr_is_leaf(args->dp))
@@ -614,8 +629,25 @@ xfs_attr_init_replace_state(struct xfs_da_args *args)
return xfs_attr_init_add_state(args);
}
+xfs_dahash_t xfs_attr_hashname(const uint8_t *name, int namelen);
+
+xfs_dahash_t xfs_attr_hashval(struct xfs_mount *mp, unsigned int attr_flags,
+ const uint8_t *name, int namelen, const void *value,
+ int valuelen);
+
+/* Set the hash value for any extended attribute from any namespace. */
+static inline void xfs_attr_sethash(struct xfs_da_args *args)
+{
+ args->hashval = xfs_attr_hashval(args->dp->i_mount, args->attr_filter,
+ args->name, args->namelen,
+ args->value, args->valuelen);
+}
+
extern struct kmem_cache *xfs_attr_intent_cache;
int __init xfs_attr_intent_init_cache(void);
void xfs_attr_intent_destroy_cache(void);
+int xfs_attr_sf_totsize(struct xfs_inode *dp);
+int xfs_attr_add_fork(struct xfs_inode *ip, int size, int rsvd);
+
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index ac904cc1a97b..b9e98950eb3d 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -388,6 +388,27 @@ xfs_attr3_leaf_verify(
return NULL;
}
+xfs_failaddr_t
+xfs_attr3_leaf_header_check(
+ struct xfs_buf *bp,
+ xfs_ino_t owner)
+{
+ struct xfs_mount *mp = bp->b_mount;
+
+ if (xfs_has_crc(mp)) {
+ struct xfs_attr3_leafblock *hdr3 = bp->b_addr;
+
+ if (hdr3->hdr.info.hdr.magic !=
+ cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+ return __this_address;
+
+ if (be64_to_cpu(hdr3->hdr.info.owner) != owner)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
static void
xfs_attr3_leaf_write_verify(
struct xfs_buf *bp)
@@ -448,16 +469,30 @@ int
xfs_attr3_leaf_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
+ xfs_ino_t owner,
xfs_dablk_t bno,
struct xfs_buf **bpp)
{
+ xfs_failaddr_t fa;
int err;
err = xfs_da_read_buf(tp, dp, bno, 0, bpp, XFS_ATTR_FORK,
&xfs_attr3_leaf_buf_ops);
- if (!err && tp && *bpp)
+ if (err || !(*bpp))
+ return err;
+
+ fa = xfs_attr3_leaf_header_check(*bpp, owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(*bpp, fa);
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
+ return -EFSCORRUPTED;
+ }
+
+ if (tp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
- return err;
+ return 0;
}
/*========================================================================
@@ -472,28 +507,57 @@ xfs_attr3_leaf_read(
* INCOMPLETE flag will not be set in attr->attr_filter, but rather
* XFS_DA_OP_RECOVERY will be set in args->op_flags.
*/
+static inline unsigned int xfs_attr_match_mask(const struct xfs_da_args *args)
+{
+ if (args->op_flags & XFS_DA_OP_RECOVERY)
+ return XFS_ATTR_NSP_ONDISK_MASK;
+ return XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE;
+}
+
+static inline bool
+xfs_attr_parent_match(
+ const struct xfs_da_args *args,
+ const void *value,
+ unsigned int valuelen)
+{
+ ASSERT(args->value != NULL);
+
+ /* Parent pointers do not use remote values */
+ if (!value)
+ return false;
+
+ /*
+ * The only value we support is a parent rec. However, we'll accept
+ * any valuelen so that offline repair can delete ATTR_PARENT values
+ * that are not parent pointers.
+ */
+ if (valuelen != args->valuelen)
+ return false;
+
+ return memcmp(args->value, value, valuelen) == 0;
+}
+
static bool
xfs_attr_match(
struct xfs_da_args *args,
- uint8_t namelen,
- unsigned char *name,
- int flags)
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen)
{
+ unsigned int mask = xfs_attr_match_mask(args);
if (args->namelen != namelen)
return false;
+ if ((args->attr_filter & mask) != (attr_flags & mask))
+ return false;
if (memcmp(args->name, name, namelen) != 0)
return false;
- /* Recovery ignores the INCOMPLETE flag. */
- if ((args->op_flags & XFS_DA_OP_RECOVERY) &&
- args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK))
- return true;
+ if (attr_flags & XFS_ATTR_PARENT)
+ return xfs_attr_parent_match(args, value, valuelen);
- /* All remaining matches need to be filtered by INCOMPLETE state. */
- if (args->attr_filter !=
- (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE)))
- return false;
return true;
}
@@ -504,6 +568,13 @@ xfs_attr_copy_value(
int valuelen)
{
/*
+ * Parent pointer lookups require the caller to specify the name and
+ * value, so don't copy anything.
+ */
+ if (args->attr_filter & XFS_ATTR_PARENT)
+ return 0;
+
+ /*
* No copy if all we have to do is get the length
*/
if (!args->valuelen) {
@@ -711,8 +782,9 @@ xfs_attr_sf_findname(
for (sfe = xfs_attr_sf_firstentry(sf);
sfe < xfs_attr_sf_endptr(sf);
sfe = xfs_attr_sf_nextentry(sfe)) {
- if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
- sfe->flags))
+ if (xfs_attr_match(args, sfe->flags, sfe->nameval,
+ sfe->namelen, &sfe->nameval[sfe->namelen],
+ sfe->valuelen))
return sfe;
}
@@ -819,7 +891,8 @@ xfs_attr_sf_removename(
*/
if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) &&
(dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
- !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) {
+ !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) &&
+ !xfs_has_parent(mp)) {
xfs_attr_fork_remove(dp, args->trans);
} else {
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
@@ -828,7 +901,8 @@ xfs_attr_sf_removename(
ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) ||
(args->op_flags & XFS_DA_OP_ADDNAME) ||
!xfs_has_attr2(mp) ||
- dp->i_df.if_format == XFS_DINODE_FMT_BTREE);
+ dp->i_df.if_format == XFS_DINODE_FMT_BTREE ||
+ xfs_has_parent(mp));
xfs_trans_log_inode(args->trans, dp,
XFS_ILOG_CORE | XFS_ILOG_ADATA);
}
@@ -904,6 +978,7 @@ xfs_attr_shortform_to_leaf(
nargs.whichfork = XFS_ATTR_FORK;
nargs.trans = args->trans;
nargs.op_flags = XFS_DA_OP_OKNOENT;
+ nargs.owner = args->owner;
sfe = xfs_attr_sf_firstentry(sf);
for (i = 0; i < sf->count; i++) {
@@ -911,9 +986,13 @@ xfs_attr_shortform_to_leaf(
nargs.namelen = sfe->namelen;
nargs.value = &sfe->nameval[nargs.namelen];
nargs.valuelen = sfe->valuelen;
- nargs.hashval = xfs_da_hashname(sfe->nameval,
- sfe->namelen);
nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK;
+ if (!xfs_attr_check_namespace(sfe->flags)) {
+ xfs_da_mark_sick(args);
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+ xfs_attr_sethash(&nargs);
error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
ASSERT(error == -ENOATTR);
error = xfs_attr3_leaf_add(bp, &nargs);
@@ -1027,7 +1106,7 @@ xfs_attr_shortform_verify(
* one namespace flag per xattr, so we can just count the
* bits (i.e. hweight) here.
*/
- if (hweight8(sfep->flags & XFS_ATTR_NSP_ONDISK_MASK) > 1)
+ if (!xfs_attr_check_namespace(sfep->flags))
return __this_address;
sfep = next_sfep;
@@ -1106,6 +1185,7 @@ xfs_attr3_leaf_to_shortform(
nargs.whichfork = XFS_ATTR_FORK;
nargs.trans = args->trans;
nargs.op_flags = XFS_DA_OP_OKNOENT;
+ nargs.owner = args->owner;
for (i = 0; i < ichdr.count; entry++, i++) {
if (entry->flags & XFS_ATTR_INCOMPLETE)
@@ -1158,7 +1238,7 @@ xfs_attr3_leaf_to_node(
error = xfs_da_grow_inode(args, &blkno);
if (error)
goto out;
- error = xfs_attr3_leaf_read(args->trans, dp, 0, &bp1);
+ error = xfs_attr3_leaf_read(args->trans, dp, args->owner, 0, &bp1);
if (error)
goto out;
@@ -1237,7 +1317,7 @@ xfs_attr3_leaf_create(
ichdr.magic = XFS_ATTR3_LEAF_MAGIC;
hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
- hdr3->owner = cpu_to_be64(dp->i_ino);
+ hdr3->owner = cpu_to_be64(args->owner);
uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
@@ -1993,7 +2073,7 @@ xfs_attr3_leaf_toosmall(
if (blkno == 0)
continue;
error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
- blkno, &bp);
+ state->args->owner, blkno, &bp);
if (error)
return error;
@@ -2401,18 +2481,23 @@ xfs_attr3_leaf_lookup_int(
*/
if (entry->flags & XFS_ATTR_LOCAL) {
name_loc = xfs_attr3_leaf_name_local(leaf, probe);
- if (!xfs_attr_match(args, name_loc->namelen,
- name_loc->nameval, entry->flags))
+ if (!xfs_attr_match(args, entry->flags,
+ name_loc->nameval, name_loc->namelen,
+ &name_loc->nameval[name_loc->namelen],
+ be16_to_cpu(name_loc->valuelen)))
continue;
args->index = probe;
return -EEXIST;
} else {
+ unsigned int valuelen;
+
name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
- if (!xfs_attr_match(args, name_rmt->namelen,
- name_rmt->name, entry->flags))
+ valuelen = be32_to_cpu(name_rmt->valuelen);
+ if (!xfs_attr_match(args, entry->flags, name_rmt->name,
+ name_rmt->namelen, NULL, valuelen))
continue;
args->index = probe;
- args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+ args->rmtvaluelen = valuelen;
args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
args->rmtblkcnt = xfs_attr3_rmt_blocks(
args->dp->i_mount,
@@ -2715,7 +2800,8 @@ xfs_attr3_leaf_clearflag(
/*
* Set up the operation.
*/
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner,
+ args->blkno, &bp);
if (error)
return error;
@@ -2779,7 +2865,8 @@ xfs_attr3_leaf_setflag(
/*
* Set up the operation.
*/
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner,
+ args->blkno, &bp);
if (error)
return error;
@@ -2838,7 +2925,8 @@ xfs_attr3_leaf_flipflags(
/*
* Read the block containing the "old" attr
*/
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp1);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner,
+ args->blkno, &bp1);
if (error)
return error;
@@ -2846,8 +2934,8 @@ xfs_attr3_leaf_flipflags(
* Read the block containing the "new" attr, if it is different
*/
if (args->blkno2 != args->blkno) {
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2,
- &bp2);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner,
+ args->blkno2, &bp2);
if (error)
return error;
} else {
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 9b9948639c0f..bac219589896 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -98,12 +98,14 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
struct xfs_buf *leaf2_bp);
int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t bno, struct xfs_buf **bpp);
+ xfs_ino_t owner, xfs_dablk_t bno, struct xfs_buf **bpp);
void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo,
struct xfs_attr3_icleaf_hdr *to,
struct xfs_attr_leafblock *from);
void xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo,
struct xfs_attr_leafblock *to,
struct xfs_attr3_icleaf_hdr *from);
+xfs_failaddr_t xfs_attr3_leaf_header_check(struct xfs_buf *bp,
+ xfs_ino_t owner);
#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index ff0412828772..4c44ce1c8a64 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -43,19 +43,32 @@
* the logging system and therefore never have a log item.
*/
-/*
- * Each contiguous block has a header, so it is not just a simple attribute
- * length to FSB conversion.
- */
-int
+/* How many bytes can be stored in a remote value buffer? */
+inline unsigned int
+xfs_attr3_rmt_buf_space(
+ struct xfs_mount *mp)
+{
+ unsigned int blocksize = mp->m_attr_geo->blksize;
+
+ if (xfs_has_crc(mp))
+ return blocksize - sizeof(struct xfs_attr3_rmt_hdr);
+
+ return blocksize;
+}
+
+/* Compute number of fsblocks needed to store a remote attr value */
+unsigned int
xfs_attr3_rmt_blocks(
- struct xfs_mount *mp,
- int attrlen)
+ struct xfs_mount *mp,
+ unsigned int attrlen)
{
- if (xfs_has_crc(mp)) {
- int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
- return (attrlen + buflen - 1) / buflen;
- }
+ /*
+ * Each contiguous block has a header, so it is not just a simple
+ * attribute length to FSB conversion.
+ */
+ if (xfs_has_crc(mp))
+ return howmany(attrlen, xfs_attr3_rmt_buf_space(mp));
+
return XFS_B_TO_FSB(mp, attrlen);
}
@@ -92,7 +105,6 @@ xfs_attr3_rmt_verify(
struct xfs_mount *mp,
struct xfs_buf *bp,
void *ptr,
- int fsbsize,
xfs_daddr_t bno)
{
struct xfs_attr3_rmt_hdr *rmt = ptr;
@@ -103,7 +115,7 @@ xfs_attr3_rmt_verify(
return __this_address;
if (be64_to_cpu(rmt->rm_blkno) != bno)
return __this_address;
- if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
+ if (be32_to_cpu(rmt->rm_bytes) > mp->m_attr_geo->blksize - sizeof(*rmt))
return __this_address;
if (be32_to_cpu(rmt->rm_offset) +
be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX)
@@ -122,9 +134,9 @@ __xfs_attr3_rmt_read_verify(
{
struct xfs_mount *mp = bp->b_mount;
char *ptr;
- int len;
+ unsigned int len;
xfs_daddr_t bno;
- int blksize = mp->m_attr_geo->blksize;
+ unsigned int blksize = mp->m_attr_geo->blksize;
/* no verification of non-crc buffers */
if (!xfs_has_crc(mp))
@@ -141,7 +153,7 @@ __xfs_attr3_rmt_read_verify(
*failaddr = __this_address;
return -EFSBADCRC;
}
- *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
+ *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, bno);
if (*failaddr)
return -EFSCORRUPTED;
len -= blksize;
@@ -186,7 +198,7 @@ xfs_attr3_rmt_write_verify(
{
struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
- int blksize = mp->m_attr_geo->blksize;
+ unsigned int blksize = mp->m_attr_geo->blksize;
char *ptr;
int len;
xfs_daddr_t bno;
@@ -203,7 +215,7 @@ xfs_attr3_rmt_write_verify(
while (len > 0) {
struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr;
- fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
+ fa = xfs_attr3_rmt_verify(mp, bp, ptr, bno);
if (fa) {
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
@@ -280,30 +292,30 @@ xfs_attr_rmtval_copyout(
struct xfs_mount *mp,
struct xfs_buf *bp,
struct xfs_inode *dp,
- int *offset,
- int *valuelen,
+ xfs_ino_t owner,
+ unsigned int *offset,
+ unsigned int *valuelen,
uint8_t **dst)
{
char *src = bp->b_addr;
- xfs_ino_t ino = dp->i_ino;
xfs_daddr_t bno = xfs_buf_daddr(bp);
- int len = BBTOB(bp->b_length);
- int blksize = mp->m_attr_geo->blksize;
+ unsigned int len = BBTOB(bp->b_length);
+ unsigned int blksize = mp->m_attr_geo->blksize;
ASSERT(len >= blksize);
while (len > 0 && *valuelen > 0) {
- int hdr_size = 0;
- int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+ unsigned int hdr_size = 0;
+ unsigned int byte_cnt = xfs_attr3_rmt_buf_space(mp);
byte_cnt = min(*valuelen, byte_cnt);
if (xfs_has_crc(mp)) {
- if (xfs_attr3_rmt_hdr_ok(src, ino, *offset,
+ if (xfs_attr3_rmt_hdr_ok(src, owner, *offset,
byte_cnt, bno)) {
xfs_alert(mp,
"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
- bno, *offset, byte_cnt, ino);
+ bno, *offset, byte_cnt, owner);
xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
@@ -330,20 +342,20 @@ xfs_attr_rmtval_copyin(
struct xfs_mount *mp,
struct xfs_buf *bp,
xfs_ino_t ino,
- int *offset,
- int *valuelen,
+ unsigned int *offset,
+ unsigned int *valuelen,
uint8_t **src)
{
char *dst = bp->b_addr;
xfs_daddr_t bno = xfs_buf_daddr(bp);
- int len = BBTOB(bp->b_length);
- int blksize = mp->m_attr_geo->blksize;
+ unsigned int len = BBTOB(bp->b_length);
+ unsigned int blksize = mp->m_attr_geo->blksize;
ASSERT(len >= blksize);
while (len > 0 && *valuelen > 0) {
- int hdr_size;
- int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+ unsigned int hdr_size;
+ unsigned int byte_cnt = xfs_attr3_rmt_buf_space(mp);
byte_cnt = min(*valuelen, byte_cnt);
hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
@@ -389,12 +401,12 @@ xfs_attr_rmtval_get(
struct xfs_buf *bp;
xfs_dablk_t lblkno = args->rmtblkno;
uint8_t *dst = args->value;
- int valuelen;
+ unsigned int valuelen;
int nmap;
int error;
- int blkcnt = args->rmtblkcnt;
+ unsigned int blkcnt = args->rmtblkcnt;
int i;
- int offset = 0;
+ unsigned int offset = 0;
trace_xfs_attr_rmtval_get(args);
@@ -427,8 +439,7 @@ xfs_attr_rmtval_get(
return error;
error = xfs_attr_rmtval_copyout(mp, bp, args->dp,
- &offset, &valuelen,
- &dst);
+ args->owner, &offset, &valuelen, &dst);
xfs_buf_relse(bp);
if (error)
return error;
@@ -453,7 +464,7 @@ xfs_attr_rmt_find_hole(
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
int error;
- int blkcnt;
+ unsigned int blkcnt;
xfs_fileoff_t lfileoff = 0;
/*
@@ -482,11 +493,11 @@ xfs_attr_rmtval_set_value(
struct xfs_bmbt_irec map;
xfs_dablk_t lblkno;
uint8_t *src = args->value;
- int blkcnt;
- int valuelen;
+ unsigned int blkcnt;
+ unsigned int valuelen;
int nmap;
int error;
- int offset = 0;
+ unsigned int offset = 0;
/*
* Roll through the "value", copying the attribute value to the
@@ -522,8 +533,8 @@ xfs_attr_rmtval_set_value(
return error;
bp->b_ops = &xfs_attr3_rmt_buf_ops;
- xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
- &valuelen, &src);
+ xfs_attr_rmtval_copyin(mp, bp, args->owner, &offset, &valuelen,
+ &src);
error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
xfs_buf_relse(bp);
@@ -626,7 +637,6 @@ xfs_attr_rmtval_set_blk(
if (error)
return error;
- ASSERT(nmap == 1);
ASSERT((map->br_startblock != DELAYSTARTBLOCK) &&
(map->br_startblock != HOLESTARTBLOCK));
@@ -646,7 +656,7 @@ xfs_attr_rmtval_invalidate(
struct xfs_da_args *args)
{
xfs_dablk_t lblkno;
- int blkcnt;
+ unsigned int blkcnt;
int error;
/*
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
index d097ec6c4dc3..e3c6c7d774bf 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -6,7 +6,13 @@
#ifndef __XFS_ATTR_REMOTE_H__
#define __XFS_ATTR_REMOTE_H__
-int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
+unsigned int xfs_attr3_rmt_blocks(struct xfs_mount *mp, unsigned int attrlen);
+
+/* Number of rmt blocks needed to store the maximally sized attr value */
+static inline unsigned int xfs_attr3_max_rmt_blocks(struct xfs_mount *mp)
+{
+ return xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);
+}
int xfs_attr_rmtval_get(struct xfs_da_args *args);
int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index bc4422223024..73bdc0e55682 100644
--- a/fs/xfs/libxfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -16,6 +16,7 @@ typedef struct xfs_attr_sf_sort {
uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
xfs_dahash_t hash; /* this entry's hash value */
unsigned char *name; /* name value, pointer into buffer */
+ void *value;
} xfs_attr_sf_sort_t;
#define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 656c95a22f2e..3b3206d312d6 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -779,7 +779,7 @@ xfs_bmap_local_to_extents_empty(
}
-STATIC int /* error */
+int /* error */
xfs_bmap_local_to_extents(
xfs_trans_t *tp, /* transaction pointer */
xfs_inode_t *ip, /* incore inode pointer */
@@ -789,7 +789,8 @@ xfs_bmap_local_to_extents(
void (*init_fn)(struct xfs_trans *tp,
struct xfs_buf *bp,
struct xfs_inode *ip,
- struct xfs_ifork *ifp))
+ struct xfs_ifork *ifp, void *priv),
+ void *priv)
{
int error = 0;
int flags; /* logging flags returned */
@@ -850,7 +851,7 @@ xfs_bmap_local_to_extents(
* log here. Note that init_fn must also set the buffer log item type
* correctly.
*/
- init_fn(tp, bp, ip, ifp);
+ init_fn(tp, bp, ip, ifp, priv);
/* account for the change in fork size */
xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
@@ -976,13 +977,14 @@ xfs_bmap_add_attrfork_local(
dargs.total = dargs.geo->fsbcount;
dargs.whichfork = XFS_DATA_FORK;
dargs.trans = tp;
+ dargs.owner = ip->i_ino;
return xfs_dir2_sf_to_block(&dargs);
}
if (S_ISLNK(VFS_I(ip)->i_mode))
return xfs_bmap_local_to_extents(tp, ip, 1, flags,
- XFS_DATA_FORK,
- xfs_symlink_local_to_remote);
+ XFS_DATA_FORK, xfs_symlink_local_to_remote,
+ NULL);
/* should only be called for types that support local format data */
ASSERT(0);
@@ -1023,40 +1025,29 @@ xfs_bmap_set_attrforkoff(
}
/*
- * Convert inode from non-attributed to attributed.
- * Must not be in a transaction, ip must not be locked.
+ * Convert inode from non-attributed to attributed. Caller must hold the
+ * ILOCK_EXCL and the file cannot have an attr fork.
*/
int /* error code */
xfs_bmap_add_attrfork(
- xfs_inode_t *ip, /* incore inode pointer */
+ struct xfs_trans *tp,
+ struct xfs_inode *ip, /* incore inode pointer */
int size, /* space new attribute needs */
int rsvd) /* xact may use reserved blks */
{
- xfs_mount_t *mp; /* mount structure */
- xfs_trans_t *tp; /* transaction pointer */
- int blks; /* space reservation */
+ struct xfs_mount *mp = tp->t_mountp;
int version = 1; /* superblock attr version */
int logflags; /* logging flags */
int error; /* error return value */
- ASSERT(xfs_inode_has_attr_fork(ip) == 0);
-
- mp = ip->i_mount;
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
-
- blks = XFS_ADDAFORK_SPACE_RES(mp);
-
- error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0,
- rsvd, &tp);
- if (error)
- return error;
- if (xfs_inode_has_attr_fork(ip))
- goto trans_cancel;
+ ASSERT(!xfs_inode_has_attr_fork(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_bmap_set_attrforkoff(ip, size, &version);
if (error)
- goto trans_cancel;
+ return error;
xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
logflags = 0;
@@ -1077,7 +1068,7 @@ xfs_bmap_add_attrfork(
if (logflags)
xfs_trans_log_inode(tp, ip, logflags);
if (error)
- goto trans_cancel;
+ return error;
if (!xfs_has_attr(mp) ||
(!xfs_has_attr2(mp) && version == 2)) {
bool log_sb = false;
@@ -1096,14 +1087,7 @@ xfs_bmap_add_attrfork(
xfs_log_sb(tp);
}
- error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
-
-trans_cancel:
- xfs_trans_cancel(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
+ return 0;
}
/*
@@ -1586,6 +1570,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
+ ASSERT(da_new <= da_old);
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -1616,6 +1601,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
+ ASSERT(da_new <= da_old);
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1650,6 +1636,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
+ ASSERT(da_new <= da_old);
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -1684,6 +1671,7 @@ xfs_bmap_add_extent_delay_real(
goto done;
}
}
+ ASSERT(da_new <= da_old);
break;
case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -1722,6 +1710,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
+ ASSERT(da_new <= da_old);
break;
case BMAP_LEFT_FILLING:
@@ -1812,6 +1801,7 @@ xfs_bmap_add_extent_delay_real(
xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
xfs_iext_next(ifp, &bma->icur);
xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT);
+ ASSERT(da_new <= da_old);
break;
case BMAP_RIGHT_FILLING:
@@ -1861,6 +1851,7 @@ xfs_bmap_add_extent_delay_real(
PREV.br_blockcount = temp;
xfs_iext_insert(bma->ip, &bma->icur, &PREV, state);
xfs_iext_next(ifp, &bma->icur);
+ ASSERT(da_new <= da_old);
break;
case 0:
@@ -1975,7 +1966,7 @@ xfs_bmap_add_extent_delay_real(
}
if (da_new != da_old)
- xfs_mod_delalloc(mp, (int64_t)da_new - da_old);
+ xfs_mod_delalloc(bma->ip, 0, (int64_t)da_new - da_old);
if (bma->cur) {
da_new += bma->cur->bc_bmap.allocated;
@@ -1983,11 +1974,10 @@ xfs_bmap_add_extent_delay_real(
}
/* adjust for changes in reserved delayed indirect blocks */
- if (da_new != da_old) {
- ASSERT(state == 0 || da_new < da_old);
- error = xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new),
- false);
- }
+ if (da_new < da_old)
+ xfs_add_fdblocks(mp, da_old - da_new);
+ else if (da_new > da_old)
+ error = xfs_dec_fdblocks(mp, da_new - da_old, true);
xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
done:
@@ -2688,12 +2678,12 @@ xfs_bmap_add_extent_hole_delay(
}
if (oldlen != newlen) {
ASSERT(oldlen > newlen);
- xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen),
- false);
+ xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
+
/*
* Nothing to do for disk quota accounting here.
*/
- xfs_mod_delalloc(ip->i_mount, (int64_t)newlen - oldlen);
+ xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
}
}
@@ -3370,7 +3360,7 @@ xfs_bmap_alloc_account(
* yet.
*/
if (ap->wasdel) {
- xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length);
+ xfs_mod_delalloc(ap->ip, -(int64_t)ap->length, 0);
return;
}
@@ -3394,7 +3384,7 @@ xfs_bmap_alloc_account(
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
if (ap->wasdel) {
ap->ip->i_delayed_blks -= ap->length;
- xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length);
+ xfs_mod_delalloc(ap->ip, -(int64_t)ap->length, 0);
fld = isrt ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_DELBCOUNT;
} else {
fld = isrt ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
@@ -4066,6 +4056,7 @@ xfs_bmapi_reserve_delalloc(
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_extlen_t alen;
xfs_extlen_t indlen;
+ uint64_t fdblocks;
int error;
xfs_fileoff_t aoff = off;
@@ -4108,17 +4099,21 @@ xfs_bmapi_reserve_delalloc(
indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
ASSERT(indlen > 0);
- error = xfs_mod_fdblocks(mp, -((int64_t)alen), false);
- if (error)
- goto out_unreserve_quota;
+ fdblocks = indlen;
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ error = xfs_dec_frextents(mp, xfs_rtb_to_rtx(mp, alen));
+ if (error)
+ goto out_unreserve_quota;
+ } else {
+ fdblocks += alen;
+ }
- error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false);
+ error = xfs_dec_fdblocks(mp, fdblocks, false);
if (error)
- goto out_unreserve_blocks;
-
+ goto out_unreserve_frextents;
ip->i_delayed_blks += alen;
- xfs_mod_delalloc(ip->i_mount, alen + indlen);
+ xfs_mod_delalloc(ip, alen, indlen);
got->br_startoff = aoff;
got->br_startblock = nullstartblock(indlen);
@@ -4139,8 +4134,9 @@ xfs_bmapi_reserve_delalloc(
return 0;
-out_unreserve_blocks:
- xfs_mod_fdblocks(mp, alen, false);
+out_unreserve_frextents:
+ if (XFS_IS_REALTIME_INODE(ip))
+ xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, alen));
out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp))
xfs_quota_unreserve_blkres(ip, alen);
@@ -4191,26 +4187,10 @@ xfs_bmapi_allocate(
struct xfs_mount *mp = bma->ip->i_mount;
int whichfork = xfs_bmapi_whichfork(bma->flags);
struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
- int tmp_logflags = 0;
int error;
ASSERT(bma->length > 0);
-
- /*
- * For the wasdelay case, we could also just allocate the stuff asked
- * for in this bmap call but that wouldn't be as good.
- */
- if (bma->wasdel) {
- bma->length = (xfs_extlen_t)bma->got.br_blockcount;
- bma->offset = bma->got.br_startoff;
- if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev))
- bma->prev.br_startoff = NULLFILEOFF;
- } else {
- bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN);
- if (!bma->eof)
- bma->length = XFS_FILBLKS_MIN(bma->length,
- bma->got.br_startoff - bma->offset);
- }
+ ASSERT(bma->length <= XFS_MAX_BMBT_EXTLEN);
if (bma->flags & XFS_BMAPI_CONTIG)
bma->minlen = bma->length;
@@ -4226,8 +4206,15 @@ xfs_bmapi_allocate(
} else {
error = xfs_bmap_alloc_userdata(bma);
}
- if (error || bma->blkno == NULLFSBLOCK)
+ if (error)
return error;
+ if (bma->blkno == NULLFSBLOCK)
+ return -ENOSPC;
+
+ if (WARN_ON_ONCE(!xfs_valid_startblock(bma->ip, bma->blkno))) {
+ xfs_bmap_mark_sick(bma->ip, whichfork);
+ return -EFSCORRUPTED;
+ }
if (bma->flags & XFS_BMAPI_ZERO) {
error = xfs_zero_extent(bma->ip, bma->blkno, bma->length);
@@ -4260,8 +4247,6 @@ xfs_bmapi_allocate(
error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
whichfork, &bma->icur, &bma->cur, &bma->got,
&bma->logflags, bma->flags);
-
- bma->logflags |= tmp_logflags;
if (error)
return error;
@@ -4406,6 +4391,15 @@ xfs_bmapi_finish(
* extent state if necessary. Details behaviour is controlled by the flags
* parameter. Only allocates blocks from a single allocation group, to avoid
* locking problems.
+ *
+ * Returns 0 on success and places the extent mappings in mval. nmaps is used
+ * as an input/output parameter where the caller specifies the maximum number
+ * of mappings that may be returned and xfs_bmapi_write passes back the number
+ * of mappings (including existing mappings) it found.
+ *
+ * Returns a negative error code on failure, including -ENOSPC when it could not
+ * allocate any blocks and -ENOSR when it did allocate blocks to convert a
+ * delalloc range, but those blocks were before the passed in range.
*/
int
xfs_bmapi_write(
@@ -4524,20 +4518,33 @@ xfs_bmapi_write(
* allocation length request (which can be 64 bits in
* length) and the bma length request, which is
* xfs_extlen_t and therefore 32 bits. Hence we have to
- * check for 32-bit overflows and handle them here.
+ * be careful and do the min() using the larger type to
+ * avoid overflows.
*/
- if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN)
- bma.length = XFS_MAX_BMBT_EXTLEN;
- else
- bma.length = len;
+ bma.length = XFS_FILBLKS_MIN(len, XFS_MAX_BMBT_EXTLEN);
+
+ if (wasdelay) {
+ bma.length = XFS_FILBLKS_MIN(bma.length,
+ bma.got.br_blockcount -
+ (bno - bma.got.br_startoff));
+ } else {
+ if (!eof)
+ bma.length = XFS_FILBLKS_MIN(bma.length,
+ bma.got.br_startoff - bno);
+ }
- ASSERT(len > 0);
ASSERT(bma.length > 0);
error = xfs_bmapi_allocate(&bma);
- if (error)
+ if (error) {
+ /*
+ * If we already allocated space in a previous
+ * iteration return what we go so far when
+ * running out of space.
+ */
+ if (error == -ENOSPC && bma.nallocs)
+ break;
goto error0;
- if (bma.blkno == NULLFSBLOCK)
- break;
+ }
/*
* If this is a CoW allocation, record the data in
@@ -4575,7 +4582,6 @@ xfs_bmapi_write(
if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got))
eof = true;
}
- *nmap = n;
error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
whichfork);
@@ -4586,7 +4592,22 @@ xfs_bmapi_write(
ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork));
xfs_bmapi_finish(&bma, whichfork, 0);
xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
- orig_nmap, *nmap);
+ orig_nmap, n);
+
+ /*
+ * When converting delayed allocations, xfs_bmapi_allocate ignores
+ * the passed in bno and always converts from the start of the found
+ * delalloc extent.
+ *
+ * To avoid a successful return with *nmap set to 0, return the magic
+ * -ENOSR error code for this particular case so that the caller can
+ * handle it.
+ */
+ if (!n) {
+ ASSERT(bma.nallocs >= *nmap);
+ return -ENOSR;
+ }
+ *nmap = n;
return 0;
error0:
xfs_bmapi_finish(&bma, whichfork, error);
@@ -4599,8 +4620,8 @@ error0:
* invocations to allocate the target offset if a large enough physical extent
* is not available.
*/
-int
-xfs_bmapi_convert_delalloc(
+static int
+xfs_bmapi_convert_one_delalloc(
struct xfs_inode *ip,
int whichfork,
xfs_off_t offset,
@@ -4630,11 +4651,8 @@ xfs_bmapi_convert_delalloc(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_iext_count_may_overflow(ip, whichfork,
+ error = xfs_iext_count_extend(tp, ip, whichfork,
XFS_IEXT_ADD_NOSPLIT_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip,
- XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto out_trans_cancel;
@@ -4657,19 +4675,25 @@ xfs_bmapi_convert_delalloc(
if (!isnullstartblock(bma.got.br_startblock)) {
xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
xfs_iomap_inode_sequence(ip, flags));
- *seq = READ_ONCE(ifp->if_seq);
+ if (seq)
+ *seq = READ_ONCE(ifp->if_seq);
goto out_trans_cancel;
}
bma.tp = tp;
bma.ip = ip;
bma.wasdel = true;
- bma.offset = bma.got.br_startoff;
- bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount,
- XFS_MAX_BMBT_EXTLEN);
bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
/*
+ * Always allocate convert from the start of the delalloc extent even if
+ * that is outside the passed in range to create large contiguous
+ * extents on disk.
+ */
+ bma.offset = bma.got.br_startoff;
+ bma.length = bma.got.br_blockcount;
+
+ /*
* When we're converting the delalloc reservations backing dirty pages
* in the page cache, we must be careful about how we create the new
* extents:
@@ -4693,22 +4717,14 @@ xfs_bmapi_convert_delalloc(
if (error)
goto out_finish;
- error = -ENOSPC;
- if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
- goto out_finish;
- if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) {
- xfs_bmap_mark_sick(ip, whichfork);
- error = -EFSCORRUPTED;
- goto out_finish;
- }
-
XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
XFS_STATS_INC(mp, xs_xstrat_quick);
ASSERT(!isnullstartblock(bma.got.br_startblock));
xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
xfs_iomap_inode_sequence(ip, flags));
- *seq = READ_ONCE(ifp->if_seq);
+ if (seq)
+ *seq = READ_ONCE(ifp->if_seq);
if (whichfork == XFS_COW_FORK)
xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
@@ -4731,6 +4747,36 @@ out_trans_cancel:
return error;
}
+/*
+ * Pass in a dellalloc extent and convert it to real extents, return the real
+ * extent that maps offset_fsb in iomap.
+ */
+int
+xfs_bmapi_convert_delalloc(
+ struct xfs_inode *ip,
+ int whichfork,
+ loff_t offset,
+ struct iomap *iomap,
+ unsigned int *seq)
+{
+ int error;
+
+ /*
+ * Attempt to allocate whatever delalloc extent currently backs offset
+ * and put the result into iomap. Allocate in a loop because it may
+ * take several attempts to allocate real blocks for a contiguous
+ * delalloc extent if free space is sufficiently fragmented.
+ */
+ do {
+ error = xfs_bmapi_convert_one_delalloc(ip, whichfork, offset,
+ iomap, seq);
+ if (error)
+ return error;
+ } while (iomap->offset + iomap->length <= offset);
+
+ return 0;
+}
+
int
xfs_bmapi_remap(
struct xfs_trans *tp,
@@ -4822,32 +4868,18 @@ error0:
* ores == 1). The number of stolen blocks is returned. The availability and
* subsequent accounting of stolen blocks is the responsibility of the caller.
*/
-static xfs_filblks_t
+static void
xfs_bmap_split_indlen(
xfs_filblks_t ores, /* original res. */
xfs_filblks_t *indlen1, /* ext1 worst indlen */
- xfs_filblks_t *indlen2, /* ext2 worst indlen */
- xfs_filblks_t avail) /* stealable blocks */
+ xfs_filblks_t *indlen2) /* ext2 worst indlen */
{
xfs_filblks_t len1 = *indlen1;
xfs_filblks_t len2 = *indlen2;
xfs_filblks_t nres = len1 + len2; /* new total res. */
- xfs_filblks_t stolen = 0;
xfs_filblks_t resfactor;
/*
- * Steal as many blocks as we can to try and satisfy the worst case
- * indlen for both new extents.
- */
- if (ores < nres && avail)
- stolen = XFS_FILBLKS_MIN(nres - ores, avail);
- ores += stolen;
-
- /* nothing else to do if we've satisfied the new reservation */
- if (ores >= nres)
- return stolen;
-
- /*
* We can't meet the total required reservation for the two extents.
* Calculate the percent of the overall shortage between both extents
* and apply this percentage to each of the requested indlen values.
@@ -4891,11 +4923,9 @@ xfs_bmap_split_indlen(
*indlen1 = len1;
*indlen2 = len2;
-
- return stolen;
}
-int
+void
xfs_bmap_del_extent_delay(
struct xfs_inode *ip,
int whichfork,
@@ -4908,9 +4938,9 @@ xfs_bmap_del_extent_delay(
struct xfs_bmbt_irec new;
int64_t da_old, da_new, da_diff = 0;
xfs_fileoff_t del_endoff, got_endoff;
- xfs_filblks_t got_indlen, new_indlen, stolen;
+ xfs_filblks_t got_indlen, new_indlen, stolen = 0;
uint32_t state = xfs_bmap_fork_to_state(whichfork);
- int error = 0;
+ uint64_t fdblocks;
bool isrt;
XFS_STATS_INC(mp, xs_del_exlist);
@@ -4925,18 +4955,12 @@ xfs_bmap_del_extent_delay(
ASSERT(got->br_startoff <= del->br_startoff);
ASSERT(got_endoff >= del_endoff);
- if (isrt)
- xfs_mod_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount));
-
/*
* Update the inode delalloc counter now and wait to update the
* sb counters as we might have to borrow some blocks for the
* indirect block accounting.
*/
- ASSERT(!isrt);
- error = xfs_quota_unreserve_blkres(ip, del->br_blockcount);
- if (error)
- return error;
+ xfs_quota_unreserve_blkres(ip, del->br_blockcount);
ip->i_delayed_blks -= del->br_blockcount;
if (got->br_startoff == del->br_startoff)
@@ -4990,8 +5014,24 @@ xfs_bmap_del_extent_delay(
new_indlen = xfs_bmap_worst_indlen(ip, new.br_blockcount);
WARN_ON_ONCE(!got_indlen || !new_indlen);
- stolen = xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen,
- del->br_blockcount);
+ /*
+ * Steal as many blocks as we can to try and satisfy the worst
+ * case indlen for both new extents.
+ *
+ * However, we can't just steal reservations from the data
+ * blocks if this is an RT inodes as the data and metadata
+ * blocks come from different pools. We'll have to live with
+ * under-filled indirect reservation in this case.
+ */
+ da_new = got_indlen + new_indlen;
+ if (da_new > da_old && !isrt) {
+ stolen = XFS_FILBLKS_MIN(da_new - da_old,
+ del->br_blockcount);
+ da_old += stolen;
+ }
+ if (da_new > da_old)
+ xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen);
+ da_new = got_indlen + new_indlen;
got->br_startblock = nullstartblock((int)got_indlen);
@@ -5003,20 +5043,21 @@ xfs_bmap_del_extent_delay(
xfs_iext_next(ifp, icur);
xfs_iext_insert(ip, icur, &new, state);
- da_new = got_indlen + new_indlen - stolen;
del->br_blockcount -= stolen;
break;
}
ASSERT(da_old >= da_new);
da_diff = da_old - da_new;
- if (!isrt)
- da_diff += del->br_blockcount;
- if (da_diff) {
- xfs_mod_fdblocks(mp, da_diff, false);
- xfs_mod_delalloc(mp, -da_diff);
- }
- return error;
+ fdblocks = da_diff;
+
+ if (isrt)
+ xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount));
+ else
+ fdblocks += del->br_blockcount;
+
+ xfs_add_fdblocks(mp, fdblocks);
+ xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
}
void
@@ -5107,8 +5148,7 @@ xfs_bmap_del_extent_real(
{
xfs_fsblock_t del_endblock=0; /* first block past del */
xfs_fileoff_t del_endoff; /* first offset past del */
- int do_fx; /* free extent at end of routine */
- int error; /* error return value */
+ int error = 0; /* error return value */
struct xfs_bmbt_irec got; /* current extent entry */
xfs_fileoff_t got_endoff; /* first offset past got */
int i; /* temp state */
@@ -5151,20 +5191,10 @@ xfs_bmap_del_extent_real(
return -ENOSPC;
*logflagsp = XFS_ILOG_CORE;
- if (xfs_ifork_is_realtime(ip, whichfork)) {
- if (!(bflags & XFS_BMAPI_REMAP)) {
- error = xfs_rtfree_blocks(tp, del->br_startblock,
- del->br_blockcount);
- if (error)
- return error;
- }
-
- do_fx = 0;
+ if (xfs_ifork_is_realtime(ip, whichfork))
qfield = XFS_TRANS_DQ_RTBCOUNT;
- } else {
- do_fx = 1;
+ else
qfield = XFS_TRANS_DQ_BCOUNT;
- }
nblks = del->br_blockcount;
del_endblock = del->br_startblock + del->br_blockcount;
@@ -5312,18 +5342,29 @@ xfs_bmap_del_extent_real(
/*
* If we need to, add to list of extents to delete.
*/
- if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
+ if (!(bflags & XFS_BMAPI_REMAP)) {
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
xfs_refcount_decrease_extent(tp, del);
+ } else if (xfs_ifork_is_realtime(ip, whichfork)) {
+ /*
+ * Ensure the bitmap and summary inodes are locked
+ * and joined to the transaction before modifying them.
+ */
+ if (!(tp->t_flags & XFS_TRANS_RTBITMAP_LOCKED)) {
+ tp->t_flags |= XFS_TRANS_RTBITMAP_LOCKED;
+ xfs_rtbitmap_lock(tp, mp);
+ }
+ error = xfs_rtfree_blocks(tp, del->br_startblock,
+ del->br_blockcount);
} else {
error = xfs_free_extent_later(tp, del->br_startblock,
del->br_blockcount, NULL,
XFS_AG_RESV_NONE,
((bflags & XFS_BMAPI_NODISCARD) ||
del->br_state == XFS_EXT_UNWRITTEN));
- if (error)
- return error;
}
+ if (error)
+ return error;
}
/*
@@ -5414,16 +5455,6 @@ __xfs_bunmapi(
} else
cur = NULL;
- if (isrt) {
- /*
- * Synchronize by locking the bitmap inode.
- */
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
- xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
- xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
- }
-
extno = 0;
while (end != (xfs_fileoff_t)-1 && end >= start &&
(nexts == 0 || extno < nexts)) {
@@ -5584,18 +5615,16 @@ __xfs_bunmapi(
delete:
if (wasdel) {
- error = xfs_bmap_del_extent_delay(ip, whichfork, &icur,
- &got, &del);
+ xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
} else {
error = xfs_bmap_del_extent_real(ip, tp, &icur, cur,
&del, &tmp_logflags, whichfork,
flags);
logflags |= tmp_logflags;
+ if (error)
+ goto error0;
}
- if (error)
- goto error0;
-
end = del.br_startoff - 1;
nodelete:
/*
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index f7662595309d..667b0c2b33d1 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -158,7 +158,7 @@ static inline bool xfs_bmap_is_real_extent(const struct xfs_bmbt_irec *irec)
* Return true if the extent is a real, allocated extent, or false if it is a
* delayed allocation, and unwritten extent or a hole.
*/
-static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec)
+static inline bool xfs_bmap_is_written_extent(const struct xfs_bmbt_irec *irec)
{
return xfs_bmap_is_real_extent(irec) &&
irec->br_state != XFS_EXT_UNWRITTEN;
@@ -176,9 +176,16 @@ int xfs_bmap_longest_free_extent(struct xfs_perag *pag,
void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp);
-int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+int xfs_bmap_add_attrfork(struct xfs_trans *tp, struct xfs_inode *ip,
+ int size, int rsvd);
void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp,
struct xfs_inode *ip, int whichfork);
+int xfs_bmap_local_to_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_extlen_t total, int *logflagsp, int whichfork,
+ void (*init_fn)(struct xfs_trans *tp, struct xfs_buf *bp,
+ struct xfs_inode *ip, struct xfs_ifork *ifp,
+ void *priv),
+ void *priv);
void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
@@ -195,7 +202,7 @@ int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extnum_t nexts, int *done);
-int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
+void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del);
void xfs_bmap_del_extent_cow(struct xfs_inode *ip,
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 718d071bb21a..16a529a88780 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -252,6 +252,51 @@ xfs_da3_node_verify(
return NULL;
}
+xfs_failaddr_t
+xfs_da3_node_header_check(
+ struct xfs_buf *bp,
+ xfs_ino_t owner)
+{
+ struct xfs_mount *mp = bp->b_mount;
+
+ if (xfs_has_crc(mp)) {
+ struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
+
+ if (hdr3->hdr.magic != cpu_to_be16(XFS_DA3_NODE_MAGIC))
+ return __this_address;
+
+ if (be64_to_cpu(hdr3->owner) != owner)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
+xfs_failaddr_t
+xfs_da3_header_check(
+ struct xfs_buf *bp,
+ xfs_ino_t owner)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_da_blkinfo *hdr = bp->b_addr;
+
+ if (!xfs_has_crc(mp))
+ return NULL;
+
+ switch (hdr->magic) {
+ case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+ return xfs_attr3_leaf_header_check(bp, owner);
+ case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+ return xfs_da3_node_header_check(bp, owner);
+ case cpu_to_be16(XFS_DIR3_LEAF1_MAGIC):
+ case cpu_to_be16(XFS_DIR3_LEAFN_MAGIC):
+ return xfs_dir3_leaf_header_check(bp, owner);
+ }
+
+ ASSERT(0);
+ return NULL;
+}
+
static void
xfs_da3_node_write_verify(
struct xfs_buf *bp)
@@ -486,7 +531,7 @@ xfs_da3_node_create(
memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr));
ichdr.magic = XFS_DA3_NODE_MAGIC;
hdr3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
- hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
+ hdr3->info.owner = cpu_to_be64(args->owner);
uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid);
} else {
ichdr.magic = XFS_DA_NODE_MAGIC;
@@ -1199,6 +1244,7 @@ xfs_da3_root_join(
struct xfs_da3_icnode_hdr oldroothdr;
int error;
struct xfs_inode *dp = state->args->dp;
+ xfs_failaddr_t fa;
trace_xfs_da_root_join(state->args);
@@ -1225,6 +1271,13 @@ xfs_da3_root_join(
error = xfs_da3_node_read(args->trans, dp, child, &bp, args->whichfork);
if (error)
return error;
+ fa = xfs_da3_header_check(bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
+ xfs_trans_brelse(args->trans, bp);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
/*
@@ -1259,6 +1312,7 @@ xfs_da3_node_toosmall(
struct xfs_da_blkinfo *info;
xfs_dablk_t blkno;
struct xfs_buf *bp;
+ xfs_failaddr_t fa;
struct xfs_da3_icnode_hdr nodehdr;
int count;
int forward;
@@ -1333,6 +1387,13 @@ xfs_da3_node_toosmall(
state->args->whichfork);
if (error)
return error;
+ fa = xfs_da3_node_header_check(bp, state->args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
+ xfs_trans_brelse(state->args->trans, bp);
+ xfs_da_mark_sick(state->args);
+ return -EFSCORRUPTED;
+ }
node = bp->b_addr;
xfs_da3_node_hdr_from_disk(dp->i_mount, &thdr, node);
@@ -1591,6 +1652,7 @@ xfs_da3_node_lookup_int(
struct xfs_da_node_entry *btree;
struct xfs_da3_icnode_hdr nodehdr;
struct xfs_da_args *args;
+ xfs_failaddr_t fa;
xfs_dablk_t blkno;
xfs_dahash_t hashval;
xfs_dahash_t btreehashval;
@@ -1629,6 +1691,12 @@ xfs_da3_node_lookup_int(
if (magic == XFS_ATTR_LEAF_MAGIC ||
magic == XFS_ATTR3_LEAF_MAGIC) {
+ fa = xfs_attr3_leaf_header_check(blk->bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(blk->bp, fa);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
blk->magic = XFS_ATTR_LEAF_MAGIC;
blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
break;
@@ -1636,6 +1704,12 @@ xfs_da3_node_lookup_int(
if (magic == XFS_DIR2_LEAFN_MAGIC ||
magic == XFS_DIR3_LEAFN_MAGIC) {
+ fa = xfs_dir3_leaf_header_check(blk->bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(blk->bp, fa);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
blk->magic = XFS_DIR2_LEAFN_MAGIC;
blk->hashval = xfs_dir2_leaf_lasthash(args->dp,
blk->bp, NULL);
@@ -1648,6 +1722,13 @@ xfs_da3_node_lookup_int(
return -EFSCORRUPTED;
}
+ fa = xfs_da3_node_header_check(blk->bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(blk->bp, fa);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
+
blk->magic = XFS_DA_NODE_MAGIC;
/*
@@ -1820,6 +1901,7 @@ xfs_da3_blk_link(
struct xfs_da_blkinfo *tmp_info;
struct xfs_da_args *args;
struct xfs_buf *bp;
+ xfs_failaddr_t fa;
int before = 0;
int error;
struct xfs_inode *dp = state->args->dp;
@@ -1863,6 +1945,13 @@ xfs_da3_blk_link(
&bp, args->whichfork);
if (error)
return error;
+ fa = xfs_da3_header_check(bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
+ xfs_trans_brelse(args->trans, bp);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
ASSERT(bp != NULL);
tmp_info = bp->b_addr;
ASSERT(tmp_info->magic == old_info->magic);
@@ -1884,6 +1973,13 @@ xfs_da3_blk_link(
&bp, args->whichfork);
if (error)
return error;
+ fa = xfs_da3_header_check(bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
+ xfs_trans_brelse(args->trans, bp);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
ASSERT(bp != NULL);
tmp_info = bp->b_addr;
ASSERT(tmp_info->magic == old_info->magic);
@@ -1913,6 +2009,7 @@ xfs_da3_blk_unlink(
struct xfs_da_blkinfo *tmp_info;
struct xfs_da_args *args;
struct xfs_buf *bp;
+ xfs_failaddr_t fa;
int error;
/*
@@ -1943,6 +2040,13 @@ xfs_da3_blk_unlink(
&bp, args->whichfork);
if (error)
return error;
+ fa = xfs_da3_header_check(bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
+ xfs_trans_brelse(args->trans, bp);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
ASSERT(bp != NULL);
tmp_info = bp->b_addr;
ASSERT(tmp_info->magic == save_info->magic);
@@ -1960,6 +2064,13 @@ xfs_da3_blk_unlink(
&bp, args->whichfork);
if (error)
return error;
+ fa = xfs_da3_header_check(bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
+ xfs_trans_brelse(args->trans, bp);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
ASSERT(bp != NULL);
tmp_info = bp->b_addr;
ASSERT(tmp_info->magic == save_info->magic);
@@ -1996,6 +2107,7 @@ xfs_da3_path_shift(
struct xfs_da_node_entry *btree;
struct xfs_da3_icnode_hdr nodehdr;
struct xfs_buf *bp;
+ xfs_failaddr_t fa;
xfs_dablk_t blkno = 0;
int level;
int error;
@@ -2074,6 +2186,12 @@ xfs_da3_path_shift(
switch (be16_to_cpu(info->magic)) {
case XFS_DA_NODE_MAGIC:
case XFS_DA3_NODE_MAGIC:
+ fa = xfs_da3_node_header_check(blk->bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(blk->bp, fa);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
blk->magic = XFS_DA_NODE_MAGIC;
xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr,
bp->b_addr);
@@ -2087,6 +2205,12 @@ xfs_da3_path_shift(
break;
case XFS_ATTR_LEAF_MAGIC:
case XFS_ATTR3_LEAF_MAGIC:
+ fa = xfs_attr3_leaf_header_check(blk->bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(blk->bp, fa);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
blk->magic = XFS_ATTR_LEAF_MAGIC;
ASSERT(level == path->active-1);
blk->index = 0;
@@ -2094,6 +2218,12 @@ xfs_da3_path_shift(
break;
case XFS_DIR2_LEAFN_MAGIC:
case XFS_DIR3_LEAFN_MAGIC:
+ fa = xfs_dir3_leaf_header_check(blk->bp, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(blk->bp, fa);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
blk->magic = XFS_DIR2_LEAFN_MAGIC;
ASSERT(level == path->active-1);
blk->index = 0;
@@ -2167,8 +2297,8 @@ xfs_da_grow_inode_int(
struct xfs_inode *dp = args->dp;
int w = args->whichfork;
xfs_rfsblock_t nblks = dp->i_nblocks;
- struct xfs_bmbt_irec map, *mapp;
- int nmap, error, got, i, mapi;
+ struct xfs_bmbt_irec map, *mapp = &map;
+ int nmap, error, got, i, mapi = 1;
/*
* Find a spot in the file space to put the new block.
@@ -2184,14 +2314,7 @@ xfs_da_grow_inode_int(
error = xfs_bmapi_write(tp, dp, *bno, count,
xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
args->total, &map, &nmap);
- if (error)
- return error;
-
- ASSERT(nmap <= 1);
- if (nmap == 1) {
- mapp = &map;
- mapi = 1;
- } else if (nmap == 0 && count > 1) {
+ if (error == -ENOSPC && count > 1) {
xfs_fileoff_t b;
int c;
@@ -2209,16 +2332,13 @@ xfs_da_grow_inode_int(
args->total, &mapp[mapi], &nmap);
if (error)
goto out_free_map;
- if (nmap < 1)
- break;
mapi += nmap;
b = mapp[mapi - 1].br_startoff +
mapp[mapi - 1].br_blockcount;
}
- } else {
- mapi = 0;
- mapp = NULL;
}
+ if (error)
+ goto out_free_map;
/*
* Count the blocks we got, make sure it matches the total.
@@ -2290,6 +2410,7 @@ xfs_da3_swap_lastblock(
struct xfs_buf *last_buf;
struct xfs_buf *sib_buf;
struct xfs_buf *par_buf;
+ xfs_failaddr_t fa;
xfs_dahash_t dead_hash;
xfs_fileoff_t lastoff;
xfs_dablk_t dead_blkno;
@@ -2326,6 +2447,14 @@ xfs_da3_swap_lastblock(
error = xfs_da3_node_read(tp, dp, last_blkno, &last_buf, w);
if (error)
return error;
+ fa = xfs_da3_header_check(last_buf, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(last_buf, fa);
+ xfs_trans_brelse(tp, last_buf);
+ xfs_da_mark_sick(args);
+ return -EFSCORRUPTED;
+ }
+
/*
* Copy the last block into the dead buffer and log it.
*/
@@ -2364,6 +2493,13 @@ xfs_da3_swap_lastblock(
error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w);
if (error)
goto done;
+ fa = xfs_da3_header_check(sib_buf, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(sib_buf, fa);
+ xfs_da_mark_sick(args);
+ error = -EFSCORRUPTED;
+ goto done;
+ }
sib_info = sib_buf->b_addr;
if (XFS_IS_CORRUPT(mp,
be32_to_cpu(sib_info->forw) != last_blkno ||
@@ -2385,6 +2521,13 @@ xfs_da3_swap_lastblock(
error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w);
if (error)
goto done;
+ fa = xfs_da3_header_check(sib_buf, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(sib_buf, fa);
+ xfs_da_mark_sick(args);
+ error = -EFSCORRUPTED;
+ goto done;
+ }
sib_info = sib_buf->b_addr;
if (XFS_IS_CORRUPT(mp,
be32_to_cpu(sib_info->back) != last_blkno ||
@@ -2408,6 +2551,13 @@ xfs_da3_swap_lastblock(
error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w);
if (error)
goto done;
+ fa = xfs_da3_node_header_check(par_buf, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(par_buf, fa);
+ xfs_da_mark_sick(args);
+ error = -EFSCORRUPTED;
+ goto done;
+ }
par_node = par_buf->b_addr;
xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
if (XFS_IS_CORRUPT(mp,
@@ -2457,6 +2607,13 @@ xfs_da3_swap_lastblock(
error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w);
if (error)
goto done;
+ fa = xfs_da3_node_header_check(par_buf, args->owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(par_buf, fa);
+ xfs_da_mark_sick(args);
+ error = -EFSCORRUPTED;
+ goto done;
+ }
par_node = par_buf->b_addr;
xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) {
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 706baf36e175..354d5d65043e 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -54,17 +54,24 @@ enum xfs_dacmp {
*/
typedef struct xfs_da_args {
struct xfs_da_geometry *geo; /* da block geometry */
- const uint8_t *name; /* string (maybe not NULL terminated) */
- int namelen; /* length of string (maybe no NULL) */
- uint8_t filetype; /* filetype of inode for directories */
+ const uint8_t *name; /* string (maybe not NULL terminated) */
+ const uint8_t *new_name; /* new attr name */
void *value; /* set of bytes (maybe contain NULLs) */
- int valuelen; /* length of value */
- unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */
- unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */
- xfs_dahash_t hashval; /* hash value of name */
- xfs_ino_t inumber; /* input/output inode number */
+ void *new_value; /* new xattr value (may contain NULLs) */
struct xfs_inode *dp; /* directory inode to manipulate */
struct xfs_trans *trans; /* current trans (changes over time) */
+
+ xfs_ino_t inumber; /* input/output inode number */
+ xfs_ino_t owner; /* inode that owns the dir/attr data */
+
+ int valuelen; /* length of value */
+ int new_valuelen; /* length of new_value */
+ uint8_t filetype; /* filetype of inode for directories */
+ uint8_t op_flags; /* operation flags */
+ uint8_t attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */
+ short namelen; /* length of string (maybe no NULL) */
+ short new_namelen; /* length of new attr name */
+ xfs_dahash_t hashval; /* hash value of name */
xfs_extlen_t total; /* total blocks needed, for 1st bmap */
int whichfork; /* data or attribute fork */
xfs_dablk_t blkno; /* blkno of attr leaf of interest */
@@ -77,7 +84,6 @@ typedef struct xfs_da_args {
xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
int rmtblkcnt2; /* remote attr value block count */
int rmtvaluelen2; /* remote attr value length in bytes */
- uint32_t op_flags; /* operation flags */
enum xfs_dacmp cmpresult; /* name compare result for lookups */
} xfs_da_args_t;
@@ -89,10 +95,8 @@ typedef struct xfs_da_args {
#define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */
#define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */
#define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */
-#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */
-#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */
-#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */
-#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */
+#define XFS_DA_OP_RECOVERY (1u << 5) /* Log recovery operation */
+#define XFS_DA_OP_LOGGED (1u << 6) /* Use intent items to track op */
#define XFS_DA_OP_FLAGS \
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
@@ -100,8 +104,6 @@ typedef struct xfs_da_args {
{ XFS_DA_OP_ADDNAME, "ADDNAME" }, \
{ XFS_DA_OP_OKNOENT, "OKNOENT" }, \
{ XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
- { XFS_DA_OP_NOTIME, "NOTIME" }, \
- { XFS_DA_OP_REMOVE, "REMOVE" }, \
{ XFS_DA_OP_RECOVERY, "RECOVERY" }, \
{ XFS_DA_OP_LOGGED, "LOGGED" }
@@ -235,6 +237,8 @@ void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp,
struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from);
void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp,
struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from);
+xfs_failaddr_t xfs_da3_header_check(struct xfs_buf *bp, xfs_ino_t owner);
+xfs_failaddr_t xfs_da3_node_header_check(struct xfs_buf *bp, xfs_ino_t owner);
extern struct kmem_cache *xfs_da_state_cache;
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 060e5c96b70f..86de99e2f757 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -714,12 +714,30 @@ struct xfs_attr3_leafblock {
#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
+#define XFS_ATTR_PARENT_BIT 3 /* parent pointer attrs */
#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
#define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT)
#define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT)
#define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_PARENT (1u << XFS_ATTR_PARENT_BIT)
#define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT)
-#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
+
+#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | \
+ XFS_ATTR_SECURE | \
+ XFS_ATTR_PARENT)
+
+/* Private attr namespaces not exposed to userspace */
+#define XFS_ATTR_PRIVATE_NSP_MASK (XFS_ATTR_PARENT)
+
+#define XFS_ATTR_ONDISK_MASK (XFS_ATTR_NSP_ONDISK_MASK | \
+ XFS_ATTR_LOCAL | \
+ XFS_ATTR_INCOMPLETE)
+
+#define XFS_ATTR_NAMESPACE_STR \
+ { XFS_ATTR_LOCAL, "local" }, \
+ { XFS_ATTR_ROOT, "root" }, \
+ { XFS_ATTR_SECURE, "secure" }, \
+ { XFS_ATTR_PARENT, "parent" }
/*
* Alignment for namelist and valuelist entries (since they are mixed
@@ -862,9 +880,7 @@ struct xfs_attr3_rmt_hdr {
#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
-#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \
- ((bufsize) - (xfs_has_crc((mp)) ? \
- sizeof(struct xfs_attr3_rmt_hdr) : 0))
+unsigned int xfs_attr3_rmt_buf_space(struct xfs_mount *mp);
/* Number of bytes in a directory block. */
static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp)
@@ -875,4 +891,17 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp)
xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp,
struct xfs_da3_blkinfo *hdr3);
+/*
+ * Parent pointer attribute format definition
+ *
+ * The xattr name contains the dirent name.
+ * The xattr value encodes the parent inode number and generation to ease
+ * opening parents by handle.
+ * The xattr hashval is xfs_dir2_namehash() ^ p_ino
+ */
+struct xfs_parent_rec {
+ __be64 p_ino;
+ __be32 p_gen;
+} __packed;
+
#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index c13276095cc0..4a078e07e1a0 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -27,6 +27,7 @@
#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_trans_priv.h"
+#include "xfs_exchmaps.h"
static struct kmem_cache *xfs_defer_pending_cache;
@@ -1091,7 +1092,11 @@ xfs_defer_ops_continue(
ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
/* Lock the captured resources to the new transaction. */
- if (dfc->dfc_held.dr_inos == 2)
+ if (dfc->dfc_held.dr_inos > 2) {
+ xfs_sort_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos);
+ xfs_lock_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos,
+ XFS_ILOCK_EXCL);
+ } else if (dfc->dfc_held.dr_inos == 2)
xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
else if (dfc->dfc_held.dr_inos == 1)
@@ -1176,6 +1181,10 @@ xfs_defer_init_item_caches(void)
error = xfs_attr_intent_init_cache();
if (error)
goto err;
+ error = xfs_exchmaps_intent_init_cache();
+ if (error)
+ goto err;
+
return 0;
err:
xfs_defer_destroy_item_caches();
@@ -1186,6 +1195,7 @@ err:
void
xfs_defer_destroy_item_caches(void)
{
+ xfs_exchmaps_intent_destroy_cache();
xfs_attr_intent_destroy_cache();
xfs_extfree_intent_destroy_cache();
xfs_bmap_intent_destroy_cache();
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 18a9fb92dde8..8b338031e487 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -72,12 +72,18 @@ extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
extern const struct xfs_defer_op_type xfs_attr_defer_type;
-
+extern const struct xfs_defer_op_type xfs_exchmaps_defer_type;
/*
* Deferred operation item relogging limits.
*/
-#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */
+
+/*
+ * Rename w/ parent pointers can require up to 5 inodes with deferred ops to
+ * be joined to the transaction: src_dp, target_dp, src_ip, target_ip, and wip.
+ * These inodes are locked in sorted order by their inode numbers
+ */
+#define XFS_DEFER_OPS_NR_INODES 5
#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */
/* Resources that must be held across a transaction roll. */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 4821519efad4..457f9a38f850 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -250,11 +250,68 @@ xfs_dir_init(
args->geo = dp->i_mount->m_dir_geo;
args->dp = dp;
args->trans = tp;
+ args->owner = dp->i_ino;
error = xfs_dir2_sf_create(args, pdp->i_ino);
kfree(args);
return error;
}
+enum xfs_dir2_fmt
+xfs_dir2_format(
+ struct xfs_da_args *args,
+ int *error)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ xfs_fileoff_t eof;
+
+ xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
+
+ *error = 0;
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
+ return XFS_DIR2_FMT_SF;
+
+ *error = xfs_bmap_last_offset(dp, &eof, XFS_DATA_FORK);
+ if (*error)
+ return XFS_DIR2_FMT_ERROR;
+
+ if (eof == XFS_B_TO_FSB(mp, geo->blksize)) {
+ if (XFS_IS_CORRUPT(mp, dp->i_disk_size != geo->blksize)) {
+ xfs_da_mark_sick(args);
+ *error = -EFSCORRUPTED;
+ return XFS_DIR2_FMT_ERROR;
+ }
+ return XFS_DIR2_FMT_BLOCK;
+ }
+ if (eof == geo->leafblk + geo->fsbcount)
+ return XFS_DIR2_FMT_LEAF;
+ return XFS_DIR2_FMT_NODE;
+}
+
+int
+xfs_dir_createname_args(
+ struct xfs_da_args *args)
+{
+ int error;
+
+ if (!args->inumber)
+ args->op_flags |= XFS_DA_OP_JUSTCHECK;
+
+ switch (xfs_dir2_format(args, &error)) {
+ case XFS_DIR2_FMT_SF:
+ return xfs_dir2_sf_addname(args);
+ case XFS_DIR2_FMT_BLOCK:
+ return xfs_dir2_block_addname(args);
+ case XFS_DIR2_FMT_LEAF:
+ return xfs_dir2_leaf_addname(args);
+ case XFS_DIR2_FMT_NODE:
+ return xfs_dir2_node_addname(args);
+ default:
+ return error;
+ }
+}
+
/*
* Enter a name in a directory, or check for available space.
* If inum is 0, only the available space test is performed.
@@ -269,7 +326,6 @@ xfs_dir_createname(
{
struct xfs_da_args *args;
int rval;
- bool v;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
@@ -295,31 +351,9 @@ xfs_dir_createname(
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
- if (!inum)
- args->op_flags |= XFS_DA_OP_JUSTCHECK;
-
- if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
- rval = xfs_dir2_sf_addname(args);
- goto out_free;
- }
-
- rval = xfs_dir2_isblock(args, &v);
- if (rval)
- goto out_free;
- if (v) {
- rval = xfs_dir2_block_addname(args);
- goto out_free;
- }
+ args->owner = dp->i_ino;
- rval = xfs_dir2_isleaf(args, &v);
- if (rval)
- goto out_free;
- if (v)
- rval = xfs_dir2_leaf_addname(args);
- else
- rval = xfs_dir2_node_addname(args);
-
-out_free:
+ rval = xfs_dir_createname_args(args);
kfree(args);
return rval;
}
@@ -350,6 +384,34 @@ xfs_dir_cilookup_result(
return -EEXIST;
}
+int
+xfs_dir_lookup_args(
+ struct xfs_da_args *args)
+{
+ int error;
+
+ switch (xfs_dir2_format(args, &error)) {
+ case XFS_DIR2_FMT_SF:
+ error = xfs_dir2_sf_lookup(args);
+ break;
+ case XFS_DIR2_FMT_BLOCK:
+ error = xfs_dir2_block_lookup(args);
+ break;
+ case XFS_DIR2_FMT_LEAF:
+ error = xfs_dir2_leaf_lookup(args);
+ break;
+ case XFS_DIR2_FMT_NODE:
+ error = xfs_dir2_node_lookup(args);
+ break;
+ default:
+ break;
+ }
+
+ if (error != -EEXIST)
+ return error;
+ return 0;
+}
+
/*
* Lookup a name in a directory, give back the inode number.
* If ci_name is not NULL, returns the actual name in ci_name if it differs
@@ -366,7 +428,6 @@ xfs_dir_lookup(
{
struct xfs_da_args *args;
int rval;
- bool v;
int lock_mode;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
@@ -383,34 +444,12 @@ xfs_dir_lookup(
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
args->op_flags = XFS_DA_OP_OKNOENT;
+ args->owner = dp->i_ino;
if (ci_name)
args->op_flags |= XFS_DA_OP_CILOOKUP;
lock_mode = xfs_ilock_data_map_shared(dp);
- if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
- rval = xfs_dir2_sf_lookup(args);
- goto out_check_rval;
- }
-
- rval = xfs_dir2_isblock(args, &v);
- if (rval)
- goto out_free;
- if (v) {
- rval = xfs_dir2_block_lookup(args);
- goto out_check_rval;
- }
-
- rval = xfs_dir2_isleaf(args, &v);
- if (rval)
- goto out_free;
- if (v)
- rval = xfs_dir2_leaf_lookup(args);
- else
- rval = xfs_dir2_node_lookup(args);
-
-out_check_rval:
- if (rval == -EEXIST)
- rval = 0;
+ rval = xfs_dir_lookup_args(args);
if (!rval) {
*inum = args->inumber;
if (ci_name) {
@@ -418,12 +457,31 @@ out_check_rval:
ci_name->len = args->valuelen;
}
}
-out_free:
xfs_iunlock(dp, lock_mode);
kfree(args);
return rval;
}
+int
+xfs_dir_removename_args(
+ struct xfs_da_args *args)
+{
+ int error;
+
+ switch (xfs_dir2_format(args, &error)) {
+ case XFS_DIR2_FMT_SF:
+ return xfs_dir2_sf_removename(args);
+ case XFS_DIR2_FMT_BLOCK:
+ return xfs_dir2_block_removename(args);
+ case XFS_DIR2_FMT_LEAF:
+ return xfs_dir2_leaf_removename(args);
+ case XFS_DIR2_FMT_NODE:
+ return xfs_dir2_node_removename(args);
+ default:
+ return error;
+ }
+}
+
/*
* Remove an entry from a directory.
*/
@@ -431,13 +489,12 @@ int
xfs_dir_removename(
struct xfs_trans *tp,
struct xfs_inode *dp,
- struct xfs_name *name,
+ const struct xfs_name *name,
xfs_ino_t ino,
xfs_extlen_t total) /* bmap's total block count */
{
struct xfs_da_args *args;
int rval;
- bool v;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_remove);
@@ -456,30 +513,30 @@ xfs_dir_removename(
args->total = total;
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
+ args->owner = dp->i_ino;
+ rval = xfs_dir_removename_args(args);
+ kfree(args);
+ return rval;
+}
- if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
- rval = xfs_dir2_sf_removename(args);
- goto out_free;
- }
+int
+xfs_dir_replace_args(
+ struct xfs_da_args *args)
+{
+ int error;
- rval = xfs_dir2_isblock(args, &v);
- if (rval)
- goto out_free;
- if (v) {
- rval = xfs_dir2_block_removename(args);
- goto out_free;
+ switch (xfs_dir2_format(args, &error)) {
+ case XFS_DIR2_FMT_SF:
+ return xfs_dir2_sf_replace(args);
+ case XFS_DIR2_FMT_BLOCK:
+ return xfs_dir2_block_replace(args);
+ case XFS_DIR2_FMT_LEAF:
+ return xfs_dir2_leaf_replace(args);
+ case XFS_DIR2_FMT_NODE:
+ return xfs_dir2_node_replace(args);
+ default:
+ return error;
}
-
- rval = xfs_dir2_isleaf(args, &v);
- if (rval)
- goto out_free;
- if (v)
- rval = xfs_dir2_leaf_removename(args);
- else
- rval = xfs_dir2_node_removename(args);
-out_free:
- kfree(args);
- return rval;
}
/*
@@ -495,7 +552,6 @@ xfs_dir_replace(
{
struct xfs_da_args *args;
int rval;
- bool v;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
@@ -517,28 +573,8 @@ xfs_dir_replace(
args->total = total;
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
-
- if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
- rval = xfs_dir2_sf_replace(args);
- goto out_free;
- }
-
- rval = xfs_dir2_isblock(args, &v);
- if (rval)
- goto out_free;
- if (v) {
- rval = xfs_dir2_block_replace(args);
- goto out_free;
- }
-
- rval = xfs_dir2_isleaf(args, &v);
- if (rval)
- goto out_free;
- if (v)
- rval = xfs_dir2_leaf_replace(args);
- else
- rval = xfs_dir2_node_replace(args);
-out_free:
+ args->owner = dp->i_ino;
+ rval = xfs_dir_replace_args(args);
kfree(args);
return rval;
}
@@ -607,57 +643,6 @@ xfs_dir2_grow_inode(
}
/*
- * See if the directory is a single-block form directory.
- */
-int
-xfs_dir2_isblock(
- struct xfs_da_args *args,
- bool *isblock)
-{
- struct xfs_mount *mp = args->dp->i_mount;
- xfs_fileoff_t eof;
- int error;
-
- error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK);
- if (error)
- return error;
-
- *isblock = false;
- if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize)
- return 0;
-
- *isblock = true;
- if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) {
- xfs_da_mark_sick(args);
- return -EFSCORRUPTED;
- }
- return 0;
-}
-
-/*
- * See if the directory is a single-leaf form directory.
- */
-int
-xfs_dir2_isleaf(
- struct xfs_da_args *args,
- bool *isleaf)
-{
- xfs_fileoff_t eof;
- int error;
-
- error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK);
- if (error)
- return error;
-
- *isleaf = false;
- if (eof != args->geo->leafblk + args->geo->fsbcount)
- return 0;
-
- *isleaf = true;
- return 0;
-}
-
-/*
* Remove the given block from the directory.
* This routine is used for data and free blocks, leaf/node are done
* by xfs_da_shrink_inode.
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 8497d041f316..6dbe6e9ecb49 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -36,6 +36,16 @@ xfs_dir2_samename(
return !memcmp(n1->name, n2->name, n1->len);
}
+enum xfs_dir2_fmt {
+ XFS_DIR2_FMT_SF,
+ XFS_DIR2_FMT_BLOCK,
+ XFS_DIR2_FMT_LEAF,
+ XFS_DIR2_FMT_NODE,
+ XFS_DIR2_FMT_ERROR,
+};
+
+enum xfs_dir2_fmt xfs_dir2_format(struct xfs_da_args *args, int *error);
+
/*
* Convert inode mode to directory entry filetype
*/
@@ -58,7 +68,7 @@ extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
const struct xfs_name *name, xfs_ino_t *inum,
struct xfs_name *ci_name);
extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_name *name, xfs_ino_t ino,
+ const struct xfs_name *name, xfs_ino_t ino,
xfs_extlen_t tot);
extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
const struct xfs_name *name, xfs_ino_t inum,
@@ -66,6 +76,11 @@ extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name);
+int xfs_dir_lookup_args(struct xfs_da_args *args);
+int xfs_dir_createname_args(struct xfs_da_args *args);
+int xfs_dir_removename_args(struct xfs_da_args *args);
+int xfs_dir_replace_args(struct xfs_da_args *args);
+
/*
* Direct call from the bmap code, bypassing the generic directory layer.
*/
@@ -74,8 +89,6 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
/*
* Interface routines used by userspace utilities
*/
-extern int xfs_dir2_isblock(struct xfs_da_args *args, bool *isblock);
-extern int xfs_dir2_isleaf(struct xfs_da_args *args, bool *isleaf);
extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
struct xfs_buf *bp);
@@ -101,6 +114,10 @@ extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+xfs_failaddr_t xfs_dir3_leaf_header_check(struct xfs_buf *bp, xfs_ino_t owner);
+xfs_failaddr_t xfs_dir3_data_header_check(struct xfs_buf *bp, xfs_ino_t owner);
+xfs_failaddr_t xfs_dir3_block_header_check(struct xfs_buf *bp, xfs_ino_t owner);
+
extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index a2da007adb46..0f93ed1a4a74 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -115,17 +115,20 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
.verify_struct = xfs_dir3_block_verify,
};
-static xfs_failaddr_t
+xfs_failaddr_t
xfs_dir3_block_header_check(
- struct xfs_inode *dp,
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ xfs_ino_t owner)
{
- struct xfs_mount *mp = dp->i_mount;
+ struct xfs_mount *mp = bp->b_mount;
if (xfs_has_crc(mp)) {
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
- if (be64_to_cpu(hdr3->owner) != dp->i_ino)
+ if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
+ return __this_address;
+
+ if (be64_to_cpu(hdr3->owner) != owner)
return __this_address;
}
@@ -136,6 +139,7 @@ int
xfs_dir3_block_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
+ xfs_ino_t owner,
struct xfs_buf **bpp)
{
struct xfs_mount *mp = dp->i_mount;
@@ -148,7 +152,7 @@ xfs_dir3_block_read(
return err;
/* Check things that we can't do in the verifier. */
- fa = xfs_dir3_block_header_check(dp, *bpp);
+ fa = xfs_dir3_block_header_check(*bpp, owner);
if (fa) {
__xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
@@ -163,12 +167,13 @@ xfs_dir3_block_read(
static void
xfs_dir3_block_init(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- struct xfs_buf *bp,
- struct xfs_inode *dp)
+ struct xfs_da_args *args,
+ struct xfs_buf *bp)
{
- struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
bp->b_ops = &xfs_dir3_block_buf_ops;
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF);
@@ -177,7 +182,7 @@ xfs_dir3_block_init(
memset(hdr3, 0, sizeof(*hdr3));
hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
- hdr3->owner = cpu_to_be64(dp->i_ino);
+ hdr3->owner = cpu_to_be64(args->owner);
uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
return;
@@ -382,7 +387,7 @@ xfs_dir2_block_addname(
tp = args->trans;
/* Read the (one and only) directory block into bp. */
- error = xfs_dir3_block_read(tp, dp, &bp);
+ error = xfs_dir3_block_read(tp, dp, args->owner, &bp);
if (error)
return error;
@@ -697,7 +702,7 @@ xfs_dir2_block_lookup_int(
dp = args->dp;
tp = args->trans;
- error = xfs_dir3_block_read(tp, dp, &bp);
+ error = xfs_dir3_block_read(tp, dp, args->owner, &bp);
if (error)
return error;
@@ -981,7 +986,8 @@ xfs_dir2_leaf_to_block(
* Read the data block if we don't already have it, give up if it fails.
*/
if (!dbp) {
- error = xfs_dir3_data_read(tp, dp, args->geo->datablk, 0, &dbp);
+ error = xfs_dir3_data_read(tp, dp, args->owner,
+ args->geo->datablk, 0, &dbp);
if (error)
return error;
}
@@ -1009,7 +1015,7 @@ xfs_dir2_leaf_to_block(
/*
* Start converting it to block form.
*/
- xfs_dir3_block_init(mp, tp, dbp, dp);
+ xfs_dir3_block_init(args, dbp);
needlog = 1;
needscan = 0;
@@ -1129,7 +1135,7 @@ xfs_dir2_sf_to_block(
error = xfs_dir3_data_init(args, blkno, &bp);
if (error)
goto out_free;
- xfs_dir3_block_init(mp, tp, bp, dp);
+ xfs_dir3_block_init(args, bp);
hdr = bp->b_addr;
/*
@@ -1169,7 +1175,7 @@ xfs_dir2_sf_to_block(
* Create entry for .
*/
dep = bp->b_addr + offset;
- dep->inumber = cpu_to_be64(dp->i_ino);
+ dep->inumber = cpu_to_be64(args->owner);
dep->namelen = 1;
dep->name[0] = '.';
xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 7a6d965bea71..ea0b9628df18 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -395,17 +395,20 @@ static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
.verify_write = xfs_dir3_data_write_verify,
};
-static xfs_failaddr_t
+xfs_failaddr_t
xfs_dir3_data_header_check(
- struct xfs_inode *dp,
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ xfs_ino_t owner)
{
- struct xfs_mount *mp = dp->i_mount;
+ struct xfs_mount *mp = bp->b_mount;
if (xfs_has_crc(mp)) {
struct xfs_dir3_data_hdr *hdr3 = bp->b_addr;
- if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino)
+ if (hdr3->hdr.magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
+ return __this_address;
+
+ if (be64_to_cpu(hdr3->hdr.owner) != owner)
return __this_address;
}
@@ -416,6 +419,7 @@ int
xfs_dir3_data_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
+ xfs_ino_t owner,
xfs_dablk_t bno,
unsigned int flags,
struct xfs_buf **bpp)
@@ -429,7 +433,7 @@ xfs_dir3_data_read(
return err;
/* Check things that we can't do in the verifier. */
- fa = xfs_dir3_data_header_check(dp, *bpp);
+ fa = xfs_dir3_data_header_check(*bpp, owner);
if (fa) {
__xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
@@ -725,7 +729,7 @@ xfs_dir3_data_init(
memset(hdr3, 0, sizeof(*hdr3));
hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
- hdr3->owner = cpu_to_be64(dp->i_ino);
+ hdr3->owner = cpu_to_be64(args->owner);
uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
} else
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 08dda5ce9d91..71c2f22a3f6e 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -208,6 +208,29 @@ xfs_dir3_leaf_verify(
return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr, true);
}
+xfs_failaddr_t
+xfs_dir3_leaf_header_check(
+ struct xfs_buf *bp,
+ xfs_ino_t owner)
+{
+ struct xfs_mount *mp = bp->b_mount;
+
+ if (xfs_has_crc(mp)) {
+ struct xfs_dir3_leaf *hdr3 = bp->b_addr;
+
+ if (hdr3->hdr.info.hdr.magic !=
+ cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) &&
+ hdr3->hdr.info.hdr.magic !=
+ cpu_to_be16(XFS_DIR3_LEAFN_MAGIC))
+ return __this_address;
+
+ if (be64_to_cpu(hdr3->hdr.info.owner) != owner)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
static void
xfs_dir3_leaf_read_verify(
struct xfs_buf *bp)
@@ -271,32 +294,60 @@ int
xfs_dir3_leaf_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
+ xfs_ino_t owner,
xfs_dablk_t fbno,
struct xfs_buf **bpp)
{
+ xfs_failaddr_t fa;
int err;
err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK,
&xfs_dir3_leaf1_buf_ops);
- if (!err && tp && *bpp)
+ if (err || !(*bpp))
+ return err;
+
+ fa = xfs_dir3_leaf_header_check(*bpp, owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(*bpp, fa);
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
+ return -EFSCORRUPTED;
+ }
+
+ if (tp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
- return err;
+ return 0;
}
int
xfs_dir3_leafn_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
+ xfs_ino_t owner,
xfs_dablk_t fbno,
struct xfs_buf **bpp)
{
+ xfs_failaddr_t fa;
int err;
err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK,
&xfs_dir3_leafn_buf_ops);
- if (!err && tp && *bpp)
+ if (err || !(*bpp))
+ return err;
+
+ fa = xfs_dir3_leaf_header_check(*bpp, owner);
+ if (fa) {
+ __xfs_buf_mark_corrupt(*bpp, fa);
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
+ return -EFSCORRUPTED;
+ }
+
+ if (tp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
- return err;
+ return 0;
}
/*
@@ -304,12 +355,12 @@ xfs_dir3_leafn_read(
*/
static void
xfs_dir3_leaf_init(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
- xfs_ino_t owner,
uint16_t type)
{
+ struct xfs_mount *mp = args->dp->i_mount;
+ struct xfs_trans *tp = args->trans;
struct xfs_dir2_leaf *leaf = bp->b_addr;
ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC);
@@ -323,7 +374,7 @@ xfs_dir3_leaf_init(
? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)
: cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
leaf3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
- leaf3->info.owner = cpu_to_be64(owner);
+ leaf3->info.owner = cpu_to_be64(args->owner);
uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid);
} else {
memset(leaf, 0, sizeof(*leaf));
@@ -356,7 +407,6 @@ xfs_dir3_leaf_get_buf(
{
struct xfs_inode *dp = args->dp;
struct xfs_trans *tp = args->trans;
- struct xfs_mount *mp = dp->i_mount;
struct xfs_buf *bp;
int error;
@@ -369,7 +419,7 @@ xfs_dir3_leaf_get_buf(
if (error)
return error;
- xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
+ xfs_dir3_leaf_init(args, bp, magic);
xfs_dir3_leaf_log_header(args, bp);
if (magic == XFS_DIR2_LEAF1_MAGIC)
xfs_dir3_leaf_log_tail(args, bp);
@@ -647,7 +697,8 @@ xfs_dir2_leaf_addname(
trace_xfs_dir2_leaf_addname(args);
- error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp);
+ error = xfs_dir3_leaf_read(tp, dp, args->owner, args->geo->leafblk,
+ &lbp);
if (error)
return error;
@@ -834,9 +885,9 @@ xfs_dir2_leaf_addname(
* Already had space in some data block.
* Just read that one in.
*/
- error = xfs_dir3_data_read(tp, dp,
- xfs_dir2_db_to_da(args->geo, use_block),
- 0, &dbp);
+ error = xfs_dir3_data_read(tp, dp, args->owner,
+ xfs_dir2_db_to_da(args->geo, use_block), 0,
+ &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -1238,7 +1289,8 @@ xfs_dir2_leaf_lookup_int(
tp = args->trans;
mp = dp->i_mount;
- error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp);
+ error = xfs_dir3_leaf_read(tp, dp, args->owner, args->geo->leafblk,
+ &lbp);
if (error)
return error;
@@ -1276,9 +1328,9 @@ xfs_dir2_leaf_lookup_int(
if (newdb != curdb) {
if (dbp)
xfs_trans_brelse(tp, dbp);
- error = xfs_dir3_data_read(tp, dp,
- xfs_dir2_db_to_da(args->geo, newdb),
- 0, &dbp);
+ error = xfs_dir3_data_read(tp, dp, args->owner,
+ xfs_dir2_db_to_da(args->geo, newdb), 0,
+ &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -1318,9 +1370,9 @@ xfs_dir2_leaf_lookup_int(
ASSERT(cidb != -1);
if (cidb != curdb) {
xfs_trans_brelse(tp, dbp);
- error = xfs_dir3_data_read(tp, dp,
- xfs_dir2_db_to_da(args->geo, cidb),
- 0, &dbp);
+ error = xfs_dir3_data_read(tp, dp, args->owner,
+ xfs_dir2_db_to_da(args->geo, cidb), 0,
+ &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -1614,7 +1666,8 @@ xfs_dir2_leaf_trim_data(
/*
* Read the offending data block. We need its buffer.
*/
- error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(geo, db), 0, &dbp);
+ error = xfs_dir3_data_read(tp, dp, args->owner,
+ xfs_dir2_db_to_da(geo, db), 0, &dbp);
if (error)
return error;
@@ -1753,7 +1806,8 @@ xfs_dir2_node_to_leaf(
/*
* Read the freespace block.
*/
- error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp);
+ error = xfs_dir2_free_read(tp, dp, args->owner, args->geo->freeblk,
+ &fbp);
if (error)
return error;
xfs_dir2_free_hdr_from_disk(mp, &freehdr, fbp->b_addr);
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index be0b8834028c..fe8d4fa13128 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -175,11 +175,11 @@ const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
/* Everything ok in the free block header? */
static xfs_failaddr_t
xfs_dir3_free_header_check(
- struct xfs_inode *dp,
- xfs_dablk_t fbno,
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ xfs_ino_t owner,
+ xfs_dablk_t fbno)
{
- struct xfs_mount *mp = dp->i_mount;
+ struct xfs_mount *mp = bp->b_mount;
int maxbests = mp->m_dir_geo->free_max_bests;
unsigned int firstdb;
@@ -195,7 +195,7 @@ xfs_dir3_free_header_check(
return __this_address;
if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused))
return __this_address;
- if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino)
+ if (be64_to_cpu(hdr3->hdr.owner) != owner)
return __this_address;
} else {
struct xfs_dir2_free_hdr *hdr = bp->b_addr;
@@ -214,6 +214,7 @@ static int
__xfs_dir3_free_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
+ xfs_ino_t owner,
xfs_dablk_t fbno,
unsigned int flags,
struct xfs_buf **bpp)
@@ -227,7 +228,7 @@ __xfs_dir3_free_read(
return err;
/* Check things that we can't do in the verifier. */
- fa = xfs_dir3_free_header_check(dp, fbno, *bpp);
+ fa = xfs_dir3_free_header_check(*bpp, owner, fbno);
if (fa) {
__xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
@@ -299,20 +300,23 @@ int
xfs_dir2_free_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
+ xfs_ino_t owner,
xfs_dablk_t fbno,
struct xfs_buf **bpp)
{
- return __xfs_dir3_free_read(tp, dp, fbno, 0, bpp);
+ return __xfs_dir3_free_read(tp, dp, owner, fbno, 0, bpp);
}
static int
xfs_dir2_free_try_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
+ xfs_ino_t owner,
xfs_dablk_t fbno,
struct xfs_buf **bpp)
{
- return __xfs_dir3_free_read(tp, dp, fbno, XFS_DABUF_MAP_HOLE_OK, bpp);
+ return __xfs_dir3_free_read(tp, dp, owner, fbno, XFS_DABUF_MAP_HOLE_OK,
+ bpp);
}
static int
@@ -349,7 +353,7 @@ xfs_dir3_free_get_buf(
hdr.magic = XFS_DIR3_FREE_MAGIC;
hdr3->hdr.blkno = cpu_to_be64(xfs_buf_daddr(bp));
- hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
+ hdr3->hdr.owner = cpu_to_be64(args->owner);
uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid);
} else
hdr.magic = XFS_DIR2_FREE_MAGIC;
@@ -717,7 +721,7 @@ xfs_dir2_leafn_lookup_for_addname(
if (curbp)
xfs_trans_brelse(tp, curbp);
- error = xfs_dir2_free_read(tp, dp,
+ error = xfs_dir2_free_read(tp, dp, args->owner,
xfs_dir2_db_to_da(args->geo,
newfdb),
&curbp);
@@ -863,7 +867,7 @@ xfs_dir2_leafn_lookup_for_entry(
ASSERT(state->extravalid);
curbp = state->extrablk.bp;
} else {
- error = xfs_dir3_data_read(tp, dp,
+ error = xfs_dir3_data_read(tp, dp, args->owner,
xfs_dir2_db_to_da(args->geo,
newdb),
0, &curbp);
@@ -1356,8 +1360,8 @@ xfs_dir2_leafn_remove(
* read in the free block.
*/
fdb = xfs_dir2_db_to_fdb(geo, db);
- error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(geo, fdb),
- &fbp);
+ error = xfs_dir2_free_read(tp, dp, args->owner,
+ xfs_dir2_db_to_da(geo, fdb), &fbp);
if (error)
return error;
free = fbp->b_addr;
@@ -1562,7 +1566,8 @@ xfs_dir2_leafn_toosmall(
/*
* Read the sibling leaf block.
*/
- error = xfs_dir3_leafn_read(state->args->trans, dp, blkno, &bp);
+ error = xfs_dir3_leafn_read(state->args->trans, dp,
+ state->args->owner, blkno, &bp);
if (error)
return error;
@@ -1715,7 +1720,7 @@ xfs_dir2_node_add_datablk(
* that was just allocated.
*/
fbno = xfs_dir2_db_to_fdb(args->geo, *dbno);
- error = xfs_dir2_free_try_read(tp, dp,
+ error = xfs_dir2_free_try_read(tp, dp, args->owner,
xfs_dir2_db_to_da(args->geo, fbno), &fbp);
if (error)
return error;
@@ -1862,7 +1867,7 @@ xfs_dir2_node_find_freeblk(
* so this might not succeed. This should be really rare, so
* there's no reason to avoid it.
*/
- error = xfs_dir2_free_try_read(tp, dp,
+ error = xfs_dir2_free_try_read(tp, dp, args->owner,
xfs_dir2_db_to_da(args->geo, fbno),
&fbp);
if (error)
@@ -1948,9 +1953,8 @@ xfs_dir2_node_addname_int(
&freehdr, &findex);
} else {
/* Read the data block in. */
- error = xfs_dir3_data_read(tp, dp,
- xfs_dir2_db_to_da(args->geo, dbno),
- 0, &dbp);
+ error = xfs_dir3_data_read(tp, dp, args->owner,
+ xfs_dir2_db_to_da(args->geo, dbno), 0, &dbp);
}
if (error)
return error;
@@ -2302,7 +2306,7 @@ xfs_dir2_node_trim_free(
/*
* Read the freespace block.
*/
- error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
+ error = xfs_dir2_free_try_read(tp, dp, args->owner, fo, &bp);
if (error)
return error;
/*
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 1db2e60ba827..3befb32509fa 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -50,8 +50,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
/* xfs_dir2_block.c */
-extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_buf **bpp);
+int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_ino_t owner, struct xfs_buf **bpp);
extern int xfs_dir2_block_addname(struct xfs_da_args *args);
extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
extern int xfs_dir2_block_removename(struct xfs_da_args *args);
@@ -78,7 +78,8 @@ extern void xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
extern xfs_failaddr_t __xfs_dir3_data_check(struct xfs_inode *dp,
struct xfs_buf *bp);
int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp);
+ xfs_ino_t owner, xfs_dablk_t bno, unsigned int flags,
+ struct xfs_buf **bpp);
int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
unsigned int flags);
@@ -95,9 +96,9 @@ void xfs_dir2_leaf_hdr_from_disk(struct xfs_mount *mp,
void xfs_dir2_leaf_hdr_to_disk(struct xfs_mount *mp, struct xfs_dir2_leaf *to,
struct xfs_dir3_icleaf_hdr *from);
int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t fbno, struct xfs_buf **bpp);
+ xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp);
int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t fbno, struct xfs_buf **bpp);
+ xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp);
extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
struct xfs_buf *dbp);
extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
@@ -154,8 +155,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args);
extern int xfs_dir2_node_replace(struct xfs_da_args *args);
extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
int *rvalp);
-extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t fbno, struct xfs_buf **bpp);
+int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp);
/* xfs_dir2_sf.c */
xfs_ino_t xfs_dir2_sf_get_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr,
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 01a9e86b3037..7002d7676a78 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -63,7 +63,8 @@
#define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41
#define XFS_ERRTAG_WB_DELAY_MS 42
#define XFS_ERRTAG_WRITE_DELAY_MS 43
-#define XFS_ERRTAG_MAX 44
+#define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44
+#define XFS_ERRTAG_MAX 45
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -111,5 +112,6 @@
#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1
#define XFS_RANDOM_WB_DELAY_MS 3000
#define XFS_RANDOM_WRITE_DELAY_MS 3000
+#define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c
new file mode 100644
index 000000000000..2021396651de
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_exchmaps.c
@@ -0,0 +1,1235 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_icache.h"
+#include "xfs_quota.h"
+#include "xfs_exchmaps.h"
+#include "xfs_trace.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_error.h"
+#include "xfs_errortag.h"
+#include "xfs_health.h"
+#include "xfs_exchmaps_item.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_dir2.h"
+#include "xfs_symlink_remote.h"
+
+struct kmem_cache *xfs_exchmaps_intent_cache;
+
+/* bmbt mappings adjacent to a pair of records. */
+struct xfs_exchmaps_adjacent {
+ struct xfs_bmbt_irec left1;
+ struct xfs_bmbt_irec right1;
+ struct xfs_bmbt_irec left2;
+ struct xfs_bmbt_irec right2;
+};
+
+#define ADJACENT_INIT { \
+ .left1 = { .br_startblock = HOLESTARTBLOCK }, \
+ .right1 = { .br_startblock = HOLESTARTBLOCK }, \
+ .left2 = { .br_startblock = HOLESTARTBLOCK }, \
+ .right2 = { .br_startblock = HOLESTARTBLOCK }, \
+}
+
+/* Information to reset reflink flag / CoW fork state after an exchange. */
+
+/*
+ * If the reflink flag is set on either inode, make sure it has an incore CoW
+ * fork, since all reflink inodes must have them. If there's a CoW fork and it
+ * has mappings in it, make sure the inodes are tagged appropriately so that
+ * speculative preallocations can be GC'd if we run low of space.
+ */
+static inline void
+xfs_exchmaps_ensure_cowfork(
+ struct xfs_inode *ip)
+{
+ struct xfs_ifork *cfork;
+
+ if (xfs_is_reflink_inode(ip))
+ xfs_ifork_init_cow(ip);
+
+ cfork = xfs_ifork_ptr(ip, XFS_COW_FORK);
+ if (!cfork)
+ return;
+ if (cfork->if_bytes > 0)
+ xfs_inode_set_cowblocks_tag(ip);
+ else
+ xfs_inode_clear_cowblocks_tag(ip);
+}
+
+/*
+ * Adjust the on-disk inode size upwards if needed so that we never add
+ * mappings into the file past EOF. This is crucial so that log recovery won't
+ * get confused by the sudden appearance of post-eof mappings.
+ */
+STATIC void
+xfs_exchmaps_update_size(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ xfs_fsize_t new_isize)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_fsize_t len;
+
+ if (new_isize < 0)
+ return;
+
+ len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount),
+ new_isize);
+
+ if (len <= ip->i_disk_size)
+ return;
+
+ trace_xfs_exchmaps_update_inode_size(ip, len);
+
+ ip->i_disk_size = len;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/* Advance the incore state tracking after exchanging a mapping. */
+static inline void
+xmi_advance(
+ struct xfs_exchmaps_intent *xmi,
+ const struct xfs_bmbt_irec *irec)
+{
+ xmi->xmi_startoff1 += irec->br_blockcount;
+ xmi->xmi_startoff2 += irec->br_blockcount;
+ xmi->xmi_blockcount -= irec->br_blockcount;
+}
+
+/* Do we still have more mappings to exchange? */
+static inline bool
+xmi_has_more_exchange_work(const struct xfs_exchmaps_intent *xmi)
+{
+ return xmi->xmi_blockcount > 0;
+}
+
+/* Do we have post-operation cleanups to perform? */
+static inline bool
+xmi_has_postop_work(const struct xfs_exchmaps_intent *xmi)
+{
+ return xmi->xmi_flags & (XFS_EXCHMAPS_CLEAR_INO1_REFLINK |
+ XFS_EXCHMAPS_CLEAR_INO2_REFLINK |
+ __XFS_EXCHMAPS_INO2_SHORTFORM);
+}
+
+/* Check all mappings to make sure we can actually exchange them. */
+int
+xfs_exchmaps_check_forks(
+ struct xfs_mount *mp,
+ const struct xfs_exchmaps_req *req)
+{
+ struct xfs_ifork *ifp1, *ifp2;
+ int whichfork = xfs_exchmaps_reqfork(req);
+
+ /* No fork? */
+ ifp1 = xfs_ifork_ptr(req->ip1, whichfork);
+ ifp2 = xfs_ifork_ptr(req->ip2, whichfork);
+ if (!ifp1 || !ifp2)
+ return -EINVAL;
+
+ /* We don't know how to exchange local format forks. */
+ if (ifp1->if_format == XFS_DINODE_FMT_LOCAL ||
+ ifp2->if_format == XFS_DINODE_FMT_LOCAL)
+ return -EINVAL;
+
+ return 0;
+}
+
+#ifdef CONFIG_XFS_QUOTA
+/* Log the actual updates to the quota accounting. */
+static inline void
+xfs_exchmaps_update_quota(
+ struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi,
+ struct xfs_bmbt_irec *irec1,
+ struct xfs_bmbt_irec *irec2)
+{
+ int64_t ip1_delta = 0, ip2_delta = 0;
+ unsigned int qflag;
+
+ qflag = XFS_IS_REALTIME_INODE(xmi->xmi_ip1) ? XFS_TRANS_DQ_RTBCOUNT :
+ XFS_TRANS_DQ_BCOUNT;
+
+ if (xfs_bmap_is_real_extent(irec1)) {
+ ip1_delta -= irec1->br_blockcount;
+ ip2_delta += irec1->br_blockcount;
+ }
+
+ if (xfs_bmap_is_real_extent(irec2)) {
+ ip1_delta += irec2->br_blockcount;
+ ip2_delta -= irec2->br_blockcount;
+ }
+
+ xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip1, qflag, ip1_delta);
+ xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip2, qflag, ip2_delta);
+}
+#else
+# define xfs_exchmaps_update_quota(tp, xmi, irec1, irec2) ((void)0)
+#endif
+
+/* Decide if we want to skip this mapping from file1. */
+static inline bool
+xfs_exchmaps_can_skip_mapping(
+ struct xfs_exchmaps_intent *xmi,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = xmi->xmi_ip1->i_mount;
+
+ /* Do not skip this mapping if the caller did not tell us to. */
+ if (!(xmi->xmi_flags & XFS_EXCHMAPS_INO1_WRITTEN))
+ return false;
+
+ /* Do not skip mapped, written mappings. */
+ if (xfs_bmap_is_written_extent(irec))
+ return false;
+
+ /*
+ * The mapping is unwritten or a hole. It cannot be a delalloc
+ * reservation because we already excluded those. It cannot be an
+ * unwritten extent with dirty page cache because we flushed the page
+ * cache. For files where the allocation unit is 1FSB (files on the
+ * data dev, rt files if the extent size is 1FSB), we can safely
+ * skip this mapping.
+ */
+ if (!xfs_inode_has_bigrtalloc(xmi->xmi_ip1))
+ return true;
+
+ /*
+ * For a realtime file with a multi-fsb allocation unit, the decision
+ * is trickier because we can only swap full allocation units.
+ * Unwritten mappings can appear in the middle of an rtx if the rtx is
+ * partially written, but they can also appear for preallocations.
+ *
+ * If the mapping is a hole, skip it entirely. Holes should align with
+ * rtx boundaries.
+ */
+ if (!xfs_bmap_is_real_extent(irec))
+ return true;
+
+ /*
+ * All mappings below this point are unwritten.
+ *
+ * - If the beginning is not aligned to an rtx, trim the end of the
+ * mapping so that it does not cross an rtx boundary, and swap it.
+ *
+ * - If both ends are aligned to an rtx, skip the entire mapping.
+ */
+ if (!isaligned_64(irec->br_startoff, mp->m_sb.sb_rextsize)) {
+ xfs_fileoff_t new_end;
+
+ new_end = roundup_64(irec->br_startoff, mp->m_sb.sb_rextsize);
+ irec->br_blockcount = min(irec->br_blockcount,
+ new_end - irec->br_startoff);
+ return false;
+ }
+ if (isaligned_64(irec->br_blockcount, mp->m_sb.sb_rextsize))
+ return true;
+
+ /*
+ * All mappings below this point are unwritten, start on an rtx
+ * boundary, and do not end on an rtx boundary.
+ *
+ * - If the mapping is longer than one rtx, trim the end of the mapping
+ * down to an rtx boundary and skip it.
+ *
+ * - The mapping is shorter than one rtx. Swap it.
+ */
+ if (irec->br_blockcount > mp->m_sb.sb_rextsize) {
+ xfs_fileoff_t new_end;
+
+ new_end = rounddown_64(irec->br_startoff + irec->br_blockcount,
+ mp->m_sb.sb_rextsize);
+ irec->br_blockcount = new_end - irec->br_startoff;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Walk forward through the file ranges in @xmi until we find two different
+ * mappings to exchange. If there is work to do, return the mappings;
+ * otherwise we've reached the end of the range and xmi_blockcount will be
+ * zero.
+ *
+ * If the walk skips over a pair of mappings to the same storage, save them as
+ * the left records in @adj (if provided) so that the simulation phase can
+ * avoid an extra lookup.
+ */
+static int
+xfs_exchmaps_find_mappings(
+ struct xfs_exchmaps_intent *xmi,
+ struct xfs_bmbt_irec *irec1,
+ struct xfs_bmbt_irec *irec2,
+ struct xfs_exchmaps_adjacent *adj)
+{
+ int nimaps;
+ int bmap_flags;
+ int error;
+
+ bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_whichfork(xmi));
+
+ for (; xmi_has_more_exchange_work(xmi); xmi_advance(xmi, irec1)) {
+ /* Read mapping from the first file */
+ nimaps = 1;
+ error = xfs_bmapi_read(xmi->xmi_ip1, xmi->xmi_startoff1,
+ xmi->xmi_blockcount, irec1, &nimaps,
+ bmap_flags);
+ if (error)
+ return error;
+ if (nimaps != 1 ||
+ irec1->br_startblock == DELAYSTARTBLOCK ||
+ irec1->br_startoff != xmi->xmi_startoff1) {
+ /*
+ * We should never get no mapping or a delalloc mapping
+ * or something that doesn't match what we asked for,
+ * since the caller flushed both inodes and we hold the
+ * ILOCKs for both inodes.
+ */
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ if (xfs_exchmaps_can_skip_mapping(xmi, irec1)) {
+ trace_xfs_exchmaps_mapping1_skip(xmi->xmi_ip1, irec1);
+ continue;
+ }
+
+ /* Read mapping from the second file */
+ nimaps = 1;
+ error = xfs_bmapi_read(xmi->xmi_ip2, xmi->xmi_startoff2,
+ irec1->br_blockcount, irec2, &nimaps,
+ bmap_flags);
+ if (error)
+ return error;
+ if (nimaps != 1 ||
+ irec2->br_startblock == DELAYSTARTBLOCK ||
+ irec2->br_startoff != xmi->xmi_startoff2) {
+ /*
+ * We should never get no mapping or a delalloc mapping
+ * or something that doesn't match what we asked for,
+ * since the caller flushed both inodes and we hold the
+ * ILOCKs for both inodes.
+ */
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ /*
+ * We can only exchange as many blocks as the smaller of the
+ * two mapping maps.
+ */
+ irec1->br_blockcount = min(irec1->br_blockcount,
+ irec2->br_blockcount);
+
+ trace_xfs_exchmaps_mapping1(xmi->xmi_ip1, irec1);
+ trace_xfs_exchmaps_mapping2(xmi->xmi_ip2, irec2);
+
+ /* We found something to exchange, so return it. */
+ if (irec1->br_startblock != irec2->br_startblock)
+ return 0;
+
+ /*
+ * Two mappings pointing to the same physical block must not
+ * have different states; that's filesystem corruption. Move
+ * on to the next mapping if they're both holes or both point
+ * to the same physical space extent.
+ */
+ if (irec1->br_state != irec2->br_state) {
+ xfs_bmap_mark_sick(xmi->xmi_ip1,
+ xfs_exchmaps_whichfork(xmi));
+ xfs_bmap_mark_sick(xmi->xmi_ip2,
+ xfs_exchmaps_whichfork(xmi));
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Save the mappings if we're estimating work and skipping
+ * these identical mappings.
+ */
+ if (adj) {
+ memcpy(&adj->left1, irec1, sizeof(*irec1));
+ memcpy(&adj->left2, irec2, sizeof(*irec2));
+ }
+ }
+
+ return 0;
+}
+
+/* Exchange these two mappings. */
+static void
+xfs_exchmaps_one_step(
+ struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi,
+ struct xfs_bmbt_irec *irec1,
+ struct xfs_bmbt_irec *irec2)
+{
+ int whichfork = xfs_exchmaps_whichfork(xmi);
+
+ xfs_exchmaps_update_quota(tp, xmi, irec1, irec2);
+
+ /* Remove both mappings. */
+ xfs_bmap_unmap_extent(tp, xmi->xmi_ip1, whichfork, irec1);
+ xfs_bmap_unmap_extent(tp, xmi->xmi_ip2, whichfork, irec2);
+
+ /*
+ * Re-add both mappings. We exchange the file offsets between the two
+ * maps and add the opposite map, which has the effect of filling the
+ * logical offsets we just unmapped, but with with the physical mapping
+ * information exchanged.
+ */
+ swap(irec1->br_startoff, irec2->br_startoff);
+ xfs_bmap_map_extent(tp, xmi->xmi_ip1, whichfork, irec2);
+ xfs_bmap_map_extent(tp, xmi->xmi_ip2, whichfork, irec1);
+
+ /* Make sure we're not adding mappings past EOF. */
+ if (whichfork == XFS_DATA_FORK) {
+ xfs_exchmaps_update_size(tp, xmi->xmi_ip1, irec2,
+ xmi->xmi_isize1);
+ xfs_exchmaps_update_size(tp, xmi->xmi_ip2, irec1,
+ xmi->xmi_isize2);
+ }
+
+ /*
+ * Advance our cursor and exit. The caller (either defer ops or log
+ * recovery) will log the XMD item, and if *blockcount is nonzero, it
+ * will log a new XMI item for the remainder and call us back.
+ */
+ xmi_advance(xmi, irec1);
+}
+
+/* Convert inode2's leaf attr fork back to shortform, if possible.. */
+STATIC int
+xfs_exchmaps_attr_to_sf(
+ struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi)
+{
+ struct xfs_da_args args = {
+ .dp = xmi->xmi_ip2,
+ .geo = tp->t_mountp->m_attr_geo,
+ .whichfork = XFS_ATTR_FORK,
+ .trans = tp,
+ .owner = xmi->xmi_ip2->i_ino,
+ };
+ struct xfs_buf *bp;
+ int forkoff;
+ int error;
+
+ if (!xfs_attr_is_leaf(xmi->xmi_ip2))
+ return 0;
+
+ error = xfs_attr3_leaf_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, 0,
+ &bp);
+ if (error)
+ return error;
+
+ forkoff = xfs_attr_shortform_allfit(bp, xmi->xmi_ip2);
+ if (forkoff == 0)
+ return 0;
+
+ return xfs_attr3_leaf_to_shortform(bp, &args, forkoff);
+}
+
+/* Convert inode2's block dir fork back to shortform, if possible.. */
+STATIC int
+xfs_exchmaps_dir_to_sf(
+ struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi)
+{
+ struct xfs_da_args args = {
+ .dp = xmi->xmi_ip2,
+ .geo = tp->t_mountp->m_dir_geo,
+ .whichfork = XFS_DATA_FORK,
+ .trans = tp,
+ .owner = xmi->xmi_ip2->i_ino,
+ };
+ struct xfs_dir2_sf_hdr sfh;
+ struct xfs_buf *bp;
+ int size;
+ int error = 0;
+
+ if (xfs_dir2_format(&args, &error) != XFS_DIR2_FMT_BLOCK)
+ return error;
+
+ error = xfs_dir3_block_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, &bp);
+ if (error)
+ return error;
+
+ size = xfs_dir2_block_sfsize(xmi->xmi_ip2, bp->b_addr, &sfh);
+ if (size > xfs_inode_data_fork_size(xmi->xmi_ip2))
+ return 0;
+
+ return xfs_dir2_block_to_sf(&args, bp, size, &sfh);
+}
+
+/* Convert inode2's remote symlink target back to shortform, if possible. */
+STATIC int
+xfs_exchmaps_link_to_sf(
+ struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi)
+{
+ struct xfs_inode *ip = xmi->xmi_ip2;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ char *buf;
+ int error;
+
+ if (ifp->if_format == XFS_DINODE_FMT_LOCAL ||
+ ip->i_disk_size > xfs_inode_data_fork_size(ip))
+ return 0;
+
+ /* Read the current symlink target into a buffer. */
+ buf = kmalloc(ip->i_disk_size + 1,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+ if (!buf) {
+ ASSERT(0);
+ return -ENOMEM;
+ }
+
+ error = xfs_symlink_remote_read(ip, buf);
+ if (error)
+ goto free;
+
+ /* Remove the blocks. */
+ error = xfs_symlink_remote_truncate(tp, ip);
+ if (error)
+ goto free;
+
+ /* Convert fork to local format and log our changes. */
+ xfs_idestroy_fork(ifp);
+ ifp->if_bytes = 0;
+ ifp->if_format = XFS_DINODE_FMT_LOCAL;
+ xfs_init_local_fork(ip, XFS_DATA_FORK, buf, ip->i_disk_size);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+free:
+ kfree(buf);
+ return error;
+}
+
+/* Clear the reflink flag after an exchange. */
+static inline void
+xfs_exchmaps_clear_reflink(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ trace_xfs_reflink_unset_inode_flag(ip);
+
+ ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/* Finish whatever work might come after an exchange operation. */
+static int
+xfs_exchmaps_do_postop_work(
+ struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi)
+{
+ if (xmi->xmi_flags & __XFS_EXCHMAPS_INO2_SHORTFORM) {
+ int error = 0;
+
+ if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)
+ error = xfs_exchmaps_attr_to_sf(tp, xmi);
+ else if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode))
+ error = xfs_exchmaps_dir_to_sf(tp, xmi);
+ else if (S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode))
+ error = xfs_exchmaps_link_to_sf(tp, xmi);
+ xmi->xmi_flags &= ~__XFS_EXCHMAPS_INO2_SHORTFORM;
+ if (error)
+ return error;
+ }
+
+ if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO1_REFLINK) {
+ xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip1);
+ xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO1_REFLINK;
+ }
+
+ if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO2_REFLINK) {
+ xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip2);
+ xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO2_REFLINK;
+ }
+
+ return 0;
+}
+
+/* Finish one step in a mapping exchange operation, possibly relogging. */
+int
+xfs_exchmaps_finish_one(
+ struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi)
+{
+ struct xfs_bmbt_irec irec1, irec2;
+ int error;
+
+ if (xmi_has_more_exchange_work(xmi)) {
+ /*
+ * If the operation state says that some range of the files
+ * have not yet been exchanged, look for mappings in that range
+ * to exchange. If we find some mappings, exchange them.
+ */
+ error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, NULL);
+ if (error)
+ return error;
+
+ if (xmi_has_more_exchange_work(xmi))
+ xfs_exchmaps_one_step(tp, xmi, &irec1, &irec2);
+
+ /*
+ * If the caller asked us to exchange the file sizes after the
+ * exchange and either we just exchanged the last mappings in
+ * the range or we didn't find anything to exchange, update the
+ * ondisk file sizes.
+ */
+ if ((xmi->xmi_flags & XFS_EXCHMAPS_SET_SIZES) &&
+ !xmi_has_more_exchange_work(xmi)) {
+ xmi->xmi_ip1->i_disk_size = xmi->xmi_isize1;
+ xmi->xmi_ip2->i_disk_size = xmi->xmi_isize2;
+
+ xfs_trans_log_inode(tp, xmi->xmi_ip1, XFS_ILOG_CORE);
+ xfs_trans_log_inode(tp, xmi->xmi_ip2, XFS_ILOG_CORE);
+ }
+ } else if (xmi_has_postop_work(xmi)) {
+ /*
+ * Now that we're finished with the exchange operation,
+ * complete the post-op cleanup work.
+ */
+ error = xfs_exchmaps_do_postop_work(tp, xmi);
+ if (error)
+ return error;
+ }
+
+ if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE))
+ return -EIO;
+
+ /* If we still have work to do, ask for a new transaction. */
+ if (xmi_has_more_exchange_work(xmi) || xmi_has_postop_work(xmi)) {
+ trace_xfs_exchmaps_defer(tp->t_mountp, xmi);
+ return -EAGAIN;
+ }
+
+ /*
+ * If we reach here, we've finished all the exchange work and the post
+ * operation work. The last thing we need to do before returning to
+ * the caller is to make sure that COW forks are set up correctly.
+ */
+ if (!(xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)) {
+ xfs_exchmaps_ensure_cowfork(xmi->xmi_ip1);
+ xfs_exchmaps_ensure_cowfork(xmi->xmi_ip2);
+ }
+
+ return 0;
+}
+
+/*
+ * Compute the amount of bmbt blocks we should reserve for each file. In the
+ * worst case, each exchange will fill a hole with a new mapping, which could
+ * result in a btree split every time we add a new leaf block.
+ */
+static inline uint64_t
+xfs_exchmaps_bmbt_blocks(
+ struct xfs_mount *mp,
+ const struct xfs_exchmaps_req *req)
+{
+ return howmany_64(req->nr_exchanges,
+ XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp)) *
+ XFS_EXTENTADD_SPACE_RES(mp, xfs_exchmaps_reqfork(req));
+}
+
+/* Compute the space we should reserve for the rmap btree expansions. */
+static inline uint64_t
+xfs_exchmaps_rmapbt_blocks(
+ struct xfs_mount *mp,
+ const struct xfs_exchmaps_req *req)
+{
+ if (!xfs_has_rmapbt(mp))
+ return 0;
+ if (XFS_IS_REALTIME_INODE(req->ip1))
+ return 0;
+
+ return howmany_64(req->nr_exchanges,
+ XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) *
+ XFS_RMAPADD_SPACE_RES(mp);
+}
+
+/* Estimate the bmbt and rmapbt overhead required to exchange mappings. */
+int
+xfs_exchmaps_estimate_overhead(
+ struct xfs_exchmaps_req *req)
+{
+ struct xfs_mount *mp = req->ip1->i_mount;
+ xfs_filblks_t bmbt_blocks;
+ xfs_filblks_t rmapbt_blocks;
+ xfs_filblks_t resblks = req->resblks;
+
+ /*
+ * Compute the number of bmbt and rmapbt blocks we might need to handle
+ * the estimated number of exchanges.
+ */
+ bmbt_blocks = xfs_exchmaps_bmbt_blocks(mp, req);
+ rmapbt_blocks = xfs_exchmaps_rmapbt_blocks(mp, req);
+
+ trace_xfs_exchmaps_overhead(mp, bmbt_blocks, rmapbt_blocks);
+
+ /* Make sure the change in file block count doesn't overflow. */
+ if (check_add_overflow(req->ip1_bcount, bmbt_blocks, &req->ip1_bcount))
+ return -EFBIG;
+ if (check_add_overflow(req->ip2_bcount, bmbt_blocks, &req->ip2_bcount))
+ return -EFBIG;
+
+ /*
+ * Add together the number of blocks we need to handle btree growth,
+ * then add it to the number of blocks we need to reserve to this
+ * transaction.
+ */
+ if (check_add_overflow(resblks, bmbt_blocks, &resblks))
+ return -ENOSPC;
+ if (check_add_overflow(resblks, bmbt_blocks, &resblks))
+ return -ENOSPC;
+ if (check_add_overflow(resblks, rmapbt_blocks, &resblks))
+ return -ENOSPC;
+ if (check_add_overflow(resblks, rmapbt_blocks, &resblks))
+ return -ENOSPC;
+
+ /* Can't actually reserve more than UINT_MAX blocks. */
+ if (req->resblks > UINT_MAX)
+ return -ENOSPC;
+
+ req->resblks = resblks;
+ trace_xfs_exchmaps_final_estimate(req);
+ return 0;
+}
+
+/* Decide if we can merge two real mappings. */
+static inline bool
+xmi_can_merge(
+ const struct xfs_bmbt_irec *b1,
+ const struct xfs_bmbt_irec *b2)
+{
+ /* Don't merge holes. */
+ if (b1->br_startblock == HOLESTARTBLOCK ||
+ b2->br_startblock == HOLESTARTBLOCK)
+ return false;
+
+ /* We don't merge holes. */
+ if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2))
+ return false;
+
+ if (b1->br_startoff + b1->br_blockcount == b2->br_startoff &&
+ b1->br_startblock + b1->br_blockcount == b2->br_startblock &&
+ b1->br_state == b2->br_state &&
+ b1->br_blockcount + b2->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+ return true;
+
+ return false;
+}
+
+/*
+ * Decide if we can merge three mappings. Caller must ensure all three
+ * mappings must not be holes or delalloc reservations.
+ */
+static inline bool
+xmi_can_merge_all(
+ const struct xfs_bmbt_irec *l,
+ const struct xfs_bmbt_irec *m,
+ const struct xfs_bmbt_irec *r)
+{
+ xfs_filblks_t new_len;
+
+ new_len = l->br_blockcount + m->br_blockcount + r->br_blockcount;
+ return new_len <= XFS_MAX_BMBT_EXTLEN;
+}
+
+#define CLEFT_CONTIG 0x01
+#define CRIGHT_CONTIG 0x02
+#define CHOLE 0x04
+#define CBOTH_CONTIG (CLEFT_CONTIG | CRIGHT_CONTIG)
+
+#define NLEFT_CONTIG 0x10
+#define NRIGHT_CONTIG 0x20
+#define NHOLE 0x40
+#define NBOTH_CONTIG (NLEFT_CONTIG | NRIGHT_CONTIG)
+
+/* Estimate the effect of a single exchange on mapping count. */
+static inline int
+xmi_delta_nextents_step(
+ struct xfs_mount *mp,
+ const struct xfs_bmbt_irec *left,
+ const struct xfs_bmbt_irec *curr,
+ const struct xfs_bmbt_irec *new,
+ const struct xfs_bmbt_irec *right)
+{
+ bool lhole, rhole, chole, nhole;
+ unsigned int state = 0;
+ int ret = 0;
+
+ lhole = left->br_startblock == HOLESTARTBLOCK;
+ rhole = right->br_startblock == HOLESTARTBLOCK;
+ chole = curr->br_startblock == HOLESTARTBLOCK;
+ nhole = new->br_startblock == HOLESTARTBLOCK;
+
+ if (chole)
+ state |= CHOLE;
+ if (!lhole && !chole && xmi_can_merge(left, curr))
+ state |= CLEFT_CONTIG;
+ if (!rhole && !chole && xmi_can_merge(curr, right))
+ state |= CRIGHT_CONTIG;
+ if ((state & CBOTH_CONTIG) == CBOTH_CONTIG &&
+ !xmi_can_merge_all(left, curr, right))
+ state &= ~CRIGHT_CONTIG;
+
+ if (nhole)
+ state |= NHOLE;
+ if (!lhole && !nhole && xmi_can_merge(left, new))
+ state |= NLEFT_CONTIG;
+ if (!rhole && !nhole && xmi_can_merge(new, right))
+ state |= NRIGHT_CONTIG;
+ if ((state & NBOTH_CONTIG) == NBOTH_CONTIG &&
+ !xmi_can_merge_all(left, new, right))
+ state &= ~NRIGHT_CONTIG;
+
+ switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) {
+ case CLEFT_CONTIG | CRIGHT_CONTIG:
+ /*
+ * left/curr/right are the same mapping, so deleting curr
+ * causes 2 new mappings to be created.
+ */
+ ret += 2;
+ break;
+ case 0:
+ /*
+ * curr is not contiguous with any mapping, so we remove curr
+ * completely
+ */
+ ret--;
+ break;
+ case CHOLE:
+ /* hole, do nothing */
+ break;
+ case CLEFT_CONTIG:
+ case CRIGHT_CONTIG:
+ /* trim either left or right, no change */
+ break;
+ }
+
+ switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) {
+ case NLEFT_CONTIG | NRIGHT_CONTIG:
+ /*
+ * left/curr/right will become the same mapping, so adding
+ * curr causes the deletion of right.
+ */
+ ret--;
+ break;
+ case 0:
+ /* new is not contiguous with any mapping */
+ ret++;
+ break;
+ case NHOLE:
+ /* hole, do nothing. */
+ break;
+ case NLEFT_CONTIG:
+ case NRIGHT_CONTIG:
+ /* new is absorbed into left or right, no change */
+ break;
+ }
+
+ trace_xfs_exchmaps_delta_nextents_step(mp, left, curr, new, right, ret,
+ state);
+ return ret;
+}
+
+/* Make sure we don't overflow the extent (mapping) counters. */
+static inline int
+xmi_ensure_delta_nextents(
+ struct xfs_exchmaps_req *req,
+ struct xfs_inode *ip,
+ int64_t delta)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int whichfork = xfs_exchmaps_reqfork(req);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ uint64_t new_nextents;
+ xfs_extnum_t max_nextents;
+
+ if (delta < 0)
+ return 0;
+
+ /*
+ * It's always an error if the delta causes integer overflow. delta
+ * needs an explicit cast here to avoid warnings about implicit casts
+ * coded into the overflow check.
+ */
+ if (check_add_overflow(ifp->if_nextents, (uint64_t)delta,
+ &new_nextents))
+ return -EFBIG;
+
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
+ new_nextents > 10)
+ return -EFBIG;
+
+ /*
+ * We always promote both inodes to have large extent counts if the
+ * superblock feature is enabled, so we only need to check against the
+ * theoretical maximum.
+ */
+ max_nextents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
+ whichfork);
+ if (new_nextents > max_nextents)
+ return -EFBIG;
+
+ return 0;
+}
+
+/* Find the next mapping after irec. */
+static inline int
+xmi_next(
+ struct xfs_inode *ip,
+ int bmap_flags,
+ const struct xfs_bmbt_irec *irec,
+ struct xfs_bmbt_irec *nrec)
+{
+ xfs_fileoff_t off;
+ xfs_filblks_t blockcount;
+ int nimaps = 1;
+ int error;
+
+ off = irec->br_startoff + irec->br_blockcount;
+ blockcount = XFS_MAX_FILEOFF - off;
+ error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags);
+ if (error)
+ return error;
+ if (nrec->br_startblock == DELAYSTARTBLOCK ||
+ nrec->br_startoff != off) {
+ /*
+ * If we don't get the mapping we want, return a zero-length
+ * mapping, which our estimator function will pretend is a hole.
+ * We shouldn't get delalloc reservations.
+ */
+ nrec->br_startblock = HOLESTARTBLOCK;
+ }
+
+ return 0;
+}
+
+int __init
+xfs_exchmaps_intent_init_cache(void)
+{
+ xfs_exchmaps_intent_cache = kmem_cache_create("xfs_exchmaps_intent",
+ sizeof(struct xfs_exchmaps_intent),
+ 0, 0, NULL);
+
+ return xfs_exchmaps_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_exchmaps_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_exchmaps_intent_cache);
+ xfs_exchmaps_intent_cache = NULL;
+}
+
+/*
+ * Decide if we will exchange the reflink flags between the two files after the
+ * exchange. The only time we want to do this is if we're exchanging all
+ * mappings under EOF and the inode reflink flags have different states.
+ */
+static inline bool
+xmi_can_exchange_reflink_flags(
+ const struct xfs_exchmaps_req *req,
+ unsigned int reflink_state)
+{
+ struct xfs_mount *mp = req->ip1->i_mount;
+
+ if (hweight32(reflink_state) != 1)
+ return false;
+ if (req->startoff1 != 0 || req->startoff2 != 0)
+ return false;
+ if (req->blockcount != XFS_B_TO_FSB(mp, req->ip1->i_disk_size))
+ return false;
+ if (req->blockcount != XFS_B_TO_FSB(mp, req->ip2->i_disk_size))
+ return false;
+ return true;
+}
+
+
+/* Allocate and initialize a new incore intent item from a request. */
+struct xfs_exchmaps_intent *
+xfs_exchmaps_init_intent(
+ const struct xfs_exchmaps_req *req)
+{
+ struct xfs_exchmaps_intent *xmi;
+ unsigned int rs = 0;
+
+ xmi = kmem_cache_zalloc(xfs_exchmaps_intent_cache,
+ GFP_NOFS | __GFP_NOFAIL);
+ INIT_LIST_HEAD(&xmi->xmi_list);
+ xmi->xmi_ip1 = req->ip1;
+ xmi->xmi_ip2 = req->ip2;
+ xmi->xmi_startoff1 = req->startoff1;
+ xmi->xmi_startoff2 = req->startoff2;
+ xmi->xmi_blockcount = req->blockcount;
+ xmi->xmi_isize1 = xmi->xmi_isize2 = -1;
+ xmi->xmi_flags = req->flags & XFS_EXCHMAPS_PARAMS;
+
+ if (xfs_exchmaps_whichfork(xmi) == XFS_ATTR_FORK) {
+ xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM;
+ return xmi;
+ }
+
+ if (req->flags & XFS_EXCHMAPS_SET_SIZES) {
+ xmi->xmi_flags |= XFS_EXCHMAPS_SET_SIZES;
+ xmi->xmi_isize1 = req->ip2->i_disk_size;
+ xmi->xmi_isize2 = req->ip1->i_disk_size;
+ }
+
+ /* Record the state of each inode's reflink flag before the op. */
+ if (xfs_is_reflink_inode(req->ip1))
+ rs |= 1;
+ if (xfs_is_reflink_inode(req->ip2))
+ rs |= 2;
+
+ /*
+ * Figure out if we're clearing the reflink flags (which effectively
+ * exchanges them) after the operation.
+ */
+ if (xmi_can_exchange_reflink_flags(req, rs)) {
+ if (rs & 1)
+ xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO1_REFLINK;
+ if (rs & 2)
+ xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO2_REFLINK;
+ }
+
+ if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode) ||
+ S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode))
+ xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM;
+
+ return xmi;
+}
+
+/*
+ * Estimate the number of exchange operations and the number of file blocks
+ * in each file that will be affected by the exchange operation.
+ */
+int
+xfs_exchmaps_estimate(
+ struct xfs_exchmaps_req *req)
+{
+ struct xfs_exchmaps_intent *xmi;
+ struct xfs_bmbt_irec irec1, irec2;
+ struct xfs_exchmaps_adjacent adj = ADJACENT_INIT;
+ xfs_filblks_t ip1_blocks = 0, ip2_blocks = 0;
+ int64_t d_nexts1, d_nexts2;
+ int bmap_flags;
+ int error;
+
+ ASSERT(!(req->flags & ~XFS_EXCHMAPS_PARAMS));
+
+ bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_reqfork(req));
+ xmi = xfs_exchmaps_init_intent(req);
+
+ /*
+ * To guard against the possibility of overflowing the extent counters,
+ * we have to estimate an upper bound on the potential increase in that
+ * counter. We can split the mapping at each end of the range, and for
+ * each step of the exchange we can split the mapping that we're
+ * working on if the mappings do not align.
+ */
+ d_nexts1 = d_nexts2 = 3;
+
+ while (xmi_has_more_exchange_work(xmi)) {
+ /*
+ * Walk through the file ranges until we find something to
+ * exchange. Because we're simulating the exchange, pass in
+ * adj to capture skipped mappings for correct estimation of
+ * bmbt record merges.
+ */
+ error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, &adj);
+ if (error)
+ goto out_free;
+ if (!xmi_has_more_exchange_work(xmi))
+ break;
+
+ /* Update accounting. */
+ if (xfs_bmap_is_real_extent(&irec1))
+ ip1_blocks += irec1.br_blockcount;
+ if (xfs_bmap_is_real_extent(&irec2))
+ ip2_blocks += irec2.br_blockcount;
+ req->nr_exchanges++;
+
+ /* Read the next mappings from both files. */
+ error = xmi_next(req->ip1, bmap_flags, &irec1, &adj.right1);
+ if (error)
+ goto out_free;
+
+ error = xmi_next(req->ip2, bmap_flags, &irec2, &adj.right2);
+ if (error)
+ goto out_free;
+
+ /* Update extent count deltas. */
+ d_nexts1 += xmi_delta_nextents_step(req->ip1->i_mount,
+ &adj.left1, &irec1, &irec2, &adj.right1);
+
+ d_nexts2 += xmi_delta_nextents_step(req->ip1->i_mount,
+ &adj.left2, &irec2, &irec1, &adj.right2);
+
+ /* Now pretend we exchanged the mappings. */
+ if (xmi_can_merge(&adj.left2, &irec1))
+ adj.left2.br_blockcount += irec1.br_blockcount;
+ else
+ memcpy(&adj.left2, &irec1, sizeof(irec1));
+
+ if (xmi_can_merge(&adj.left1, &irec2))
+ adj.left1.br_blockcount += irec2.br_blockcount;
+ else
+ memcpy(&adj.left1, &irec2, sizeof(irec2));
+
+ xmi_advance(xmi, &irec1);
+ }
+
+ /* Account for the blocks that are being exchanged. */
+ if (XFS_IS_REALTIME_INODE(req->ip1) &&
+ xfs_exchmaps_reqfork(req) == XFS_DATA_FORK) {
+ req->ip1_rtbcount = ip1_blocks;
+ req->ip2_rtbcount = ip2_blocks;
+ } else {
+ req->ip1_bcount = ip1_blocks;
+ req->ip2_bcount = ip2_blocks;
+ }
+
+ /*
+ * Make sure that both forks have enough slack left in their extent
+ * counters that the exchange operation will not overflow.
+ */
+ trace_xfs_exchmaps_delta_nextents(req, d_nexts1, d_nexts2);
+ if (req->ip1 == req->ip2) {
+ error = xmi_ensure_delta_nextents(req, req->ip1,
+ d_nexts1 + d_nexts2);
+ } else {
+ error = xmi_ensure_delta_nextents(req, req->ip1, d_nexts1);
+ if (error)
+ goto out_free;
+ error = xmi_ensure_delta_nextents(req, req->ip2, d_nexts2);
+ }
+ if (error)
+ goto out_free;
+
+ trace_xfs_exchmaps_initial_estimate(req);
+ error = xfs_exchmaps_estimate_overhead(req);
+out_free:
+ kmem_cache_free(xfs_exchmaps_intent_cache, xmi);
+ return error;
+}
+
+/* Set the reflink flag before an operation. */
+static inline void
+xfs_exchmaps_set_reflink(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ trace_xfs_reflink_set_inode_flag(ip);
+
+ ip->i_diflags2 |= XFS_DIFLAG2_REFLINK;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/*
+ * If either file has shared blocks and we're exchanging data forks, we must
+ * flag the other file as having shared blocks so that we get the shared-block
+ * rmap functions if we need to fix up the rmaps.
+ */
+void
+xfs_exchmaps_ensure_reflink(
+ struct xfs_trans *tp,
+ const struct xfs_exchmaps_intent *xmi)
+{
+ unsigned int rs = 0;
+
+ if (xfs_is_reflink_inode(xmi->xmi_ip1))
+ rs |= 1;
+ if (xfs_is_reflink_inode(xmi->xmi_ip2))
+ rs |= 2;
+
+ if ((rs & 1) && !xfs_is_reflink_inode(xmi->xmi_ip2))
+ xfs_exchmaps_set_reflink(tp, xmi->xmi_ip2);
+
+ if ((rs & 2) && !xfs_is_reflink_inode(xmi->xmi_ip1))
+ xfs_exchmaps_set_reflink(tp, xmi->xmi_ip1);
+}
+
+/* Set the large extent count flag before an operation if needed. */
+static inline void
+xfs_exchmaps_ensure_large_extent_counts(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ if (xfs_inode_has_large_extent_counts(ip))
+ return;
+
+ ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/* Widen the extent counter fields of both inodes if necessary. */
+void
+xfs_exchmaps_upgrade_extent_counts(
+ struct xfs_trans *tp,
+ const struct xfs_exchmaps_intent *xmi)
+{
+ if (!xfs_has_large_extent_counts(tp->t_mountp))
+ return;
+
+ xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip1);
+ xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip2);
+}
+
+/*
+ * Schedule an exchange a range of mappings from one inode to another.
+ *
+ * The use of file mapping exchange log intent items ensures the operation can
+ * be resumed even if the system goes down. The caller must commit the
+ * transaction to start the work.
+ *
+ * The caller must ensure the inodes must be joined to the transaction and
+ * ILOCKd; they will still be joined to the transaction at exit.
+ */
+void
+xfs_exchange_mappings(
+ struct xfs_trans *tp,
+ const struct xfs_exchmaps_req *req)
+{
+ struct xfs_exchmaps_intent *xmi;
+
+ BUILD_BUG_ON(XFS_EXCHMAPS_INTERNAL_FLAGS & XFS_EXCHMAPS_LOGGED_FLAGS);
+
+ xfs_assert_ilocked(req->ip1, XFS_ILOCK_EXCL);
+ xfs_assert_ilocked(req->ip2, XFS_ILOCK_EXCL);
+ ASSERT(!(req->flags & ~XFS_EXCHMAPS_LOGGED_FLAGS));
+ if (req->flags & XFS_EXCHMAPS_SET_SIZES)
+ ASSERT(!(req->flags & XFS_EXCHMAPS_ATTR_FORK));
+ ASSERT(xfs_has_exchange_range(tp->t_mountp));
+
+ if (req->blockcount == 0)
+ return;
+
+ xmi = xfs_exchmaps_init_intent(req);
+ xfs_exchmaps_defer_add(tp, xmi);
+ xfs_exchmaps_ensure_reflink(tp, xmi);
+ xfs_exchmaps_upgrade_extent_counts(tp, xmi);
+}
diff --git a/fs/xfs/libxfs/xfs_exchmaps.h b/fs/xfs/libxfs/xfs_exchmaps.h
new file mode 100644
index 000000000000..fa822dff202a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_exchmaps.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_EXCHMAPS_H__
+#define __XFS_EXCHMAPS_H__
+
+/* In-core deferred operation info about a file mapping exchange request. */
+struct xfs_exchmaps_intent {
+ /* List of other incore deferred work. */
+ struct list_head xmi_list;
+
+ /* Inodes participating in the operation. */
+ struct xfs_inode *xmi_ip1;
+ struct xfs_inode *xmi_ip2;
+
+ /* File offset range information. */
+ xfs_fileoff_t xmi_startoff1;
+ xfs_fileoff_t xmi_startoff2;
+ xfs_filblks_t xmi_blockcount;
+
+ /* Set these file sizes after the operation, unless negative. */
+ xfs_fsize_t xmi_isize1;
+ xfs_fsize_t xmi_isize2;
+
+ uint64_t xmi_flags; /* XFS_EXCHMAPS_* flags */
+};
+
+/* Try to convert inode2 from block to short format at the end, if possible. */
+#define __XFS_EXCHMAPS_INO2_SHORTFORM (1ULL << 63)
+
+#define XFS_EXCHMAPS_INTERNAL_FLAGS (__XFS_EXCHMAPS_INO2_SHORTFORM)
+
+/* flags that can be passed to xfs_exchmaps_{estimate,mappings} */
+#define XFS_EXCHMAPS_PARAMS (XFS_EXCHMAPS_ATTR_FORK | \
+ XFS_EXCHMAPS_SET_SIZES | \
+ XFS_EXCHMAPS_INO1_WRITTEN)
+
+static inline int
+xfs_exchmaps_whichfork(const struct xfs_exchmaps_intent *xmi)
+{
+ if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)
+ return XFS_ATTR_FORK;
+ return XFS_DATA_FORK;
+}
+
+/* Parameters for a mapping exchange request. */
+struct xfs_exchmaps_req {
+ /* Inodes participating in the operation. */
+ struct xfs_inode *ip1;
+ struct xfs_inode *ip2;
+
+ /* File offset range information. */
+ xfs_fileoff_t startoff1;
+ xfs_fileoff_t startoff2;
+ xfs_filblks_t blockcount;
+
+ /* XFS_EXCHMAPS_* operation flags */
+ uint64_t flags;
+
+ /*
+ * Fields below this line are filled out by xfs_exchmaps_estimate;
+ * callers should initialize this part of the struct to zero.
+ */
+
+ /*
+ * Data device blocks to be moved out of ip1, and free space needed to
+ * handle the bmbt changes.
+ */
+ xfs_filblks_t ip1_bcount;
+
+ /*
+ * Data device blocks to be moved out of ip2, and free space needed to
+ * handle the bmbt changes.
+ */
+ xfs_filblks_t ip2_bcount;
+
+ /* rt blocks to be moved out of ip1. */
+ xfs_filblks_t ip1_rtbcount;
+
+ /* rt blocks to be moved out of ip2. */
+ xfs_filblks_t ip2_rtbcount;
+
+ /* Free space needed to handle the bmbt changes */
+ unsigned long long resblks;
+
+ /* Number of exchanges needed to complete the operation */
+ unsigned long long nr_exchanges;
+};
+
+static inline int
+xfs_exchmaps_reqfork(const struct xfs_exchmaps_req *req)
+{
+ if (req->flags & XFS_EXCHMAPS_ATTR_FORK)
+ return XFS_ATTR_FORK;
+ return XFS_DATA_FORK;
+}
+
+int xfs_exchmaps_estimate_overhead(struct xfs_exchmaps_req *req);
+int xfs_exchmaps_estimate(struct xfs_exchmaps_req *req);
+
+extern struct kmem_cache *xfs_exchmaps_intent_cache;
+
+int __init xfs_exchmaps_intent_init_cache(void);
+void xfs_exchmaps_intent_destroy_cache(void);
+
+struct xfs_exchmaps_intent *xfs_exchmaps_init_intent(
+ const struct xfs_exchmaps_req *req);
+void xfs_exchmaps_ensure_reflink(struct xfs_trans *tp,
+ const struct xfs_exchmaps_intent *xmi);
+void xfs_exchmaps_upgrade_extent_counts(struct xfs_trans *tp,
+ const struct xfs_exchmaps_intent *xmi);
+
+int xfs_exchmaps_finish_one(struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi);
+
+int xfs_exchmaps_check_forks(struct xfs_mount *mp,
+ const struct xfs_exchmaps_req *req);
+
+void xfs_exchange_mappings(struct xfs_trans *tp,
+ const struct xfs_exchmaps_req *req);
+
+#endif /* __XFS_EXCHMAPS_H__ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 2b2f9050fbfb..61f51becff4f 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -367,19 +367,23 @@ xfs_sb_has_ro_compat_feature(
return (sbp->sb_features_ro_compat & feature) != 0;
}
-#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
-#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */
-#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */
-#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */
-#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */
-#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */
+#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */
+#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */
+#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */
+#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */
+#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */
+#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */
+#define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */
#define XFS_SB_FEAT_INCOMPAT_ALL \
- (XFS_SB_FEAT_INCOMPAT_FTYPE| \
- XFS_SB_FEAT_INCOMPAT_SPINODES| \
- XFS_SB_FEAT_INCOMPAT_META_UUID| \
- XFS_SB_FEAT_INCOMPAT_BIGTIME| \
- XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \
- XFS_SB_FEAT_INCOMPAT_NREXT64)
+ (XFS_SB_FEAT_INCOMPAT_FTYPE | \
+ XFS_SB_FEAT_INCOMPAT_SPINODES | \
+ XFS_SB_FEAT_INCOMPAT_META_UUID | \
+ XFS_SB_FEAT_INCOMPAT_BIGTIME | \
+ XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR | \
+ XFS_SB_FEAT_INCOMPAT_NREXT64 | \
+ XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \
+ XFS_SB_FEAT_INCOMPAT_PARENT)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -898,6 +902,12 @@ static inline uint xfs_dinode_size(int version)
#define XFS_MAXLINK ((1U << 31) - 1U)
/*
+ * Any file that hits the maximum ondisk link count should be pinned to avoid
+ * a use-after-free situation.
+ */
+#define XFS_NLINK_PINNED (~0U)
+
+/*
* Values for di_format
*
* This enum is used in string mapping in xfs_trace.h; please keep the
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index ca1b17d01437..97996cb79aaa 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -239,6 +239,8 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */
#define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */
#define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */
+#define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */
+#define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */
/*
* Minimum and maximum sizes need for growth checks.
@@ -409,6 +411,7 @@ struct xfs_bulkstat {
#define XFS_BS_SICK_XATTR (1 << 5) /* extended attributes */
#define XFS_BS_SICK_SYMLINK (1 << 6) /* symbolic link remote target */
#define XFS_BS_SICK_PARENT (1 << 7) /* parent pointers */
+#define XFS_BS_SICK_DIRTREE (1 << 8) /* directory tree structure */
/*
* Project quota id helpers (previously projid was 16bit only
@@ -632,7 +635,9 @@ typedef struct xfs_fsop_attrmulti_handlereq {
/*
* per machine unique filesystem identifier types.
*/
-typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */
+typedef struct xfs_fsid {
+ __u32 val[2]; /* file system id type */
+} xfs_fsid_t;
typedef struct xfs_fid {
__u16 fid_len; /* length of remainder */
@@ -715,9 +720,19 @@ struct xfs_scrub_metadata {
#define XFS_SCRUB_TYPE_QUOTACHECK 25 /* quota counters */
#define XFS_SCRUB_TYPE_NLINKS 26 /* inode link counts */
#define XFS_SCRUB_TYPE_HEALTHY 27 /* everything checked out ok */
+#define XFS_SCRUB_TYPE_DIRTREE 28 /* directory tree structure */
/* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR 28
+#define XFS_SCRUB_TYPE_NR 29
+
+/*
+ * This special type code only applies to the vectored scrub implementation.
+ *
+ * If any of the previous scrub vectors recorded runtime errors or have
+ * sv_flags bits set that match the OFLAG bits in the barrier vector's
+ * sv_flags, set the barrier's sv_ret to -ECANCELED and return to userspace.
+ */
+#define XFS_SCRUB_TYPE_BARRIER (0xFFFFFFFF)
/* i: Repair this metadata. */
#define XFS_SCRUB_IFLAG_REPAIR (1u << 0)
@@ -763,6 +778,29 @@ struct xfs_scrub_metadata {
XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED)
#define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT)
+/* Vectored scrub calls to reduce the number of kernel transitions. */
+
+struct xfs_scrub_vec {
+ __u32 sv_type; /* XFS_SCRUB_TYPE_* */
+ __u32 sv_flags; /* XFS_SCRUB_FLAGS_* */
+ __s32 sv_ret; /* 0 or a negative error code */
+ __u32 sv_reserved; /* must be zero */
+};
+
+/* Vectored metadata scrub control structure. */
+struct xfs_scrub_vec_head {
+ __u64 svh_ino; /* inode number. */
+ __u32 svh_gen; /* inode generation. */
+ __u32 svh_agno; /* ag number. */
+ __u32 svh_flags; /* XFS_SCRUB_VEC_FLAGS_* */
+ __u16 svh_rest_us; /* wait this much time between vector items */
+ __u16 svh_nr; /* number of svh_vectors */
+ __u64 svh_reserved; /* must be zero */
+ __u64 svh_vectors; /* pointer to buffer of xfs_scrub_vec */
+};
+
+#define XFS_SCRUB_VEC_FLAGS_ALL (0)
+
/*
* ioctl limits
*/
@@ -772,6 +810,118 @@ struct xfs_scrub_metadata {
# define XFS_XATTR_LIST_MAX 65536
#endif
+/*
+ * Exchange part of file1 with part of the file that this ioctl that is being
+ * called against (which we'll call file2). Filesystems must be able to
+ * restart and complete the operation even after the system goes down.
+ */
+struct xfs_exchange_range {
+ __s32 file1_fd;
+ __u32 pad; /* must be zeroes */
+ __u64 file1_offset; /* file1 offset, bytes */
+ __u64 file2_offset; /* file2 offset, bytes */
+ __u64 length; /* bytes to exchange */
+
+ __u64 flags; /* see XFS_EXCHANGE_RANGE_* below */
+};
+
+/*
+ * Exchange file data all the way to the ends of both files, and then exchange
+ * the file sizes. This flag can be used to replace a file's contents with a
+ * different amount of data. length will be ignored.
+ */
+#define XFS_EXCHANGE_RANGE_TO_EOF (1ULL << 0)
+
+/* Flush all changes in file data and file metadata to disk before returning. */
+#define XFS_EXCHANGE_RANGE_DSYNC (1ULL << 1)
+
+/* Dry run; do all the parameter verification but do not change anything. */
+#define XFS_EXCHANGE_RANGE_DRY_RUN (1ULL << 2)
+
+/*
+ * Exchange only the parts of the two files where the file allocation units
+ * mapped to file1's range have been written to. This can accelerate
+ * scatter-gather atomic writes with a temp file if all writes are aligned to
+ * the file allocation unit.
+ */
+#define XFS_EXCHANGE_RANGE_FILE1_WRITTEN (1ULL << 3)
+
+#define XFS_EXCHANGE_RANGE_ALL_FLAGS (XFS_EXCHANGE_RANGE_TO_EOF | \
+ XFS_EXCHANGE_RANGE_DSYNC | \
+ XFS_EXCHANGE_RANGE_DRY_RUN | \
+ XFS_EXCHANGE_RANGE_FILE1_WRITTEN)
+
+/* Iterating parent pointers of files. */
+
+/* target was the root directory */
+#define XFS_GETPARENTS_OFLAG_ROOT (1U << 0)
+
+/* Cursor is done iterating pptrs */
+#define XFS_GETPARENTS_OFLAG_DONE (1U << 1)
+
+#define XFS_GETPARENTS_OFLAGS_ALL (XFS_GETPARENTS_OFLAG_ROOT | \
+ XFS_GETPARENTS_OFLAG_DONE)
+
+#define XFS_GETPARENTS_IFLAGS_ALL (0)
+
+struct xfs_getparents_rec {
+ struct xfs_handle gpr_parent; /* Handle to parent */
+ __u32 gpr_reclen; /* Length of entire record */
+ __u32 gpr_reserved; /* zero */
+ char gpr_name[]; /* Null-terminated filename */
+};
+
+/* Iterate through this file's directory parent pointers */
+struct xfs_getparents {
+ /*
+ * Structure to track progress in iterating the parent pointers.
+ * Must be initialized to zeroes before the first ioctl call, and
+ * not touched by callers after that.
+ */
+ struct xfs_attrlist_cursor gp_cursor;
+
+ /* Input flags: XFS_GETPARENTS_IFLAG* */
+ __u16 gp_iflags;
+
+ /* Output flags: XFS_GETPARENTS_OFLAG* */
+ __u16 gp_oflags;
+
+ /* Size of the gp_buffer in bytes */
+ __u32 gp_bufsize;
+
+ /* Must be set to zero */
+ __u64 gp_reserved;
+
+ /* Pointer to a buffer in which to place xfs_getparents_rec */
+ __u64 gp_buffer;
+};
+
+static inline struct xfs_getparents_rec *
+xfs_getparents_first_rec(struct xfs_getparents *gp)
+{
+ return (struct xfs_getparents_rec *)(uintptr_t)gp->gp_buffer;
+}
+
+static inline struct xfs_getparents_rec *
+xfs_getparents_next_rec(struct xfs_getparents *gp,
+ struct xfs_getparents_rec *gpr)
+{
+ void *next = ((void *)gpr + gpr->gpr_reclen);
+ void *end = (void *)(uintptr_t)(gp->gp_buffer + gp->gp_bufsize);
+
+ if (next >= end)
+ return NULL;
+
+ return next;
+}
+
+/* Iterate through this file handle's directory parent pointers. */
+struct xfs_getparents_by_handle {
+ /* Handle to file whose parents we want. */
+ struct xfs_handle gph_handle;
+
+ struct xfs_getparents gph_request;
+};
/*
* ioctl commands that are used by Linux filesystems
@@ -808,6 +958,9 @@ struct xfs_scrub_metadata {
/* XFS_IOC_GETFSMAP ------ hoisted 59 */
#define XFS_IOC_SCRUB_METADATA _IOWR('X', 60, struct xfs_scrub_metadata)
#define XFS_IOC_AG_GEOMETRY _IOWR('X', 61, struct xfs_ag_geometry)
+#define XFS_IOC_GETPARENTS _IOWR('X', 62, struct xfs_getparents)
+#define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle)
+#define XFS_IOC_SCRUBV_METADATA _IOWR('X', 64, struct xfs_scrub_vec_head)
/*
* ioctl commands that replace IRIX syssgi()'s
@@ -843,6 +996,7 @@ struct xfs_scrub_metadata {
#define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom)
#define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req)
#define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req)
+#define XFS_IOC_EXCHANGE_RANGE _IOWR('X', 129, struct xfs_exchange_range)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index 3c64b5f9bd68..b0edb4288e59 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -95,6 +95,7 @@ struct xfs_da_args;
/* Don't propagate sick status to ag health summary during inactivation */
#define XFS_SICK_INO_FORGET (1 << 12)
+#define XFS_SICK_INO_DIRTREE (1 << 13) /* directory tree structure */
/* Primary evidence of health problems in a given group. */
#define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \
@@ -125,7 +126,8 @@ struct xfs_da_args;
XFS_SICK_INO_DIR | \
XFS_SICK_INO_XATTR | \
XFS_SICK_INO_SYMLINK | \
- XFS_SICK_INO_PARENT)
+ XFS_SICK_INO_PARENT | \
+ XFS_SICK_INO_DIRTREE)
#define XFS_SICK_INO_ZAPPED (XFS_SICK_INO_BMBTD_ZAPPED | \
XFS_SICK_INO_BMBTA_ZAPPED | \
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index e5ac3e5430c4..14c81f227c5b 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1058,6 +1058,33 @@ xfs_inobt_first_free_inode(
}
/*
+ * If this AG has corrupt inodes, check if allocating this inode would fail
+ * with corruption errors. Returns 0 if we're clear, or EAGAIN to try again
+ * somewhere else.
+ */
+static int
+xfs_dialloc_check_ino(
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ xfs_ino_t ino)
+{
+ struct xfs_imap imap;
+ struct xfs_buf *bp;
+ int error;
+
+ error = xfs_imap(pag, tp, ino, &imap, 0);
+ if (error)
+ return -EAGAIN;
+
+ error = xfs_imap_to_bp(pag->pag_mount, tp, &imap, &bp);
+ if (error)
+ return -EAGAIN;
+
+ xfs_trans_brelse(tp, bp);
+ return 0;
+}
+
+/*
* Allocate an inode using the inobt-only algorithm.
*/
STATIC int
@@ -1309,6 +1336,13 @@ alloc_inode:
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
+
+ if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
+ error = xfs_dialloc_check_ino(pag, tp, ino);
+ if (error)
+ goto error0;
+ }
+
rec.ir_free &= ~XFS_INOBT_MASK(offset);
rec.ir_freecount--;
error = xfs_inobt_update(cur, &rec);
@@ -1584,6 +1618,12 @@ xfs_dialloc_ag(
XFS_INODES_PER_CHUNK) == 0);
ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
+ if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
+ error = xfs_dialloc_check_ino(pag, tp, ino);
+ if (error)
+ goto error_cur;
+ }
+
/*
* Modify or remove the finobt record.
*/
@@ -1699,7 +1739,7 @@ xfs_dialloc_good_ag(
return false;
if (!xfs_perag_initialised_agi(pag)) {
- error = xfs_ialloc_read_agi(pag, tp, NULL);
+ error = xfs_ialloc_read_agi(pag, tp, 0, NULL);
if (error)
return false;
}
@@ -1768,7 +1808,7 @@ xfs_dialloc_try_ag(
* Then read in the AGI buffer and recheck with the AGI buffer
* lock held.
*/
- error = xfs_ialloc_read_agi(pag, *tpp, &agbp);
+ error = xfs_ialloc_read_agi(pag, *tpp, 0, &agbp);
if (error)
return error;
@@ -2286,7 +2326,7 @@ xfs_difree(
/*
* Get the allocation group header.
*/
- error = xfs_ialloc_read_agi(pag, tp, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
if (error) {
xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
__func__, error);
@@ -2332,7 +2372,7 @@ xfs_imap_lookup(
int error;
int i;
- error = xfs_ialloc_read_agi(pag, tp, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
if (error) {
xfs_alert(mp,
"%s: xfs_ialloc_read_agi() returned error %d, agno %d",
@@ -2675,6 +2715,7 @@ int
xfs_read_agi(
struct xfs_perag *pag,
struct xfs_trans *tp,
+ xfs_buf_flags_t flags,
struct xfs_buf **agibpp)
{
struct xfs_mount *mp = pag->pag_mount;
@@ -2684,7 +2725,7 @@ xfs_read_agi(
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops);
+ XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops);
if (xfs_metadata_is_sick(error))
xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
if (error)
@@ -2704,6 +2745,7 @@ int
xfs_ialloc_read_agi(
struct xfs_perag *pag,
struct xfs_trans *tp,
+ int flags,
struct xfs_buf **agibpp)
{
struct xfs_buf *agibp;
@@ -2712,7 +2754,9 @@ xfs_ialloc_read_agi(
trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno);
- error = xfs_read_agi(pag, tp, &agibp);
+ error = xfs_read_agi(pag, tp,
+ (flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
+ &agibp);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index f1412183bb44..b549627e3a61 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -63,10 +63,11 @@ xfs_ialloc_log_agi(
struct xfs_buf *bp, /* allocation group header buffer */
uint32_t fields); /* bitmask of fields to log */
-int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp,
+int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, xfs_buf_flags_t flags,
struct xfs_buf **agibpp);
int xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp,
- struct xfs_buf **agibpp);
+ int flags, struct xfs_buf **agibpp);
+#define XFS_IALLOC_FLAG_TRYLOCK (1U << 0) /* use trylock for buffer locking */
/*
* Lookup a record by ino in the btree given by cur.
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index cc661fca6ff5..42e9fd47f6c7 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -745,7 +745,7 @@ xfs_finobt_count_blocks(
struct xfs_btree_cur *cur;
int error;
- error = xfs_ialloc_read_agi(pag, tp, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
if (error)
return error;
@@ -768,7 +768,7 @@ xfs_finobt_read_blocks(
struct xfs_agi *agi;
int error;
- error = xfs_ialloc_read_agi(pag, tp, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index d0dcce462bf4..d79002343d0b 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -491,6 +491,14 @@ xfs_dinode_verify(
return __this_address;
}
+ if (dip->di_version > 1) {
+ if (dip->di_onlink)
+ return __this_address;
+ } else {
+ if (dip->di_nlink)
+ return __this_address;
+ }
+
/* don't allow invalid i_size */
di_size = be64_to_cpu(dip->di_size);
if (di_size & (1ULL << 63))
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 7d660a973909..9d11ae015909 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -765,53 +765,46 @@ xfs_ifork_verify_local_attr(
return 0;
}
+/*
+ * Check if the inode fork supports adding nr_to_add more extents.
+ *
+ * If it doesn't but we can upgrade it to large extent counters, do the upgrade.
+ * If we can't upgrade or are already using big counters but still can't fit the
+ * additional extents, return -EFBIG.
+ */
int
-xfs_iext_count_may_overflow(
+xfs_iext_count_extend(
+ struct xfs_trans *tp,
struct xfs_inode *ip,
int whichfork,
- int nr_to_add)
+ uint nr_to_add)
{
+ struct xfs_mount *mp = ip->i_mount;
+ bool has_large =
+ xfs_inode_has_large_extent_counts(ip);
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
- uint64_t max_exts;
uint64_t nr_exts;
+ ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR);
+
if (whichfork == XFS_COW_FORK)
return 0;
- max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip),
- whichfork);
-
- if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
- max_exts = 10;
-
+ /* no point in upgrading if if_nextents overflows */
nr_exts = ifp->if_nextents + nr_to_add;
- if (nr_exts < ifp->if_nextents || nr_exts > max_exts)
+ if (nr_exts < ifp->if_nextents)
return -EFBIG;
- return 0;
-}
-
-/*
- * Upgrade this inode's extent counter fields to be able to handle a potential
- * increase in the extent count by nr_to_add. Normally this is the same
- * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG.
- */
-int
-xfs_iext_count_upgrade(
- struct xfs_trans *tp,
- struct xfs_inode *ip,
- uint nr_to_add)
-{
- ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR);
-
- if (!xfs_has_large_extent_counts(ip->i_mount) ||
- xfs_inode_has_large_extent_counts(ip) ||
- XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
+ nr_exts > 10)
return -EFBIG;
- ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
+ if (nr_exts > xfs_iext_max_nextents(has_large, whichfork)) {
+ if (has_large || !xfs_has_large_extent_counts(mp))
+ return -EFBIG;
+ ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ }
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index bd53eb951b65..2373d12fd474 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -256,10 +256,8 @@ extern void xfs_ifork_init_cow(struct xfs_inode *ip);
int xfs_ifork_verify_local_data(struct xfs_inode *ip);
int xfs_ifork_verify_local_attr(struct xfs_inode *ip);
-int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork,
- int nr_to_add);
-int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
- uint nr_to_add);
+int xfs_iext_count_extend(struct xfs_trans *tp, struct xfs_inode *ip,
+ int whichfork, uint nr_to_add);
bool xfs_ifork_is_realtime(struct xfs_inode *ip, int whichfork);
/* returns true if the fork has extents but they are not read in yet. */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 16872972e1e9..3e6682ed656b 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -115,10 +115,13 @@ struct xfs_unmount_log_format {
#define XLOG_REG_TYPE_BUD_FORMAT 26
#define XLOG_REG_TYPE_ATTRI_FORMAT 27
#define XLOG_REG_TYPE_ATTRD_FORMAT 28
-#define XLOG_REG_TYPE_ATTR_NAME 29
+#define XLOG_REG_TYPE_ATTR_NAME 29
#define XLOG_REG_TYPE_ATTR_VALUE 30
-#define XLOG_REG_TYPE_MAX 30
-
+#define XLOG_REG_TYPE_XMI_FORMAT 31
+#define XLOG_REG_TYPE_XMD_FORMAT 32
+#define XLOG_REG_TYPE_ATTR_NEWNAME 33
+#define XLOG_REG_TYPE_ATTR_NEWVALUE 34
+#define XLOG_REG_TYPE_MAX 34
/*
* Flags to log operation header
@@ -243,6 +246,8 @@ typedef struct xfs_trans_header {
#define XFS_LI_BUD 0x1245
#define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/
#define XFS_LI_ATTRD 0x1247 /* attr set/remove done */
+#define XFS_LI_XMI 0x1248 /* mapping exchange intent */
+#define XFS_LI_XMD 0x1249 /* mapping exchange done */
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -260,7 +265,9 @@ typedef struct xfs_trans_header {
{ XFS_LI_BUI, "XFS_LI_BUI" }, \
{ XFS_LI_BUD, "XFS_LI_BUD" }, \
{ XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \
- { XFS_LI_ATTRD, "XFS_LI_ATTRD" }
+ { XFS_LI_ATTRD, "XFS_LI_ATTRD" }, \
+ { XFS_LI_XMI, "XFS_LI_XMI" }, \
+ { XFS_LI_XMD, "XFS_LI_XMD" }
/*
* Inode Log Item Format definitions.
@@ -879,6 +886,61 @@ struct xfs_bud_log_format {
};
/*
+ * XMI/XMD (file mapping exchange) log format definitions
+ */
+
+/* This is the structure used to lay out an mapping exchange log item. */
+struct xfs_xmi_log_format {
+ uint16_t xmi_type; /* xmi log item type */
+ uint16_t xmi_size; /* size of this item */
+ uint32_t __pad; /* must be zero */
+ uint64_t xmi_id; /* xmi identifier */
+
+ uint64_t xmi_inode1; /* inumber of first file */
+ uint64_t xmi_inode2; /* inumber of second file */
+ uint32_t xmi_igen1; /* generation of first file */
+ uint32_t xmi_igen2; /* generation of second file */
+ uint64_t xmi_startoff1; /* block offset into file1 */
+ uint64_t xmi_startoff2; /* block offset into file2 */
+ uint64_t xmi_blockcount; /* number of blocks */
+ uint64_t xmi_flags; /* XFS_EXCHMAPS_* */
+ uint64_t xmi_isize1; /* intended file1 size */
+ uint64_t xmi_isize2; /* intended file2 size */
+};
+
+/* Exchange mappings between extended attribute forks instead of data forks. */
+#define XFS_EXCHMAPS_ATTR_FORK (1ULL << 0)
+
+/* Set the file sizes when finished. */
+#define XFS_EXCHMAPS_SET_SIZES (1ULL << 1)
+
+/*
+ * Exchange the mappings of the two files only if the file allocation units
+ * mapped to file1's range have been written.
+ */
+#define XFS_EXCHMAPS_INO1_WRITTEN (1ULL << 2)
+
+/* Clear the reflink flag from inode1 after the operation. */
+#define XFS_EXCHMAPS_CLEAR_INO1_REFLINK (1ULL << 3)
+
+/* Clear the reflink flag from inode2 after the operation. */
+#define XFS_EXCHMAPS_CLEAR_INO2_REFLINK (1ULL << 4)
+
+#define XFS_EXCHMAPS_LOGGED_FLAGS (XFS_EXCHMAPS_ATTR_FORK | \
+ XFS_EXCHMAPS_SET_SIZES | \
+ XFS_EXCHMAPS_INO1_WRITTEN | \
+ XFS_EXCHMAPS_CLEAR_INO1_REFLINK | \
+ XFS_EXCHMAPS_CLEAR_INO2_REFLINK)
+
+/* This is the structure used to lay out an mapping exchange done log item. */
+struct xfs_xmd_log_format {
+ uint16_t xmd_type; /* xmd log item type */
+ uint16_t xmd_size; /* size of this item */
+ uint32_t __pad;
+ uint64_t xmd_xmi_id; /* id of corresponding xmi */
+};
+
+/*
* Dquot Log format definitions.
*
* The first two fields must be the type and size fitting into
@@ -966,6 +1028,9 @@ struct xfs_icreate_log {
#define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */
#define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */
#define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */
+#define XFS_ATTRI_OP_FLAGS_PPTR_SET 4 /* Set parent pointer */
+#define XFS_ATTRI_OP_FLAGS_PPTR_REMOVE 5 /* Remove parent pointer */
+#define XFS_ATTRI_OP_FLAGS_PPTR_REPLACE 6 /* Replace parent pointer */
#define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */
/*
@@ -974,6 +1039,7 @@ struct xfs_icreate_log {
*/
#define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \
XFS_ATTR_SECURE | \
+ XFS_ATTR_PARENT | \
XFS_ATTR_INCOMPLETE)
/*
@@ -983,11 +1049,22 @@ struct xfs_icreate_log {
struct xfs_attri_log_format {
uint16_t alfi_type; /* attri log item type */
uint16_t alfi_size; /* size of this item */
- uint32_t __pad; /* pad to 64 bit aligned */
+ uint32_t alfi_igen; /* generation of alfi_ino for pptr ops */
uint64_t alfi_id; /* attri identifier */
uint64_t alfi_ino; /* the inode for this attr operation */
uint32_t alfi_op_flags; /* marks the op as a set or remove */
- uint32_t alfi_name_len; /* attr name length */
+ union {
+ uint32_t alfi_name_len; /* attr name length */
+ struct {
+ /*
+ * For PPTR_REPLACE, these are the lengths of the old
+ * and new attr names. The new and old values must
+ * have the same length.
+ */
+ uint16_t alfi_old_name_len;
+ uint16_t alfi_new_name_len;
+ };
+ };
uint32_t alfi_value_len; /* attr value length */
uint32_t alfi_attr_filter;/* attr filter flags */
};
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 9fe7a9564bca..521d327e4c89 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -75,6 +75,8 @@ extern const struct xlog_recover_item_ops xlog_cui_item_ops;
extern const struct xlog_recover_item_ops xlog_cud_item_ops;
extern const struct xlog_recover_item_ops xlog_attri_item_ops;
extern const struct xlog_recover_item_ops xlog_attrd_item_ops;
+extern const struct xlog_recover_item_ops xlog_xmi_item_ops;
+extern const struct xlog_recover_item_ops xlog_xmd_item_ops;
/*
* Macros, structures, prototypes for internal log manager use.
@@ -121,6 +123,8 @@ bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len);
int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino,
struct xfs_inode **ipp);
+int xlog_recover_iget_handle(struct xfs_mount *mp, xfs_ino_t ino, uint32_t gen,
+ struct xfs_inode **ipp);
void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type,
uint64_t intent_id);
int xlog_alloc_buf_cancel_table(struct xlog *log);
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index 9975b93a7412..d3bd6a86c8fe 100644
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -17,6 +17,34 @@
#include "xfs_trace.h"
/*
+ * Shortly after enabling the large extents count feature in 2023, longstanding
+ * bugs were found in the code that computes the minimum log size. Luckily,
+ * the bugs resulted in over-estimates of that size, so there's no impact to
+ * existing users. However, we don't want to reduce the minimum log size
+ * because that can create the situation where a newer mkfs writes a new
+ * filesystem that an older kernel won't mount.
+ *
+ * Several years prior, we also discovered that the transaction reservations
+ * for rmap and reflink operations were unnecessarily large. That was fixed,
+ * but the minimum log size computation was left alone to avoid the
+ * compatibility problems noted above. Fix that too.
+ *
+ * Therefore, we only may correct the computation starting with filesystem
+ * features that didn't exist in 2023. In other words, only turn this on if
+ * the filesystem has parent pointers.
+ *
+ * This function can be called before the XFS_HAS_* flags have been set up,
+ * (e.g. mkfs) so we must check the ondisk superblock.
+ */
+static inline bool
+xfs_want_minlogsize_fixes(
+ struct xfs_sb *sb)
+{
+ return xfs_sb_is_v5(sb) &&
+ xfs_sb_has_incompat_feature(sb, XFS_SB_FEAT_INCOMPAT_PARENT);
+}
+
+/*
* Calculate the maximum length in bytes that would be required for a local
* attribute value as large attributes out of line are not logged.
*/
@@ -31,6 +59,15 @@ xfs_log_calc_max_attrsetm_res(
MAXNAMELEN - 1;
nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
nblks += XFS_B_TO_FSB(mp, size);
+
+ /*
+ * If the feature set is new enough, correct a unit conversion error in
+ * the xattr transaction reservation code that resulted in oversized
+ * minimum log size computations.
+ */
+ if (xfs_want_minlogsize_fixes(&mp->m_sb))
+ size = XFS_B_TO_FSB(mp, size);
+
nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK);
return M_RES(mp)->tr_attrsetm.tr_logres +
@@ -49,6 +86,15 @@ xfs_log_calc_trans_resv_for_minlogblocks(
unsigned int rmap_maxlevels = mp->m_rmap_maxlevels;
/*
+ * If the feature set is new enough, drop the oversized minimum log
+ * size computation introduced by the original reflink code.
+ */
+ if (xfs_want_minlogsize_fixes(&mp->m_sb)) {
+ xfs_trans_resv_calc(mp, resv);
+ return;
+ }
+
+ /*
* In the early days of rmap+reflink, we always set the rmap maxlevels
* to 9 even if the AG was small enough that it would never grow to
* that height. Transaction reservation sizes influence the minimum
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 81885a6a028e..e8cdd77d03fa 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -119,6 +119,7 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1);
XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3);
XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_parent_rec, 12);
/* log structures */
XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88);
@@ -155,6 +156,11 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16);
XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16);
+ /* parent pointer ioctls */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_getparents_rec, 32);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_getparents, 40);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_getparents_by_handle, 64);
+
/*
* The v5 superblock format extended several v4 header structures with
* additional data. While new fields are only accessible on v5
diff --git a/fs/xfs/libxfs/xfs_parent.c b/fs/xfs/libxfs/xfs_parent.c
new file mode 100644
index 000000000000..69366c44a701
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_parent.c
@@ -0,0 +1,379 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022-2024 Oracle.
+ * All rights reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_da_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_attr_sf.h"
+#include "xfs_bmap.h"
+#include "xfs_defer.h"
+#include "xfs_log.h"
+#include "xfs_xattr.h"
+#include "xfs_parent.h"
+#include "xfs_trans_space.h"
+#include "xfs_attr_item.h"
+#include "xfs_health.h"
+
+struct kmem_cache *xfs_parent_args_cache;
+
+/*
+ * Parent pointer attribute handling.
+ *
+ * Because the attribute name is a filename component, it will never be longer
+ * than 255 bytes and must not contain nulls or slashes. These are roughly the
+ * same constraints that apply to attribute names.
+ *
+ * The attribute value must always be a struct xfs_parent_rec. This means the
+ * attribute will never be in remote format because 12 bytes is nowhere near
+ * xfs_attr_leaf_entsize_local_max() (~75% of block size).
+ *
+ * Creating a new parent attribute will always create a new attribute - there
+ * should never, ever be an existing attribute in the tree for a new inode.
+ * ENOSPC behavior is problematic - creating the inode without the parent
+ * pointer is effectively a corruption, so we allow parent attribute creation
+ * to dip into the reserve block pool to avoid unexpected ENOSPC errors from
+ * occurring.
+ */
+
+/* Return true if parent pointer attr name is valid. */
+bool
+xfs_parent_namecheck(
+ unsigned int attr_flags,
+ const void *name,
+ size_t length)
+{
+ /*
+ * Parent pointers always use logged operations, so there should never
+ * be incomplete xattrs.
+ */
+ if (attr_flags & XFS_ATTR_INCOMPLETE)
+ return false;
+
+ return xfs_dir2_namecheck(name, length);
+}
+
+/* Return true if parent pointer attr value is valid. */
+bool
+xfs_parent_valuecheck(
+ struct xfs_mount *mp,
+ const void *value,
+ size_t valuelen)
+{
+ const struct xfs_parent_rec *rec = value;
+
+ if (!xfs_has_parent(mp))
+ return false;
+
+ /* The xattr value must be a parent record. */
+ if (valuelen != sizeof(struct xfs_parent_rec))
+ return false;
+
+ /* The parent record must be local. */
+ if (value == NULL)
+ return false;
+
+ /* The parent inumber must be valid. */
+ if (!xfs_verify_dir_ino(mp, be64_to_cpu(rec->p_ino)))
+ return false;
+
+ return true;
+}
+
+/* Compute the attribute name hash for a parent pointer. */
+xfs_dahash_t
+xfs_parent_hashval(
+ struct xfs_mount *mp,
+ const uint8_t *name,
+ int namelen,
+ xfs_ino_t parent_ino)
+{
+ struct xfs_name xname = {
+ .name = name,
+ .len = namelen,
+ };
+
+ /*
+ * Use the same dirent name hash as would be used on the directory, but
+ * mix in the parent inode number to avoid collisions on hardlinked
+ * files with identical names but different parents.
+ */
+ return xfs_dir2_hashname(mp, &xname) ^
+ upper_32_bits(parent_ino) ^ lower_32_bits(parent_ino);
+}
+
+/* Compute the attribute name hash from the xattr components. */
+xfs_dahash_t
+xfs_parent_hashattr(
+ struct xfs_mount *mp,
+ const uint8_t *name,
+ int namelen,
+ const void *value,
+ int valuelen)
+{
+ const struct xfs_parent_rec *rec = value;
+
+ /* Requires a local attr value in xfs_parent_rec format */
+ if (valuelen != sizeof(struct xfs_parent_rec)) {
+ ASSERT(valuelen == sizeof(struct xfs_parent_rec));
+ return 0;
+ }
+
+ if (!value) {
+ ASSERT(value != NULL);
+ return 0;
+ }
+
+ return xfs_parent_hashval(mp, name, namelen, be64_to_cpu(rec->p_ino));
+}
+
+/*
+ * Initialize the parent pointer arguments structure. Caller must have zeroed
+ * the contents of @args. @tp is only required for updates.
+ */
+static void
+xfs_parent_da_args_init(
+ struct xfs_da_args *args,
+ struct xfs_trans *tp,
+ struct xfs_parent_rec *rec,
+ struct xfs_inode *child,
+ xfs_ino_t owner,
+ const struct xfs_name *parent_name)
+{
+ args->geo = child->i_mount->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->attr_filter = XFS_ATTR_PARENT;
+ args->op_flags = XFS_DA_OP_LOGGED | XFS_DA_OP_OKNOENT;
+ args->trans = tp;
+ args->dp = child;
+ args->owner = owner;
+ args->name = parent_name->name;
+ args->namelen = parent_name->len;
+ args->value = rec;
+ args->valuelen = sizeof(struct xfs_parent_rec);
+ xfs_attr_sethash(args);
+}
+
+/* Make sure the incore state is ready for a parent pointer query/update. */
+static inline int
+xfs_parent_iread_extents(
+ struct xfs_trans *tp,
+ struct xfs_inode *child)
+{
+ /* Parent pointers require that the attr fork must exist. */
+ if (XFS_IS_CORRUPT(child->i_mount, !xfs_inode_has_attr_fork(child))) {
+ xfs_inode_mark_sick(child, XFS_SICK_INO_PARENT);
+ return -EFSCORRUPTED;
+ }
+
+ return xfs_iread_extents(tp, child, XFS_ATTR_FORK);
+}
+
+/* Add a parent pointer to reflect a dirent addition. */
+int
+xfs_parent_addname(
+ struct xfs_trans *tp,
+ struct xfs_parent_args *ppargs,
+ struct xfs_inode *dp,
+ const struct xfs_name *parent_name,
+ struct xfs_inode *child)
+{
+ int error;
+
+ error = xfs_parent_iread_extents(tp, child);
+ if (error)
+ return error;
+
+ xfs_inode_to_parent_rec(&ppargs->rec, dp);
+ xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child,
+ child->i_ino, parent_name);
+ xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_SET);
+ return 0;
+}
+
+/* Remove a parent pointer to reflect a dirent removal. */
+int
+xfs_parent_removename(
+ struct xfs_trans *tp,
+ struct xfs_parent_args *ppargs,
+ struct xfs_inode *dp,
+ const struct xfs_name *parent_name,
+ struct xfs_inode *child)
+{
+ int error;
+
+ error = xfs_parent_iread_extents(tp, child);
+ if (error)
+ return error;
+
+ xfs_inode_to_parent_rec(&ppargs->rec, dp);
+ xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child,
+ child->i_ino, parent_name);
+ xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_REMOVE);
+ return 0;
+}
+
+/* Replace one parent pointer with another to reflect a rename. */
+int
+xfs_parent_replacename(
+ struct xfs_trans *tp,
+ struct xfs_parent_args *ppargs,
+ struct xfs_inode *old_dp,
+ const struct xfs_name *old_name,
+ struct xfs_inode *new_dp,
+ const struct xfs_name *new_name,
+ struct xfs_inode *child)
+{
+ int error;
+
+ error = xfs_parent_iread_extents(tp, child);
+ if (error)
+ return error;
+
+ xfs_inode_to_parent_rec(&ppargs->rec, old_dp);
+ xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child,
+ child->i_ino, old_name);
+
+ xfs_inode_to_parent_rec(&ppargs->new_rec, new_dp);
+ ppargs->args.new_name = new_name->name;
+ ppargs->args.new_namelen = new_name->len;
+ ppargs->args.new_value = &ppargs->new_rec;
+ ppargs->args.new_valuelen = sizeof(struct xfs_parent_rec);
+ xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_REPLACE);
+ return 0;
+}
+
+/*
+ * Extract parent pointer information from any parent pointer xattr into
+ * @parent_ino/gen. The last two parameters can be NULL pointers.
+ *
+ * Returns 0 if this is not a parent pointer xattr at all; or -EFSCORRUPTED for
+ * garbage.
+ */
+int
+xfs_parent_from_attr(
+ struct xfs_mount *mp,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ xfs_ino_t *parent_ino,
+ uint32_t *parent_gen)
+{
+ const struct xfs_parent_rec *rec = value;
+
+ ASSERT(attr_flags & XFS_ATTR_PARENT);
+
+ if (!xfs_parent_namecheck(attr_flags, name, namelen))
+ return -EFSCORRUPTED;
+ if (!xfs_parent_valuecheck(mp, value, valuelen))
+ return -EFSCORRUPTED;
+
+ if (parent_ino)
+ *parent_ino = be64_to_cpu(rec->p_ino);
+ if (parent_gen)
+ *parent_gen = be32_to_cpu(rec->p_gen);
+ return 0;
+}
+
+/*
+ * Look up a parent pointer record (@parent_name -> @pptr) of @ip.
+ *
+ * Caller must hold at least ILOCK_SHARED. The scratchpad need not be
+ * initialized.
+ *
+ * Returns 0 if the pointer is found, -ENOATTR if there is no match, or a
+ * negative errno.
+ */
+int
+xfs_parent_lookup(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ const struct xfs_name *parent_name,
+ struct xfs_parent_rec *pptr,
+ struct xfs_da_args *scratch)
+{
+ memset(scratch, 0, sizeof(struct xfs_da_args));
+ xfs_parent_da_args_init(scratch, tp, pptr, ip, ip->i_ino, parent_name);
+ return xfs_attr_get_ilocked(scratch);
+}
+
+/* Sanity-check a parent pointer before we try to perform repairs. */
+static inline bool
+xfs_parent_sanity_check(
+ struct xfs_mount *mp,
+ const struct xfs_name *parent_name,
+ const struct xfs_parent_rec *pptr)
+{
+ if (!xfs_parent_namecheck(XFS_ATTR_PARENT, parent_name->name,
+ parent_name->len))
+ return false;
+
+ if (!xfs_parent_valuecheck(mp, pptr, sizeof(*pptr)))
+ return false;
+
+ return true;
+}
+
+
+/*
+ * Attach the parent pointer (@parent_name -> @pptr) to @ip immediately.
+ * Caller must not have a transaction or hold the ILOCK. This is for
+ * specialized repair functions only. The scratchpad need not be initialized.
+ */
+int
+xfs_parent_set(
+ struct xfs_inode *ip,
+ xfs_ino_t owner,
+ const struct xfs_name *parent_name,
+ struct xfs_parent_rec *pptr,
+ struct xfs_da_args *scratch)
+{
+ if (!xfs_parent_sanity_check(ip->i_mount, parent_name, pptr)) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ memset(scratch, 0, sizeof(struct xfs_da_args));
+ xfs_parent_da_args_init(scratch, NULL, pptr, ip, owner, parent_name);
+ return xfs_attr_set(scratch, XFS_ATTRUPDATE_CREATE, false);
+}
+
+/*
+ * Remove the parent pointer (@parent_name -> @pptr) from @ip immediately.
+ * Caller must not have a transaction or hold the ILOCK. This is for
+ * specialized repair functions only. The scratchpad need not be initialized.
+ */
+int
+xfs_parent_unset(
+ struct xfs_inode *ip,
+ xfs_ino_t owner,
+ const struct xfs_name *parent_name,
+ struct xfs_parent_rec *pptr,
+ struct xfs_da_args *scratch)
+{
+ if (!xfs_parent_sanity_check(ip->i_mount, parent_name, pptr)) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ memset(scratch, 0, sizeof(struct xfs_da_args));
+ xfs_parent_da_args_init(scratch, NULL, pptr, ip, owner, parent_name);
+ return xfs_attr_set(scratch, XFS_ATTRUPDATE_REMOVE, false);
+}
diff --git a/fs/xfs/libxfs/xfs_parent.h b/fs/xfs/libxfs/xfs_parent.h
new file mode 100644
index 000000000000..b8036527cdc7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_parent.h
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022-2024 Oracle.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_PARENT_H__
+#define __XFS_PARENT_H__
+
+/* Metadata validators */
+bool xfs_parent_namecheck(unsigned int attr_flags, const void *name,
+ size_t length);
+bool xfs_parent_valuecheck(struct xfs_mount *mp, const void *value,
+ size_t valuelen);
+
+xfs_dahash_t xfs_parent_hashval(struct xfs_mount *mp, const uint8_t *name,
+ int namelen, xfs_ino_t parent_ino);
+xfs_dahash_t xfs_parent_hashattr(struct xfs_mount *mp, const uint8_t *name,
+ int namelen, const void *value, int valuelen);
+
+/* Initializes a xfs_parent_rec to be stored as an attribute name. */
+static inline void
+xfs_parent_rec_init(
+ struct xfs_parent_rec *rec,
+ xfs_ino_t ino,
+ uint32_t gen)
+{
+ rec->p_ino = cpu_to_be64(ino);
+ rec->p_gen = cpu_to_be32(gen);
+}
+
+/* Initializes a xfs_parent_rec to be stored as an attribute name. */
+static inline void
+xfs_inode_to_parent_rec(
+ struct xfs_parent_rec *rec,
+ const struct xfs_inode *dp)
+{
+ xfs_parent_rec_init(rec, dp->i_ino, VFS_IC(dp)->i_generation);
+}
+
+extern struct kmem_cache *xfs_parent_args_cache;
+
+/*
+ * Parent pointer information needed to pass around the deferred xattr update
+ * machinery.
+ */
+struct xfs_parent_args {
+ struct xfs_parent_rec rec;
+ struct xfs_parent_rec new_rec;
+ struct xfs_da_args args;
+};
+
+/*
+ * Start a parent pointer update by allocating the context object we need to
+ * perform a parent pointer update.
+ */
+static inline int
+xfs_parent_start(
+ struct xfs_mount *mp,
+ struct xfs_parent_args **ppargsp)
+{
+ if (!xfs_has_parent(mp)) {
+ *ppargsp = NULL;
+ return 0;
+ }
+
+ *ppargsp = kmem_cache_zalloc(xfs_parent_args_cache, GFP_KERNEL);
+ if (!*ppargsp)
+ return -ENOMEM;
+ return 0;
+}
+
+/* Finish a parent pointer update by freeing the context object. */
+static inline void
+xfs_parent_finish(
+ struct xfs_mount *mp,
+ struct xfs_parent_args *ppargs)
+{
+ if (ppargs)
+ kmem_cache_free(xfs_parent_args_cache, ppargs);
+}
+
+int xfs_parent_addname(struct xfs_trans *tp, struct xfs_parent_args *ppargs,
+ struct xfs_inode *dp, const struct xfs_name *parent_name,
+ struct xfs_inode *child);
+int xfs_parent_removename(struct xfs_trans *tp, struct xfs_parent_args *ppargs,
+ struct xfs_inode *dp, const struct xfs_name *parent_name,
+ struct xfs_inode *child);
+int xfs_parent_replacename(struct xfs_trans *tp,
+ struct xfs_parent_args *ppargs,
+ struct xfs_inode *old_dp, const struct xfs_name *old_name,
+ struct xfs_inode *new_dp, const struct xfs_name *new_name,
+ struct xfs_inode *child);
+
+int xfs_parent_from_attr(struct xfs_mount *mp, unsigned int attr_flags,
+ const unsigned char *name, unsigned int namelen,
+ const void *value, unsigned int valuelen,
+ xfs_ino_t *parent_ino, uint32_t *parent_gen);
+
+/* Repair functions */
+int xfs_parent_lookup(struct xfs_trans *tp, struct xfs_inode *ip,
+ const struct xfs_name *name, struct xfs_parent_rec *pptr,
+ struct xfs_da_args *scratch);
+int xfs_parent_set(struct xfs_inode *ip, xfs_ino_t owner,
+ const struct xfs_name *name, struct xfs_parent_rec *pptr,
+ struct xfs_da_args *scratch);
+int xfs_parent_unset(struct xfs_inode *ip, xfs_ino_t owner,
+ const struct xfs_name *name, struct xfs_parent_rec *pptr,
+ struct xfs_da_args *scratch);
+
+#endif /* __XFS_PARENT_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index f246d6dbf4ec..386b672c5058 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1168,3 +1168,60 @@ xfs_rtsummary_wordcount(
blocks = xfs_rtsummary_blockcount(mp, rsumlevels, rbmblocks);
return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG;
}
+
+/*
+ * Lock both realtime free space metadata inodes for a freespace update. If a
+ * transaction is given, the inodes will be joined to the transaction and the
+ * ILOCKs will be released on transaction commit.
+ */
+void
+xfs_rtbitmap_lock(
+ struct xfs_trans *tp,
+ struct xfs_mount *mp)
+{
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+ if (tp)
+ xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+ if (tp)
+ xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+}
+
+/* Unlock both realtime free space metadata inodes after a freespace update. */
+void
+xfs_rtbitmap_unlock(
+ struct xfs_mount *mp)
+{
+ xfs_iunlock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+}
+
+/*
+ * Lock the realtime free space metadata inodes for a freespace scan. Callers
+ * must walk metadata blocks in order of increasing file offset.
+ */
+void
+xfs_rtbitmap_lock_shared(
+ struct xfs_mount *mp,
+ unsigned int rbmlock_flags)
+{
+ if (rbmlock_flags & XFS_RBMLOCK_BITMAP)
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+
+ if (rbmlock_flags & XFS_RBMLOCK_SUMMARY)
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
+}
+
+/* Unlock the realtime free space metadata inodes after a freespace scan. */
+void
+xfs_rtbitmap_unlock_shared(
+ struct xfs_mount *mp,
+ unsigned int rbmlock_flags)
+{
+ if (rbmlock_flags & XFS_RBMLOCK_SUMMARY)
+ xfs_iunlock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
+
+ if (rbmlock_flags & XFS_RBMLOCK_BITMAP)
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+}
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index 152a66750af5..6186585f2c37 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -360,6 +360,19 @@ xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp,
unsigned int rsumlevels, xfs_extlen_t rbmblocks);
unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp,
unsigned int rsumlevels, xfs_extlen_t rbmblocks);
+
+void xfs_rtbitmap_lock(struct xfs_trans *tp, struct xfs_mount *mp);
+void xfs_rtbitmap_unlock(struct xfs_mount *mp);
+
+/* Lock the rt bitmap inode in shared mode */
+#define XFS_RBMLOCK_BITMAP (1U << 0)
+/* Lock the rt summary inode in shared mode */
+#define XFS_RBMLOCK_SUMMARY (1U << 1)
+
+void xfs_rtbitmap_lock_shared(struct xfs_mount *mp,
+ unsigned int rbmlock_flags);
+void xfs_rtbitmap_unlock_shared(struct xfs_mount *mp,
+ unsigned int rbmlock_flags);
#else /* CONFIG_XFS_RT */
# define xfs_rtfree_extent(t,b,l) (-ENOSYS)
# define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS)
@@ -378,6 +391,10 @@ xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
# define xfs_rtbitmap_wordcount(mp, r) (0)
# define xfs_rtsummary_blockcount(mp, l, b) (0)
# define xfs_rtsummary_wordcount(mp, l, b) (0)
+# define xfs_rtbitmap_lock(tp, mp) do { } while (0)
+# define xfs_rtbitmap_unlock(mp) do { } while (0)
+# define xfs_rtbitmap_lock_shared(mp, lf) do { } while (0)
+# define xfs_rtbitmap_unlock_shared(mp, lf) do { } while (0)
#endif /* CONFIG_XFS_RT */
#endif /* __XFS_RTBITMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 73a4b895de67..09e4bf949bf8 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -26,6 +26,7 @@
#include "xfs_health.h"
#include "xfs_ag.h"
#include "xfs_rtbitmap.h"
+#include "xfs_exchrange.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -175,6 +176,10 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_NEEDSREPAIR;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64)
features |= XFS_FEAT_NREXT64;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_EXCHRANGE)
+ features |= XFS_FEAT_EXCHANGE_RANGE;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_PARENT)
+ features |= XFS_FEAT_PARENT;
return features;
}
@@ -1251,6 +1256,8 @@ xfs_fs_geometry(
geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME;
if (xfs_has_inobtcounts(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT;
+ if (xfs_has_parent(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_PARENT;
if (xfs_has_sector(mp)) {
geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR;
geo->logsectsize = sbp->sb_logsectsize;
@@ -1259,6 +1266,8 @@ xfs_fs_geometry(
}
if (xfs_has_large_extent_counts(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64;
+ if (xfs_has_exchange_range(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE;
geo->rtsectsize = sbp->sb_blocksize;
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index dfd61fa8332e..34f104ed372c 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -124,7 +124,6 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
#define XFS_TRANS_RES_FDBLKS (1u << 6)
/* Transaction contains an intent done log item */
#define XFS_TRANS_HAS_INTENT_DONE (1u << 7)
-
/*
* LOWMODE is used by the allocator to activate the lowspace algorithm - when
* free space is running low the extent allocator may choose to allocate an
@@ -136,7 +135,10 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
* for free space from AG 0. If the correct transaction reservations have been
* made then this algorithm will eventually find all the space it needs.
*/
-#define XFS_TRANS_LOWMODE 0x100 /* allocate in low space mode */
+#define XFS_TRANS_LOWMODE (1u << 8)
+
+/* Transaction has locked the rtbitmap and rtsum inodes */
+#define XFS_TRANS_RTBITMAP_LOCKED (1u << 9)
/*
* Field values for xfs_trans_mod_sb.
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index ffb1317a9212..f228127a88ff 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -169,7 +169,8 @@ xfs_symlink_local_to_remote(
struct xfs_trans *tp,
struct xfs_buf *bp,
struct xfs_inode *ip,
- struct xfs_ifork *ifp)
+ struct xfs_ifork *ifp,
+ void *priv)
{
struct xfs_mount *mp = ip->i_mount;
char *buf;
@@ -310,6 +311,7 @@ int
xfs_symlink_write_target(
struct xfs_trans *tp,
struct xfs_inode *ip,
+ xfs_ino_t owner,
const char *target_path,
int pathlen,
xfs_fsblock_t fs_blocks,
@@ -364,8 +366,7 @@ xfs_symlink_write_target(
byte_cnt = min(byte_cnt, pathlen);
buf = bp->b_addr;
- buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, byte_cnt,
- bp);
+ buf += xfs_symlink_hdr_set(mp, owner, offset, byte_cnt, bp);
memcpy(buf, cur_chunk, byte_cnt);
@@ -380,3 +381,50 @@ xfs_symlink_write_target(
ASSERT(pathlen == 0);
return 0;
}
+
+/* Remove all the blocks from a symlink and invalidate buffers. */
+int
+xfs_symlink_remote_truncate(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_buf *bp;
+ int nmaps = XFS_SYMLINK_MAPS;
+ int done = 0;
+ int i;
+ int error;
+
+ /* Read mappings and invalidate buffers. */
+ error = xfs_bmapi_read(ip, 0, XFS_MAX_FILEOFF, mval, &nmaps, 0);
+ if (error)
+ return error;
+
+ for (i = 0; i < nmaps; i++) {
+ if (!xfs_bmap_is_real_extent(&mval[i]))
+ break;
+
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
+ XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0,
+ &bp);
+ if (error)
+ return error;
+
+ xfs_trans_binval(tp, bp);
+ }
+
+ /* Unmap the remote blocks. */
+ error = xfs_bunmapi(tp, ip, 0, XFS_MAX_FILEOFF, 0, nmaps, &done);
+ if (error)
+ return error;
+ if (!done) {
+ ASSERT(done);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
+ return -EFSCORRUPTED;
+ }
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.h b/fs/xfs/libxfs/xfs_symlink_remote.h
index a63bd38ae4fa..c1672fe1f17b 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.h
+++ b/fs/xfs/libxfs/xfs_symlink_remote.h
@@ -16,11 +16,13 @@ int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
uint32_t size, struct xfs_buf *bp);
void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
- struct xfs_inode *ip, struct xfs_ifork *ifp);
+ struct xfs_inode *ip, struct xfs_ifork *ifp,
+ void *priv);
xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size);
int xfs_symlink_remote_read(struct xfs_inode *ip, char *link);
int xfs_symlink_write_target(struct xfs_trans *tp, struct xfs_inode *ip,
- const char *target_path, int pathlen, xfs_fsblock_t fs_blocks,
- uint resblks);
+ xfs_ino_t owner, const char *target_path, int pathlen,
+ xfs_fsblock_t fs_blocks, uint resblks);
+int xfs_symlink_remote_truncate(struct xfs_trans *tp, struct xfs_inode *ip);
#endif /* __XFS_SYMLINK_REMOTE_H */
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 6cd45e8c118d..6dbe6e7251e7 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -20,6 +20,9 @@
#include "xfs_qm.h"
#include "xfs_trans_space.h"
#include "xfs_rtbitmap.h"
+#include "xfs_attr_item.h"
+#include "xfs_log.h"
+#include "xfs_da_format.h"
#define _ALLOC true
#define _FREE false
@@ -422,29 +425,110 @@ xfs_calc_itruncate_reservation_minlogsize(
return xfs_calc_itruncate_reservation(mp, true);
}
+static inline unsigned int xfs_calc_pptr_link_overhead(void)
+{
+ return sizeof(struct xfs_attri_log_format) +
+ xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) +
+ xlog_calc_iovec_len(MAXNAMELEN - 1);
+}
+static inline unsigned int xfs_calc_pptr_unlink_overhead(void)
+{
+ return sizeof(struct xfs_attri_log_format) +
+ xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) +
+ xlog_calc_iovec_len(MAXNAMELEN - 1);
+}
+static inline unsigned int xfs_calc_pptr_replace_overhead(void)
+{
+ return sizeof(struct xfs_attri_log_format) +
+ xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) +
+ xlog_calc_iovec_len(MAXNAMELEN - 1) +
+ xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) +
+ xlog_calc_iovec_len(MAXNAMELEN - 1);
+}
+
/*
* In renaming a files we can modify:
* the five inodes involved: 5 * inode size
* the two directory btrees: 2 * (max depth + v2) * dir block size
* the two directory bmap btrees: 2 * max depth * block size
* And the bmap_finish transaction can free dir and bmap blocks (two sets
- * of bmap blocks) giving:
+ * of bmap blocks) giving (t2):
* the agf for the ags in which the blocks live: 3 * sector size
* the agfl for the ags in which the blocks live: 3 * sector size
* the superblock for the free block count: sector size
* the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ * If parent pointers are enabled (t3), then each transaction in the chain
+ * must be capable of setting or removing the extended attribute
+ * containing the parent information. It must also be able to handle
+ * the three xattr intent items that track the progress of the parent
+ * pointer update.
*/
STATIC uint
xfs_calc_rename_reservation(
struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
- max((xfs_calc_inode_res(mp, 5) +
- xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
- XFS_FSB_TO_B(mp, 1))),
- (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
- XFS_FSB_TO_B(mp, 1))));
+ unsigned int overhead = XFS_DQUOT_LOGRES(mp);
+ struct xfs_trans_resv *resp = M_RES(mp);
+ unsigned int t1, t2, t3 = 0;
+
+ t1 = xfs_calc_inode_res(mp, 5) +
+ xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
+ XFS_FSB_TO_B(mp, 1));
+
+ t2 = xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
+ XFS_FSB_TO_B(mp, 1));
+
+ if (xfs_has_parent(mp)) {
+ unsigned int rename_overhead, exchange_overhead;
+
+ t3 = max(resp->tr_attrsetm.tr_logres,
+ resp->tr_attrrm.tr_logres);
+
+ /*
+ * For a standard rename, the three xattr intent log items
+ * are (1) replacing the pptr for the source file; (2)
+ * removing the pptr on the dest file; and (3) adding a
+ * pptr for the whiteout file in the src dir.
+ *
+ * For an RENAME_EXCHANGE, there are two xattr intent
+ * items to replace the pptr for both src and dest
+ * files. Link counts don't change and there is no
+ * whiteout.
+ *
+ * In the worst case we can end up relogging all log
+ * intent items to allow the log tail to move ahead, so
+ * they become overhead added to each transaction in a
+ * processing chain.
+ */
+ rename_overhead = xfs_calc_pptr_replace_overhead() +
+ xfs_calc_pptr_unlink_overhead() +
+ xfs_calc_pptr_link_overhead();
+ exchange_overhead = 2 * xfs_calc_pptr_replace_overhead();
+
+ overhead += max(rename_overhead, exchange_overhead);
+ }
+
+ return overhead + max3(t1, t2, t3);
+}
+
+static inline unsigned int
+xfs_rename_log_count(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ /* One for the rename, one more for freeing blocks */
+ unsigned int ret = XFS_RENAME_LOG_COUNT;
+
+ /*
+ * Pre-reserve enough log reservation to handle the transaction
+ * rolling needed to remove or add one parent pointer.
+ */
+ if (xfs_has_parent(mp))
+ ret += max(resp->tr_attrsetm.tr_logcount,
+ resp->tr_attrrm.tr_logcount);
+
+ return ret;
}
/*
@@ -461,6 +545,23 @@ xfs_calc_iunlink_remove_reservation(
2 * M_IGEO(mp)->inode_cluster_size;
}
+static inline unsigned int
+xfs_link_log_count(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ unsigned int ret = XFS_LINK_LOG_COUNT;
+
+ /*
+ * Pre-reserve enough log reservation to handle the transaction
+ * rolling needed to add one parent pointer.
+ */
+ if (xfs_has_parent(mp))
+ ret += resp->tr_attrsetm.tr_logcount;
+
+ return ret;
+}
+
/*
* For creating a link to an inode:
* the parent directory inode: inode size
@@ -477,14 +578,23 @@ STATIC uint
xfs_calc_link_reservation(
struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
- xfs_calc_iunlink_remove_reservation(mp) +
- max((xfs_calc_inode_res(mp, 2) +
- xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
- XFS_FSB_TO_B(mp, 1))),
- (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
- XFS_FSB_TO_B(mp, 1))));
+ unsigned int overhead = XFS_DQUOT_LOGRES(mp);
+ struct xfs_trans_resv *resp = M_RES(mp);
+ unsigned int t1, t2, t3 = 0;
+
+ overhead += xfs_calc_iunlink_remove_reservation(mp);
+ t1 = xfs_calc_inode_res(mp, 2) +
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
+ t2 = xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+
+ if (xfs_has_parent(mp)) {
+ t3 = resp->tr_attrsetm.tr_logres;
+ overhead += xfs_calc_pptr_link_overhead();
+ }
+
+ return overhead + max3(t1, t2, t3);
}
/*
@@ -499,6 +609,23 @@ xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
M_IGEO(mp)->inode_cluster_size;
}
+static inline unsigned int
+xfs_remove_log_count(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ unsigned int ret = XFS_REMOVE_LOG_COUNT;
+
+ /*
+ * Pre-reserve enough log reservation to handle the transaction
+ * rolling needed to add one parent pointer.
+ */
+ if (xfs_has_parent(mp))
+ ret += resp->tr_attrrm.tr_logcount;
+
+ return ret;
+}
+
/*
* For removing a directory entry we can modify:
* the parent directory inode: inode size
@@ -515,14 +642,24 @@ STATIC uint
xfs_calc_remove_reservation(
struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
- xfs_calc_iunlink_add_reservation(mp) +
- max((xfs_calc_inode_res(mp, 2) +
- xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
- XFS_FSB_TO_B(mp, 1))),
- (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
- XFS_FSB_TO_B(mp, 1))));
+ unsigned int overhead = XFS_DQUOT_LOGRES(mp);
+ struct xfs_trans_resv *resp = M_RES(mp);
+ unsigned int t1, t2, t3 = 0;
+
+ overhead += xfs_calc_iunlink_add_reservation(mp);
+
+ t1 = xfs_calc_inode_res(mp, 2) +
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
+ t2 = xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
+ XFS_FSB_TO_B(mp, 1));
+
+ if (xfs_has_parent(mp)) {
+ t3 = resp->tr_attrrm.tr_logres;
+ overhead += xfs_calc_pptr_unlink_overhead();
+ }
+
+ return overhead + max3(t1, t2, t3);
}
/*
@@ -571,12 +708,40 @@ xfs_calc_icreate_resv_alloc(
xfs_calc_finobt_res(mp);
}
+static inline unsigned int
+xfs_icreate_log_count(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ unsigned int ret = XFS_CREATE_LOG_COUNT;
+
+ /*
+ * Pre-reserve enough log reservation to handle the transaction
+ * rolling needed to add one parent pointer.
+ */
+ if (xfs_has_parent(mp))
+ ret += resp->tr_attrsetm.tr_logcount;
+
+ return ret;
+}
+
STATIC uint
-xfs_calc_icreate_reservation(xfs_mount_t *mp)
+xfs_calc_icreate_reservation(
+ struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
- max(xfs_calc_icreate_resv_alloc(mp),
- xfs_calc_create_resv_modify(mp));
+ struct xfs_trans_resv *resp = M_RES(mp);
+ unsigned int overhead = XFS_DQUOT_LOGRES(mp);
+ unsigned int t1, t2, t3 = 0;
+
+ t1 = xfs_calc_icreate_resv_alloc(mp);
+ t2 = xfs_calc_create_resv_modify(mp);
+
+ if (xfs_has_parent(mp)) {
+ t3 = resp->tr_attrsetm.tr_logres;
+ overhead += xfs_calc_pptr_link_overhead();
+ }
+
+ return overhead + max3(t1, t2, t3);
}
STATIC uint
@@ -589,6 +754,23 @@ xfs_calc_create_tmpfile_reservation(
return res + xfs_calc_iunlink_add_reservation(mp);
}
+static inline unsigned int
+xfs_mkdir_log_count(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ unsigned int ret = XFS_MKDIR_LOG_COUNT;
+
+ /*
+ * Pre-reserve enough log reservation to handle the transaction
+ * rolling needed to add one parent pointer.
+ */
+ if (xfs_has_parent(mp))
+ ret += resp->tr_attrsetm.tr_logcount;
+
+ return ret;
+}
+
/*
* Making a new directory is the same as creating a new file.
*/
@@ -599,6 +781,22 @@ xfs_calc_mkdir_reservation(
return xfs_calc_icreate_reservation(mp);
}
+static inline unsigned int
+xfs_symlink_log_count(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ unsigned int ret = XFS_SYMLINK_LOG_COUNT;
+
+ /*
+ * Pre-reserve enough log reservation to handle the transaction
+ * rolling needed to add one parent pointer.
+ */
+ if (xfs_has_parent(mp))
+ ret += resp->tr_attrsetm.tr_logcount;
+
+ return ret;
+}
/*
* Making a new symplink is the same as creating a new file, but
@@ -911,54 +1109,76 @@ xfs_calc_sb_reservation(
return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
}
-void
-xfs_trans_resv_calc(
+/*
+ * Namespace reservations.
+ *
+ * These get tricky when parent pointers are enabled as we have attribute
+ * modifications occurring from within these transactions. Rather than confuse
+ * each of these reservation calculations with the conditional attribute
+ * reservations, add them here in a clear and concise manner. This requires that
+ * the attribute reservations have already been calculated.
+ *
+ * Note that we only include the static attribute reservation here; the runtime
+ * reservation will have to be modified by the size of the attributes being
+ * added/removed/modified. See the comments on the attribute reservation
+ * calculations for more details.
+ */
+STATIC void
+xfs_calc_namespace_reservations(
struct xfs_mount *mp,
struct xfs_trans_resv *resp)
{
- int logcount_adj = 0;
-
- /*
- * The following transactions are logged in physical format and
- * require a permanent reservation on space.
- */
- resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false);
- resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
- resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
- resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false);
- resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
- resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+ ASSERT(resp->tr_attrsetm.tr_logres > 0);
resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
- resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT;
+ resp->tr_rename.tr_logcount = xfs_rename_log_count(mp, resp);
resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_link.tr_logres = xfs_calc_link_reservation(mp);
- resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT;
+ resp->tr_link.tr_logcount = xfs_link_log_count(mp, resp);
resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp);
- resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT;
+ resp->tr_remove.tr_logcount = xfs_remove_log_count(mp, resp);
resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp);
- resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT;
+ resp->tr_symlink.tr_logcount = xfs_symlink_log_count(mp, resp);
resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_create.tr_logres = xfs_calc_icreate_reservation(mp);
- resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
+ resp->tr_create.tr_logcount = xfs_icreate_log_count(mp, resp);
resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+ resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
+ resp->tr_mkdir.tr_logcount = xfs_mkdir_log_count(mp, resp);
+ resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+}
+
+void
+xfs_trans_resv_calc(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ int logcount_adj = 0;
+
+ /*
+ * The following transactions are logged in physical format and
+ * require a permanent reservation on space.
+ */
+ resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false);
+ resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false);
+ resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+ resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
resp->tr_create_tmpfile.tr_logres =
xfs_calc_create_tmpfile_reservation(mp);
resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT;
resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
- resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
- resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
- resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp);
resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT;
resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
@@ -988,6 +1208,8 @@ xfs_trans_resv_calc(
resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+ xfs_calc_namespace_reservations(mp, resp);
+
/*
* The following transactions are logged in logical format with
* a default log count.
diff --git a/fs/xfs/libxfs/xfs_trans_space.c b/fs/xfs/libxfs/xfs_trans_space.c
new file mode 100644
index 000000000000..b9dc3752f702
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_trans_space.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_da_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+
+/* Calculate the disk space required to add a parent pointer. */
+unsigned int
+xfs_parent_calc_space_res(
+ struct xfs_mount *mp,
+ unsigned int namelen)
+{
+ /*
+ * Parent pointers are always the first attr in an attr tree, and never
+ * larger than a block
+ */
+ return XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) +
+ XFS_NEXTENTADD_SPACE_RES(mp, namelen, XFS_ATTR_FORK);
+}
+
+unsigned int
+xfs_create_space_res(
+ struct xfs_mount *mp,
+ unsigned int namelen)
+{
+ unsigned int ret;
+
+ ret = XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp, namelen);
+ if (xfs_has_parent(mp))
+ ret += xfs_parent_calc_space_res(mp, namelen);
+
+ return ret;
+}
+
+unsigned int
+xfs_mkdir_space_res(
+ struct xfs_mount *mp,
+ unsigned int namelen)
+{
+ return xfs_create_space_res(mp, namelen);
+}
+
+unsigned int
+xfs_link_space_res(
+ struct xfs_mount *mp,
+ unsigned int namelen)
+{
+ unsigned int ret;
+
+ ret = XFS_DIRENTER_SPACE_RES(mp, namelen);
+ if (xfs_has_parent(mp))
+ ret += xfs_parent_calc_space_res(mp, namelen);
+
+ return ret;
+}
+
+unsigned int
+xfs_symlink_space_res(
+ struct xfs_mount *mp,
+ unsigned int namelen,
+ unsigned int fsblocks)
+{
+ unsigned int ret;
+
+ ret = XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp, namelen) +
+ fsblocks;
+
+ if (xfs_has_parent(mp))
+ ret += xfs_parent_calc_space_res(mp, namelen);
+
+ return ret;
+}
+
+unsigned int
+xfs_remove_space_res(
+ struct xfs_mount *mp,
+ unsigned int namelen)
+{
+ unsigned int ret = XFS_DIRREMOVE_SPACE_RES(mp);
+
+ if (xfs_has_parent(mp))
+ ret += xfs_parent_calc_space_res(mp, namelen);
+
+ return ret;
+}
+
+unsigned int
+xfs_rename_space_res(
+ struct xfs_mount *mp,
+ unsigned int src_namelen,
+ bool target_exists,
+ unsigned int target_namelen,
+ bool has_whiteout)
+{
+ unsigned int ret;
+
+ ret = XFS_DIRREMOVE_SPACE_RES(mp) +
+ XFS_DIRENTER_SPACE_RES(mp, target_namelen);
+
+ if (xfs_has_parent(mp)) {
+ if (has_whiteout)
+ ret += xfs_parent_calc_space_res(mp, src_namelen);
+ ret += 2 * xfs_parent_calc_space_res(mp, target_namelen);
+ }
+
+ if (target_exists)
+ ret += xfs_parent_calc_space_res(mp, target_namelen);
+
+ return ret;
+}
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 87b31c69a773..1155ff2d37e2 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -10,6 +10,10 @@
* Components of space reservations.
*/
+/* Worst case number of bmaps that can be held in a block. */
+#define XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp) \
+ (((mp)->m_bmap_dmxr[0]) - ((mp)->m_bmap_dmnr[0]))
+
/* Worst case number of rmaps that can be held in a block. */
#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \
(((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))
@@ -76,31 +80,32 @@
/* This macro is not used - see inline code in xfs_attr_set */
#define XFS_ATTRSET_SPACE_RES(mp, v) \
(XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v))
-#define XFS_CREATE_SPACE_RES(mp,nl) \
- (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
#define XFS_DIOSTRAT_SPACE_RES(mp, v) \
(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
#define XFS_GROWFS_SPACE_RES(mp) \
(2 * (mp)->m_alloc_maxlevels)
#define XFS_GROWFSRT_SPACE_RES(mp,b) \
((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
-#define XFS_LINK_SPACE_RES(mp,nl) \
- XFS_DIRENTER_SPACE_RES(mp,nl)
-#define XFS_MKDIR_SPACE_RES(mp,nl) \
- (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
#define XFS_QM_DQALLOC_SPACE_RES(mp) \
(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \
XFS_DQUOT_CLUSTER_SIZE_FSB)
#define XFS_QM_QINOCREATE_SPACE_RES(mp) \
XFS_IALLOC_SPACE_RES(mp)
-#define XFS_REMOVE_SPACE_RES(mp) \
- XFS_DIRREMOVE_SPACE_RES(mp)
-#define XFS_RENAME_SPACE_RES(mp,nl) \
- (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
-#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \
- (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
#define XFS_IFREE_SPACE_RES(mp) \
(xfs_has_finobt(mp) ? M_IGEO(mp)->inobt_maxlevels : 0)
+unsigned int xfs_parent_calc_space_res(struct xfs_mount *mp,
+ unsigned int namelen);
+
+unsigned int xfs_create_space_res(struct xfs_mount *mp, unsigned int namelen);
+unsigned int xfs_mkdir_space_res(struct xfs_mount *mp, unsigned int namelen);
+unsigned int xfs_link_space_res(struct xfs_mount *mp, unsigned int namelen);
+unsigned int xfs_symlink_space_res(struct xfs_mount *mp, unsigned int namelen,
+ unsigned int fsblocks);
+unsigned int xfs_remove_space_res(struct xfs_mount *mp, unsigned int namelen);
+
+unsigned int xfs_rename_space_res(struct xfs_mount *mp,
+ unsigned int src_namelen, bool target_exists,
+ unsigned int target_namelen, bool has_whiteout);
#endif /* __XFS_TRANS_SPACE_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index e954f07679dd..f8e5b67128d2 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -15,6 +15,7 @@
#include "xfs_ialloc.h"
#include "xfs_rmap.h"
#include "xfs_ag.h"
+#include "xfs_inode.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -165,8 +166,7 @@ xchk_superblock(
xchk_block_set_corrupt(sc, bp);
/* Check sb_versionnum bits that are set at mkfs time. */
- vernum_mask = cpu_to_be16(~XFS_SB_VERSION_OKBITS |
- XFS_SB_VERSION_NUMBITS |
+ vernum_mask = cpu_to_be16(XFS_SB_VERSION_NUMBITS |
XFS_SB_VERSION_ALIGNBIT |
XFS_SB_VERSION_DALIGNBIT |
XFS_SB_VERSION_SHAREDBIT |
@@ -865,6 +865,43 @@ xchk_agi_xref(
/* scrub teardown will take care of sc->sa for us */
}
+/*
+ * Check the unlinked buckets for links to bad inodes. We hold the AGI, so
+ * there cannot be any threads updating unlinked list pointers in this AG.
+ */
+STATIC void
+xchk_iunlink(
+ struct xfs_scrub *sc,
+ struct xfs_agi *agi)
+{
+ unsigned int i;
+ struct xfs_inode *ip;
+
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+ xfs_agino_t agino = be32_to_cpu(agi->agi_unlinked[i]);
+
+ while (agino != NULLAGINO) {
+ if (agino % XFS_AGI_UNLINKED_BUCKETS != i) {
+ xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ return;
+ }
+
+ ip = xfs_iunlink_lookup(sc->sa.pag, agino);
+ if (!ip) {
+ xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ return;
+ }
+
+ if (!xfs_inode_on_unlinked_list(ip)) {
+ xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ return;
+ }
+
+ agino = ip->i_next_unlinked;
+ }
+ }
+}
+
/* Scrub the AGI. */
int
xchk_agi(
@@ -949,6 +986,8 @@ xchk_agi(
if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ xchk_iunlink(sc, agi);
+
xchk_agi_xref(sc);
out:
return error;
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 427054b65b23..0dbc484b182f 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -21,13 +21,18 @@
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_ag.h"
+#include "xfs_inode.h"
+#include "xfs_iunlink_item.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
#include "scrub/agb_bitmap.h"
+#include "scrub/agino_bitmap.h"
#include "scrub/reap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
/* Superblock */
@@ -796,15 +801,57 @@ enum {
XREP_AGI_MAX
};
+#define XREP_AGI_LOOKUP_BATCH 32
+
+struct xrep_agi {
+ struct xfs_scrub *sc;
+
+ /* AGI buffer, tracked separately */
+ struct xfs_buf *agi_bp;
+
+ /* context for finding btree roots */
+ struct xrep_find_ag_btree fab[XREP_AGI_MAX];
+
+ /* old AGI contents in case we have to revert */
+ struct xfs_agi old_agi;
+
+ /* bitmap of which inodes are unlinked */
+ struct xagino_bitmap iunlink_bmp;
+
+ /* heads of the unlinked inode bucket lists */
+ xfs_agino_t iunlink_heads[XFS_AGI_UNLINKED_BUCKETS];
+
+ /* scratchpad for batched lookups of the radix tree */
+ struct xfs_inode *lookup_batch[XREP_AGI_LOOKUP_BATCH];
+
+ /* Map of ino -> next_ino for unlinked inode processing. */
+ struct xfarray *iunlink_next;
+
+ /* Map of ino -> prev_ino for unlinked inode processing. */
+ struct xfarray *iunlink_prev;
+};
+
+static void
+xrep_agi_buf_cleanup(
+ void *buf)
+{
+ struct xrep_agi *ragi = buf;
+
+ xfarray_destroy(ragi->iunlink_prev);
+ xfarray_destroy(ragi->iunlink_next);
+ xagino_bitmap_destroy(&ragi->iunlink_bmp);
+}
+
/*
* Given the inode btree roots described by *fab, find the roots, check them
* for sanity, and pass the root data back out via *fab.
*/
STATIC int
xrep_agi_find_btrees(
- struct xfs_scrub *sc,
- struct xrep_find_ag_btree *fab)
+ struct xrep_agi *ragi)
{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xrep_find_ag_btree *fab = ragi->fab;
struct xfs_buf *agf_bp;
struct xfs_mount *mp = sc->mp;
int error;
@@ -837,10 +884,11 @@ xrep_agi_find_btrees(
*/
STATIC void
xrep_agi_init_header(
- struct xfs_scrub *sc,
- struct xfs_buf *agi_bp,
- struct xfs_agi *old_agi)
+ struct xrep_agi *ragi)
{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_buf *agi_bp = ragi->agi_bp;
+ struct xfs_agi *old_agi = &ragi->old_agi;
struct xfs_agi *agi = agi_bp->b_addr;
struct xfs_perag *pag = sc->sa.pag;
struct xfs_mount *mp = sc->mp;
@@ -856,10 +904,6 @@ xrep_agi_init_header(
if (xfs_has_crc(mp))
uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
- /* We don't know how to fix the unlinked list yet. */
- memcpy(&agi->agi_unlinked, &old_agi->agi_unlinked,
- sizeof(agi->agi_unlinked));
-
/* Mark the incore AGF data stale until we're done fixing things. */
ASSERT(xfs_perag_initialised_agi(pag));
clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
@@ -868,10 +912,12 @@ xrep_agi_init_header(
/* Set btree root information in an AGI. */
STATIC void
xrep_agi_set_roots(
- struct xfs_scrub *sc,
- struct xfs_agi *agi,
- struct xrep_find_ag_btree *fab)
+ struct xrep_agi *ragi)
{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_agi *agi = ragi->agi_bp->b_addr;
+ struct xrep_find_ag_btree *fab = ragi->fab;
+
agi->agi_root = cpu_to_be32(fab[XREP_AGI_INOBT].root);
agi->agi_level = cpu_to_be32(fab[XREP_AGI_INOBT].height);
@@ -884,9 +930,10 @@ xrep_agi_set_roots(
/* Update the AGI counters. */
STATIC int
xrep_agi_calc_from_btrees(
- struct xfs_scrub *sc,
- struct xfs_buf *agi_bp)
+ struct xrep_agi *ragi)
{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_buf *agi_bp = ragi->agi_bp;
struct xfs_btree_cur *cur;
struct xfs_agi *agi = agi_bp->b_addr;
struct xfs_mount *mp = sc->mp;
@@ -928,12 +975,721 @@ err:
return error;
}
+/*
+ * Record a forwards unlinked chain pointer from agino -> next_agino in our
+ * staging information.
+ */
+static inline int
+xrep_iunlink_store_next(
+ struct xrep_agi *ragi,
+ xfs_agino_t agino,
+ xfs_agino_t next_agino)
+{
+ ASSERT(next_agino != 0);
+
+ return xfarray_store(ragi->iunlink_next, agino, &next_agino);
+}
+
+/*
+ * Record a backwards unlinked chain pointer from prev_ino <- agino in our
+ * staging information.
+ */
+static inline int
+xrep_iunlink_store_prev(
+ struct xrep_agi *ragi,
+ xfs_agino_t agino,
+ xfs_agino_t prev_agino)
+{
+ ASSERT(prev_agino != 0);
+
+ return xfarray_store(ragi->iunlink_prev, agino, &prev_agino);
+}
+
+/*
+ * Given an @agino, look up the next inode in the iunlink bucket. Returns
+ * NULLAGINO if we're at the end of the chain, 0 if @agino is not in memory
+ * like it should be, or a per-AG inode number.
+ */
+static inline xfs_agino_t
+xrep_iunlink_next(
+ struct xfs_scrub *sc,
+ xfs_agino_t agino)
+{
+ struct xfs_inode *ip;
+
+ ip = xfs_iunlink_lookup(sc->sa.pag, agino);
+ if (!ip)
+ return 0;
+
+ return ip->i_next_unlinked;
+}
+
+/*
+ * Load the inode @agino into memory, set its i_prev_unlinked, and drop the
+ * inode so it can be inactivated. Returns NULLAGINO if we're at the end of
+ * the chain or if we should stop walking the chain due to corruption; or a
+ * per-AG inode number.
+ */
+STATIC xfs_agino_t
+xrep_iunlink_reload_next(
+ struct xrep_agi *ragi,
+ xfs_agino_t prev_agino,
+ xfs_agino_t agino)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_inode *ip;
+ xfs_ino_t ino;
+ xfs_agino_t ret = NULLAGINO;
+ int error;
+
+ ino = XFS_AGINO_TO_INO(sc->mp, sc->sa.pag->pag_agno, agino);
+ error = xchk_iget(ragi->sc, ino, &ip);
+ if (error)
+ return ret;
+
+ trace_xrep_iunlink_reload_next(ip, prev_agino);
+
+ /* If this is a linked inode, stop processing the chain. */
+ if (VFS_I(ip)->i_nlink != 0) {
+ xrep_iunlink_store_next(ragi, agino, NULLAGINO);
+ goto rele;
+ }
+
+ ip->i_prev_unlinked = prev_agino;
+ ret = ip->i_next_unlinked;
+
+ /*
+ * Drop the inode reference that we just took. We hold the AGI, so
+ * this inode cannot move off the unlinked list and hence cannot be
+ * reclaimed.
+ */
+rele:
+ xchk_irele(sc, ip);
+ return ret;
+}
+
+/*
+ * Walk an AGI unlinked bucket's list to load incore any unlinked inodes that
+ * still existed at mount time. This can happen if iunlink processing fails
+ * during log recovery.
+ */
+STATIC int
+xrep_iunlink_walk_ondisk_bucket(
+ struct xrep_agi *ragi,
+ unsigned int bucket)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
+ xfs_agino_t prev_agino = NULLAGINO;
+ xfs_agino_t next_agino;
+ int error = 0;
+
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket]);
+ while (next_agino != NULLAGINO) {
+ xfs_agino_t agino = next_agino;
+
+ if (xchk_should_terminate(ragi->sc, &error))
+ return error;
+
+ trace_xrep_iunlink_walk_ondisk_bucket(sc->sa.pag, bucket,
+ prev_agino, agino);
+
+ if (bucket != agino % XFS_AGI_UNLINKED_BUCKETS)
+ break;
+
+ next_agino = xrep_iunlink_next(sc, agino);
+ if (!next_agino)
+ next_agino = xrep_iunlink_reload_next(ragi, prev_agino,
+ agino);
+
+ prev_agino = agino;
+ }
+
+ return 0;
+}
+
+/* Decide if this is an unlinked inode in this AG. */
+STATIC bool
+xrep_iunlink_igrab(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+
+ if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+ return false;
+
+ if (!xfs_inode_on_unlinked_list(ip))
+ return false;
+
+ return true;
+}
+
+/*
+ * Mark the given inode in the lookup batch in our unlinked inode bitmap, and
+ * remember if this inode is the start of the unlinked chain.
+ */
+STATIC int
+xrep_iunlink_visit(
+ struct xrep_agi *ragi,
+ unsigned int batch_idx)
+{
+ struct xfs_mount *mp = ragi->sc->mp;
+ struct xfs_inode *ip = ragi->lookup_batch[batch_idx];
+ xfs_agino_t agino;
+ unsigned int bucket;
+ int error;
+
+ ASSERT(XFS_INO_TO_AGNO(mp, ip->i_ino) == ragi->sc->sa.pag->pag_agno);
+ ASSERT(xfs_inode_on_unlinked_list(ip));
+
+ agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
+
+ trace_xrep_iunlink_visit(ragi->sc->sa.pag, bucket,
+ ragi->iunlink_heads[bucket], ip);
+
+ error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1);
+ if (error)
+ return error;
+
+ if (ip->i_prev_unlinked == NULLAGINO) {
+ if (ragi->iunlink_heads[bucket] == NULLAGINO)
+ ragi->iunlink_heads[bucket] = agino;
+ }
+
+ return 0;
+}
+
+/*
+ * Find all incore unlinked inodes so that we can rebuild the unlinked buckets.
+ * We hold the AGI so there should not be any modifications to the unlinked
+ * list.
+ */
+STATIC int
+xrep_iunlink_mark_incore(
+ struct xrep_agi *ragi)
+{
+ struct xfs_perag *pag = ragi->sc->sa.pag;
+ struct xfs_mount *mp = pag->pag_mount;
+ uint32_t first_index = 0;
+ bool done = false;
+ unsigned int nr_found = 0;
+
+ do {
+ unsigned int i;
+ int error = 0;
+
+ if (xchk_should_terminate(ragi->sc, &error))
+ return error;
+
+ rcu_read_lock();
+
+ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+ (void **)&ragi->lookup_batch, first_index,
+ XREP_AGI_LOOKUP_BATCH);
+ if (!nr_found) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ for (i = 0; i < nr_found; i++) {
+ struct xfs_inode *ip = ragi->lookup_batch[i];
+
+ if (done || !xrep_iunlink_igrab(pag, ip))
+ ragi->lookup_batch[i] = NULL;
+
+ /*
+ * Update the index for the next lookup. Catch
+ * overflows into the next AG range which can occur if
+ * we have inodes in the last block of the AG and we
+ * are currently pointing to the last inode.
+ *
+ * Because we may see inodes that are from the wrong AG
+ * due to RCU freeing and reallocation, only update the
+ * index if it lies in this AG. It was a race that lead
+ * us to see this inode, so another lookup from the
+ * same index will not find it again.
+ */
+ if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+ continue;
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+ done = true;
+ }
+
+ /* unlock now we've grabbed the inodes. */
+ rcu_read_unlock();
+
+ for (i = 0; i < nr_found; i++) {
+ if (!ragi->lookup_batch[i])
+ continue;
+ error = xrep_iunlink_visit(ragi, i);
+ if (error)
+ return error;
+ }
+ } while (!done);
+
+ return 0;
+}
+
+/* Mark all the unlinked ondisk inodes in this inobt record in iunlink_bmp. */
+STATIC int
+xrep_iunlink_mark_ondisk_rec(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_inobt_rec_incore irec;
+ struct xrep_agi *ragi = priv;
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_agino_t agino;
+ unsigned int i;
+ int error = 0;
+
+ xfs_inobt_btrec_to_irec(mp, rec, &irec);
+
+ for (i = 0, agino = irec.ir_startino;
+ i < XFS_INODES_PER_CHUNK;
+ i++, agino++) {
+ struct xfs_inode *ip;
+ unsigned int len = 1;
+
+ /* Skip free inodes */
+ if (XFS_INOBT_MASK(i) & irec.ir_free)
+ continue;
+ /* Skip inodes we've seen before */
+ if (xagino_bitmap_test(&ragi->iunlink_bmp, agino, &len))
+ continue;
+
+ /*
+ * Skip incore inodes; these were already picked up by
+ * the _mark_incore step.
+ */
+ rcu_read_lock();
+ ip = radix_tree_lookup(&sc->sa.pag->pag_ici_root, agino);
+ rcu_read_unlock();
+ if (ip)
+ continue;
+
+ /*
+ * Try to look up this inode. If we can't get it, just move
+ * on because we haven't actually scrubbed the inobt or the
+ * inodes yet.
+ */
+ error = xchk_iget(ragi->sc,
+ XFS_AGINO_TO_INO(mp, sc->sa.pag->pag_agno,
+ agino),
+ &ip);
+ if (error)
+ continue;
+
+ trace_xrep_iunlink_reload_ondisk(ip);
+
+ if (VFS_I(ip)->i_nlink == 0)
+ error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+
+/*
+ * Find ondisk inodes that are unlinked and not in cache, and mark them in
+ * iunlink_bmp. We haven't checked the inobt yet, so we don't error out if
+ * the btree is corrupt.
+ */
+STATIC void
+xrep_iunlink_mark_ondisk(
+ struct xrep_agi *ragi)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_buf *agi_bp = ragi->agi_bp;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
+ error = xfs_btree_query_all(cur, xrep_iunlink_mark_ondisk_rec, ragi);
+ xfs_btree_del_cursor(cur, error);
+}
+
+/*
+ * Walk an iunlink bucket's inode list. For each inode that should be on this
+ * chain, clear its entry in in iunlink_bmp because it's ok and we don't need
+ * to touch it further.
+ */
+STATIC int
+xrep_iunlink_resolve_bucket(
+ struct xrep_agi *ragi,
+ unsigned int bucket)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_inode *ip;
+ xfs_agino_t prev_agino = NULLAGINO;
+ xfs_agino_t next_agino = ragi->iunlink_heads[bucket];
+ int error = 0;
+
+ while (next_agino != NULLAGINO) {
+ if (xchk_should_terminate(ragi->sc, &error))
+ return error;
+
+ /* Find the next inode in the chain. */
+ ip = xfs_iunlink_lookup(sc->sa.pag, next_agino);
+ if (!ip) {
+ /* Inode not incore? Terminate the chain. */
+ trace_xrep_iunlink_resolve_uncached(sc->sa.pag,
+ bucket, prev_agino, next_agino);
+
+ next_agino = NULLAGINO;
+ break;
+ }
+
+ if (next_agino % XFS_AGI_UNLINKED_BUCKETS != bucket) {
+ /*
+ * Inode is in the wrong bucket. Advance the list,
+ * but pretend we didn't see this inode.
+ */
+ trace_xrep_iunlink_resolve_wronglist(sc->sa.pag,
+ bucket, prev_agino, next_agino);
+
+ next_agino = ip->i_next_unlinked;
+ continue;
+ }
+
+ if (!xfs_inode_on_unlinked_list(ip)) {
+ /*
+ * Incore inode doesn't think this inode is on an
+ * unlinked list. This is probably because we reloaded
+ * it from disk. Advance the list, but pretend we
+ * didn't see this inode; we'll fix that later.
+ */
+ trace_xrep_iunlink_resolve_nolist(sc->sa.pag,
+ bucket, prev_agino, next_agino);
+ next_agino = ip->i_next_unlinked;
+ continue;
+ }
+
+ trace_xrep_iunlink_resolve_ok(sc->sa.pag, bucket, prev_agino,
+ next_agino);
+
+ /*
+ * Otherwise, this inode's unlinked pointers are ok. Clear it
+ * from the unlinked bitmap since we're done with it, and make
+ * sure the chain is still correct.
+ */
+ error = xagino_bitmap_clear(&ragi->iunlink_bmp, next_agino, 1);
+ if (error)
+ return error;
+
+ /* Remember the previous inode's next pointer. */
+ if (prev_agino != NULLAGINO) {
+ error = xrep_iunlink_store_next(ragi, prev_agino,
+ next_agino);
+ if (error)
+ return error;
+ }
+
+ /* Remember this inode's previous pointer. */
+ error = xrep_iunlink_store_prev(ragi, next_agino, prev_agino);
+ if (error)
+ return error;
+
+ /* Advance the list and remember this inode. */
+ prev_agino = next_agino;
+ next_agino = ip->i_next_unlinked;
+ }
+
+ /* Update the previous inode's next pointer. */
+ if (prev_agino != NULLAGINO) {
+ error = xrep_iunlink_store_next(ragi, prev_agino, next_agino);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Reinsert this unlinked inode into the head of the staged bucket list. */
+STATIC int
+xrep_iunlink_add_to_bucket(
+ struct xrep_agi *ragi,
+ xfs_agino_t agino)
+{
+ xfs_agino_t current_head;
+ unsigned int bucket;
+ int error;
+
+ bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
+
+ /* Point this inode at the current head of the bucket list. */
+ current_head = ragi->iunlink_heads[bucket];
+
+ trace_xrep_iunlink_add_to_bucket(ragi->sc->sa.pag, bucket, agino,
+ current_head);
+
+ error = xrep_iunlink_store_next(ragi, agino, current_head);
+ if (error)
+ return error;
+
+ /* Remember the head inode's previous pointer. */
+ if (current_head != NULLAGINO) {
+ error = xrep_iunlink_store_prev(ragi, current_head, agino);
+ if (error)
+ return error;
+ }
+
+ ragi->iunlink_heads[bucket] = agino;
+ return 0;
+}
+
+/* Reinsert unlinked inodes into the staged iunlink buckets. */
+STATIC int
+xrep_iunlink_add_lost_inodes(
+ uint32_t start,
+ uint32_t len,
+ void *priv)
+{
+ struct xrep_agi *ragi = priv;
+ int error;
+
+ for (; len > 0; start++, len--) {
+ error = xrep_iunlink_add_to_bucket(ragi, start);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Figure out the iunlink bucket values and find inodes that need to be
+ * reinserted into the list.
+ */
+STATIC int
+xrep_iunlink_rebuild_buckets(
+ struct xrep_agi *ragi)
+{
+ unsigned int i;
+ int error;
+
+ /*
+ * Walk the ondisk AGI unlinked list to find inodes that are on the
+ * list but aren't in memory. This can happen if a past log recovery
+ * tried to clear the iunlinked list but failed. Our scan rebuilds the
+ * unlinked list using incore inodes, so we must load and link them
+ * properly.
+ */
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+ error = xrep_iunlink_walk_ondisk_bucket(ragi, i);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Record all the incore unlinked inodes in iunlink_bmp that we didn't
+ * find by walking the ondisk iunlink buckets. This shouldn't happen,
+ * but we can't risk forgetting an inode somewhere.
+ */
+ error = xrep_iunlink_mark_incore(ragi);
+ if (error)
+ return error;
+
+ /*
+ * If there are ondisk inodes that are unlinked and are not been loaded
+ * into cache, record them in iunlink_bmp.
+ */
+ xrep_iunlink_mark_ondisk(ragi);
+
+ /*
+ * Walk each iunlink bucket to (re)construct as much of the incore list
+ * as would be correct. For each inode that survives this step, mark
+ * it clear in iunlink_bmp; we're done with those inodes.
+ */
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+ error = xrep_iunlink_resolve_bucket(ragi, i);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Any unlinked inodes that we didn't find through the bucket list
+ * walk (or was ignored by the walk) must be inserted into the bucket
+ * list. Stage this in memory for now.
+ */
+ return xagino_bitmap_walk(&ragi->iunlink_bmp,
+ xrep_iunlink_add_lost_inodes, ragi);
+}
+
+/* Update i_next_iunlinked for the inode @agino. */
+STATIC int
+xrep_iunlink_relink_next(
+ struct xrep_agi *ragi,
+ xfarray_idx_t idx,
+ xfs_agino_t next_agino)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_inode *ip;
+ xfarray_idx_t agino = idx - 1;
+ bool want_rele = false;
+ int error = 0;
+
+ ip = xfs_iunlink_lookup(pag, agino);
+ if (!ip) {
+ xfs_ino_t ino;
+ xfs_agino_t prev_agino;
+
+ /*
+ * No inode exists in cache. Load it off the disk so that we
+ * can reinsert it into the incore unlinked list.
+ */
+ ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
+ error = xchk_iget(sc, ino, &ip);
+ if (error)
+ return -EFSCORRUPTED;
+
+ want_rele = true;
+
+ /* Set the backward pointer since this just came off disk. */
+ error = xfarray_load(ragi->iunlink_prev, agino, &prev_agino);
+ if (error)
+ goto out_rele;
+
+ trace_xrep_iunlink_relink_prev(ip, prev_agino);
+ ip->i_prev_unlinked = prev_agino;
+ }
+
+ /* Update the forward pointer. */
+ if (ip->i_next_unlinked != next_agino) {
+ error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino);
+ if (error)
+ goto out_rele;
+
+ trace_xrep_iunlink_relink_next(ip, next_agino);
+ ip->i_next_unlinked = next_agino;
+ }
+
+out_rele:
+ /*
+ * The iunlink lookup doesn't igrab because we hold the AGI buffer lock
+ * and the inode cannot be reclaimed. However, if we used iget to load
+ * a missing inode, we must irele it here.
+ */
+ if (want_rele)
+ xchk_irele(sc, ip);
+ return error;
+}
+
+/* Update i_prev_iunlinked for the inode @agino. */
+STATIC int
+xrep_iunlink_relink_prev(
+ struct xrep_agi *ragi,
+ xfarray_idx_t idx,
+ xfs_agino_t prev_agino)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_inode *ip;
+ xfarray_idx_t agino = idx - 1;
+ bool want_rele = false;
+ int error = 0;
+
+ ASSERT(prev_agino != 0);
+
+ ip = xfs_iunlink_lookup(pag, agino);
+ if (!ip) {
+ xfs_ino_t ino;
+ xfs_agino_t next_agino;
+
+ /*
+ * No inode exists in cache. Load it off the disk so that we
+ * can reinsert it into the incore unlinked list.
+ */
+ ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
+ error = xchk_iget(sc, ino, &ip);
+ if (error)
+ return -EFSCORRUPTED;
+
+ want_rele = true;
+
+ /* Set the forward pointer since this just came off disk. */
+ error = xfarray_load(ragi->iunlink_prev, agino, &next_agino);
+ if (error)
+ goto out_rele;
+
+ error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino);
+ if (error)
+ goto out_rele;
+
+ trace_xrep_iunlink_relink_next(ip, next_agino);
+ ip->i_next_unlinked = next_agino;
+ }
+
+ /* Update the backward pointer. */
+ if (ip->i_prev_unlinked != prev_agino) {
+ trace_xrep_iunlink_relink_prev(ip, prev_agino);
+ ip->i_prev_unlinked = prev_agino;
+ }
+
+out_rele:
+ /*
+ * The iunlink lookup doesn't igrab because we hold the AGI buffer lock
+ * and the inode cannot be reclaimed. However, if we used iget to load
+ * a missing inode, we must irele it here.
+ */
+ if (want_rele)
+ xchk_irele(sc, ip);
+ return error;
+}
+
+/* Log all the iunlink updates we need to finish regenerating the AGI. */
+STATIC int
+xrep_iunlink_commit(
+ struct xrep_agi *ragi)
+{
+ struct xfs_agi *agi = ragi->agi_bp->b_addr;
+ xfarray_idx_t idx = XFARRAY_CURSOR_INIT;
+ xfs_agino_t agino;
+ unsigned int i;
+ int error;
+
+ /* Fix all the forward links */
+ while ((error = xfarray_iter(ragi->iunlink_next, &idx, &agino)) == 1) {
+ error = xrep_iunlink_relink_next(ragi, idx, agino);
+ if (error)
+ return error;
+ }
+
+ /* Fix all the back links */
+ idx = XFARRAY_CURSOR_INIT;
+ while ((error = xfarray_iter(ragi->iunlink_prev, &idx, &agino)) == 1) {
+ error = xrep_iunlink_relink_prev(ragi, idx, agino);
+ if (error)
+ return error;
+ }
+
+ /* Copy the staged iunlink buckets to the new AGI. */
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+ trace_xrep_iunlink_commit_bucket(ragi->sc->sa.pag, i,
+ be32_to_cpu(ragi->old_agi.agi_unlinked[i]),
+ ragi->iunlink_heads[i]);
+
+ agi->agi_unlinked[i] = cpu_to_be32(ragi->iunlink_heads[i]);
+ }
+
+ return 0;
+}
+
/* Trigger reinitialization of the in-core data. */
STATIC int
xrep_agi_commit_new(
- struct xfs_scrub *sc,
- struct xfs_buf *agi_bp)
+ struct xrep_agi *ragi)
{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_buf *agi_bp = ragi->agi_bp;
struct xfs_perag *pag;
struct xfs_agi *agi = agi_bp->b_addr;
@@ -956,33 +1712,58 @@ xrep_agi_commit_new(
/* Repair the AGI. */
int
xrep_agi(
- struct xfs_scrub *sc)
+ struct xfs_scrub *sc)
{
- struct xrep_find_ag_btree fab[XREP_AGI_MAX] = {
- [XREP_AGI_INOBT] = {
- .rmap_owner = XFS_RMAP_OWN_INOBT,
- .buf_ops = &xfs_inobt_buf_ops,
- .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
- },
- [XREP_AGI_FINOBT] = {
- .rmap_owner = XFS_RMAP_OWN_INOBT,
- .buf_ops = &xfs_finobt_buf_ops,
- .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
- },
- [XREP_AGI_END] = {
- .buf_ops = NULL
- },
- };
- struct xfs_agi old_agi;
- struct xfs_mount *mp = sc->mp;
- struct xfs_buf *agi_bp;
- struct xfs_agi *agi;
- int error;
+ struct xrep_agi *ragi;
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ unsigned int i;
+ int error;
/* We require the rmapbt to rebuild anything. */
if (!xfs_has_rmapbt(mp))
return -EOPNOTSUPP;
+ sc->buf = kzalloc(sizeof(struct xrep_agi), XCHK_GFP_FLAGS);
+ if (!sc->buf)
+ return -ENOMEM;
+ ragi = sc->buf;
+ ragi->sc = sc;
+
+ ragi->fab[XREP_AGI_INOBT] = (struct xrep_find_ag_btree){
+ .rmap_owner = XFS_RMAP_OWN_INOBT,
+ .buf_ops = &xfs_inobt_buf_ops,
+ .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
+ };
+ ragi->fab[XREP_AGI_FINOBT] = (struct xrep_find_ag_btree){
+ .rmap_owner = XFS_RMAP_OWN_INOBT,
+ .buf_ops = &xfs_finobt_buf_ops,
+ .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
+ };
+ ragi->fab[XREP_AGI_END] = (struct xrep_find_ag_btree){
+ .buf_ops = NULL,
+ };
+
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+ ragi->iunlink_heads[i] = NULLAGINO;
+
+ xagino_bitmap_init(&ragi->iunlink_bmp);
+ sc->buf_cleanup = xrep_agi_buf_cleanup;
+
+ descr = xchk_xfile_ag_descr(sc, "iunlinked next pointers");
+ error = xfarray_create(descr, 0, sizeof(xfs_agino_t),
+ &ragi->iunlink_next);
+ kfree(descr);
+ if (error)
+ return error;
+
+ descr = xchk_xfile_ag_descr(sc, "iunlinked prev pointers");
+ error = xfarray_create(descr, 0, sizeof(xfs_agino_t),
+ &ragi->iunlink_prev);
+ kfree(descr);
+ if (error)
+ return error;
+
/*
* Make sure we have the AGI buffer, as scrub might have decided it
* was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED.
@@ -990,14 +1771,17 @@ xrep_agi(
error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &agi_bp, NULL);
+ XFS_FSS_TO_BB(mp, 1), 0, &ragi->agi_bp, NULL);
if (error)
return error;
- agi_bp->b_ops = &xfs_agi_buf_ops;
- agi = agi_bp->b_addr;
+ ragi->agi_bp->b_ops = &xfs_agi_buf_ops;
/* Find the AGI btree roots. */
- error = xrep_agi_find_btrees(sc, fab);
+ error = xrep_agi_find_btrees(ragi);
+ if (error)
+ return error;
+
+ error = xrep_iunlink_rebuild_buckets(ragi);
if (error)
return error;
@@ -1006,18 +1790,21 @@ xrep_agi(
return error;
/* Start rewriting the header and implant the btrees we found. */
- xrep_agi_init_header(sc, agi_bp, &old_agi);
- xrep_agi_set_roots(sc, agi, fab);
- error = xrep_agi_calc_from_btrees(sc, agi_bp);
+ xrep_agi_init_header(ragi);
+ xrep_agi_set_roots(ragi);
+ error = xrep_agi_calc_from_btrees(ragi);
+ if (error)
+ goto out_revert;
+ error = xrep_iunlink_commit(ragi);
if (error)
goto out_revert;
/* Reinitialize in-core state. */
- return xrep_agi_commit_new(sc, agi_bp);
+ return xrep_agi_commit_new(ragi);
out_revert:
/* Mark the incore AGI state stale and revert the AGI. */
clear_bit(XFS_AGSTATE_AGI_INIT, &sc->sa.pag->pag_opstate);
- memcpy(agi, &old_agi, sizeof(old_agi));
+ memcpy(ragi->agi_bp->b_addr, &ragi->old_agi, sizeof(struct xfs_agi));
return error;
}
diff --git a/fs/xfs/scrub/agino_bitmap.h b/fs/xfs/scrub/agino_bitmap.h
new file mode 100644
index 000000000000..56d7db5f1699
--- /dev/null
+++ b/fs/xfs/scrub/agino_bitmap.h
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_AGINO_BITMAP_H__
+#define __XFS_SCRUB_AGINO_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_agino_t */
+
+struct xagino_bitmap {
+ struct xbitmap32 aginobitmap;
+};
+
+static inline void xagino_bitmap_init(struct xagino_bitmap *bitmap)
+{
+ xbitmap32_init(&bitmap->aginobitmap);
+}
+
+static inline void xagino_bitmap_destroy(struct xagino_bitmap *bitmap)
+{
+ xbitmap32_destroy(&bitmap->aginobitmap);
+}
+
+static inline int xagino_bitmap_clear(struct xagino_bitmap *bitmap,
+ xfs_agino_t agino, unsigned int len)
+{
+ return xbitmap32_clear(&bitmap->aginobitmap, agino, len);
+}
+
+static inline int xagino_bitmap_set(struct xagino_bitmap *bitmap,
+ xfs_agino_t agino, unsigned int len)
+{
+ return xbitmap32_set(&bitmap->aginobitmap, agino, len);
+}
+
+static inline bool xagino_bitmap_test(struct xagino_bitmap *bitmap,
+ xfs_agino_t agino, unsigned int *len)
+{
+ return xbitmap32_test(&bitmap->aginobitmap, agino, len);
+}
+
+static inline int xagino_bitmap_walk(struct xagino_bitmap *bitmap,
+ xbitmap32_walk_fn fn, void *priv)
+{
+ return xbitmap32_walk(&bitmap->aginobitmap, fn, priv);
+}
+
+#endif /* __XFS_SCRUB_AGINO_BITMAP_H__ */
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
index d421b253923e..30295898cc8a 100644
--- a/fs/xfs/scrub/alloc_repair.c
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -778,7 +778,7 @@ xrep_abt_build_new_trees(
error = xrep_bnobt_sort_records(ra);
if (error)
- return error;
+ goto err_levels;
/* Load the free space by block number tree. */
ra->array_cur = XFARRAY_CURSOR_INIT;
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 83c7feb38714..708334f9b2bd 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -10,16 +10,20 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_log_format.h"
+#include "xfs_trans.h"
#include "xfs_inode.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
#include "xfs_attr_sf.h"
+#include "xfs_parent.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/dabtree.h"
#include "scrub/attr.h"
+#include "scrub/listxattr.h"
+#include "scrub/repair.h"
/* Free the buffers linked from the xattr buffer. */
static void
@@ -35,6 +39,8 @@ xchk_xattr_buf_cleanup(
kvfree(ab->value);
ab->value = NULL;
ab->value_sz = 0;
+ kvfree(ab->name);
+ ab->name = NULL;
}
/*
@@ -65,7 +71,7 @@ xchk_xattr_want_freemap(
* reallocating the buffer if necessary. Buffer contents are not preserved
* across a reallocation.
*/
-static int
+int
xchk_setup_xattr_buf(
struct xfs_scrub *sc,
size_t value_size)
@@ -95,6 +101,12 @@ xchk_setup_xattr_buf(
return -ENOMEM;
}
+ if (xchk_could_repair(sc)) {
+ ab->name = kvmalloc(XATTR_NAME_MAX + 1, XCHK_GFP_FLAGS);
+ if (!ab->name)
+ return -ENOMEM;
+ }
+
resize_value:
if (ab->value_sz >= value_size)
return 0;
@@ -121,6 +133,12 @@ xchk_setup_xattr(
{
int error;
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_xattr(sc);
+ if (error)
+ return error;
+ }
+
/*
* We failed to get memory while checking attrs, so this time try to
* get all the memory we're ever going to need. Allocate the buffer
@@ -137,106 +155,105 @@ xchk_setup_xattr(
/* Extended Attributes */
-struct xchk_xattr {
- struct xfs_attr_list_context context;
- struct xfs_scrub *sc;
-};
-
/*
* Check that an extended attribute key can be looked up by hash.
*
- * We use the XFS attribute list iterator (i.e. xfs_attr_list_ilocked)
- * to call this function for every attribute key in an inode. Once
- * we're here, we load the attribute value to see if any errors happen,
- * or if we get more or less data than we expected.
+ * We use the extended attribute walk helper to call this function for every
+ * attribute key in an inode. Once we're here, we load the attribute value to
+ * see if any errors happen, or if we get more or less data than we expected.
*/
-static void
-xchk_xattr_listent(
- struct xfs_attr_list_context *context,
- int flags,
- unsigned char *name,
- int namelen,
- int valuelen)
+static int
+xchk_xattr_actor(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
{
struct xfs_da_args args = {
- .op_flags = XFS_DA_OP_NOTIME,
- .attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK,
- .geo = context->dp->i_mount->m_attr_geo,
+ .attr_filter = attr_flags & XFS_ATTR_NSP_ONDISK_MASK,
+ .geo = sc->mp->m_attr_geo,
.whichfork = XFS_ATTR_FORK,
- .dp = context->dp,
+ .dp = ip,
.name = name,
.namelen = namelen,
- .hashval = xfs_da_hashname(name, namelen),
- .trans = context->tp,
+ .trans = sc->tp,
.valuelen = valuelen,
+ .owner = ip->i_ino,
};
struct xchk_xattr_buf *ab;
- struct xchk_xattr *sx;
int error = 0;
- sx = container_of(context, struct xchk_xattr, context);
- ab = sx->sc->buf;
+ ab = sc->buf;
- if (xchk_should_terminate(sx->sc, &error)) {
- context->seen_enough = error;
- return;
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ if (attr_flags & ~XFS_ATTR_ONDISK_MASK) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno);
+ return -ECANCELED;
}
- if (flags & XFS_ATTR_INCOMPLETE) {
+ if (attr_flags & XFS_ATTR_INCOMPLETE) {
/* Incomplete attr key, just mark the inode for preening. */
- xchk_ino_set_preen(sx->sc, context->dp->i_ino);
- return;
+ xchk_ino_set_preen(sc, ip->i_ino);
+ return 0;
}
- /* Only one namespace bit allowed. */
- if (hweight32(flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) {
- xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
- goto fail_xref;
+ /* Does this name make sense? */
+ if (!xfs_attr_namecheck(attr_flags, name, namelen)) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno);
+ return -ECANCELED;
}
- /* Does this name make sense? */
- if (!xfs_attr_namecheck(name, namelen)) {
- xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
- goto fail_xref;
+ /* Check parent pointer record. */
+ if ((attr_flags & XFS_ATTR_PARENT) &&
+ !xfs_parent_valuecheck(sc->mp, value, valuelen)) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno);
+ return -ECANCELED;
}
/*
- * Local xattr values are stored in the attr leaf block, so we don't
- * need to retrieve the value from a remote block to detect corruption
- * problems.
+ * Try to allocate enough memory to extract the attr value. If that
+ * doesn't work, return -EDEADLOCK as a signal to try again with a
+ * maximally sized buffer.
*/
- if (flags & XFS_ATTR_LOCAL)
- goto fail_xref;
+ error = xchk_setup_xattr_buf(sc, valuelen);
+ if (error == -ENOMEM)
+ error = -EDEADLOCK;
+ if (error)
+ return error;
/*
- * Try to allocate enough memory to extrat the attr value. If that
- * doesn't work, we overload the seen_enough variable to convey
- * the error message back to the main scrub function.
+ * Parent pointers are matched on attr name and value, so we must
+ * supply the xfs_parent_rec here when confirming that the dabtree
+ * indexing works correctly.
*/
- error = xchk_setup_xattr_buf(sx->sc, valuelen);
- if (error == -ENOMEM)
- error = -EDEADLOCK;
- if (error) {
- context->seen_enough = error;
- return;
- }
+ if (attr_flags & XFS_ATTR_PARENT)
+ memcpy(ab->value, value, valuelen);
args.value = ab->value;
+ /*
+ * Get the attr value to ensure that lookup can find this attribute
+ * through the dabtree indexing and that remote value retrieval also
+ * works correctly.
+ */
+ xfs_attr_sethash(&args);
error = xfs_attr_get_ilocked(&args);
/* ENODATA means the hash lookup failed and the attr is bad */
if (error == -ENODATA)
error = -EFSCORRUPTED;
- if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
+ if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, args.blkno,
&error))
- goto fail_xref;
+ return error;
if (args.valuelen != valuelen)
- xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK,
- args.blkno);
-fail_xref:
- if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- context->seen_enough = 1;
- return;
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno);
+
+ return 0;
}
/*
@@ -246,7 +263,7 @@ fail_xref:
* Within a char, the lowest bit of the char represents the byte with
* the smallest address
*/
-STATIC bool
+bool
xchk_xattr_set_map(
struct xfs_scrub *sc,
unsigned long *map,
@@ -403,6 +420,17 @@ xchk_xattr_block(
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
hdrsize = xfs_attr3_leaf_hdr_size(leaf);
+ /*
+ * Empty xattr leaf blocks mapped at block 0 are probably a byproduct
+ * of a race between setxattr and a log shutdown. Anywhere else in the
+ * attr fork is a corruption.
+ */
+ if (leafhdr.count == 0) {
+ if (blk->blkno == 0)
+ xchk_da_set_preen(ds, level);
+ else
+ xchk_da_set_corrupt(ds, level);
+ }
if (leafhdr.usedbytes > mp->m_attr_geo->blksize)
xchk_da_set_corrupt(ds, level);
if (leafhdr.firstused > mp->m_attr_geo->blksize)
@@ -411,6 +439,8 @@ xchk_xattr_block(
xchk_da_set_corrupt(ds, level);
if (!xchk_xattr_set_map(ds->sc, ab->usedmap, 0, hdrsize))
xchk_da_set_corrupt(ds, level);
+ if (leafhdr.holes)
+ xchk_da_set_preen(ds, level);
if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
goto out;
@@ -463,7 +493,6 @@ xchk_xattr_rec(
xfs_dahash_t hash;
int nameidx;
int hdrsize;
- unsigned int badflags;
int error;
ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
@@ -493,10 +522,15 @@ xchk_xattr_rec(
/* Retrieve the entry and check it. */
hash = be32_to_cpu(ent->hashval);
- badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE |
- XFS_ATTR_INCOMPLETE);
- if ((ent->flags & badflags) != 0)
+ if (ent->flags & ~XFS_ATTR_ONDISK_MASK) {
+ xchk_da_set_corrupt(ds, level);
+ return 0;
+ }
+ if (!xfs_attr_check_namespace(ent->flags)) {
xchk_da_set_corrupt(ds, level);
+ return 0;
+ }
+
if (ent->flags & XFS_ATTR_LOCAL) {
lentry = (struct xfs_attr_leaf_name_local *)
(((char *)bp->b_addr) + nameidx);
@@ -504,7 +538,10 @@ xchk_xattr_rec(
xchk_da_set_corrupt(ds, level);
goto out;
}
- calc_hash = xfs_da_hashname(lentry->nameval, lentry->namelen);
+ calc_hash = xfs_attr_hashval(mp, ent->flags, lentry->nameval,
+ lentry->namelen,
+ lentry->nameval + lentry->namelen,
+ be16_to_cpu(lentry->valuelen));
} else {
rentry = (struct xfs_attr_leaf_name_remote *)
(((char *)bp->b_addr) + nameidx);
@@ -512,7 +549,13 @@ xchk_xattr_rec(
xchk_da_set_corrupt(ds, level);
goto out;
}
- calc_hash = xfs_da_hashname(rentry->name, rentry->namelen);
+ if (ent->flags & XFS_ATTR_PARENT) {
+ xchk_da_set_corrupt(ds, level);
+ goto out;
+ }
+ calc_hash = xfs_attr_hashval(mp, ent->flags, rentry->name,
+ rentry->namelen, NULL,
+ be32_to_cpu(rentry->valuelen));
}
if (calc_hash != hash)
xchk_da_set_corrupt(ds, level);
@@ -556,6 +599,15 @@ xchk_xattr_check_sf(
break;
}
+ /*
+ * Shortform entries do not set LOCAL or INCOMPLETE, so the
+ * only valid flag bits here are for namespaces.
+ */
+ if (sfe->flags & ~XFS_ATTR_NSP_ONDISK_MASK) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ break;
+ }
+
if (!xchk_xattr_set_map(sc, ab->usedmap,
(char *)sfe - (char *)sf,
sizeof(struct xfs_attr_sf_entry))) {
@@ -588,16 +640,6 @@ int
xchk_xattr(
struct xfs_scrub *sc)
{
- struct xchk_xattr sx = {
- .sc = sc,
- .context = {
- .dp = sc->ip,
- .tp = sc->tp,
- .resynch = 1,
- .put_listent = xchk_xattr_listent,
- .allow_incomplete = true,
- },
- };
xfs_dablk_t last_checked = -1U;
int error = 0;
@@ -626,12 +668,6 @@ xchk_xattr(
/*
* Look up every xattr in this file by name and hash.
*
- * Use the backend implementation of xfs_attr_list to call
- * xchk_xattr_listent on every attribute key in this inode.
- * In other words, we use the same iterator/callback mechanism
- * that listattr uses to scrub extended attributes, though in our
- * _listent function, we check the value of the attribute.
- *
* The VFS only locks i_rwsem when modifying attrs, so keep all
* three locks held because that's the only way to ensure we're
* the only thread poking into the da btree. We traverse the da
@@ -639,13 +675,9 @@ xchk_xattr(
* iteration, which doesn't really follow the usual buffer
* locking order.
*/
- error = xfs_attr_list_ilocked(&sx.context);
+ error = xchk_xattr_walk(sc, sc->ip, xchk_xattr_actor, NULL, NULL);
if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error))
return error;
- /* Did our listent function try to return any errors? */
- if (sx.context.seen_enough < 0)
- return sx.context.seen_enough;
-
return 0;
}
diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h
index 48fd9402c432..7db58af56646 100644
--- a/fs/xfs/scrub/attr.h
+++ b/fs/xfs/scrub/attr.h
@@ -16,9 +16,16 @@ struct xchk_xattr_buf {
/* Bitmap of free space in xattr leaf blocks. */
unsigned long *freemap;
+ /* Memory buffer used to hold salvaged xattr names. */
+ unsigned char *name;
+
/* Memory buffer used to extract xattr values. */
void *value;
size_t value_sz;
};
+bool xchk_xattr_set_map(struct xfs_scrub *sc, unsigned long *map,
+ unsigned int start, unsigned int len);
+int xchk_setup_xattr_buf(struct xfs_scrub *sc, size_t value_size);
+
#endif /* __XFS_SCRUB_ATTR_H__ */
diff --git a/fs/xfs/scrub/attr_repair.c b/fs/xfs/scrub/attr_repair.c
new file mode 100644
index 000000000000..c7eb94069caf
--- /dev/null
+++ b/fs/xfs/scrub/attr_repair.c
@@ -0,0 +1,1663 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_exchmaps.h"
+#include "xfs_exchrange.h"
+#include "xfs_acl.h"
+#include "xfs_parent.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/attr.h"
+#include "scrub/reap.h"
+#include "scrub/attr_repair.h"
+
+/*
+ * Extended Attribute Repair
+ * =========================
+ *
+ * We repair extended attributes by reading the attr leaf blocks looking for
+ * attributes entries that look salvageable (name passes verifiers, value can
+ * be retrieved, etc). Each extended attribute worth salvaging is stashed in
+ * memory, and the stashed entries are periodically replayed into a temporary
+ * file to constrain memory use. Batching the construction of the temporary
+ * extended attribute structure in this fashion reduces lock cycling of the
+ * file being repaired and the temporary file.
+ *
+ * When salvaging completes, the remaining stashed attributes are replayed to
+ * the temporary file. An atomic file contents exchange is used to commit the
+ * new xattr blocks to the file being repaired. This will disrupt attrmulti
+ * cursors.
+ */
+
+struct xrep_xattr_key {
+ /* Cookie for retrieval of the xattr name. */
+ xfblob_cookie name_cookie;
+
+ /* Cookie for retrieval of the xattr value. */
+ xfblob_cookie value_cookie;
+
+ /* XFS_ATTR_* flags */
+ int flags;
+
+ /* Length of the value and name. */
+ uint32_t valuelen;
+ uint16_t namelen;
+};
+
+/*
+ * Stash up to 8 pages of attrs in xattr_records/xattr_blobs before we write
+ * them to the temp file.
+ */
+#define XREP_XATTR_MAX_STASH_BYTES (PAGE_SIZE * 8)
+
+struct xrep_xattr {
+ struct xfs_scrub *sc;
+
+ /* Information for exchanging attr fork mappings at the end. */
+ struct xrep_tempexch tx;
+
+ /* xattr keys */
+ struct xfarray *xattr_records;
+
+ /* xattr values */
+ struct xfblob *xattr_blobs;
+
+ /* Number of attributes that we are salvaging. */
+ unsigned long long attrs_found;
+
+ /* Can we flush stashed attrs to the tempfile? */
+ bool can_flush;
+
+ /* Did the live update fail, and hence the repair is now out of date? */
+ bool live_update_aborted;
+
+ /* Lock protecting parent pointer updates */
+ struct mutex lock;
+
+ /* Fixed-size array of xrep_xattr_pptr structures. */
+ struct xfarray *pptr_recs;
+
+ /* Blobs containing parent pointer names. */
+ struct xfblob *pptr_names;
+
+ /* Hook to capture parent pointer updates. */
+ struct xfs_dir_hook dhook;
+
+ /* Scratch buffer for capturing parent pointers. */
+ struct xfs_da_args pptr_args;
+
+ /* Name buffer */
+ struct xfs_name xname;
+ char namebuf[MAXNAMELEN];
+};
+
+/* Create a parent pointer in the tempfile. */
+#define XREP_XATTR_PPTR_ADD (1)
+
+/* Remove a parent pointer from the tempfile. */
+#define XREP_XATTR_PPTR_REMOVE (2)
+
+/* A stashed parent pointer update. */
+struct xrep_xattr_pptr {
+ /* Cookie for retrieval of the pptr name. */
+ xfblob_cookie name_cookie;
+
+ /* Parent pointer record. */
+ struct xfs_parent_rec pptr_rec;
+
+ /* Length of the pptr name. */
+ uint8_t namelen;
+
+ /* XREP_XATTR_PPTR_{ADD,REMOVE} */
+ uint8_t action;
+};
+
+/* Set up to recreate the extended attributes. */
+int
+xrep_setup_xattr(
+ struct xfs_scrub *sc)
+{
+ if (xfs_has_parent(sc->mp))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ return xrep_tempfile_create(sc, S_IFREG);
+}
+
+/*
+ * Decide if we want to salvage this attribute. We don't bother with
+ * incomplete or oversized keys or values. The @value parameter can be null
+ * for remote attrs.
+ */
+STATIC int
+xrep_xattr_want_salvage(
+ struct xrep_xattr *rx,
+ unsigned int attr_flags,
+ const void *name,
+ int namelen,
+ const void *value,
+ int valuelen)
+{
+ if (attr_flags & XFS_ATTR_INCOMPLETE)
+ return false;
+ if (namelen > XATTR_NAME_MAX || namelen <= 0)
+ return false;
+ if (!xfs_attr_namecheck(attr_flags, name, namelen))
+ return false;
+ if (valuelen > XATTR_SIZE_MAX || valuelen < 0)
+ return false;
+ if (attr_flags & XFS_ATTR_PARENT)
+ return xfs_parent_valuecheck(rx->sc->mp, value, valuelen);
+
+ return true;
+}
+
+/* Allocate an in-core record to hold xattrs while we rebuild the xattr data. */
+STATIC int
+xrep_xattr_salvage_key(
+ struct xrep_xattr *rx,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ unsigned char *value,
+ int valuelen)
+{
+ struct xrep_xattr_key key = {
+ .valuelen = valuelen,
+ .flags = flags & XFS_ATTR_NSP_ONDISK_MASK,
+ };
+ unsigned int i = 0;
+ int error = 0;
+
+ if (xchk_should_terminate(rx->sc, &error))
+ return error;
+
+ /*
+ * Truncate the name to the first character that would trip namecheck.
+ * If we no longer have a name after that, ignore this attribute.
+ */
+ if (flags & XFS_ATTR_PARENT) {
+ key.namelen = namelen;
+
+ trace_xrep_xattr_salvage_pptr(rx->sc->ip, flags, name,
+ key.namelen, value, valuelen);
+ } else {
+ while (i < namelen && name[i] != 0)
+ i++;
+ if (i == 0)
+ return 0;
+ key.namelen = i;
+
+ trace_xrep_xattr_salvage_rec(rx->sc->ip, flags, name,
+ key.namelen, valuelen);
+ }
+
+ error = xfblob_store(rx->xattr_blobs, &key.name_cookie, name,
+ key.namelen);
+ if (error)
+ return error;
+
+ error = xfblob_store(rx->xattr_blobs, &key.value_cookie, value,
+ key.valuelen);
+ if (error)
+ return error;
+
+ error = xfarray_append(rx->xattr_records, &key);
+ if (error)
+ return error;
+
+ rx->attrs_found++;
+ return 0;
+}
+
+/*
+ * Record a shortform extended attribute key & value for later reinsertion
+ * into the inode.
+ */
+STATIC int
+xrep_xattr_salvage_sf_attr(
+ struct xrep_xattr *rx,
+ struct xfs_attr_sf_hdr *hdr,
+ struct xfs_attr_sf_entry *sfe)
+{
+ struct xfs_scrub *sc = rx->sc;
+ struct xchk_xattr_buf *ab = sc->buf;
+ unsigned char *name = sfe->nameval;
+ unsigned char *value = &sfe->nameval[sfe->namelen];
+
+ if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)name - (char *)hdr,
+ sfe->namelen))
+ return 0;
+
+ if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)value - (char *)hdr,
+ sfe->valuelen))
+ return 0;
+
+ if (!xrep_xattr_want_salvage(rx, sfe->flags, sfe->nameval,
+ sfe->namelen, value, sfe->valuelen))
+ return 0;
+
+ return xrep_xattr_salvage_key(rx, sfe->flags, sfe->nameval,
+ sfe->namelen, value, sfe->valuelen);
+}
+
+/*
+ * Record a local format extended attribute key & value for later reinsertion
+ * into the inode.
+ */
+STATIC int
+xrep_xattr_salvage_local_attr(
+ struct xrep_xattr *rx,
+ struct xfs_attr_leaf_entry *ent,
+ unsigned int nameidx,
+ const char *buf_end,
+ struct xfs_attr_leaf_name_local *lentry)
+{
+ struct xchk_xattr_buf *ab = rx->sc->buf;
+ unsigned char *value;
+ unsigned int valuelen;
+ unsigned int namesize;
+
+ /*
+ * Decode the leaf local entry format. If something seems wrong, we
+ * junk the attribute.
+ */
+ value = &lentry->nameval[lentry->namelen];
+ valuelen = be16_to_cpu(lentry->valuelen);
+ namesize = xfs_attr_leaf_entsize_local(lentry->namelen, valuelen);
+ if ((char *)lentry + namesize > buf_end)
+ return 0;
+ if (!xrep_xattr_want_salvage(rx, ent->flags, lentry->nameval,
+ lentry->namelen, value, valuelen))
+ return 0;
+ if (!xchk_xattr_set_map(rx->sc, ab->usedmap, nameidx, namesize))
+ return 0;
+
+ /* Try to save this attribute. */
+ return xrep_xattr_salvage_key(rx, ent->flags, lentry->nameval,
+ lentry->namelen, value, valuelen);
+}
+
+/*
+ * Record a remote format extended attribute key & value for later reinsertion
+ * into the inode.
+ */
+STATIC int
+xrep_xattr_salvage_remote_attr(
+ struct xrep_xattr *rx,
+ struct xfs_attr_leaf_entry *ent,
+ unsigned int nameidx,
+ const char *buf_end,
+ struct xfs_attr_leaf_name_remote *rentry,
+ unsigned int ent_idx,
+ struct xfs_buf *leaf_bp)
+{
+ struct xchk_xattr_buf *ab = rx->sc->buf;
+ struct xfs_da_args args = {
+ .trans = rx->sc->tp,
+ .dp = rx->sc->ip,
+ .index = ent_idx,
+ .geo = rx->sc->mp->m_attr_geo,
+ .owner = rx->sc->ip->i_ino,
+ .attr_filter = ent->flags & XFS_ATTR_NSP_ONDISK_MASK,
+ .namelen = rentry->namelen,
+ .name = rentry->name,
+ .value = ab->value,
+ .valuelen = be32_to_cpu(rentry->valuelen),
+ };
+ unsigned int namesize;
+ int error;
+
+ /*
+ * Decode the leaf remote entry format. If something seems wrong, we
+ * junk the attribute. Note that we should never find a zero-length
+ * remote attribute value.
+ */
+ namesize = xfs_attr_leaf_entsize_remote(rentry->namelen);
+ if ((char *)rentry + namesize > buf_end)
+ return 0;
+ if (args.valuelen == 0 ||
+ !xrep_xattr_want_salvage(rx, ent->flags, rentry->name,
+ rentry->namelen, NULL, args.valuelen))
+ return 0;
+ if (!xchk_xattr_set_map(rx->sc, ab->usedmap, nameidx, namesize))
+ return 0;
+
+ /*
+ * Enlarge the buffer (if needed) to hold the value that we're trying
+ * to salvage from the old extended attribute data.
+ */
+ error = xchk_setup_xattr_buf(rx->sc, args.valuelen);
+ if (error == -ENOMEM)
+ error = -EDEADLOCK;
+ if (error)
+ return error;
+
+ /* Look up the remote value and stash it for reconstruction. */
+ error = xfs_attr3_leaf_getvalue(leaf_bp, &args);
+ if (error || args.rmtblkno == 0)
+ goto err_free;
+
+ error = xfs_attr_rmtval_get(&args);
+ if (error)
+ goto err_free;
+
+ /* Try to save this attribute. */
+ error = xrep_xattr_salvage_key(rx, ent->flags, rentry->name,
+ rentry->namelen, ab->value, args.valuelen);
+err_free:
+ /* remote value was garbage, junk it */
+ if (error == -EFSBADCRC || error == -EFSCORRUPTED)
+ error = 0;
+ return error;
+}
+
+/* Extract every xattr key that we can from this attr fork block. */
+STATIC int
+xrep_xattr_recover_leaf(
+ struct xrep_xattr *rx,
+ struct xfs_buf *bp)
+{
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ struct xfs_scrub *sc = rx->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_name_local *lentry;
+ struct xfs_attr_leaf_name_remote *rentry;
+ struct xfs_attr_leaf_entry *ent;
+ struct xfs_attr_leaf_entry *entries;
+ struct xchk_xattr_buf *ab = rx->sc->buf;
+ char *buf_end;
+ size_t off;
+ unsigned int nameidx;
+ unsigned int hdrsize;
+ int i;
+ int error = 0;
+
+ bitmap_zero(ab->usedmap, mp->m_attr_geo->blksize);
+
+ /* Check the leaf header */
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
+ hdrsize = xfs_attr3_leaf_hdr_size(leaf);
+ xchk_xattr_set_map(sc, ab->usedmap, 0, hdrsize);
+ entries = xfs_attr3_leaf_entryp(leaf);
+
+ buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize;
+ for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /* Skip key if it conflicts with something else? */
+ off = (char *)ent - (char *)leaf;
+ if (!xchk_xattr_set_map(sc, ab->usedmap, off,
+ sizeof(xfs_attr_leaf_entry_t)))
+ continue;
+
+ /* Check the name information. */
+ nameidx = be16_to_cpu(ent->nameidx);
+ if (nameidx < leafhdr.firstused ||
+ nameidx >= mp->m_attr_geo->blksize)
+ continue;
+
+ if (ent->flags & XFS_ATTR_LOCAL) {
+ lentry = xfs_attr3_leaf_name_local(leaf, i);
+ error = xrep_xattr_salvage_local_attr(rx, ent, nameidx,
+ buf_end, lentry);
+ } else {
+ rentry = xfs_attr3_leaf_name_remote(leaf, i);
+ error = xrep_xattr_salvage_remote_attr(rx, ent, nameidx,
+ buf_end, rentry, i, bp);
+ }
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Try to recover shortform attrs. */
+STATIC int
+xrep_xattr_recover_sf(
+ struct xrep_xattr *rx)
+{
+ struct xfs_scrub *sc = rx->sc;
+ struct xchk_xattr_buf *ab = sc->buf;
+ struct xfs_attr_sf_hdr *hdr;
+ struct xfs_attr_sf_entry *sfe;
+ struct xfs_attr_sf_entry *next;
+ struct xfs_ifork *ifp;
+ unsigned char *end;
+ int i;
+ int error = 0;
+
+ ifp = xfs_ifork_ptr(rx->sc->ip, XFS_ATTR_FORK);
+ hdr = ifp->if_data;
+
+ bitmap_zero(ab->usedmap, ifp->if_bytes);
+ end = (unsigned char *)ifp->if_data + ifp->if_bytes;
+ xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(*hdr));
+
+ sfe = xfs_attr_sf_firstentry(hdr);
+ if ((unsigned char *)sfe > end)
+ return 0;
+
+ for (i = 0; i < hdr->count; i++) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ next = xfs_attr_sf_nextentry(sfe);
+ if ((unsigned char *)next > end)
+ break;
+
+ if (xchk_xattr_set_map(sc, ab->usedmap,
+ (char *)sfe - (char *)hdr,
+ sizeof(struct xfs_attr_sf_entry))) {
+ /*
+ * No conflicts with the sf entry; let's save this
+ * attribute.
+ */
+ error = xrep_xattr_salvage_sf_attr(rx, hdr, sfe);
+ if (error)
+ return error;
+ }
+
+ sfe = next;
+ }
+
+ return 0;
+}
+
+/*
+ * Try to return a buffer of xattr data for a given physical extent.
+ *
+ * Because the buffer cache get function complains if it finds a buffer
+ * matching the block number but not matching the length, we must be careful to
+ * look for incore buffers (up to the maximum length of a remote value) that
+ * could be hiding anywhere in the physical range. If we find an incore
+ * buffer, we can pass that to the caller. Optionally, read a single block and
+ * pass that back.
+ *
+ * Note the subtlety that remote attr value blocks for which there is no incore
+ * buffer will be passed to the callback one block at a time. These buffers
+ * will not have any ops attached and must be staled to prevent aliasing with
+ * multiblock buffers once we drop the ILOCK.
+ */
+STATIC int
+xrep_xattr_find_buf(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsbno,
+ xfs_extlen_t max_len,
+ bool can_read,
+ struct xfs_buf **bpp)
+{
+ struct xrep_bufscan scan = {
+ .daddr = XFS_FSB_TO_DADDR(mp, fsbno),
+ .max_sectors = xrep_bufscan_max_sectors(mp, max_len),
+ .daddr_step = XFS_FSB_TO_BB(mp, 1),
+ };
+ struct xfs_buf *bp;
+
+ while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
+ *bpp = bp;
+ return 0;
+ }
+
+ if (!can_read) {
+ *bpp = NULL;
+ return 0;
+ }
+
+ return xfs_buf_read(mp->m_ddev_targp, scan.daddr, XFS_FSB_TO_BB(mp, 1),
+ XBF_TRYLOCK, bpp, NULL);
+}
+
+/*
+ * Deal with a buffer that we found during our walk of the attr fork.
+ *
+ * Attribute leaf and node blocks are simple -- they're a single block, so we
+ * can walk them one at a time and we never have to worry about discontiguous
+ * multiblock buffers like we do for directories.
+ *
+ * Unfortunately, remote attr blocks add a lot of complexity here. Each disk
+ * block is totally self contained, in the sense that the v5 header provides no
+ * indication that there could be more data in the next block. The incore
+ * buffers can span multiple blocks, though they never cross extent records.
+ * However, they don't necessarily start or end on an extent record boundary.
+ * Therefore, we need a special buffer find function to walk the buffer cache
+ * for us.
+ *
+ * The caller must hold the ILOCK on the file being repaired. We use
+ * XBF_TRYLOCK here to skip any locked buffer on the assumption that we don't
+ * own the block and don't want to hang the system on a potentially garbage
+ * buffer.
+ */
+STATIC int
+xrep_xattr_recover_block(
+ struct xrep_xattr *rx,
+ xfs_dablk_t dabno,
+ xfs_fsblock_t fsbno,
+ xfs_extlen_t max_len,
+ xfs_extlen_t *actual_len)
+{
+ struct xfs_da_blkinfo *info;
+ struct xfs_buf *bp;
+ int error;
+
+ error = xrep_xattr_find_buf(rx->sc->mp, fsbno, max_len, true, &bp);
+ if (error)
+ return error;
+ info = bp->b_addr;
+ *actual_len = XFS_BB_TO_FSB(rx->sc->mp, bp->b_length);
+
+ trace_xrep_xattr_recover_leafblock(rx->sc->ip, dabno,
+ be16_to_cpu(info->magic));
+
+ /*
+ * If the buffer has the right magic number for an attr leaf block and
+ * passes a structure check (we don't care about checksums), salvage
+ * as much as we can from the block. */
+ if (info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) &&
+ xrep_buf_verify_struct(bp, &xfs_attr3_leaf_buf_ops) &&
+ xfs_attr3_leaf_header_check(bp, rx->sc->ip->i_ino) == NULL)
+ error = xrep_xattr_recover_leaf(rx, bp);
+
+ /*
+ * If the buffer didn't already have buffer ops set, it was read in by
+ * the _find_buf function and could very well be /part/ of a multiblock
+ * remote block. Mark it stale so that it doesn't hang around in
+ * memory to cause problems.
+ */
+ if (bp->b_ops == NULL)
+ xfs_buf_stale(bp);
+
+ xfs_buf_relse(bp);
+ return error;
+}
+
+/* Insert one xattr key/value. */
+STATIC int
+xrep_xattr_insert_rec(
+ struct xrep_xattr *rx,
+ const struct xrep_xattr_key *key)
+{
+ struct xfs_da_args args = {
+ .dp = rx->sc->tempip,
+ .attr_filter = key->flags,
+ .namelen = key->namelen,
+ .valuelen = key->valuelen,
+ .owner = rx->sc->ip->i_ino,
+ .geo = rx->sc->mp->m_attr_geo,
+ .whichfork = XFS_ATTR_FORK,
+ .op_flags = XFS_DA_OP_OKNOENT,
+ };
+ struct xchk_xattr_buf *ab = rx->sc->buf;
+ int error;
+
+ /*
+ * Grab pointers to the scrub buffer so that we can use them to insert
+ * attrs into the temp file.
+ */
+ args.name = ab->name;
+ args.value = ab->value;
+
+ /*
+ * The attribute name is stored near the end of the in-core buffer,
+ * though we reserve one more byte to ensure null termination.
+ */
+ ab->name[XATTR_NAME_MAX] = 0;
+
+ error = xfblob_load(rx->xattr_blobs, key->name_cookie, ab->name,
+ key->namelen);
+ if (error)
+ return error;
+
+ error = xfblob_free(rx->xattr_blobs, key->name_cookie);
+ if (error)
+ return error;
+
+ error = xfblob_load(rx->xattr_blobs, key->value_cookie, args.value,
+ key->valuelen);
+ if (error)
+ return error;
+
+ error = xfblob_free(rx->xattr_blobs, key->value_cookie);
+ if (error)
+ return error;
+
+ ab->name[key->namelen] = 0;
+
+ if (key->flags & XFS_ATTR_PARENT) {
+ trace_xrep_xattr_insert_pptr(rx->sc->tempip, key->flags,
+ ab->name, key->namelen, ab->value,
+ key->valuelen);
+ args.op_flags |= XFS_DA_OP_LOGGED;
+ } else {
+ trace_xrep_xattr_insert_rec(rx->sc->tempip, key->flags,
+ ab->name, key->namelen, key->valuelen);
+ }
+
+ /*
+ * xfs_attr_set creates and commits its own transaction. If the attr
+ * already exists, we'll just drop it during the rebuild.
+ */
+ xfs_attr_sethash(&args);
+ error = xfs_attr_set(&args, XFS_ATTRUPDATE_CREATE, false);
+ if (error == -EEXIST)
+ error = 0;
+
+ return error;
+}
+
+/*
+ * Periodically flush salvaged attributes to the temporary file. This is done
+ * to reduce the memory requirements of the xattr rebuild because files can
+ * contain millions of attributes.
+ */
+STATIC int
+xrep_xattr_flush_stashed(
+ struct xrep_xattr *rx)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ /*
+ * Entering this function, the scrub context has a reference to the
+ * inode being repaired, the temporary file, and a scrub transaction
+ * that we use during xattr salvaging to avoid livelocking if there
+ * are cycles in the xattr structures. We hold ILOCK_EXCL on both
+ * the inode being repaired, though it is not ijoined to the scrub
+ * transaction.
+ *
+ * To constrain kernel memory use, we occasionally flush salvaged
+ * xattrs from the xfarray and xfblob structures into the temporary
+ * file in preparation for exchanging the xattr structures at the end.
+ * Updating the temporary file requires a transaction, so we commit the
+ * scrub transaction and drop the two ILOCKs so that xfs_attr_set can
+ * allocate whatever transaction it wants.
+ *
+ * We still hold IOLOCK_EXCL on the inode being repaired, which
+ * prevents anyone from modifying the damaged xattr data while we
+ * repair it.
+ */
+ error = xrep_trans_commit(rx->sc);
+ if (error)
+ return error;
+ xchk_iunlock(rx->sc, XFS_ILOCK_EXCL);
+
+ /*
+ * Take the IOLOCK of the temporary file while we modify xattrs. This
+ * isn't strictly required because the temporary file is never revealed
+ * to userspace, but we follow the same locking rules. We still hold
+ * sc->ip's IOLOCK.
+ */
+ error = xrep_tempfile_iolock_polled(rx->sc);
+ if (error)
+ return error;
+
+ /* Add all the salvaged attrs to the temporary file. */
+ foreach_xfarray_idx(rx->xattr_records, array_cur) {
+ struct xrep_xattr_key key;
+
+ error = xfarray_load(rx->xattr_records, array_cur, &key);
+ if (error)
+ return error;
+
+ error = xrep_xattr_insert_rec(rx, &key);
+ if (error)
+ return error;
+ }
+
+ /* Empty out both arrays now that we've added the entries. */
+ xfarray_truncate(rx->xattr_records);
+ xfblob_truncate(rx->xattr_blobs);
+
+ xrep_tempfile_iounlock(rx->sc);
+
+ /* Recreate the salvage transaction and relock the inode. */
+ error = xchk_trans_alloc(rx->sc, 0);
+ if (error)
+ return error;
+ xchk_ilock(rx->sc, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/* Decide if we've stashed too much xattr data in memory. */
+static inline bool
+xrep_xattr_want_flush_stashed(
+ struct xrep_xattr *rx)
+{
+ unsigned long long bytes;
+
+ if (!rx->can_flush)
+ return false;
+
+ bytes = xfarray_bytes(rx->xattr_records) +
+ xfblob_bytes(rx->xattr_blobs);
+ return bytes > XREP_XATTR_MAX_STASH_BYTES;
+}
+
+/*
+ * Did we observe rename changing parent pointer xattrs while we were flushing
+ * salvaged attrs?
+ */
+static inline bool
+xrep_xattr_saw_pptr_conflict(
+ struct xrep_xattr *rx)
+{
+ bool ret;
+
+ ASSERT(rx->can_flush);
+
+ if (!xfs_has_parent(rx->sc->mp))
+ return false;
+
+ xfs_assert_ilocked(rx->sc->ip, XFS_ILOCK_EXCL);
+
+ mutex_lock(&rx->lock);
+ ret = xfarray_bytes(rx->pptr_recs) > 0;
+ mutex_unlock(&rx->lock);
+
+ return ret;
+}
+
+/*
+ * Reset the entire repair state back to initial conditions, now that we've
+ * detected a parent pointer update to the attr structure while we were
+ * flushing salvaged attrs. See the locking notes in dir_repair.c for more
+ * information on why this is all necessary.
+ */
+STATIC int
+xrep_xattr_full_reset(
+ struct xrep_xattr *rx)
+{
+ struct xfs_scrub *sc = rx->sc;
+ struct xfs_attr_sf_hdr *hdr;
+ struct xfs_ifork *ifp = &sc->tempip->i_af;
+ int error;
+
+ trace_xrep_xattr_full_reset(sc->ip, sc->tempip);
+
+ /* The temporary file's data fork had better not be in btree format. */
+ if (sc->tempip->i_df.if_format == XFS_DINODE_FMT_BTREE) {
+ ASSERT(0);
+ return -EIO;
+ }
+
+ /*
+ * We begin in transaction context with sc->ip ILOCKed but not joined
+ * to the transaction. To reset to the initial state, we must hold
+ * sc->ip's ILOCK to prevent rename from updating parent pointer
+ * information and the tempfile's ILOCK to clear its contents.
+ */
+ xchk_iunlock(rx->sc, XFS_ILOCK_EXCL);
+ xrep_tempfile_ilock_both(sc);
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+
+ /*
+ * Free all the blocks of the attr fork of the temp file, and reset
+ * it back to local format.
+ */
+ if (xfs_ifork_has_extents(&sc->tempip->i_af)) {
+ error = xrep_reap_ifork(sc, sc->tempip, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ ASSERT(ifp->if_bytes == 0);
+ ifp->if_format = XFS_DINODE_FMT_LOCAL;
+ xfs_idata_realloc(sc->tempip, sizeof(*hdr), XFS_ATTR_FORK);
+ }
+
+ /* Reinitialize the attr fork to an empty shortform structure. */
+ hdr = ifp->if_data;
+ memset(hdr, 0, sizeof(*hdr));
+ hdr->totsize = cpu_to_be16(sizeof(*hdr));
+ xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+
+ /*
+ * Roll this transaction to commit our reset ondisk. The tempfile
+ * should no longer be joined to the transaction, so we drop its ILOCK.
+ * This should leave us in transaction context with sc->ip ILOCKed but
+ * not joined to the transaction.
+ */
+ error = xrep_roll_trans(sc);
+ if (error)
+ return error;
+ xrep_tempfile_iunlock(sc);
+
+ /*
+ * Erase any accumulated parent pointer updates now that we've erased
+ * the tempfile's attr fork. We're resetting the entire repair state
+ * back to where we were initially, except now we won't flush salvaged
+ * xattrs until the very end.
+ */
+ mutex_lock(&rx->lock);
+ xfarray_truncate(rx->pptr_recs);
+ xfblob_truncate(rx->pptr_names);
+ mutex_unlock(&rx->lock);
+
+ rx->can_flush = false;
+ rx->attrs_found = 0;
+
+ ASSERT(xfarray_bytes(rx->xattr_records) == 0);
+ ASSERT(xfblob_bytes(rx->xattr_blobs) == 0);
+ return 0;
+}
+
+/* Extract as many attribute keys and values as we can. */
+STATIC int
+xrep_xattr_recover(
+ struct xrep_xattr *rx)
+{
+ struct xfs_bmbt_irec got;
+ struct xfs_scrub *sc = rx->sc;
+ struct xfs_da_geometry *geo = sc->mp->m_attr_geo;
+ xfs_fileoff_t offset;
+ xfs_extlen_t len;
+ xfs_dablk_t dabno;
+ int nmap;
+ int error;
+
+restart:
+ /*
+ * Iterate each xattr leaf block in the attr fork to scan them for any
+ * attributes that we might salvage.
+ */
+ for (offset = 0;
+ offset < XFS_MAX_FILEOFF;
+ offset = got.br_startoff + got.br_blockcount) {
+ nmap = 1;
+ error = xfs_bmapi_read(sc->ip, offset, XFS_MAX_FILEOFF - offset,
+ &got, &nmap, XFS_BMAPI_ATTRFORK);
+ if (error)
+ return error;
+ if (nmap != 1)
+ return -EFSCORRUPTED;
+ if (!xfs_bmap_is_written_extent(&got))
+ continue;
+
+ for (dabno = round_up(got.br_startoff, geo->fsbcount);
+ dabno < got.br_startoff + got.br_blockcount;
+ dabno += len) {
+ xfs_fileoff_t curr_offset = dabno - got.br_startoff;
+ xfs_extlen_t maxlen;
+
+ if (xchk_should_terminate(rx->sc, &error))
+ return error;
+
+ maxlen = min_t(xfs_filblks_t, INT_MAX,
+ got.br_blockcount - curr_offset);
+ error = xrep_xattr_recover_block(rx, dabno,
+ curr_offset + got.br_startblock,
+ maxlen, &len);
+ if (error)
+ return error;
+
+ if (xrep_xattr_want_flush_stashed(rx)) {
+ error = xrep_xattr_flush_stashed(rx);
+ if (error)
+ return error;
+
+ if (xrep_xattr_saw_pptr_conflict(rx)) {
+ error = xrep_xattr_full_reset(rx);
+ if (error)
+ return error;
+
+ goto restart;
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Reset the extended attribute fork to a state where we can start re-adding
+ * the salvaged attributes.
+ */
+STATIC int
+xrep_xattr_fork_remove(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ struct xfs_attr_sf_hdr *hdr;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK);
+
+ /*
+ * If the data fork is in btree format, we can't change di_forkoff
+ * because we could run afoul of the rule that the data fork isn't
+ * supposed to be in btree format if there's enough space in the fork
+ * that it could have used extents format. Instead, reinitialize the
+ * attr fork to have a shortform structure with zero attributes.
+ */
+ if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE) {
+ ifp->if_format = XFS_DINODE_FMT_LOCAL;
+ hdr = xfs_idata_realloc(ip, (int)sizeof(*hdr) - ifp->if_bytes,
+ XFS_ATTR_FORK);
+ hdr->count = 0;
+ hdr->totsize = cpu_to_be16(sizeof(*hdr));
+ xfs_trans_log_inode(sc->tp, ip,
+ XFS_ILOG_CORE | XFS_ILOG_ADATA);
+ return 0;
+ }
+
+ /* If we still have attr fork extents, something's wrong. */
+ if (ifp->if_nextents != 0) {
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec irec;
+ unsigned int i = 0;
+
+ xfs_emerg(sc->mp,
+ "inode 0x%llx attr fork still has %llu attr extents, format %d?!",
+ ip->i_ino, ifp->if_nextents, ifp->if_format);
+ for_each_xfs_iext(ifp, &icur, &irec) {
+ xfs_err(sc->mp,
+ "[%u]: startoff %llu startblock %llu blockcount %llu state %u",
+ i++, irec.br_startoff,
+ irec.br_startblock, irec.br_blockcount,
+ irec.br_state);
+ }
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ xfs_attr_fork_remove(ip, sc->tp);
+ return 0;
+}
+
+/*
+ * Free all the attribute fork blocks of the file being repaired and delete the
+ * fork. The caller must ILOCK the scrub file and join it to the transaction.
+ * This function returns with the inode joined to a clean transaction.
+ */
+int
+xrep_xattr_reset_fork(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ trace_xrep_xattr_reset_fork(sc->ip, sc->ip);
+
+ /* Unmap all the attr blocks. */
+ if (xfs_ifork_has_extents(&sc->ip->i_af)) {
+ error = xrep_reap_ifork(sc, sc->ip, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ }
+
+ error = xrep_xattr_fork_remove(sc, sc->ip);
+ if (error)
+ return error;
+
+ return xfs_trans_roll_inode(&sc->tp, sc->ip);
+}
+
+/*
+ * Free all the attribute fork blocks of the temporary file and delete the attr
+ * fork. The caller must ILOCK the tempfile and join it to the transaction.
+ * This function returns with the inode joined to a clean scrub transaction.
+ */
+int
+xrep_xattr_reset_tempfile_fork(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ trace_xrep_xattr_reset_fork(sc->ip, sc->tempip);
+
+ /*
+ * Wipe out the attr fork of the temp file so that regular inode
+ * inactivation won't trip over the corrupt attr fork.
+ */
+ if (xfs_ifork_has_extents(&sc->tempip->i_af)) {
+ error = xrep_reap_ifork(sc, sc->tempip, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ }
+
+ return xrep_xattr_fork_remove(sc, sc->tempip);
+}
+
+/*
+ * Find all the extended attributes for this inode by scraping them out of the
+ * attribute key blocks by hand, and flushing them into the temp file.
+ * When we're done, free the staging memory before exchanging the xattr
+ * structures to reduce memory usage.
+ */
+STATIC int
+xrep_xattr_salvage_attributes(
+ struct xrep_xattr *rx)
+{
+ struct xfs_inode *ip = rx->sc->ip;
+ int error;
+
+ /* Short format xattrs are easy! */
+ if (rx->sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL) {
+ error = xrep_xattr_recover_sf(rx);
+ if (error)
+ return error;
+
+ return xrep_xattr_flush_stashed(rx);
+ }
+
+ /*
+ * For non-inline xattr structures, the salvage function scans the
+ * buffer cache looking for potential attr leaf blocks. The scan
+ * requires the ability to lock any buffer found and runs independently
+ * of any transaction <-> buffer item <-> buffer linkage. Therefore,
+ * roll the transaction to ensure there are no buffers joined. We hold
+ * the ILOCK independently of the transaction.
+ */
+ error = xfs_trans_roll(&rx->sc->tp);
+ if (error)
+ return error;
+
+ error = xfs_iread_extents(rx->sc->tp, ip, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ error = xrep_xattr_recover(rx);
+ if (error)
+ return error;
+
+ return xrep_xattr_flush_stashed(rx);
+}
+
+/*
+ * Add this stashed incore parent pointer to the temporary file. The caller
+ * must hold the tempdir's IOLOCK, must not hold any ILOCKs, and must not be in
+ * transaction context.
+ */
+STATIC int
+xrep_xattr_replay_pptr_update(
+ struct xrep_xattr *rx,
+ const struct xfs_name *xname,
+ struct xrep_xattr_pptr *pptr)
+{
+ struct xfs_scrub *sc = rx->sc;
+ int error;
+
+ switch (pptr->action) {
+ case XREP_XATTR_PPTR_ADD:
+ /* Create parent pointer. */
+ trace_xrep_xattr_replay_parentadd(sc->tempip, xname,
+ &pptr->pptr_rec);
+
+ error = xfs_parent_set(sc->tempip, sc->ip->i_ino, xname,
+ &pptr->pptr_rec, &rx->pptr_args);
+ ASSERT(error != -EEXIST);
+ return error;
+ case XREP_XATTR_PPTR_REMOVE:
+ /* Remove parent pointer. */
+ trace_xrep_xattr_replay_parentremove(sc->tempip, xname,
+ &pptr->pptr_rec);
+
+ error = xfs_parent_unset(sc->tempip, sc->ip->i_ino, xname,
+ &pptr->pptr_rec, &rx->pptr_args);
+ ASSERT(error != -ENOATTR);
+ return error;
+ }
+
+ ASSERT(0);
+ return -EIO;
+}
+
+/*
+ * Flush stashed parent pointer updates that have been recorded by the scanner.
+ * This is done to reduce the memory requirements of the xattr rebuild, since
+ * files can have a lot of hardlinks and the fs can be busy.
+ *
+ * Caller must not hold transactions or ILOCKs. Caller must hold the tempfile
+ * IOLOCK.
+ */
+STATIC int
+xrep_xattr_replay_pptr_updates(
+ struct xrep_xattr *rx)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ mutex_lock(&rx->lock);
+ foreach_xfarray_idx(rx->pptr_recs, array_cur) {
+ struct xrep_xattr_pptr pptr;
+
+ error = xfarray_load(rx->pptr_recs, array_cur, &pptr);
+ if (error)
+ goto out_unlock;
+
+ error = xfblob_loadname(rx->pptr_names, pptr.name_cookie,
+ &rx->xname, pptr.namelen);
+ if (error)
+ goto out_unlock;
+ mutex_unlock(&rx->lock);
+
+ error = xrep_xattr_replay_pptr_update(rx, &rx->xname, &pptr);
+ if (error)
+ return error;
+
+ mutex_lock(&rx->lock);
+ }
+
+ /* Empty out both arrays now that we've added the entries. */
+ xfarray_truncate(rx->pptr_recs);
+ xfblob_truncate(rx->pptr_names);
+ mutex_unlock(&rx->lock);
+ return 0;
+out_unlock:
+ mutex_unlock(&rx->lock);
+ return error;
+}
+
+/*
+ * Remember that we want to create a parent pointer in the tempfile. These
+ * stashed actions will be replayed later.
+ */
+STATIC int
+xrep_xattr_stash_parentadd(
+ struct xrep_xattr *rx,
+ const struct xfs_name *name,
+ const struct xfs_inode *dp)
+{
+ struct xrep_xattr_pptr pptr = {
+ .action = XREP_XATTR_PPTR_ADD,
+ .namelen = name->len,
+ };
+ int error;
+
+ trace_xrep_xattr_stash_parentadd(rx->sc->tempip, dp, name);
+
+ xfs_inode_to_parent_rec(&pptr.pptr_rec, dp);
+ error = xfblob_storename(rx->pptr_names, &pptr.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rx->pptr_recs, &pptr);
+}
+
+/*
+ * Remember that we want to remove a parent pointer from the tempfile. These
+ * stashed actions will be replayed later.
+ */
+STATIC int
+xrep_xattr_stash_parentremove(
+ struct xrep_xattr *rx,
+ const struct xfs_name *name,
+ const struct xfs_inode *dp)
+{
+ struct xrep_xattr_pptr pptr = {
+ .action = XREP_XATTR_PPTR_REMOVE,
+ .namelen = name->len,
+ };
+ int error;
+
+ trace_xrep_xattr_stash_parentremove(rx->sc->tempip, dp, name);
+
+ xfs_inode_to_parent_rec(&pptr.pptr_rec, dp);
+ error = xfblob_storename(rx->pptr_names, &pptr.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rx->pptr_recs, &pptr);
+}
+
+/*
+ * Capture dirent updates being made by other threads. We will have to replay
+ * the parent pointer updates before exchanging attr forks.
+ */
+STATIC int
+xrep_xattr_live_dirent_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xrep_xattr *rx;
+ struct xfs_scrub *sc;
+ int error;
+
+ rx = container_of(nb, struct xrep_xattr, dhook.dirent_hook.nb);
+ sc = rx->sc;
+
+ /*
+ * This thread updated a dirent that points to the file that we're
+ * repairing, so stash the update for replay against the temporary
+ * file.
+ */
+ if (p->ip->i_ino != sc->ip->i_ino)
+ return NOTIFY_DONE;
+
+ mutex_lock(&rx->lock);
+ if (p->delta > 0)
+ error = xrep_xattr_stash_parentadd(rx, p->name, p->dp);
+ else
+ error = xrep_xattr_stash_parentremove(rx, p->name, p->dp);
+ if (error)
+ rx->live_update_aborted = true;
+ mutex_unlock(&rx->lock);
+ return NOTIFY_DONE;
+}
+
+/*
+ * Prepare both inodes' attribute forks for an exchange. Promote the tempfile
+ * from short format to leaf format, and if the file being repaired has a short
+ * format attr fork, turn it into an empty extent list.
+ */
+STATIC int
+xrep_xattr_swap_prep(
+ struct xfs_scrub *sc,
+ bool temp_local,
+ bool ip_local)
+{
+ int error;
+
+ /*
+ * If the tempfile's attributes are in shortform format, convert that
+ * to a single leaf extent so that we can use the atomic mapping
+ * exchange.
+ */
+ if (temp_local) {
+ struct xfs_da_args args = {
+ .dp = sc->tempip,
+ .geo = sc->mp->m_attr_geo,
+ .whichfork = XFS_ATTR_FORK,
+ .trans = sc->tp,
+ .total = 1,
+ .owner = sc->ip->i_ino,
+ };
+
+ error = xfs_attr_shortform_to_leaf(&args);
+ if (error)
+ return error;
+
+ /*
+ * Roll the deferred log items to get us back to a clean
+ * transaction.
+ */
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If the file being repaired had a shortform attribute fork, convert
+ * that to an empty extent list in preparation for the atomic mapping
+ * exchange.
+ */
+ if (ip_local) {
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
+
+ xfs_idestroy_fork(ifp);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+ ifp->if_nextents = 0;
+ ifp->if_bytes = 0;
+ ifp->if_data = NULL;
+ ifp->if_height = 0;
+
+ xfs_trans_log_inode(sc->tp, sc->ip,
+ XFS_ILOG_CORE | XFS_ILOG_ADATA);
+ }
+
+ return 0;
+}
+
+/* Exchange the temporary file's attribute fork with the one being repaired. */
+int
+xrep_xattr_swap(
+ struct xfs_scrub *sc,
+ struct xrep_tempexch *tx)
+{
+ bool ip_local, temp_local;
+ int error = 0;
+
+ ip_local = sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL;
+ temp_local = sc->tempip->i_af.if_format == XFS_DINODE_FMT_LOCAL;
+
+ /*
+ * If the both files have a local format attr fork and the rebuilt
+ * xattr data would fit in the repaired file's attr fork, just copy
+ * the contents from the tempfile and declare ourselves done.
+ */
+ if (ip_local && temp_local) {
+ int forkoff;
+ int newsize;
+
+ newsize = xfs_attr_sf_totsize(sc->tempip);
+ forkoff = xfs_attr_shortform_bytesfit(sc->ip, newsize);
+ if (forkoff > 0) {
+ sc->ip->i_forkoff = forkoff;
+ xrep_tempfile_copyout_local(sc, XFS_ATTR_FORK);
+ return 0;
+ }
+ }
+
+ /* Otherwise, make sure both attr forks are in block-mapping mode. */
+ error = xrep_xattr_swap_prep(sc, temp_local, ip_local);
+ if (error)
+ return error;
+
+ return xrep_tempexch_contents(sc, tx);
+}
+
+/*
+ * Finish replaying stashed parent pointer updates, allocate a transaction for
+ * exchanging extent mappings, and take the ILOCKs of both files before we
+ * commit the new extended attribute structure.
+ */
+STATIC int
+xrep_xattr_finalize_tempfile(
+ struct xrep_xattr *rx)
+{
+ struct xfs_scrub *sc = rx->sc;
+ int error;
+
+ if (!xfs_has_parent(sc->mp))
+ return xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rx->tx);
+
+ /*
+ * Repair relies on the ILOCK to quiesce all possible xattr updates.
+ * Replay all queued parent pointer updates into the tempfile before
+ * exchanging the contents, even if that means dropping the ILOCKs and
+ * the transaction.
+ */
+ do {
+ error = xrep_xattr_replay_pptr_updates(rx);
+ if (error)
+ return error;
+
+ error = xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rx->tx);
+ if (error)
+ return error;
+
+ if (xfarray_length(rx->pptr_recs) == 0)
+ break;
+
+ xchk_trans_cancel(sc);
+ xrep_tempfile_iunlock_both(sc);
+ } while (!xchk_should_terminate(sc, &error));
+ return error;
+}
+
+/*
+ * Exchange the new extended attribute data (which we created in the tempfile)
+ * with the file being repaired.
+ */
+STATIC int
+xrep_xattr_rebuild_tree(
+ struct xrep_xattr *rx)
+{
+ struct xfs_scrub *sc = rx->sc;
+ int error;
+
+ /*
+ * If we didn't find any attributes to salvage, repair the file by
+ * zapping its attr fork.
+ */
+ if (rx->attrs_found == 0) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ error = xrep_xattr_reset_fork(sc);
+ if (error)
+ return error;
+
+ goto forget_acls;
+ }
+
+ trace_xrep_xattr_rebuild_tree(sc->ip, sc->tempip);
+
+ /*
+ * Commit the repair transaction and drop the ILOCKs so that we can use
+ * the atomic file content exchange helper functions to compute the
+ * correct resource reservations.
+ *
+ * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent xattr
+ * modifications, but there's nothing to prevent userspace from reading
+ * the attributes until we're ready for the exchange operation. Reads
+ * will return -EIO without shutting down the fs, so we're ok with
+ * that.
+ */
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /*
+ * Take the IOLOCK on the temporary file so that we can run xattr
+ * operations with the same locks held as we would for a normal file.
+ * We still hold sc->ip's IOLOCK.
+ */
+ error = xrep_tempfile_iolock_polled(rx->sc);
+ if (error)
+ return error;
+
+ /*
+ * Allocate transaction, lock inodes, and make sure that we've replayed
+ * all the stashed parent pointer updates to the temp file. After this
+ * point, we're ready to exchange attr fork mappings.
+ */
+ error = xrep_xattr_finalize_tempfile(rx);
+ if (error)
+ return error;
+
+ /*
+ * Exchange the blocks mapped by the tempfile's attr fork with the file
+ * being repaired. The old attr blocks will then be attached to the
+ * tempfile, so reap its attr fork.
+ */
+ error = xrep_xattr_swap(sc, &rx->tx);
+ if (error)
+ return error;
+
+ error = xrep_xattr_reset_tempfile_fork(sc);
+ if (error)
+ return error;
+
+ /*
+ * Roll to get a transaction without any inodes joined to it. Then we
+ * can drop the tempfile's ILOCK and IOLOCK before doing more work on
+ * the scrub target file.
+ */
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ return error;
+
+ xrep_tempfile_iunlock(sc);
+ xrep_tempfile_iounlock(sc);
+
+forget_acls:
+ /* Invalidate cached ACLs now that we've reloaded all the xattrs. */
+ xfs_forget_acl(VFS_I(sc->ip), SGI_ACL_FILE);
+ xfs_forget_acl(VFS_I(sc->ip), SGI_ACL_DEFAULT);
+ return 0;
+}
+
+/* Tear down all the incore scan stuff we created. */
+STATIC void
+xrep_xattr_teardown(
+ struct xrep_xattr *rx)
+{
+ if (xfs_has_parent(rx->sc->mp))
+ xfs_dir_hook_del(rx->sc->mp, &rx->dhook);
+ if (rx->pptr_names)
+ xfblob_destroy(rx->pptr_names);
+ if (rx->pptr_recs)
+ xfarray_destroy(rx->pptr_recs);
+ xfblob_destroy(rx->xattr_blobs);
+ xfarray_destroy(rx->xattr_records);
+ mutex_destroy(&rx->lock);
+ kfree(rx);
+}
+
+/* Set up the filesystem scan so we can regenerate extended attributes. */
+STATIC int
+xrep_xattr_setup_scan(
+ struct xfs_scrub *sc,
+ struct xrep_xattr **rxp)
+{
+ struct xrep_xattr *rx;
+ char *descr;
+ int max_len;
+ int error;
+
+ rx = kzalloc(sizeof(struct xrep_xattr), XCHK_GFP_FLAGS);
+ if (!rx)
+ return -ENOMEM;
+ rx->sc = sc;
+ rx->can_flush = true;
+ rx->xname.name = rx->namebuf;
+
+ mutex_init(&rx->lock);
+
+ /*
+ * Allocate enough memory to handle loading local attr values from the
+ * xfblob data while flushing stashed attrs to the temporary file.
+ * We only realloc the buffer when salvaging remote attr values.
+ */
+ max_len = xfs_attr_leaf_entsize_local_max(sc->mp->m_attr_geo->blksize);
+ error = xchk_setup_xattr_buf(rx->sc, max_len);
+ if (error == -ENOMEM)
+ error = -EDEADLOCK;
+ if (error)
+ goto out_rx;
+
+ /* Set up some staging for salvaged attribute keys and values */
+ descr = xchk_xfile_ino_descr(sc, "xattr keys");
+ error = xfarray_create(descr, 0, sizeof(struct xrep_xattr_key),
+ &rx->xattr_records);
+ kfree(descr);
+ if (error)
+ goto out_rx;
+
+ descr = xchk_xfile_ino_descr(sc, "xattr names");
+ error = xfblob_create(descr, &rx->xattr_blobs);
+ kfree(descr);
+ if (error)
+ goto out_keys;
+
+ if (xfs_has_parent(sc->mp)) {
+ ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
+
+ descr = xchk_xfile_ino_descr(sc,
+ "xattr retained parent pointer entries");
+ error = xfarray_create(descr, 0,
+ sizeof(struct xrep_xattr_pptr),
+ &rx->pptr_recs);
+ kfree(descr);
+ if (error)
+ goto out_values;
+
+ descr = xchk_xfile_ino_descr(sc,
+ "xattr retained parent pointer names");
+ error = xfblob_create(descr, &rx->pptr_names);
+ kfree(descr);
+ if (error)
+ goto out_pprecs;
+
+ xfs_dir_hook_setup(&rx->dhook, xrep_xattr_live_dirent_update);
+ error = xfs_dir_hook_add(sc->mp, &rx->dhook);
+ if (error)
+ goto out_ppnames;
+ }
+
+ *rxp = rx;
+ return 0;
+out_ppnames:
+ xfblob_destroy(rx->pptr_names);
+out_pprecs:
+ xfarray_destroy(rx->pptr_recs);
+out_values:
+ xfblob_destroy(rx->xattr_blobs);
+out_keys:
+ xfarray_destroy(rx->xattr_records);
+out_rx:
+ mutex_destroy(&rx->lock);
+ kfree(rx);
+ return error;
+}
+
+/*
+ * Repair the extended attribute metadata.
+ *
+ * XXX: Remote attribute value buffers encompass the entire (up to 64k) buffer.
+ * The buffer cache in XFS can't handle aliased multiblock buffers, so this
+ * might misbehave if the attr fork is crosslinked with other filesystem
+ * metadata.
+ */
+int
+xrep_xattr(
+ struct xfs_scrub *sc)
+{
+ struct xrep_xattr *rx = NULL;
+ int error;
+
+ if (!xfs_inode_hasattr(sc->ip))
+ return -ENOENT;
+
+ /* The rmapbt is required to reap the old attr fork. */
+ if (!xfs_has_rmapbt(sc->mp))
+ return -EOPNOTSUPP;
+ /* We require atomic file exchange range to rebuild anything. */
+ if (!xfs_has_exchange_range(sc->mp))
+ return -EOPNOTSUPP;
+
+ error = xrep_xattr_setup_scan(sc, &rx);
+ if (error)
+ return error;
+
+ ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
+
+ error = xrep_xattr_salvage_attributes(rx);
+ if (error)
+ goto out_scan;
+
+ if (rx->live_update_aborted) {
+ error = -EIO;
+ goto out_scan;
+ }
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto out_scan;
+
+ error = xrep_xattr_rebuild_tree(rx);
+ if (error)
+ goto out_scan;
+
+out_scan:
+ xrep_xattr_teardown(rx);
+ return error;
+}
diff --git a/fs/xfs/scrub/attr_repair.h b/fs/xfs/scrub/attr_repair.h
new file mode 100644
index 000000000000..979729bd4a5f
--- /dev/null
+++ b/fs/xfs/scrub/attr_repair.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_ATTR_REPAIR_H__
+#define __XFS_SCRUB_ATTR_REPAIR_H__
+
+struct xrep_tempexch;
+
+int xrep_xattr_swap(struct xfs_scrub *sc, struct xrep_tempexch *tx);
+int xrep_xattr_reset_fork(struct xfs_scrub *sc);
+int xrep_xattr_reset_tempfile_fork(struct xfs_scrub *sc);
+
+#endif /* __XFS_SCRUB_ATTR_REPAIR_H__ */
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index 0cb8d43912a8..7ba35a7a7920 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -40,22 +40,23 @@ struct xbitmap64_node {
* These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll
* forward-declare them anyway for clarity.
*/
-static inline void
+static inline __maybe_unused void
xbitmap64_tree_insert(struct xbitmap64_node *node, struct rb_root_cached *root);
-static inline void
+static inline __maybe_unused void
xbitmap64_tree_remove(struct xbitmap64_node *node, struct rb_root_cached *root);
-static inline struct xbitmap64_node *
+static inline __maybe_unused struct xbitmap64_node *
xbitmap64_tree_iter_first(struct rb_root_cached *root, uint64_t start,
uint64_t last);
-static inline struct xbitmap64_node *
+static inline __maybe_unused struct xbitmap64_node *
xbitmap64_tree_iter_next(struct xbitmap64_node *node, uint64_t start,
uint64_t last);
INTERVAL_TREE_DEFINE(struct xbitmap64_node, bn_rbnode, uint64_t,
- __bn_subtree_last, START, LAST, static inline, xbitmap64_tree)
+ __bn_subtree_last, START, LAST, static inline __maybe_unused,
+ xbitmap64_tree)
/* Iterate each interval of a bitmap. Do not change the bitmap. */
#define for_each_xbitmap64_extent(bn, bitmap) \
@@ -314,22 +315,23 @@ struct xbitmap32_node {
* These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll
* forward-declare them anyway for clarity.
*/
-static inline void
+static inline __maybe_unused void
xbitmap32_tree_insert(struct xbitmap32_node *node, struct rb_root_cached *root);
-static inline void
+static inline __maybe_unused void
xbitmap32_tree_remove(struct xbitmap32_node *node, struct rb_root_cached *root);
-static inline struct xbitmap32_node *
+static inline __maybe_unused struct xbitmap32_node *
xbitmap32_tree_iter_first(struct rb_root_cached *root, uint32_t start,
uint32_t last);
-static inline struct xbitmap32_node *
+static inline __maybe_unused struct xbitmap32_node *
xbitmap32_tree_iter_next(struct xbitmap32_node *node, uint32_t start,
uint32_t last);
INTERVAL_TREE_DEFINE(struct xbitmap32_node, bn_rbnode, uint32_t,
- __bn_subtree_last, START, LAST, static inline, xbitmap32_tree)
+ __bn_subtree_last, START, LAST, static inline __maybe_unused,
+ xbitmap32_tree)
/* Iterate each interval of a bitmap. Do not change the bitmap. */
#define for_each_xbitmap32_extent(bn, bitmap) \
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 47a20cf5205f..1ad8ec63a7f4 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -31,6 +31,8 @@
#include "xfs_ag.h"
#include "xfs_error.h"
#include "xfs_quota.h"
+#include "xfs_exchmaps.h"
+#include "xfs_rtbitmap.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -445,7 +447,7 @@ xchk_perag_read_headers(
{
int error;
- error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp);
+ error = xfs_ialloc_read_agi(sa->pag, sc->tp, 0, &sa->agi_bp);
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
return error;
@@ -781,7 +783,7 @@ xchk_iget(
{
ASSERT(sc->tp != NULL);
- return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp);
+ return xfs_iget(sc->mp, sc->tp, inum, XCHK_IGET_FLAGS, 0, ipp);
}
/*
@@ -827,13 +829,13 @@ again:
* in the iget cache miss path.
*/
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
- error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
+ error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp);
xfs_perag_put(pag);
if (error)
return error;
- error = xfs_iget(mp, tp, inum,
- XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp);
+ error = xfs_iget(mp, tp, inum, XFS_IGET_NORETRY | XCHK_IGET_FLAGS, 0,
+ ipp);
if (error == -EAGAIN) {
/*
* The inode may be in core but temporarily unavailable and may
@@ -1060,12 +1062,6 @@ xchk_irele(
spin_lock(&VFS_I(ip)->i_lock);
VFS_I(ip)->i_state &= ~I_DONTCACHE;
spin_unlock(&VFS_I(ip)->i_lock);
- } else if (atomic_read(&VFS_I(ip)->i_count) == 1) {
- /*
- * If this is the last reference to the inode and the caller
- * permits it, set DONTCACHE to avoid thrashing.
- */
- d_mark_dontcache(VFS_I(ip));
}
xfs_irele(ip);
@@ -1202,27 +1198,12 @@ xchk_metadata_inode_subtype(
struct xfs_scrub *sc,
unsigned int scrub_type)
{
- __u32 smtype = sc->sm->sm_type;
- unsigned int sick_mask = sc->sick_mask;
+ struct xfs_scrub_subord *sub;
int error;
- sc->sm->sm_type = scrub_type;
-
- switch (scrub_type) {
- case XFS_SCRUB_TYPE_INODE:
- error = xchk_inode(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTD:
- error = xchk_bmap_data(sc);
- break;
- default:
- ASSERT(0);
- error = -EFSCORRUPTED;
- break;
- }
-
- sc->sick_mask = sick_mask;
- sc->sm->sm_type = smtype;
+ sub = xchk_scrub_create_subord(sc, scrub_type);
+ error = sub->sc.ops->scrub(&sub->sc);
+ xchk_scrub_free_subord(sub);
return error;
}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 89f7bbec887e..3d5f1f6b4b7b 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -6,31 +6,6 @@
#ifndef __XFS_SCRUB_COMMON_H__
#define __XFS_SCRUB_COMMON_H__
-/*
- * We /could/ terminate a scrub/repair operation early. If we're not
- * in a good place to continue (fatal signal, etc.) then bail out.
- * Note that we're careful not to make any judgements about *error.
- */
-static inline bool
-xchk_should_terminate(
- struct xfs_scrub *sc,
- int *error)
-{
- /*
- * If preemption is disabled, we need to yield to the scheduler every
- * few seconds so that we don't run afoul of the soft lockup watchdog
- * or RCU stall detector.
- */
- cond_resched();
-
- if (fatal_signal_pending(current)) {
- if (*error == 0)
- *error = -EINTR;
- return true;
- }
- return false;
-}
-
int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks);
int xchk_trans_alloc_empty(struct xfs_scrub *sc);
void xchk_trans_cancel(struct xfs_scrub *sc);
@@ -92,6 +67,7 @@ int xchk_setup_directory(struct xfs_scrub *sc);
int xchk_setup_xattr(struct xfs_scrub *sc);
int xchk_setup_symlink(struct xfs_scrub *sc);
int xchk_setup_parent(struct xfs_scrub *sc);
+int xchk_setup_dirtree(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
int xchk_setup_rtbitmap(struct xfs_scrub *sc);
int xchk_setup_rtsummary(struct xfs_scrub *sc);
@@ -212,6 +188,7 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
}
bool xchk_dir_looks_zapped(struct xfs_inode *dp);
+bool xchk_pptr_looks_zapped(struct xfs_inode *ip);
#ifdef CONFIG_XFS_ONLINE_REPAIR
/* Decide if a repair is required. */
diff --git a/fs/xfs/scrub/dab_bitmap.h b/fs/xfs/scrub/dab_bitmap.h
new file mode 100644
index 000000000000..0c6e3aad4395
--- /dev/null
+++ b/fs/xfs/scrub/dab_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_DAB_BITMAP_H__
+#define __XFS_SCRUB_DAB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_dablk_t */
+
+struct xdab_bitmap {
+ struct xbitmap32 dabitmap;
+};
+
+static inline void xdab_bitmap_init(struct xdab_bitmap *bitmap)
+{
+ xbitmap32_init(&bitmap->dabitmap);
+}
+
+static inline void xdab_bitmap_destroy(struct xdab_bitmap *bitmap)
+{
+ xbitmap32_destroy(&bitmap->dabitmap);
+}
+
+static inline int xdab_bitmap_set(struct xdab_bitmap *bitmap,
+ xfs_dablk_t dabno, xfs_extlen_t len)
+{
+ return xbitmap32_set(&bitmap->dabitmap, dabno, len);
+}
+
+static inline bool xdab_bitmap_test(struct xdab_bitmap *bitmap,
+ xfs_dablk_t dabno, xfs_extlen_t *len)
+{
+ return xbitmap32_test(&bitmap->dabitmap, dabno, len);
+}
+
+#endif /* __XFS_SCRUB_DAB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 82b150d3b8b7..056de4819f86 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -78,6 +78,22 @@ xchk_da_set_corrupt(
__return_address);
}
+/* Flag a da btree node in need of optimization. */
+void
+xchk_da_set_preen(
+ struct xchk_da_btree *ds,
+ int level)
+{
+ struct xfs_scrub *sc = ds->sc;
+
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
+ trace_xchk_fblock_preen(sc, ds->dargs.whichfork,
+ xfs_dir2_da_to_db(ds->dargs.geo,
+ ds->state->path.blk[level].blkno),
+ __return_address);
+}
+
+/* Find an entry at a certain level in a da btree. */
static struct xfs_da_node_entry *
xchk_da_btree_node_entry(
struct xchk_da_btree *ds,
@@ -320,6 +336,7 @@ xchk_da_btree_block(
struct xfs_da3_blkinfo *hdr3;
struct xfs_da_args *dargs = &ds->dargs;
struct xfs_inode *ip = ds->dargs.dp;
+ xfs_failaddr_t fa;
xfs_ino_t owner;
int *pmaxrecs;
struct xfs_da3_icnode_hdr nodehdr;
@@ -442,6 +459,12 @@ xchk_da_btree_block(
goto out_freebp;
}
+ fa = xfs_da3_header_check(blk->bp, dargs->owner);
+ if (fa) {
+ xchk_da_set_corrupt(ds, level);
+ goto out_freebp;
+ }
+
/*
* If we've been handed a block that is below the dabtree root, does
* its hashval match what the parent block expected to see?
@@ -494,6 +517,7 @@ xchk_da_btree(
ds->dargs.whichfork = whichfork;
ds->dargs.trans = sc->tp;
ds->dargs.op_flags = XFS_DA_OP_OKNOENT;
+ ds->dargs.owner = sc->ip->i_ino;
ds->state = xfs_da_state_alloc(&ds->dargs);
ds->sc = sc;
ds->private = private;
diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h
index 4f8c2138a1ec..de291e3b77dd 100644
--- a/fs/xfs/scrub/dabtree.h
+++ b/fs/xfs/scrub/dabtree.h
@@ -35,6 +35,9 @@ bool xchk_da_process_error(struct xchk_da_btree *ds, int level, int *error);
/* Check for da btree corruption. */
void xchk_da_set_corrupt(struct xchk_da_btree *ds, int level);
+void xchk_da_set_preen(struct xchk_da_btree *ds, int level);
+
+void xchk_da_set_preen(struct xchk_da_btree *ds, int level);
int xchk_da_btree_hash(struct xchk_da_btree *ds, int level, __be32 *hashp);
int xchk_da_btree(struct xfs_scrub *sc, int whichfork,
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 076a310b8eb0..bf9199e8df63 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -16,22 +16,70 @@
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_health.h"
+#include "xfs_attr.h"
+#include "xfs_parent.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/dabtree.h"
#include "scrub/readdir.h"
#include "scrub/health.h"
+#include "scrub/repair.h"
+#include "scrub/trace.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
/* Set us up to scrub directories. */
int
xchk_setup_directory(
struct xfs_scrub *sc)
{
+ int error;
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_directory(sc);
+ if (error)
+ return error;
+ }
+
return xchk_setup_inode_contents(sc, 0);
}
/* Directories */
+/* Deferred directory entry that we saved for later. */
+struct xchk_dirent {
+ /* Cookie for retrieval of the dirent name. */
+ xfblob_cookie name_cookie;
+
+ /* Child inode number. */
+ xfs_ino_t ino;
+
+ /* Length of the pptr name. */
+ uint8_t namelen;
+};
+
+struct xchk_dir {
+ struct xfs_scrub *sc;
+
+ /* information for parent pointer validation. */
+ struct xfs_parent_rec pptr_rec;
+ struct xfs_da_args pptr_args;
+
+ /* Fixed-size array of xchk_dirent structures. */
+ struct xfarray *dir_entries;
+
+ /* Blobs containing dirent names. */
+ struct xfblob *dir_names;
+
+ /* If we've cycled the ILOCK, we must revalidate deferred dirents. */
+ bool need_revalidate;
+
+ /* Name buffer for dirent revalidation. */
+ struct xfs_name xname;
+ uint8_t namebuf[MAXNAMELEN];
+};
+
/* Scrub a directory entry. */
/* Check that an inode's mode matches a given XFS_DIR3_FT_* type. */
@@ -55,6 +103,108 @@ xchk_dir_check_ftype(
}
/*
+ * Try to lock a child file for checking parent pointers. Returns the inode
+ * flags for the locks we now hold, or zero if we failed.
+ */
+STATIC unsigned int
+xchk_dir_lock_child(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
+ return 0;
+
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return 0;
+ }
+
+ if (!xfs_inode_has_attr_fork(ip) || !xfs_need_iread_extents(&ip->i_af))
+ return XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED;
+
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return 0;
+ }
+
+ return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL;
+}
+
+/* Check the backwards link (parent pointer) associated with this dirent. */
+STATIC int
+xchk_dir_parent_pointer(
+ struct xchk_dir *sd,
+ const struct xfs_name *name,
+ struct xfs_inode *ip)
+{
+ struct xfs_scrub *sc = sd->sc;
+ int error;
+
+ xfs_inode_to_parent_rec(&sd->pptr_rec, sc->ip);
+ error = xfs_parent_lookup(sc->tp, ip, name, &sd->pptr_rec,
+ &sd->pptr_args);
+ if (error == -ENOATTR)
+ xchk_fblock_xref_set_corrupt(sc, XFS_DATA_FORK, 0);
+
+ return 0;
+}
+
+/* Look for a parent pointer matching this dirent, if the child isn't busy. */
+STATIC int
+xchk_dir_check_pptr_fast(
+ struct xchk_dir *sd,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ struct xfs_inode *ip)
+{
+ struct xfs_scrub *sc = sd->sc;
+ unsigned int lockmode;
+ int error;
+
+ /* dot and dotdot entries do not have parent pointers */
+ if (xfs_dir2_samename(name, &xfs_name_dot) ||
+ xfs_dir2_samename(name, &xfs_name_dotdot))
+ return 0;
+
+ /* No self-referential non-dot or dotdot dirents. */
+ if (ip == sc->ip) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return -ECANCELED;
+ }
+
+ /* Try to lock the inode. */
+ lockmode = xchk_dir_lock_child(sc, ip);
+ if (!lockmode) {
+ struct xchk_dirent save_de = {
+ .namelen = name->len,
+ .ino = ip->i_ino,
+ };
+
+ /* Couldn't lock the inode, so save the dirent for later. */
+ trace_xchk_dir_defer(sc->ip, name, ip->i_ino);
+
+ error = xfblob_storename(sd->dir_names, &save_de.name_cookie,
+ name);
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0,
+ &error))
+ return error;
+
+ error = xfarray_append(sd->dir_entries, &save_de);
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0,
+ &error))
+ return error;
+
+ return 0;
+ }
+
+ error = xchk_dir_parent_pointer(sd, name, ip);
+ xfs_iunlock(ip, lockmode);
+ return error;
+}
+
+/*
* Scrub a single directory entry.
*
* Check the inode number to make sure it's sane, then we check that we can
@@ -71,6 +221,7 @@ xchk_dir_actor(
{
struct xfs_mount *mp = dp->i_mount;
struct xfs_inode *ip;
+ struct xchk_dir *sd = priv;
xfs_ino_t lookup_ino;
xfs_dablk_t offset;
int error = 0;
@@ -137,6 +288,14 @@ xchk_dir_actor(
goto out;
xchk_dir_check_ftype(sc, offset, ip, name->type);
+
+ if (xfs_has_parent(mp)) {
+ error = xchk_dir_check_pptr_fast(sd, dapos, name, ip);
+ if (error)
+ goto out_rele;
+ }
+
+out_rele:
xchk_irele(sc, ip);
out:
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
@@ -196,8 +355,8 @@ xchk_dir_rec(
xchk_da_set_corrupt(ds, level);
goto out;
}
- error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno,
- XFS_DABUF_MAP_HOLE_OK, &bp);
+ error = xfs_dir3_data_read(ds->dargs.trans, dp, ds->dargs.owner,
+ rec_bno, XFS_DABUF_MAP_HOLE_OK, &bp);
if (!xchk_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno,
&error))
goto out;
@@ -315,10 +474,11 @@ xchk_directory_data_bestfree(
/* dir block format */
if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
- error = xfs_dir3_block_read(sc->tp, sc->ip, &bp);
+ error = xfs_dir3_block_read(sc->tp, sc->ip, sc->ip->i_ino, &bp);
} else {
/* dir data format */
- error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, 0, &bp);
+ error = xfs_dir3_data_read(sc->tp, sc->ip, sc->ip->i_ino, lblk,
+ 0, &bp);
}
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
goto out;
@@ -470,7 +630,7 @@ xchk_directory_leaf1_bestfree(
int error;
/* Read the free space block. */
- error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp);
+ error = xfs_dir3_leaf_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, &bp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
return error;
xchk_buffer_recheck(sc, bp);
@@ -531,10 +691,9 @@ xchk_directory_leaf1_bestfree(
/* Check all the bestfree entries. */
for (i = 0; i < bestcount; i++, bestp++) {
best = be16_to_cpu(*bestp);
- error = xfs_dir3_data_read(sc->tp, sc->ip,
+ error = xfs_dir3_data_read(sc->tp, sc->ip, args->owner,
xfs_dir2_db_to_da(args->geo, i),
- XFS_DABUF_MAP_HOLE_OK,
- &dbp);
+ XFS_DABUF_MAP_HOLE_OK, &dbp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
&error))
break;
@@ -577,7 +736,7 @@ xchk_directory_free_bestfree(
int error;
/* Read the free space block */
- error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp);
+ error = xfs_dir2_free_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, &bp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
return error;
xchk_buffer_recheck(sc, bp);
@@ -597,7 +756,7 @@ xchk_directory_free_bestfree(
stale++;
continue;
}
- error = xfs_dir3_data_read(sc->tp, sc->ip,
+ error = xfs_dir3_data_read(sc->tp, sc->ip, args->owner,
(freehdr.firstdb + i) * args->geo->fsbcount,
0, &dbp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
@@ -621,10 +780,11 @@ xchk_directory_blocks(
{
struct xfs_bmbt_irec got;
struct xfs_da_args args = {
- .dp = sc ->ip,
+ .dp = sc->ip,
.whichfork = XFS_DATA_FORK,
.geo = sc->mp->m_dir_geo,
.trans = sc->tp,
+ .owner = sc->ip->i_ino,
};
struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
struct xfs_mount *mp = sc->mp;
@@ -648,7 +808,8 @@ xchk_directory_blocks(
free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET);
/* Is this a block dir? */
- error = xfs_dir2_isblock(&args, &is_block);
+ if (xfs_dir2_format(&args, &error) == XFS_DIR2_FMT_BLOCK)
+ is_block = true;
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
goto out;
@@ -752,11 +913,148 @@ out:
return error;
}
+/*
+ * Revalidate a dirent that we collected in the past but couldn't check because
+ * of lock contention. Returns 0 if the dirent is still valid, -ENOENT if it
+ * has gone away on us, or a negative errno.
+ */
+STATIC int
+xchk_dir_revalidate_dirent(
+ struct xchk_dir *sd,
+ const struct xfs_name *xname,
+ xfs_ino_t ino)
+{
+ struct xfs_scrub *sc = sd->sc;
+ xfs_ino_t child_ino;
+ int error;
+
+ /*
+ * Look up the directory entry. If we get -ENOENT, the directory entry
+ * went away and there's nothing to revalidate. Return any other
+ * error.
+ */
+ error = xchk_dir_lookup(sc, sc->ip, xname, &child_ino);
+ if (error)
+ return error;
+
+ /* The inode number changed, nothing to revalidate. */
+ if (ino != child_ino)
+ return -ENOENT;
+
+ return 0;
+}
+
+/*
+ * Check a directory entry's parent pointers the slow way, which means we cycle
+ * locks a bunch and put up with revalidation until we get it done.
+ */
+STATIC int
+xchk_dir_slow_dirent(
+ struct xchk_dir *sd,
+ struct xchk_dirent *dirent,
+ const struct xfs_name *xname)
+{
+ struct xfs_scrub *sc = sd->sc;
+ struct xfs_inode *ip;
+ unsigned int lockmode;
+ int error;
+
+ /* Check that the deferred dirent still exists. */
+ if (sd->need_revalidate) {
+ error = xchk_dir_revalidate_dirent(sd, xname, dirent->ino);
+ if (error == -ENOENT)
+ return 0;
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0,
+ &error))
+ return error;
+ }
+
+ error = xchk_iget(sc, dirent->ino, &ip);
+ if (error == -EINVAL || error == -ENOENT) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
+ return error;
+
+ /*
+ * If we can grab both IOLOCK and ILOCK of the alleged child, we can
+ * proceed with the validation.
+ */
+ lockmode = xchk_dir_lock_child(sc, ip);
+ if (lockmode) {
+ trace_xchk_dir_slowpath(sc->ip, xname, ip->i_ino);
+ goto check_pptr;
+ }
+
+ /*
+ * We couldn't lock the child file. Drop all the locks and try to
+ * get them again, one at a time.
+ */
+ xchk_iunlock(sc, sc->ilock_flags);
+ sd->need_revalidate = true;
+
+ trace_xchk_dir_ultraslowpath(sc->ip, xname, ip->i_ino);
+
+ error = xchk_dir_trylock_for_pptrs(sc, ip, &lockmode);
+ if (error)
+ goto out_rele;
+
+ /* Revalidate, since we just cycled the locks. */
+ error = xchk_dir_revalidate_dirent(sd, xname, dirent->ino);
+ if (error == -ENOENT) {
+ error = 0;
+ goto out_unlock;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out_unlock;
+
+check_pptr:
+ error = xchk_dir_parent_pointer(sd, xname, ip);
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+out_rele:
+ xchk_irele(sc, ip);
+ return error;
+}
+
+/* Check all the dirents that we deferred the first time around. */
+STATIC int
+xchk_dir_finish_slow_dirents(
+ struct xchk_dir *sd)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ foreach_xfarray_idx(sd->dir_entries, array_cur) {
+ struct xchk_dirent dirent;
+
+ if (sd->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ error = xfarray_load(sd->dir_entries, array_cur, &dirent);
+ if (error)
+ return error;
+
+ error = xfblob_loadname(sd->dir_names, dirent.name_cookie,
+ &sd->xname, dirent.namelen);
+ if (error)
+ return error;
+
+ error = xchk_dir_slow_dirent(sd, &dirent, &sd->xname);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
/* Scrub a whole directory. */
int
xchk_directory(
struct xfs_scrub *sc)
{
+ struct xchk_dir *sd;
int error;
if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
@@ -789,9 +1087,60 @@ xchk_directory(
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return 0;
+ sd = kvzalloc(sizeof(struct xchk_dir), XCHK_GFP_FLAGS);
+ if (!sd)
+ return -ENOMEM;
+ sd->sc = sc;
+ sd->xname.name = sd->namebuf;
+
+ if (xfs_has_parent(sc->mp)) {
+ char *descr;
+
+ /*
+ * Set up some staging memory for dirents that we can't check
+ * due to locking contention.
+ */
+ descr = xchk_xfile_ino_descr(sc, "slow directory entries");
+ error = xfarray_create(descr, 0, sizeof(struct xchk_dirent),
+ &sd->dir_entries);
+ kfree(descr);
+ if (error)
+ goto out_sd;
+
+ descr = xchk_xfile_ino_descr(sc, "slow directory entry names");
+ error = xfblob_create(descr, &sd->dir_names);
+ kfree(descr);
+ if (error)
+ goto out_entries;
+ }
+
/* Look up every name in this directory by hash. */
- error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL);
- if (error && error != -ECANCELED)
+ error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, sd);
+ if (error == -ECANCELED)
+ error = 0;
+ if (error)
+ goto out_names;
+
+ if (xfs_has_parent(sc->mp)) {
+ error = xchk_dir_finish_slow_dirents(sd);
+ if (error == -ETIMEDOUT) {
+ /* Couldn't grab a lock, scrub was marked incomplete */
+ error = 0;
+ goto out_names;
+ }
+ if (error)
+ goto out_names;
+ }
+
+out_names:
+ if (sd->dir_names)
+ xfblob_destroy(sd->dir_names);
+out_entries:
+ if (sd->dir_entries)
+ xfarray_destroy(sd->dir_entries);
+out_sd:
+ kvfree(sd);
+ if (error)
return error;
/* If the dir is clean, it is clearly not zapped. */
diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c
new file mode 100644
index 000000000000..64679fe08446
--- /dev/null
+++ b/fs/xfs/scrub/dir_repair.c
@@ -0,0 +1,1958 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_bmap_util.h"
+#include "xfs_exchmaps.h"
+#include "xfs_exchrange.h"
+#include "xfs_ag.h"
+#include "xfs_parent.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/iscan.h"
+#include "scrub/readdir.h"
+#include "scrub/reap.h"
+#include "scrub/findparent.h"
+#include "scrub/orphanage.h"
+#include "scrub/listxattr.h"
+
+/*
+ * Directory Repair
+ * ================
+ *
+ * We repair directories by reading the directory data blocks looking for
+ * directory entries that look salvageable (name passes verifiers, entry points
+ * to a valid allocated inode, etc). Each entry worth salvaging is stashed in
+ * memory, and the stashed entries are periodically replayed into a temporary
+ * directory to constrain memory use. Batching the construction of the
+ * temporary directory in this fashion reduces lock cycling of the directory
+ * being repaired and the temporary directory, and will later become important
+ * for parent pointer scanning.
+ *
+ * If parent pointers are enabled on this filesystem, we instead reconstruct
+ * the directory by visiting each parent pointer of each file in the filesystem
+ * and translating the relevant parent pointer records into dirents. In this
+ * case, it is advantageous to stash all directory entries created from parent
+ * pointers for a single child file before replaying them into the temporary
+ * directory. To save memory, the live filesystem scan reuses the findparent
+ * fields. Directory repair chooses either parent pointer scanning or
+ * directory entry salvaging, but not both.
+ *
+ * Directory entries added to the temporary directory do not elevate the link
+ * counts of the inodes found. When salvaging completes, the remaining stashed
+ * entries are replayed to the temporary directory. An atomic mapping exchange
+ * is used to commit the new directory blocks to the directory being repaired.
+ * This will disrupt readdir cursors.
+ *
+ * Locking Issues
+ * --------------
+ *
+ * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on
+ * /a/b for a "mv /a/b /c/" operation. This means that only b's ILOCK protects
+ * b's dotdot update. This is in contrast to every other dotdot update (link,
+ * remove, mkdir). If the repair code drops the ILOCK, it must either
+ * revalidate the dotdot entry or use dirent hooks to capture updates from
+ * other threads.
+ */
+
+/* Create a dirent in the tempdir. */
+#define XREP_DIRENT_ADD (1)
+
+/* Remove a dirent from the tempdir. */
+#define XREP_DIRENT_REMOVE (2)
+
+/* Directory entry to be restored in the new directory. */
+struct xrep_dirent {
+ /* Cookie for retrieval of the dirent name. */
+ xfblob_cookie name_cookie;
+
+ /* Target inode number. */
+ xfs_ino_t ino;
+
+ /* Length of the dirent name. */
+ uint8_t namelen;
+
+ /* File type of the dirent. */
+ uint8_t ftype;
+
+ /* XREP_DIRENT_{ADD,REMOVE} */
+ uint8_t action;
+};
+
+/*
+ * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names
+ * before we write them to the temp dir.
+ */
+#define XREP_DIR_MAX_STASH_BYTES (PAGE_SIZE * 8)
+
+struct xrep_dir {
+ struct xfs_scrub *sc;
+
+ /* Fixed-size array of xrep_dirent structures. */
+ struct xfarray *dir_entries;
+
+ /* Blobs containing directory entry names. */
+ struct xfblob *dir_names;
+
+ /* Information for exchanging data forks at the end. */
+ struct xrep_tempexch tx;
+
+ /* Preallocated args struct for performing dir operations */
+ struct xfs_da_args args;
+
+ /*
+ * Information used to scan the filesystem to find the inumber of the
+ * dotdot entry for this directory. For directory salvaging when
+ * parent pointers are not enabled, we use the findparent_* functions
+ * on this object and access only the parent_ino field directly.
+ *
+ * When parent pointers are enabled, however, the pptr scanner uses the
+ * iscan, hooks, lock, and parent_ino fields of this object directly.
+ * @pscan.lock coordinates access to dir_entries, dir_names,
+ * parent_ino, subdirs, dirents, and args. This reduces the memory
+ * requirements of this structure.
+ */
+ struct xrep_parent_scan_info pscan;
+
+ /*
+ * Context information for attaching this directory to the lost+found
+ * if this directory does not have a parent.
+ */
+ struct xrep_adoption adoption;
+
+ /* How many subdirectories did we find? */
+ uint64_t subdirs;
+
+ /* How many dirents did we find? */
+ unsigned int dirents;
+
+ /* Should we move this directory to the orphanage? */
+ bool needs_adoption;
+
+ /* Directory entry name, plus the trailing null. */
+ struct xfs_name xname;
+ unsigned char namebuf[MAXNAMELEN];
+};
+
+/* Tear down all the incore stuff we created. */
+static void
+xrep_dir_teardown(
+ struct xfs_scrub *sc)
+{
+ struct xrep_dir *rd = sc->buf;
+
+ xrep_findparent_scan_teardown(&rd->pscan);
+ xfblob_destroy(rd->dir_names);
+ xfarray_destroy(rd->dir_entries);
+}
+
+/* Set up for a directory repair. */
+int
+xrep_setup_directory(
+ struct xfs_scrub *sc)
+{
+ struct xrep_dir *rd;
+ int error;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ error = xrep_orphanage_try_create(sc);
+ if (error)
+ return error;
+
+ error = xrep_tempfile_create(sc, S_IFDIR);
+ if (error)
+ return error;
+
+ rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS);
+ if (!rd)
+ return -ENOMEM;
+ rd->sc = sc;
+ rd->xname.name = rd->namebuf;
+ sc->buf = rd;
+
+ return 0;
+}
+
+/*
+ * Look up the dotdot entry and confirm that it's really the parent.
+ * Returns NULLFSINO if we don't know what to do.
+ */
+static inline xfs_ino_t
+xrep_dir_lookup_parent(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ xfs_ino_t ino;
+ int error;
+
+ error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL);
+ if (error)
+ return NULLFSINO;
+ if (!xfs_verify_dir_ino(sc->mp, ino))
+ return NULLFSINO;
+
+ error = xrep_findparent_confirm(sc, &ino);
+ if (error)
+ return NULLFSINO;
+
+ return ino;
+}
+
+/*
+ * Look up '..' in the dentry cache and confirm that it's really the parent.
+ * Returns NULLFSINO if the dcache misses or if the hit is implausible.
+ */
+static inline xfs_ino_t
+xrep_dir_dcache_parent(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ xfs_ino_t parent_ino;
+ int error;
+
+ parent_ino = xrep_findparent_from_dcache(sc);
+ if (parent_ino == NULLFSINO)
+ return parent_ino;
+
+ error = xrep_findparent_confirm(sc, &parent_ino);
+ if (error)
+ return NULLFSINO;
+
+ return parent_ino;
+}
+
+/* Try to find the parent of the directory being repaired. */
+STATIC int
+xrep_dir_find_parent(
+ struct xrep_dir *rd)
+{
+ xfs_ino_t ino;
+
+ ino = xrep_findparent_self_reference(rd->sc);
+ if (ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rd->pscan, ino);
+ return 0;
+ }
+
+ ino = xrep_dir_dcache_parent(rd);
+ if (ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rd->pscan, ino);
+ return 0;
+ }
+
+ ino = xrep_dir_lookup_parent(rd);
+ if (ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rd->pscan, ino);
+ return 0;
+ }
+
+ /*
+ * A full filesystem scan is the last resort. On a busy filesystem,
+ * the scan can fail with -EBUSY if we cannot grab IOLOCKs. That means
+ * that we don't know what who the parent is, so we should return to
+ * userspace.
+ */
+ return xrep_findparent_scan(&rd->pscan);
+}
+
+/*
+ * Decide if we want to salvage this entry. We don't bother with oversized
+ * names or the dot entry.
+ */
+STATIC int
+xrep_dir_want_salvage(
+ struct xrep_dir *rd,
+ const char *name,
+ int namelen,
+ xfs_ino_t ino)
+{
+ struct xfs_mount *mp = rd->sc->mp;
+
+ /* No pointers to ourselves or to garbage. */
+ if (ino == rd->sc->ip->i_ino)
+ return false;
+ if (!xfs_verify_dir_ino(mp, ino))
+ return false;
+
+ /* No weird looking names or dot entries. */
+ if (namelen >= MAXNAMELEN || namelen <= 0)
+ return false;
+ if (namelen == 1 && name[0] == '.')
+ return false;
+ if (!xfs_dir2_namecheck(name, namelen))
+ return false;
+
+ return true;
+}
+
+/*
+ * Remember that we want to create a dirent in the tempdir. These stashed
+ * actions will be replayed later.
+ */
+STATIC int
+xrep_dir_stash_createname(
+ struct xrep_dir *rd,
+ const struct xfs_name *name,
+ xfs_ino_t ino)
+{
+ struct xrep_dirent dirent = {
+ .action = XREP_DIRENT_ADD,
+ .ino = ino,
+ .namelen = name->len,
+ .ftype = name->type,
+ };
+ int error;
+
+ trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino);
+
+ error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rd->dir_entries, &dirent);
+}
+
+/*
+ * Remember that we want to remove a dirent from the tempdir. These stashed
+ * actions will be replayed later.
+ */
+STATIC int
+xrep_dir_stash_removename(
+ struct xrep_dir *rd,
+ const struct xfs_name *name,
+ xfs_ino_t ino)
+{
+ struct xrep_dirent dirent = {
+ .action = XREP_DIRENT_REMOVE,
+ .ino = ino,
+ .namelen = name->len,
+ .ftype = name->type,
+ };
+ int error;
+
+ trace_xrep_dir_stash_removename(rd->sc->tempip, name, ino);
+
+ error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rd->dir_entries, &dirent);
+}
+
+/* Allocate an in-core record to hold entries while we rebuild the dir data. */
+STATIC int
+xrep_dir_salvage_entry(
+ struct xrep_dir *rd,
+ unsigned char *name,
+ unsigned int namelen,
+ xfs_ino_t ino)
+{
+ struct xfs_name xname = {
+ .name = name,
+ };
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_inode *ip;
+ unsigned int i = 0;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /*
+ * Truncate the name to the first character that would trip namecheck.
+ * If we no longer have a name after that, ignore this entry.
+ */
+ while (i < namelen && name[i] != 0 && name[i] != '/')
+ i++;
+ if (i == 0)
+ return 0;
+ xname.len = i;
+
+ /* Ignore '..' entries; we already picked the new parent. */
+ if (xname.len == 2 && name[0] == '.' && name[1] == '.') {
+ trace_xrep_dir_salvaged_parent(sc->ip, ino);
+ return 0;
+ }
+
+ trace_xrep_dir_salvage_entry(sc->ip, &xname, ino);
+
+ /*
+ * Compute the ftype or dump the entry if we can't. We don't lock the
+ * inode because inodes can't change type while we have a reference.
+ */
+ error = xchk_iget(sc, ino, &ip);
+ if (error)
+ return 0;
+
+ xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
+ xchk_irele(sc, ip);
+
+ return xrep_dir_stash_createname(rd, &xname, ino);
+}
+
+/* Record a shortform directory entry for later reinsertion. */
+STATIC int
+xrep_dir_salvage_sf_entry(
+ struct xrep_dir *rd,
+ struct xfs_dir2_sf_hdr *sfp,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ xfs_ino_t ino;
+
+ ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep);
+ if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino))
+ return 0;
+
+ return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino);
+}
+
+/* Record a regular directory entry for later reinsertion. */
+STATIC int
+xrep_dir_salvage_data_entry(
+ struct xrep_dir *rd,
+ struct xfs_dir2_data_entry *dep)
+{
+ xfs_ino_t ino;
+
+ ino = be64_to_cpu(dep->inumber);
+ if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino))
+ return 0;
+
+ return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino);
+}
+
+/* Try to recover block/data format directory entries. */
+STATIC int
+xrep_dir_recover_data(
+ struct xrep_dir *rd,
+ struct xfs_buf *bp)
+{
+ struct xfs_da_geometry *geo = rd->sc->mp->m_dir_geo;
+ unsigned int offset;
+ unsigned int end;
+ int error = 0;
+
+ /*
+ * Loop over the data portion of the block.
+ * Each object is a real entry (dep) or an unused one (dup).
+ */
+ offset = geo->data_entry_offset;
+ end = min_t(unsigned int, BBTOB(bp->b_length),
+ xfs_dir3_data_end_offset(geo, bp->b_addr));
+
+ while (offset < end) {
+ struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
+ struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
+
+ if (xchk_should_terminate(rd->sc, &error))
+ return error;
+
+ /* Skip unused entries. */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ offset += be16_to_cpu(dup->length);
+ continue;
+ }
+
+ /* Don't walk off the end of the block. */
+ offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen);
+ if (offset > end)
+ break;
+
+ /* Ok, let's save this entry. */
+ error = xrep_dir_salvage_data_entry(rd, dep);
+ if (error)
+ return error;
+
+ }
+
+ return 0;
+}
+
+/* Try to recover shortform directory entries. */
+STATIC int
+xrep_dir_recover_sf(
+ struct xrep_dir *rd)
+{
+ struct xfs_dir2_sf_hdr *hdr;
+ struct xfs_dir2_sf_entry *sfep;
+ struct xfs_dir2_sf_entry *next;
+ struct xfs_ifork *ifp;
+ xfs_ino_t ino;
+ unsigned char *end;
+ int error = 0;
+
+ ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK);
+ hdr = ifp->if_data;
+ end = (unsigned char *)ifp->if_data + ifp->if_bytes;
+
+ ino = xfs_dir2_sf_get_parent_ino(hdr);
+ trace_xrep_dir_salvaged_parent(rd->sc->ip, ino);
+
+ sfep = xfs_dir2_sf_firstentry(hdr);
+ while ((unsigned char *)sfep < end) {
+ if (xchk_should_terminate(rd->sc, &error))
+ return error;
+
+ next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep);
+ if ((unsigned char *)next > end)
+ break;
+
+ /* Ok, let's save this entry. */
+ error = xrep_dir_salvage_sf_entry(rd, hdr, sfep);
+ if (error)
+ return error;
+
+ sfep = next;
+ }
+
+ return 0;
+}
+
+/*
+ * Try to figure out the format of this directory from the data fork mappings
+ * and the directory size. If we can be reasonably sure of format, we can be
+ * more aggressive in salvaging directory entries. On return, @magic_guess
+ * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format"
+ * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory,
+ * and 0 if we can't tell.
+ */
+STATIC void
+xrep_dir_guess_format(
+ struct xrep_dir *rd,
+ __be32 *magic_guess)
+{
+ struct xfs_inode *dp = rd->sc->ip;
+ struct xfs_mount *mp = rd->sc->mp;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ xfs_fileoff_t last;
+ int error;
+
+ ASSERT(xfs_has_crc(mp));
+
+ *magic_guess = 0;
+
+ /*
+ * If there's a single directory block and the directory size is
+ * exactly one block, this has to be a single block format directory.
+ */
+ error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK);
+ if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize &&
+ dp->i_disk_size == geo->blksize) {
+ *magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+ return;
+ }
+
+ /*
+ * If the last extent before the leaf offset matches the directory
+ * size and the directory size is larger than 1 block, this is a
+ * data format directory.
+ */
+ last = geo->leafblk;
+ error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK);
+ if (!error &&
+ XFS_FSB_TO_B(mp, last) > geo->blksize &&
+ XFS_FSB_TO_B(mp, last) == dp->i_disk_size) {
+ *magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+ return;
+ }
+}
+
+/* Recover directory entries from a specific directory block. */
+STATIC int
+xrep_dir_recover_dirblock(
+ struct xrep_dir *rd,
+ __be32 magic_guess,
+ xfs_dablk_t dabno)
+{
+ struct xfs_dir2_data_hdr *hdr;
+ struct xfs_buf *bp;
+ __be32 oldmagic;
+ int error;
+
+ /*
+ * Try to read buffer. We invalidate them in the next step so we don't
+ * bother to set a buffer type or ops.
+ */
+ error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno,
+ XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL);
+ if (error || !bp)
+ return error;
+
+ hdr = bp->b_addr;
+ oldmagic = hdr->magic;
+
+ trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno,
+ be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess));
+
+ /*
+ * If we're sure of the block's format, proceed with the salvage
+ * operation using the specified magic number.
+ */
+ if (magic_guess) {
+ hdr->magic = magic_guess;
+ goto recover;
+ }
+
+ /*
+ * If we couldn't guess what type of directory this is, then we will
+ * only salvage entries from directory blocks that match the magic
+ * number and pass verifiers.
+ */
+ switch (hdr->magic) {
+ case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+ case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+ if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops))
+ goto out;
+ if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL)
+ goto out;
+ break;
+ case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+ case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+ if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops))
+ goto out;
+ if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL)
+ goto out;
+ break;
+ default:
+ goto out;
+ }
+
+recover:
+ error = xrep_dir_recover_data(rd, bp);
+
+out:
+ hdr->magic = oldmagic;
+ xfs_trans_brelse(rd->sc->tp, bp);
+ return error;
+}
+
+static inline void
+xrep_dir_init_args(
+ struct xrep_dir *rd,
+ struct xfs_inode *dp,
+ const struct xfs_name *name)
+{
+ memset(&rd->args, 0, sizeof(struct xfs_da_args));
+ rd->args.geo = rd->sc->mp->m_dir_geo;
+ rd->args.whichfork = XFS_DATA_FORK;
+ rd->args.owner = rd->sc->ip->i_ino;
+ rd->args.trans = rd->sc->tp;
+ rd->args.dp = dp;
+ if (!name)
+ return;
+ rd->args.name = name->name;
+ rd->args.namelen = name->len;
+ rd->args.filetype = name->type;
+ rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name);
+}
+
+/* Replay a stashed createname into the temporary directory. */
+STATIC int
+xrep_dir_replay_createname(
+ struct xrep_dir *rd,
+ const struct xfs_name *name,
+ xfs_ino_t inum,
+ xfs_extlen_t total)
+{
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_inode *dp = rd->sc->tempip;
+ int error;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+
+ error = xfs_dir_ino_validate(sc->mp, inum);
+ if (error)
+ return error;
+
+ trace_xrep_dir_replay_createname(dp, name, inum);
+
+ xrep_dir_init_args(rd, dp, name);
+ rd->args.inumber = inum;
+ rd->args.total = total;
+ rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+ return xfs_dir_createname_args(&rd->args);
+}
+
+/* Replay a stashed removename onto the temporary directory. */
+STATIC int
+xrep_dir_replay_removename(
+ struct xrep_dir *rd,
+ const struct xfs_name *name,
+ xfs_extlen_t total)
+{
+ struct xfs_inode *dp = rd->args.dp;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+
+ xrep_dir_init_args(rd, dp, name);
+ rd->args.op_flags = 0;
+ rd->args.total = total;
+
+ trace_xrep_dir_replay_removename(dp, name, 0);
+ return xfs_dir_removename_args(&rd->args);
+}
+
+/*
+ * Add this stashed incore directory entry to the temporary directory.
+ * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and
+ * must not be in transaction context.
+ */
+STATIC int
+xrep_dir_replay_update(
+ struct xrep_dir *rd,
+ const struct xfs_name *xname,
+ const struct xrep_dirent *dirent)
+{
+ struct xfs_mount *mp = rd->sc->mp;
+#ifdef DEBUG
+ xfs_ino_t ino;
+#endif
+ uint resblks;
+ int error;
+
+ resblks = xfs_link_space_res(mp, xname->len);
+ error = xchk_trans_alloc(rd->sc, resblks);
+ if (error)
+ return error;
+
+ /* Lock the temporary directory and join it to the transaction */
+ xrep_tempfile_ilock(rd->sc);
+ xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0);
+
+ switch (dirent->action) {
+ case XREP_DIRENT_ADD:
+ /*
+ * Create a replacement dirent in the temporary directory.
+ * Note that _createname doesn't check for existing entries.
+ * There shouldn't be any in the temporary dir, but we'll
+ * verify this in debug mode.
+ */
+#ifdef DEBUG
+ error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
+ if (error != -ENOENT) {
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+#endif
+
+ error = xrep_dir_replay_createname(rd, xname, dirent->ino,
+ resblks);
+ if (error)
+ goto out_cancel;
+
+ if (xname->type == XFS_DIR3_FT_DIR)
+ rd->subdirs++;
+ rd->dirents++;
+ break;
+ case XREP_DIRENT_REMOVE:
+ /*
+ * Remove a dirent from the temporary directory. Note that
+ * _removename doesn't check the inode target of the exist
+ * entry. There should be a perfect match in the temporary
+ * dir, but we'll verify this in debug mode.
+ */
+#ifdef DEBUG
+ error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
+ if (error) {
+ ASSERT(error != 0);
+ goto out_cancel;
+ }
+ if (ino != dirent->ino) {
+ ASSERT(ino == dirent->ino);
+ error = -EIO;
+ goto out_cancel;
+ }
+#endif
+
+ error = xrep_dir_replay_removename(rd, xname, resblks);
+ if (error)
+ goto out_cancel;
+
+ if (xname->type == XFS_DIR3_FT_DIR)
+ rd->subdirs--;
+ rd->dirents--;
+ break;
+ default:
+ ASSERT(0);
+ error = -EIO;
+ goto out_cancel;
+ }
+
+ /* Commit and unlock. */
+ error = xrep_trans_commit(rd->sc);
+ if (error)
+ return error;
+
+ xrep_tempfile_iunlock(rd->sc);
+ return 0;
+out_cancel:
+ xchk_trans_cancel(rd->sc);
+ xrep_tempfile_iunlock(rd->sc);
+ return error;
+}
+
+/*
+ * Flush stashed incore dirent updates that have been recorded by the scanner.
+ * This is done to reduce the memory requirements of the directory rebuild,
+ * since directories can contain up to 32GB of directory data.
+ *
+ * Caller must not hold transactions or ILOCKs. Caller must hold the tempdir
+ * IOLOCK.
+ */
+STATIC int
+xrep_dir_replay_updates(
+ struct xrep_dir *rd)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ /* Add all the salvaged dirents to the temporary directory. */
+ mutex_lock(&rd->pscan.lock);
+ foreach_xfarray_idx(rd->dir_entries, array_cur) {
+ struct xrep_dirent dirent;
+
+ error = xfarray_load(rd->dir_entries, array_cur, &dirent);
+ if (error)
+ goto out_unlock;
+
+ error = xfblob_loadname(rd->dir_names, dirent.name_cookie,
+ &rd->xname, dirent.namelen);
+ if (error)
+ goto out_unlock;
+ rd->xname.type = dirent.ftype;
+ mutex_unlock(&rd->pscan.lock);
+
+ error = xrep_dir_replay_update(rd, &rd->xname, &dirent);
+ if (error)
+ return error;
+ mutex_lock(&rd->pscan.lock);
+ }
+
+ /* Empty out both arrays now that we've added the entries. */
+ xfarray_truncate(rd->dir_entries);
+ xfblob_truncate(rd->dir_names);
+ mutex_unlock(&rd->pscan.lock);
+ return 0;
+out_unlock:
+ mutex_unlock(&rd->pscan.lock);
+ return error;
+}
+
+/*
+ * Periodically flush stashed directory entries to the temporary dir. This
+ * is done to reduce the memory requirements of the directory rebuild, since
+ * directories can contain up to 32GB of directory data.
+ */
+STATIC int
+xrep_dir_flush_stashed(
+ struct xrep_dir *rd)
+{
+ int error;
+
+ /*
+ * Entering this function, the scrub context has a reference to the
+ * inode being repaired, the temporary file, and a scrub transaction
+ * that we use during dirent salvaging to avoid livelocking if there
+ * are cycles in the directory structures. We hold ILOCK_EXCL on both
+ * the inode being repaired and the temporary file, though they are
+ * not ijoined to the scrub transaction.
+ *
+ * To constrain kernel memory use, we occasionally write salvaged
+ * dirents from the xfarray and xfblob structures into the temporary
+ * directory in preparation for exchanging the directory structures at
+ * the end. Updating the temporary file requires a transaction, so we
+ * commit the scrub transaction and drop the two ILOCKs so that
+ * we can allocate whatever transaction we want.
+ *
+ * We still hold IOLOCK_EXCL on the inode being repaired, which
+ * prevents anyone from accessing the damaged directory data while we
+ * repair it.
+ */
+ error = xrep_trans_commit(rd->sc);
+ if (error)
+ return error;
+ xchk_iunlock(rd->sc, XFS_ILOCK_EXCL);
+
+ /*
+ * Take the IOLOCK of the temporary file while we modify dirents. This
+ * isn't strictly required because the temporary file is never revealed
+ * to userspace, but we follow the same locking rules. We still hold
+ * sc->ip's IOLOCK.
+ */
+ error = xrep_tempfile_iolock_polled(rd->sc);
+ if (error)
+ return error;
+
+ /* Write to the tempdir all the updates that we've stashed. */
+ error = xrep_dir_replay_updates(rd);
+ xrep_tempfile_iounlock(rd->sc);
+ if (error)
+ return error;
+
+ /*
+ * Recreate the salvage transaction and relock the dir we're salvaging.
+ */
+ error = xchk_trans_alloc(rd->sc, 0);
+ if (error)
+ return error;
+ xchk_ilock(rd->sc, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/* Decide if we've stashed too much dirent data in memory. */
+static inline bool
+xrep_dir_want_flush_stashed(
+ struct xrep_dir *rd)
+{
+ unsigned long long bytes;
+
+ bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names);
+ return bytes > XREP_DIR_MAX_STASH_BYTES;
+}
+
+/* Extract as many directory entries as we can. */
+STATIC int
+xrep_dir_recover(
+ struct xrep_dir *rd)
+{
+ struct xfs_bmbt_irec got;
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_da_geometry *geo = sc->mp->m_dir_geo;
+ xfs_fileoff_t offset;
+ xfs_dablk_t dabno;
+ __be32 magic_guess;
+ int nmap;
+ int error;
+
+ xrep_dir_guess_format(rd, &magic_guess);
+
+ /* Iterate each directory data block in the data fork. */
+ for (offset = 0;
+ offset < geo->leafblk;
+ offset = got.br_startoff + got.br_blockcount) {
+ nmap = 1;
+ error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset,
+ &got, &nmap, 0);
+ if (error)
+ return error;
+ if (nmap != 1)
+ return -EFSCORRUPTED;
+ if (!xfs_bmap_is_written_extent(&got))
+ continue;
+
+ for (dabno = round_up(got.br_startoff, geo->fsbcount);
+ dabno < got.br_startoff + got.br_blockcount;
+ dabno += geo->fsbcount) {
+ if (xchk_should_terminate(rd->sc, &error))
+ return error;
+
+ error = xrep_dir_recover_dirblock(rd,
+ magic_guess, dabno);
+ if (error)
+ return error;
+
+ /* Flush dirents to constrain memory usage. */
+ if (xrep_dir_want_flush_stashed(rd)) {
+ error = xrep_dir_flush_stashed(rd);
+ if (error)
+ return error;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Find all the directory entries for this inode by scraping them out of the
+ * directory leaf blocks by hand, and flushing them into the temp dir.
+ */
+STATIC int
+xrep_dir_find_entries(
+ struct xrep_dir *rd)
+{
+ struct xfs_inode *dp = rd->sc->ip;
+ int error;
+
+ /*
+ * Salvage directory entries from the old directory, and write them to
+ * the temporary directory.
+ */
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+ error = xrep_dir_recover_sf(rd);
+ } else {
+ error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ error = xrep_dir_recover(rd);
+ }
+ if (error)
+ return error;
+
+ return xrep_dir_flush_stashed(rd);
+}
+
+/* Scan all files in the filesystem for dirents. */
+STATIC int
+xrep_dir_salvage_entries(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ int error;
+
+ /*
+ * Drop the ILOCK on this directory so that we can scan for this
+ * directory's parent. Figure out who is going to be the parent of
+ * this directory, then retake the ILOCK so that we can salvage
+ * directory entries.
+ */
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ error = xrep_dir_find_parent(rd);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ /*
+ * Collect directory entries by parsing raw leaf blocks to salvage
+ * whatever we can. When we're done, free the staging memory before
+ * exchanging the directories to reduce memory usage.
+ */
+ error = xrep_dir_find_entries(rd);
+ if (error)
+ return error;
+
+ /*
+ * Cancel the repair transaction and drop the ILOCK so that we can
+ * (later) use the atomic mapping exchange functions to compute the
+ * correct block reservations and re-lock the inodes.
+ *
+ * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory
+ * modifications, but there's nothing to prevent userspace from reading
+ * the directory until we're ready for the exchange operation. Reads
+ * will return -EIO without shutting down the fs, so we're ok with
+ * that.
+ *
+ * The VFS can change dotdot on us, but the findparent scan will keep
+ * our incore parent inode up to date. See the note on locking issues
+ * for more details.
+ */
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+
+/*
+ * Examine a parent pointer of a file. If it leads us back to the directory
+ * that we're rebuilding, create an incore dirent from the parent pointer and
+ * stash it.
+ */
+STATIC int
+xrep_dir_scan_pptr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xfs_name xname = {
+ .name = name,
+ .len = namelen,
+ .type = xfs_mode_to_ftype(VFS_I(ip)->i_mode),
+ };
+ xfs_ino_t parent_ino;
+ uint32_t parent_gen;
+ struct xrep_dir *rd = priv;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ /*
+ * Ignore parent pointers that point back to a different dir, list the
+ * wrong generation number, or are invalid.
+ */
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, &parent_ino, &parent_gen);
+ if (error)
+ return error;
+
+ if (parent_ino != sc->ip->i_ino ||
+ parent_gen != VFS_I(sc->ip)->i_generation)
+ return 0;
+
+ mutex_lock(&rd->pscan.lock);
+ error = xrep_dir_stash_createname(rd, &xname, ip->i_ino);
+ mutex_unlock(&rd->pscan.lock);
+ return error;
+}
+
+/*
+ * If this child dirent points to the directory being repaired, remember that
+ * fact so that we can reset the dotdot entry if necessary.
+ */
+STATIC int
+xrep_dir_scan_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xrep_dir *rd = priv;
+
+ /* Dirent doesn't point to this directory. */
+ if (ino != rd->sc->ip->i_ino)
+ return 0;
+
+ /* Ignore garbage inum. */
+ if (!xfs_verify_dir_ino(rd->sc->mp, ino))
+ return 0;
+
+ /* No weird looking names. */
+ if (name->len >= MAXNAMELEN || name->len <= 0)
+ return 0;
+
+ /* Don't pick up dot or dotdot entries; we only want child dirents. */
+ if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
+ xfs_dir2_samename(name, &xfs_name_dot))
+ return 0;
+
+ trace_xrep_dir_stash_createname(sc->tempip, &xfs_name_dotdot,
+ dp->i_ino);
+
+ xrep_findparent_scan_found(&rd->pscan, dp->i_ino);
+ return 0;
+}
+
+/*
+ * Decide if we want to look for child dirents or parent pointers in this file.
+ * Skip the dir being repaired and any files being used to stage repairs.
+ */
+static inline bool
+xrep_dir_want_scan(
+ struct xrep_dir *rd,
+ const struct xfs_inode *ip)
+{
+ return ip != rd->sc->ip && !xrep_is_tempfile(ip);
+}
+
+/*
+ * Take ILOCK on a file that we want to scan.
+ *
+ * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt or
+ * has an unloaded attr bmbt. Otherwise, take ILOCK_SHARED.
+ */
+static inline unsigned int
+xrep_dir_scan_ilock(
+ struct xrep_dir *rd,
+ struct xfs_inode *ip)
+{
+ uint lock_mode = XFS_ILOCK_SHARED;
+
+ /* Need to take the shared ILOCK to advance the iscan cursor. */
+ if (!xrep_dir_want_scan(rd, ip))
+ goto lock;
+
+ if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) {
+ lock_mode = XFS_ILOCK_EXCL;
+ goto lock;
+ }
+
+ if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
+ lock_mode = XFS_ILOCK_EXCL;
+
+lock:
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
+}
+
+/*
+ * Scan this file for relevant child dirents or parent pointers that point to
+ * the directory we're rebuilding.
+ */
+STATIC int
+xrep_dir_scan_file(
+ struct xrep_dir *rd,
+ struct xfs_inode *ip)
+{
+ unsigned int lock_mode;
+ int error = 0;
+
+ lock_mode = xrep_dir_scan_ilock(rd, ip);
+
+ if (!xrep_dir_want_scan(rd, ip))
+ goto scan_done;
+
+ /*
+ * If the extended attributes look as though they has been zapped by
+ * the inode record repair code, we cannot scan for parent pointers.
+ */
+ if (xchk_pptr_looks_zapped(ip)) {
+ error = -EBUSY;
+ goto scan_done;
+ }
+
+ error = xchk_xattr_walk(rd->sc, ip, xrep_dir_scan_pptr, NULL, rd);
+ if (error)
+ goto scan_done;
+
+ if (S_ISDIR(VFS_I(ip)->i_mode)) {
+ /*
+ * If the directory looks as though it has been zapped by the
+ * inode record repair code, we cannot scan for child dirents.
+ */
+ if (xchk_dir_looks_zapped(ip)) {
+ error = -EBUSY;
+ goto scan_done;
+ }
+
+ error = xchk_dir_walk(rd->sc, ip, xrep_dir_scan_dirent, rd);
+ if (error)
+ goto scan_done;
+ }
+
+scan_done:
+ xchk_iscan_mark_visited(&rd->pscan.iscan, ip);
+ xfs_iunlock(ip, lock_mode);
+ return error;
+}
+
+/*
+ * Scan all files in the filesystem for parent pointers that we can turn into
+ * replacement dirents, and a dirent that we can use to set the dotdot pointer.
+ */
+STATIC int
+xrep_dir_scan_dirtree(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_inode *ip;
+ int error;
+
+ /* Roots of directory trees are their own parents. */
+ if (sc->ip == sc->mp->m_rootip)
+ xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino);
+
+ /*
+ * Filesystem scans are time consuming. Drop the directory ILOCK and
+ * all other resources for the duration of the scan and hope for the
+ * best. The live update hooks will keep our scan information up to
+ * date even though we've dropped the locks.
+ */
+ xchk_trans_cancel(sc);
+ if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
+ xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
+ XFS_ILOCK_EXCL));
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) {
+ bool flush;
+
+ error = xrep_dir_scan_file(rd, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ /* Flush stashed dirent updates to constrain memory usage. */
+ mutex_lock(&rd->pscan.lock);
+ flush = xrep_dir_want_flush_stashed(rd);
+ mutex_unlock(&rd->pscan.lock);
+ if (flush) {
+ xchk_trans_cancel(sc);
+
+ error = xrep_tempfile_iolock_polled(sc);
+ if (error)
+ break;
+
+ error = xrep_dir_replay_updates(rd);
+ xrep_tempfile_iounlock(sc);
+ if (error)
+ break;
+
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ break;
+ }
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&rd->pscan.iscan);
+ if (error) {
+ /*
+ * If we couldn't grab an inode that was busy with a state
+ * change, change the error code so that we exit to userspace
+ * as quickly as possible.
+ */
+ if (error == -EBUSY)
+ return -ECANCELED;
+ return error;
+ }
+
+ /*
+ * Cancel the empty transaction so that we can (later) use the atomic
+ * file mapping exchange functions to lock files and commit the new
+ * directory.
+ */
+ xchk_trans_cancel(rd->sc);
+ return 0;
+}
+
+/*
+ * Capture dirent updates being made by other threads which are relevant to the
+ * directory being repaired.
+ */
+STATIC int
+xrep_dir_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xrep_dir *rd;
+ struct xfs_scrub *sc;
+ int error = 0;
+
+ rd = container_of(nb, struct xrep_dir, pscan.dhook.dirent_hook.nb);
+ sc = rd->sc;
+
+ /*
+ * This thread updated a child dirent in the directory that we're
+ * rebuilding. Stash the update for replay against the temporary
+ * directory.
+ */
+ if (p->dp->i_ino == sc->ip->i_ino &&
+ xchk_iscan_want_live_update(&rd->pscan.iscan, p->ip->i_ino)) {
+ mutex_lock(&rd->pscan.lock);
+ if (p->delta > 0)
+ error = xrep_dir_stash_createname(rd, p->name,
+ p->ip->i_ino);
+ else
+ error = xrep_dir_stash_removename(rd, p->name,
+ p->ip->i_ino);
+ mutex_unlock(&rd->pscan.lock);
+ if (error)
+ goto out_abort;
+ }
+
+ /*
+ * This thread updated another directory's child dirent that points to
+ * the directory that we're rebuilding, so remember the new dotdot
+ * target.
+ */
+ if (p->ip->i_ino == sc->ip->i_ino &&
+ xchk_iscan_want_live_update(&rd->pscan.iscan, p->dp->i_ino)) {
+ if (p->delta > 0) {
+ trace_xrep_dir_stash_createname(sc->tempip,
+ &xfs_name_dotdot,
+ p->dp->i_ino);
+
+ xrep_findparent_scan_found(&rd->pscan, p->dp->i_ino);
+ } else {
+ trace_xrep_dir_stash_removename(sc->tempip,
+ &xfs_name_dotdot,
+ rd->pscan.parent_ino);
+
+ xrep_findparent_scan_found(&rd->pscan, NULLFSINO);
+ }
+ }
+
+ return NOTIFY_DONE;
+out_abort:
+ xchk_iscan_abort(&rd->pscan.iscan);
+ return NOTIFY_DONE;
+}
+
+/*
+ * Free all the directory blocks and reset the data fork. The caller must
+ * join the inode to the transaction. This function returns with the inode
+ * joined to a clean scrub transaction.
+ */
+STATIC int
+xrep_dir_reset_fork(
+ struct xrep_dir *rd,
+ xfs_ino_t parent_ino)
+{
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK);
+ int error;
+
+ /* Unmap all the directory buffers. */
+ if (xfs_ifork_has_extents(ifp)) {
+ error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
+ if (error)
+ return error;
+ }
+
+ trace_xrep_dir_reset_fork(sc->tempip, parent_ino);
+
+ /* Reset the data fork to an empty data fork. */
+ xfs_idestroy_fork(ifp);
+ ifp->if_bytes = 0;
+ sc->tempip->i_disk_size = 0;
+
+ /* Reinitialize the short form directory. */
+ xrep_dir_init_args(rd, sc->tempip, NULL);
+ return xfs_dir2_sf_create(&rd->args, parent_ino);
+}
+
+/*
+ * Prepare both inodes' directory forks for exchanging mappings. Promote the
+ * tempfile from short format to leaf format, and if the file being repaired
+ * has a short format data fork, turn it into an empty extent list.
+ */
+STATIC int
+xrep_dir_swap_prep(
+ struct xfs_scrub *sc,
+ bool temp_local,
+ bool ip_local)
+{
+ int error;
+
+ /*
+ * If the tempfile's directory is in shortform format, convert that to
+ * a single leaf extent so that we can use the atomic mapping exchange.
+ */
+ if (temp_local) {
+ struct xfs_da_args args = {
+ .dp = sc->tempip,
+ .geo = sc->mp->m_dir_geo,
+ .whichfork = XFS_DATA_FORK,
+ .trans = sc->tp,
+ .total = 1,
+ .owner = sc->ip->i_ino,
+ };
+
+ error = xfs_dir2_sf_to_block(&args);
+ if (error)
+ return error;
+
+ /*
+ * Roll the deferred log items to get us back to a clean
+ * transaction.
+ */
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If the file being repaired had a shortform data fork, convert that
+ * to an empty extent list in preparation for the atomic mapping
+ * exchange.
+ */
+ if (ip_local) {
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+ xfs_idestroy_fork(ifp);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+ ifp->if_nextents = 0;
+ ifp->if_bytes = 0;
+ ifp->if_data = NULL;
+ ifp->if_height = 0;
+
+ xfs_trans_log_inode(sc->tp, sc->ip,
+ XFS_ILOG_CORE | XFS_ILOG_DDATA);
+ }
+
+ return 0;
+}
+
+/*
+ * Replace the inode number of a directory entry.
+ */
+static int
+xrep_dir_replace(
+ struct xrep_dir *rd,
+ struct xfs_inode *dp,
+ const struct xfs_name *name,
+ xfs_ino_t inum,
+ xfs_extlen_t total)
+{
+ struct xfs_scrub *sc = rd->sc;
+ int error;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+
+ error = xfs_dir_ino_validate(sc->mp, inum);
+ if (error)
+ return error;
+
+ xrep_dir_init_args(rd, dp, name);
+ rd->args.inumber = inum;
+ rd->args.total = total;
+ return xfs_dir_replace_args(&rd->args);
+}
+
+/*
+ * Reset the link count of this directory and adjust the unlinked list pointers
+ * as needed.
+ */
+STATIC int
+xrep_dir_set_nlink(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_inode *dp = sc->ip;
+ struct xfs_perag *pag;
+ unsigned int new_nlink = min_t(unsigned long long,
+ rd->subdirs + 2,
+ XFS_NLINK_PINNED);
+ int error;
+
+ /*
+ * The directory is not on the incore unlinked list, which means that
+ * it needs to be reachable via the directory tree. Update the nlink
+ * with our observed link count. If the directory has no parent, it
+ * will be moved to the orphanage.
+ */
+ if (!xfs_inode_on_unlinked_list(dp))
+ goto reset_nlink;
+
+ /*
+ * The directory is on the unlinked list and we did not find any
+ * dirents. Set the link count to zero and let the directory
+ * inactivate when the last reference drops.
+ */
+ if (rd->dirents == 0) {
+ rd->needs_adoption = false;
+ new_nlink = 0;
+ goto reset_nlink;
+ }
+
+ /*
+ * The directory is on the unlinked list and we found dirents. This
+ * directory needs to be reachable via the directory tree. Remove the
+ * dir from the unlinked list and update nlink with the observed link
+ * count. If the directory has no parent, it will be moved to the
+ * orphanage.
+ */
+ pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino));
+ if (!pag) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ error = xfs_iunlink_remove(sc->tp, pag, dp);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+
+reset_nlink:
+ if (VFS_I(dp)->i_nlink != new_nlink)
+ set_nlink(VFS_I(dp), new_nlink);
+ return 0;
+}
+
+/*
+ * Finish replaying stashed dirent updates, allocate a transaction for
+ * exchanging data fork mappings, and take the ILOCKs of both directories
+ * before we commit the new directory structure.
+ */
+STATIC int
+xrep_dir_finalize_tempdir(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ int error;
+
+ if (!xfs_has_parent(sc->mp))
+ return xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
+
+ /*
+ * Repair relies on the ILOCK to quiesce all possible dirent updates.
+ * Replay all queued dirent updates into the tempdir before exchanging
+ * the contents, even if that means dropping the ILOCKs and the
+ * transaction.
+ */
+ do {
+ error = xrep_dir_replay_updates(rd);
+ if (error)
+ return error;
+
+ error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
+ if (error)
+ return error;
+
+ if (xfarray_length(rd->dir_entries) == 0)
+ break;
+
+ xchk_trans_cancel(sc);
+ xrep_tempfile_iunlock_both(sc);
+ } while (!xchk_should_terminate(sc, &error));
+ return error;
+}
+
+/* Exchange the temporary directory's data fork with the one being repaired. */
+STATIC int
+xrep_dir_swap(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ bool ip_local, temp_local;
+ int error = 0;
+
+ /*
+ * If we never found the parent for this directory, temporarily assign
+ * the root dir as the parent; we'll move this to the orphanage after
+ * exchanging the dir contents. We hold the ILOCK of the dir being
+ * repaired, so we're not worried about racy updates of dotdot.
+ */
+ ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
+ if (rd->pscan.parent_ino == NULLFSINO) {
+ rd->needs_adoption = true;
+ rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino;
+ }
+
+ /*
+ * Reset the temporary directory's '..' entry to point to the parent
+ * that we found. The temporary directory was created with the root
+ * directory as the parent, so we can skip this if repairing a
+ * subdirectory of the root.
+ *
+ * It's also possible that this replacement could also expand a sf
+ * tempdir into block format.
+ */
+ if (rd->pscan.parent_ino != sc->mp->m_rootip->i_ino) {
+ error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot,
+ rd->pscan.parent_ino, rd->tx.req.resblks);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Changing the dot and dotdot entries could have changed the shape of
+ * the directory, so we recompute these.
+ */
+ ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
+ temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
+
+ /*
+ * If the both files have a local format data fork and the rebuilt
+ * directory data would fit in the repaired file's data fork, copy
+ * the contents from the tempfile and update the directory link count.
+ * We're done now.
+ */
+ if (ip_local && temp_local &&
+ sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) {
+ xrep_tempfile_copyout_local(sc, XFS_DATA_FORK);
+ return xrep_dir_set_nlink(rd);
+ }
+
+ /*
+ * Clean the transaction before we start working on exchanging
+ * directory contents.
+ */
+ error = xrep_tempfile_roll_trans(rd->sc);
+ if (error)
+ return error;
+
+ /* Otherwise, make sure both data forks are in block-mapping mode. */
+ error = xrep_dir_swap_prep(sc, temp_local, ip_local);
+ if (error)
+ return error;
+
+ /*
+ * Set nlink of the directory in the same transaction sequence that
+ * (atomically) commits the new directory data.
+ */
+ error = xrep_dir_set_nlink(rd);
+ if (error)
+ return error;
+
+ return xrep_tempexch_contents(sc, &rd->tx);
+}
+
+/*
+ * Exchange the new directory contents (which we created in the tempfile) with
+ * the directory being repaired.
+ */
+STATIC int
+xrep_dir_rebuild_tree(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ int error;
+
+ trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino);
+
+ /*
+ * Take the IOLOCK on the temporary file so that we can run dir
+ * operations with the same locks held as we would for a normal file.
+ * We still hold sc->ip's IOLOCK.
+ */
+ error = xrep_tempfile_iolock_polled(rd->sc);
+ if (error)
+ return error;
+
+ /*
+ * Allocate transaction, lock inodes, and make sure that we've replayed
+ * all the stashed dirent updates to the tempdir. After this point,
+ * we're ready to exchange data fork mappings.
+ */
+ error = xrep_dir_finalize_tempdir(rd);
+ if (error)
+ return error;
+
+ if (xchk_iscan_aborted(&rd->pscan.iscan))
+ return -ECANCELED;
+
+ /*
+ * Exchange the tempdir's data fork with the file being repaired. This
+ * recreates the transaction and re-takes the ILOCK in the scrub
+ * context.
+ */
+ error = xrep_dir_swap(rd);
+ if (error)
+ return error;
+
+ /*
+ * Release the old directory blocks and reset the data fork of the temp
+ * directory to an empty shortform directory because inactivation does
+ * nothing for directories.
+ */
+ error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino);
+ if (error)
+ return error;
+
+ /*
+ * Roll to get a transaction without any inodes joined to it. Then we
+ * can drop the tempfile's ILOCK and IOLOCK before doing more work on
+ * the scrub target directory.
+ */
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ return error;
+
+ xrep_tempfile_iunlock(sc);
+ xrep_tempfile_iounlock(sc);
+ return 0;
+}
+
+/* Set up the filesystem scan so we can regenerate directory entries. */
+STATIC int
+xrep_dir_setup_scan(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ char *descr;
+ int error;
+
+ /* Set up some staging memory for salvaging dirents. */
+ descr = xchk_xfile_ino_descr(sc, "directory entries");
+ error = xfarray_create(descr, 0, sizeof(struct xrep_dirent),
+ &rd->dir_entries);
+ kfree(descr);
+ if (error)
+ return error;
+
+ descr = xchk_xfile_ino_descr(sc, "directory entry names");
+ error = xfblob_create(descr, &rd->dir_names);
+ kfree(descr);
+ if (error)
+ goto out_xfarray;
+
+ if (xfs_has_parent(sc->mp))
+ error = __xrep_findparent_scan_start(sc, &rd->pscan,
+ xrep_dir_live_update);
+ else
+ error = xrep_findparent_scan_start(sc, &rd->pscan);
+ if (error)
+ goto out_xfblob;
+
+ return 0;
+
+out_xfblob:
+ xfblob_destroy(rd->dir_names);
+ rd->dir_names = NULL;
+out_xfarray:
+ xfarray_destroy(rd->dir_entries);
+ rd->dir_entries = NULL;
+ return error;
+}
+
+/*
+ * Move the current file to the orphanage.
+ *
+ * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon
+ * successful return, the scrub transaction will have enough extra reservation
+ * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the
+ * orphanage; and both inodes will be ijoined.
+ */
+STATIC int
+xrep_dir_move_to_orphanage(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ xfs_ino_t orig_parent, new_parent;
+ int error;
+
+ /*
+ * We are about to drop the ILOCK on sc->ip to lock the orphanage and
+ * prepare for the adoption. Therefore, look up the old dotdot entry
+ * for sc->ip so that we can compare it after we re-lock sc->ip.
+ */
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent);
+ if (error)
+ return error;
+
+ /*
+ * Drop the ILOCK on the scrub target and commit the transaction.
+ * Adoption computes its own resource requirements and gathers the
+ * necessary components.
+ */
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /* If we can take the orphanage's iolock then we're ready to move. */
+ if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) {
+ xchk_iunlock(sc, sc->ilock_flags);
+ error = xrep_orphanage_iolock_two(sc);
+ if (error)
+ return error;
+ }
+
+ /* Grab transaction and ILOCK the two files. */
+ error = xrep_adoption_trans_alloc(sc, &rd->adoption);
+ if (error)
+ return error;
+
+ error = xrep_adoption_compute_name(&rd->adoption, &rd->xname);
+ if (error)
+ return error;
+
+ /*
+ * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot
+ * entry again. If the parent changed or the child was unlinked while
+ * the child directory was unlocked, we don't need to move the child to
+ * the orphanage after all.
+ */
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent);
+ if (error)
+ return error;
+
+ /*
+ * Attach to the orphanage if we still have a linked directory and it
+ * hasn't been moved.
+ */
+ if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) {
+ error = xrep_adoption_move(&rd->adoption);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Launder the scrub transaction so we can drop the orphanage ILOCK
+ * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK.
+ */
+ error = xrep_adoption_trans_roll(&rd->adoption);
+ if (error)
+ return error;
+
+ xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ return 0;
+}
+
+/*
+ * Repair the directory metadata.
+ *
+ * XXX: Directory entry buffers can be multiple fsblocks in size. The buffer
+ * cache in XFS can't handle aliased multiblock buffers, so this might
+ * misbehave if the directory blocks are crosslinked with other filesystem
+ * metadata.
+ *
+ * XXX: Is it necessary to check the dcache for this directory to make sure
+ * that we always recreate every cached entry?
+ */
+int
+xrep_directory(
+ struct xfs_scrub *sc)
+{
+ struct xrep_dir *rd = sc->buf;
+ int error;
+
+ /* The rmapbt is required to reap the old data fork. */
+ if (!xfs_has_rmapbt(sc->mp))
+ return -EOPNOTSUPP;
+ /* We require atomic file exchange range to rebuild anything. */
+ if (!xfs_has_exchange_range(sc->mp))
+ return -EOPNOTSUPP;
+
+ error = xrep_dir_setup_scan(rd);
+ if (error)
+ return error;
+
+ if (xfs_has_parent(sc->mp))
+ error = xrep_dir_scan_dirtree(rd);
+ else
+ error = xrep_dir_salvage_entries(rd);
+ if (error)
+ goto out_teardown;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto out_teardown;
+
+ error = xrep_dir_rebuild_tree(rd);
+ if (error)
+ goto out_teardown;
+
+ if (rd->needs_adoption) {
+ if (!xrep_orphanage_can_adopt(rd->sc))
+ error = -EFSCORRUPTED;
+ else
+ error = xrep_dir_move_to_orphanage(rd);
+ if (error)
+ goto out_teardown;
+ }
+
+out_teardown:
+ xrep_dir_teardown(sc);
+ return error;
+}
diff --git a/fs/xfs/scrub/dirtree.c b/fs/xfs/scrub/dirtree.c
new file mode 100644
index 000000000000..bde58fb561ea
--- /dev/null
+++ b/fs/xfs/scrub/dirtree.c
@@ -0,0 +1,985 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_attr.h"
+#include "xfs_parent.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/bitmap.h"
+#include "scrub/ino_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/listxattr.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/orphanage.h"
+#include "scrub/dirtree.h"
+
+/*
+ * Directory Tree Structure Validation
+ * ===================================
+ *
+ * Validating the tree qualities of the directory tree structure can be
+ * difficult. If the tree is frozen, running a depth (or breadth) first search
+ * and marking a bitmap suffices to determine if there is a cycle. XORing the
+ * mark bitmap with the inode bitmap afterwards tells us if there are
+ * disconnected cycles. If the tree is not frozen, directory updates can move
+ * subtrees across the scanner wavefront, which complicates the design greatly.
+ *
+ * Directory parent pointers change that by enabling an incremental approach to
+ * validation of the tree structure. Instead of using one thread to scan the
+ * entire filesystem, we instead can have multiple threads walking individual
+ * subdirectories upwards to the root. In a perfect world, the IOLOCK would
+ * suffice to stabilize two directories in a parent -> child relationship.
+ * Unfortunately, the VFS does not take the IOLOCK when moving a child
+ * subdirectory, so we instead synchronize on ILOCK and use dirent update hooks
+ * to detect a race. If a race occurs in a path, we restart the scan.
+ *
+ * If the walk terminates without reaching the root, we know the path is
+ * disconnected and ought to be attached to the lost and found. If on the walk
+ * we find the same subdir that we're scanning, we know this is a cycle and
+ * should delete an incoming edge. If we find multiple paths to the root, we
+ * know to delete an incoming edge.
+ *
+ * There are two big hitches with this approach: first, all file link counts
+ * must be correct to prevent other writers from doing the wrong thing with the
+ * directory tree structure. Second, because we're walking upwards in a tree
+ * of arbitrary depth, we cannot hold all the ILOCKs. Instead, we will use a
+ * directory update hook to invalidate the scan results if one of the paths
+ * we've scanned has changed.
+ */
+
+/* Clean up the dirtree checking resources. */
+STATIC void
+xchk_dirtree_buf_cleanup(
+ void *buf)
+{
+ struct xchk_dirtree *dl = buf;
+ struct xchk_dirpath *path, *n;
+
+ if (dl->scan_ino != NULLFSINO)
+ xfs_dir_hook_del(dl->sc->mp, &dl->dhook);
+
+ xchk_dirtree_for_each_path_safe(dl, path, n) {
+ list_del_init(&path->list);
+ xino_bitmap_destroy(&path->seen_inodes);
+ kfree(path);
+ }
+
+ xfblob_destroy(dl->path_names);
+ xfarray_destroy(dl->path_steps);
+ mutex_destroy(&dl->lock);
+}
+
+/* Set us up to look for directory loops. */
+int
+xchk_setup_dirtree(
+ struct xfs_scrub *sc)
+{
+ struct xchk_dirtree *dl;
+ char *descr;
+ int error;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_dirtree(sc);
+ if (error)
+ return error;
+ }
+
+ dl = kvzalloc(sizeof(struct xchk_dirtree), XCHK_GFP_FLAGS);
+ if (!dl)
+ return -ENOMEM;
+ dl->sc = sc;
+ dl->xname.name = dl->namebuf;
+ dl->hook_xname.name = dl->hook_namebuf;
+ INIT_LIST_HEAD(&dl->path_list);
+ dl->root_ino = NULLFSINO;
+ dl->scan_ino = NULLFSINO;
+ dl->parent_ino = NULLFSINO;
+
+ mutex_init(&dl->lock);
+
+ descr = xchk_xfile_ino_descr(sc, "dirtree path steps");
+ error = xfarray_create(descr, 0, sizeof(struct xchk_dirpath_step),
+ &dl->path_steps);
+ kfree(descr);
+ if (error)
+ goto out_dl;
+
+ descr = xchk_xfile_ino_descr(sc, "dirtree path names");
+ error = xfblob_create(descr, &dl->path_names);
+ kfree(descr);
+ if (error)
+ goto out_steps;
+
+ error = xchk_setup_inode_contents(sc, 0);
+ if (error)
+ goto out_names;
+
+ sc->buf = dl;
+ sc->buf_cleanup = xchk_dirtree_buf_cleanup;
+ return 0;
+
+out_names:
+ xfblob_destroy(dl->path_names);
+out_steps:
+ xfarray_destroy(dl->path_steps);
+out_dl:
+ mutex_destroy(&dl->lock);
+ kvfree(dl);
+ return error;
+}
+
+/*
+ * Add the parent pointer described by @dl->pptr to the given path as a new
+ * step. Returns -ELNRNG if the path is too deep.
+ */
+int
+xchk_dirpath_append(
+ struct xchk_dirtree *dl,
+ struct xfs_inode *ip,
+ struct xchk_dirpath *path,
+ const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr)
+{
+ struct xchk_dirpath_step step = {
+ .pptr_rec = *pptr, /* struct copy */
+ .name_len = name->len,
+ };
+ int error;
+
+ /*
+ * If this path is more than 2 billion steps long, this directory tree
+ * is too far gone to fix.
+ */
+ if (path->nr_steps >= XFS_MAXLINK)
+ return -ELNRNG;
+
+ error = xfblob_storename(dl->path_names, &step.name_cookie, name);
+ if (error)
+ return error;
+
+ error = xino_bitmap_set(&path->seen_inodes, ip->i_ino);
+ if (error)
+ return error;
+
+ error = xfarray_append(dl->path_steps, &step);
+ if (error)
+ return error;
+
+ path->nr_steps++;
+ return 0;
+}
+
+/*
+ * Create an xchk_path for each parent pointer of the directory that we're
+ * scanning. For each path created, we will eventually try to walk towards the
+ * root with the goal of deleting all parents except for one that leads to the
+ * root.
+ *
+ * Returns -EFSCORRUPTED to signal that the inode being scanned has a corrupt
+ * parent pointer and hence there's no point in continuing; or -ENOSR if there
+ * are too many parent pointers for this directory.
+ */
+STATIC int
+xchk_dirtree_create_path(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xfs_name xname = {
+ .name = name,
+ .len = namelen,
+ };
+ struct xchk_dirtree *dl = priv;
+ struct xchk_dirpath *path;
+ const struct xfs_parent_rec *rec = value;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, NULL, NULL);
+ if (error)
+ return error;
+
+ /*
+ * If there are more than 2 billion actual parent pointers for this
+ * subdirectory, this fs is too far gone to fix.
+ */
+ if (dl->nr_paths >= XFS_MAXLINK)
+ return -ENOSR;
+
+ trace_xchk_dirtree_create_path(sc, ip, dl->nr_paths, &xname, rec);
+
+ /*
+ * Create a new xchk_path structure to remember this parent pointer
+ * and record the first name step.
+ */
+ path = kmalloc(sizeof(struct xchk_dirpath), XCHK_GFP_FLAGS);
+ if (!path)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&path->list);
+ xino_bitmap_init(&path->seen_inodes);
+ path->nr_steps = 0;
+ path->outcome = XCHK_DIRPATH_SCANNING;
+
+ error = xchk_dirpath_append(dl, sc->ip, path, &xname, rec);
+ if (error)
+ goto out_path;
+
+ path->first_step = xfarray_length(dl->path_steps) - 1;
+ path->second_step = XFARRAY_NULLIDX;
+ path->path_nr = dl->nr_paths;
+
+ list_add_tail(&path->list, &dl->path_list);
+ dl->nr_paths++;
+ return 0;
+out_path:
+ kfree(path);
+ return error;
+}
+
+/*
+ * Validate that the first step of this path still has a corresponding
+ * parent pointer in @sc->ip. We probably dropped @sc->ip's ILOCK while
+ * walking towards the roots, which is why this is necessary.
+ *
+ * This function has a side effect of loading the first parent pointer of this
+ * path into the parent pointer scratch pad. This prepares us to walk up the
+ * directory tree towards the root. Returns -ESTALE if the scan data is now
+ * out of date.
+ */
+STATIC int
+xchk_dirpath_revalidate(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path)
+{
+ struct xfs_scrub *sc = dl->sc;
+ int error;
+
+ /*
+ * Look up the parent pointer that corresponds to the start of this
+ * path. If the parent pointer has disappeared on us, dump all the
+ * scan results and try again.
+ */
+ error = xfs_parent_lookup(sc->tp, sc->ip, &dl->xname, &dl->pptr_rec,
+ &dl->pptr_args);
+ if (error == -ENOATTR) {
+ trace_xchk_dirpath_disappeared(dl->sc, sc->ip, path->path_nr,
+ path->first_step, &dl->xname, &dl->pptr_rec);
+ dl->stale = true;
+ return -ESTALE;
+ }
+
+ return error;
+}
+
+/*
+ * Walk the parent pointers of a directory at the end of a path and record
+ * the parent that we find in @dl->xname/pptr_rec.
+ */
+STATIC int
+xchk_dirpath_find_next_step(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xchk_dirtree *dl = priv;
+ const struct xfs_parent_rec *rec = value;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, NULL, NULL);
+ if (error)
+ return error;
+
+ /*
+ * If we've already set @dl->pptr_rec, then this directory has multiple
+ * parents. Signal this back to the caller via -EMLINK.
+ */
+ if (dl->parents_found > 0)
+ return -EMLINK;
+
+ dl->parents_found++;
+ memcpy(dl->namebuf, name, namelen);
+ dl->xname.len = namelen;
+ dl->pptr_rec = *rec; /* struct copy */
+ return 0;
+}
+
+/* Set and log the outcome of a path walk. */
+static inline void
+xchk_dirpath_set_outcome(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path,
+ enum xchk_dirpath_outcome outcome)
+{
+ trace_xchk_dirpath_set_outcome(dl->sc, path->path_nr, path->nr_steps,
+ outcome);
+
+ path->outcome = outcome;
+}
+
+/*
+ * Scan the directory at the end of this path for its parent directory link.
+ * If we find one, extend the path. Returns -ESTALE if the scan data out of
+ * date. Returns -EFSCORRUPTED if the parent pointer is bad; or -ELNRNG if
+ * the path got too deep.
+ */
+STATIC int
+xchk_dirpath_step_up(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path)
+{
+ struct xfs_scrub *sc = dl->sc;
+ struct xfs_inode *dp;
+ xfs_ino_t parent_ino = be64_to_cpu(dl->pptr_rec.p_ino);
+ unsigned int lock_mode;
+ int error;
+
+ /* Grab and lock the parent directory. */
+ error = xchk_iget(sc, parent_ino, &dp);
+ if (error)
+ return error;
+
+ lock_mode = xfs_ilock_attr_map_shared(dp);
+ mutex_lock(&dl->lock);
+
+ if (dl->stale) {
+ error = -ESTALE;
+ goto out_scanlock;
+ }
+
+ /* We've reached the root directory; the path is ok. */
+ if (parent_ino == dl->root_ino) {
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_OK);
+ error = 0;
+ goto out_scanlock;
+ }
+
+ /*
+ * The inode being scanned is its own distant ancestor! Get rid of
+ * this path.
+ */
+ if (parent_ino == sc->ip->i_ino) {
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ error = 0;
+ goto out_scanlock;
+ }
+
+ /*
+ * We've seen this inode before during the path walk. There's a loop
+ * above us in the directory tree. This probably means that we cannot
+ * continue, but let's keep walking paths to get a full picture.
+ */
+ if (xino_bitmap_test(&path->seen_inodes, parent_ino)) {
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_LOOP);
+ error = 0;
+ goto out_scanlock;
+ }
+
+ /* The handle encoded in the parent pointer must match. */
+ if (VFS_I(dp)->i_generation != be32_to_cpu(dl->pptr_rec.p_gen)) {
+ trace_xchk_dirpath_badgen(dl->sc, dp, path->path_nr,
+ path->nr_steps, &dl->xname, &dl->pptr_rec);
+ error = -EFSCORRUPTED;
+ goto out_scanlock;
+ }
+
+ /* Parent pointer must point up to a directory. */
+ if (!S_ISDIR(VFS_I(dp)->i_mode)) {
+ trace_xchk_dirpath_nondir_parent(dl->sc, dp, path->path_nr,
+ path->nr_steps, &dl->xname, &dl->pptr_rec);
+ error = -EFSCORRUPTED;
+ goto out_scanlock;
+ }
+
+ /* Parent cannot be an unlinked directory. */
+ if (VFS_I(dp)->i_nlink == 0) {
+ trace_xchk_dirpath_unlinked_parent(dl->sc, dp, path->path_nr,
+ path->nr_steps, &dl->xname, &dl->pptr_rec);
+ error = -EFSCORRUPTED;
+ goto out_scanlock;
+ }
+
+ /*
+ * If the extended attributes look as though they has been zapped by
+ * the inode record repair code, we cannot scan for parent pointers.
+ */
+ if (xchk_pptr_looks_zapped(dp)) {
+ error = -EBUSY;
+ xchk_set_incomplete(sc);
+ goto out_scanlock;
+ }
+
+ /*
+ * Walk the parent pointers of @dp to find the parent of this directory
+ * to find the next step in our walk. If we find that @dp has exactly
+ * one parent, the parent pointer information will be stored in
+ * @dl->pptr_rec. This prepares us for the next step of the walk.
+ */
+ mutex_unlock(&dl->lock);
+ dl->parents_found = 0;
+ error = xchk_xattr_walk(sc, dp, xchk_dirpath_find_next_step, NULL, dl);
+ mutex_lock(&dl->lock);
+ if (error == -EFSCORRUPTED || error == -EMLINK ||
+ (!error && dl->parents_found == 0)) {
+ /*
+ * Further up the directory tree from @sc->ip, we found a
+ * corrupt parent pointer, multiple parent pointers while
+ * finding this directory's parent, or zero parents despite
+ * having a nonzero link count. Keep looking for other paths.
+ */
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_CORRUPT);
+ error = 0;
+ goto out_scanlock;
+ }
+ if (error)
+ goto out_scanlock;
+
+ if (dl->stale) {
+ error = -ESTALE;
+ goto out_scanlock;
+ }
+
+ trace_xchk_dirpath_found_next_step(sc, dp, path->path_nr,
+ path->nr_steps, &dl->xname, &dl->pptr_rec);
+
+ /* Append to the path steps */
+ error = xchk_dirpath_append(dl, dp, path, &dl->xname, &dl->pptr_rec);
+ if (error)
+ goto out_scanlock;
+
+ if (path->second_step == XFARRAY_NULLIDX)
+ path->second_step = xfarray_length(dl->path_steps) - 1;
+
+out_scanlock:
+ mutex_unlock(&dl->lock);
+ xfs_iunlock(dp, lock_mode);
+ xchk_irele(sc, dp);
+ return error;
+}
+
+/*
+ * Walk the directory tree upwards towards what is hopefully the root
+ * directory, recording path steps as we go. The current path components are
+ * stored in dl->pptr_rec and dl->xname.
+ *
+ * Returns -ESTALE if the scan data are out of date. Returns -EFSCORRUPTED
+ * only if the direct parent pointer of @sc->ip associated with this path is
+ * corrupt.
+ */
+STATIC int
+xchk_dirpath_walk_upwards(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path)
+{
+ struct xfs_scrub *sc = dl->sc;
+ int error;
+
+ ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
+
+ /* Reload the start of this path and make sure it's still there. */
+ error = xchk_dirpath_revalidate(dl, path);
+ if (error)
+ return error;
+
+ trace_xchk_dirpath_walk_upwards(sc, sc->ip, path->path_nr, &dl->xname,
+ &dl->pptr_rec);
+
+ /*
+ * The inode being scanned is its own direct ancestor!
+ * Get rid of this path.
+ */
+ if (be64_to_cpu(dl->pptr_rec.p_ino) == sc->ip->i_ino) {
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ return 0;
+ }
+
+ /*
+ * Drop ILOCK_EXCL on the inode being scanned. We still hold
+ * IOLOCK_EXCL on it, so it cannot move around or be renamed.
+ *
+ * Beyond this point we're walking up the directory tree, which means
+ * that we can acquire and drop the ILOCK on an alias of sc->ip. The
+ * ILOCK state is no longer tracked in the scrub context. Hence we
+ * must drop @sc->ip's ILOCK during the walk.
+ */
+ mutex_unlock(&dl->lock);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /*
+ * Take the first step in the walk towards the root by checking the
+ * start of this path, which is a direct parent pointer of @sc->ip.
+ * If we see any kind of error here (including corruptions), the parent
+ * pointer of @sc->ip is corrupt. Stop the whole scan.
+ */
+ error = xchk_dirpath_step_up(dl, path);
+ if (error) {
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ mutex_lock(&dl->lock);
+ return error;
+ }
+
+ /*
+ * Take steps upward from the second step in this path towards the
+ * root. If we hit corruption errors here, there's a problem
+ * *somewhere* in the path, but we don't need to stop scanning.
+ */
+ while (!error && path->outcome == XCHK_DIRPATH_SCANNING)
+ error = xchk_dirpath_step_up(dl, path);
+
+ /* Retake the locks we had, mark paths, etc. */
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ mutex_lock(&dl->lock);
+ if (error == -EFSCORRUPTED) {
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_CORRUPT);
+ error = 0;
+ }
+ if (!error && dl->stale)
+ return -ESTALE;
+ return error;
+}
+
+/*
+ * Decide if this path step has been touched by this live update. Returns
+ * 1 for yes, 0 for no, or a negative errno.
+ */
+STATIC int
+xchk_dirpath_step_is_stale(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path,
+ unsigned int step_nr,
+ xfarray_idx_t step_idx,
+ struct xfs_dir_update_params *p,
+ xfs_ino_t *cursor)
+{
+ struct xchk_dirpath_step step;
+ xfs_ino_t child_ino = *cursor;
+ int error;
+
+ error = xfarray_load(dl->path_steps, step_idx, &step);
+ if (error)
+ return error;
+ *cursor = be64_to_cpu(step.pptr_rec.p_ino);
+
+ /*
+ * If the parent and child being updated are not the ones mentioned in
+ * this path step, the scan data is still ok.
+ */
+ if (p->ip->i_ino != child_ino || p->dp->i_ino != *cursor)
+ return 0;
+
+ /*
+ * If the dirent name lengths or byte sequences are different, the scan
+ * data is still ok.
+ */
+ if (p->name->len != step.name_len)
+ return 0;
+
+ error = xfblob_loadname(dl->path_names, step.name_cookie,
+ &dl->hook_xname, step.name_len);
+ if (error)
+ return error;
+
+ if (memcmp(dl->hook_xname.name, p->name->name, p->name->len) != 0)
+ return 0;
+
+ /*
+ * If the update comes from the repair code itself, walk the state
+ * machine forward.
+ */
+ if (p->ip->i_ino == dl->scan_ino &&
+ path->outcome == XREP_DIRPATH_ADOPTING) {
+ xchk_dirpath_set_outcome(dl, path, XREP_DIRPATH_ADOPTED);
+ return 0;
+ }
+
+ if (p->ip->i_ino == dl->scan_ino &&
+ path->outcome == XREP_DIRPATH_DELETING) {
+ xchk_dirpath_set_outcome(dl, path, XREP_DIRPATH_DELETED);
+ return 0;
+ }
+
+ /* Exact match, scan data is out of date. */
+ trace_xchk_dirpath_changed(dl->sc, path->path_nr, step_nr, p->dp,
+ p->ip, p->name);
+ return 1;
+}
+
+/*
+ * Decide if this path has been touched by this live update. Returns 1 for
+ * yes, 0 for no, or a negative errno.
+ */
+STATIC int
+xchk_dirpath_is_stale(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path,
+ struct xfs_dir_update_params *p)
+{
+ xfs_ino_t cursor = dl->scan_ino;
+ xfarray_idx_t idx = path->first_step;
+ unsigned int i;
+ int ret;
+
+ /*
+ * The child being updated has not been seen by this path at all; this
+ * path cannot be stale.
+ */
+ if (!xino_bitmap_test(&path->seen_inodes, p->ip->i_ino))
+ return 0;
+
+ ret = xchk_dirpath_step_is_stale(dl, path, 0, idx, p, &cursor);
+ if (ret != 0)
+ return ret;
+
+ for (i = 1, idx = path->second_step; i < path->nr_steps; i++, idx++) {
+ ret = xchk_dirpath_step_is_stale(dl, path, i, idx, p, &cursor);
+ if (ret != 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Decide if a directory update from the regular filesystem touches any of the
+ * paths we've scanned, and invalidate the scan data if true.
+ */
+STATIC int
+xchk_dirtree_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xchk_dirtree *dl;
+ struct xchk_dirpath *path;
+ int ret;
+
+ dl = container_of(nb, struct xchk_dirtree, dhook.dirent_hook.nb);
+
+ trace_xchk_dirtree_live_update(dl->sc, p->dp, action, p->ip, p->delta,
+ p->name);
+
+ mutex_lock(&dl->lock);
+
+ if (dl->stale || dl->aborted)
+ goto out_unlock;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ ret = xchk_dirpath_is_stale(dl, path, p);
+ if (ret < 0) {
+ dl->aborted = true;
+ break;
+ }
+ if (ret == 1) {
+ dl->stale = true;
+ break;
+ }
+ }
+
+out_unlock:
+ mutex_unlock(&dl->lock);
+ return NOTIFY_DONE;
+}
+
+/* Delete all the collected path information. */
+STATIC void
+xchk_dirtree_reset(
+ void *buf)
+{
+ struct xchk_dirtree *dl = buf;
+ struct xchk_dirpath *path, *n;
+
+ ASSERT(dl->sc->ilock_flags & XFS_ILOCK_EXCL);
+
+ xchk_dirtree_for_each_path_safe(dl, path, n) {
+ list_del_init(&path->list);
+ xino_bitmap_destroy(&path->seen_inodes);
+ kfree(path);
+ }
+ dl->nr_paths = 0;
+
+ xfarray_truncate(dl->path_steps);
+ xfblob_truncate(dl->path_names);
+
+ dl->stale = false;
+}
+
+/*
+ * Load the name/pptr from the first step in this path into @dl->pptr_rec and
+ * @dl->xname.
+ */
+STATIC int
+xchk_dirtree_load_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path)
+{
+ struct xchk_dirpath_step step;
+ int error;
+
+ error = xfarray_load(dl->path_steps, path->first_step, &step);
+ if (error)
+ return error;
+
+ error = xfblob_loadname(dl->path_names, step.name_cookie, &dl->xname,
+ step.name_len);
+ if (error)
+ return error;
+
+ dl->pptr_rec = step.pptr_rec; /* struct copy */
+ return 0;
+}
+
+/*
+ * For each parent pointer of this subdir, trace a path upwards towards the
+ * root directory and record what we find. Returns 0 for success;
+ * -EFSCORRUPTED if walking the parent pointers of @sc->ip failed, -ELNRNG if a
+ * path was too deep; -ENOSR if there were too many parent pointers; or
+ * a negative errno.
+ */
+int
+xchk_dirtree_find_paths_to_root(
+ struct xchk_dirtree *dl)
+{
+ struct xfs_scrub *sc = dl->sc;
+ struct xchk_dirpath *path;
+ int error = 0;
+
+ do {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ xchk_dirtree_reset(dl);
+
+ /*
+ * If the extended attributes look as though they has been
+ * zapped by the inode record repair code, we cannot scan for
+ * parent pointers.
+ */
+ if (xchk_pptr_looks_zapped(sc->ip)) {
+ xchk_set_incomplete(sc);
+ return -EBUSY;
+ }
+
+ /*
+ * Create path walk contexts for each parent of the directory
+ * that is being scanned. Directories are supposed to have
+ * only one parent, but this is how we detect multiple parents.
+ */
+ error = xchk_xattr_walk(sc, sc->ip, xchk_dirtree_create_path,
+ NULL, dl);
+ if (error)
+ return error;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ /* Load path components into dl->pptr/xname */
+ error = xchk_dirtree_load_path(dl, path);
+ if (error)
+ return error;
+
+ /*
+ * Try to walk up each path to the root. This enables
+ * us to find directory loops in ancestors, and the
+ * like.
+ */
+ error = xchk_dirpath_walk_upwards(dl, path);
+ if (error == -EFSCORRUPTED) {
+ /*
+ * A parent pointer of @sc->ip is bad, don't
+ * bother continuing.
+ */
+ break;
+ }
+ if (error == -ESTALE) {
+ /* This had better be an invalidation. */
+ ASSERT(dl->stale);
+ break;
+ }
+ if (error)
+ return error;
+ if (dl->aborted)
+ return 0;
+ }
+ } while (dl->stale);
+
+ return error;
+}
+
+/*
+ * Figure out what to do with the paths we tried to find. Do not call this
+ * if the scan results are stale.
+ */
+void
+xchk_dirtree_evaluate(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+
+ ASSERT(!dl->stale);
+
+ /* Scan the paths we have to decide what to do. */
+ memset(oc, 0, sizeof(struct xchk_dirtree_outcomes));
+ xchk_dirtree_for_each_path(dl, path) {
+ trace_xchk_dirpath_evaluate_path(dl->sc, path->path_nr,
+ path->nr_steps, path->outcome);
+
+ switch (path->outcome) {
+ case XCHK_DIRPATH_SCANNING:
+ /* shouldn't get here */
+ ASSERT(0);
+ break;
+ case XCHK_DIRPATH_DELETE:
+ /* This one is already going away. */
+ oc->bad++;
+ break;
+ case XCHK_DIRPATH_CORRUPT:
+ case XCHK_DIRPATH_LOOP:
+ /* Couldn't find the end of this path. */
+ oc->suspect++;
+ break;
+ case XCHK_DIRPATH_STALE:
+ /* shouldn't get here either */
+ ASSERT(0);
+ break;
+ case XCHK_DIRPATH_OK:
+ /* This path got all the way to the root. */
+ oc->good++;
+ break;
+ case XREP_DIRPATH_DELETING:
+ case XREP_DIRPATH_DELETED:
+ case XREP_DIRPATH_ADOPTING:
+ case XREP_DIRPATH_ADOPTED:
+ /* These should not be in progress! */
+ ASSERT(0);
+ break;
+ }
+ }
+
+ trace_xchk_dirtree_evaluate(dl, oc);
+}
+
+/* Look for directory loops. */
+int
+xchk_dirtree(
+ struct xfs_scrub *sc)
+{
+ struct xchk_dirtree_outcomes oc;
+ struct xchk_dirtree *dl = sc->buf;
+ int error;
+
+ /*
+ * Nondirectories do not point downwards to other files, so they cannot
+ * cause a cycle in the directory tree.
+ */
+ if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
+ return -ENOENT;
+
+ ASSERT(xfs_has_parent(sc->mp));
+
+ /*
+ * Find the root of the directory tree. Remember which directory to
+ * scan, because the hook doesn't detach until after sc->ip gets
+ * released during teardown.
+ */
+ dl->root_ino = sc->mp->m_rootip->i_ino;
+ dl->scan_ino = sc->ip->i_ino;
+
+ trace_xchk_dirtree_start(sc->ip, sc->sm, 0);
+
+ /*
+ * Hook into the directory entry code so that we can capture updates to
+ * paths that we have already scanned. The scanner thread takes each
+ * directory's ILOCK, which means that any in-progress directory update
+ * will finish before we can scan the directory.
+ */
+ ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
+ xfs_dir_hook_setup(&dl->dhook, xchk_dirtree_live_update);
+ error = xfs_dir_hook_add(sc->mp, &dl->dhook);
+ if (error)
+ goto out;
+
+ mutex_lock(&dl->lock);
+
+ /* Trace each parent pointer's path to the root. */
+ error = xchk_dirtree_find_paths_to_root(dl);
+ if (error == -EFSCORRUPTED || error == -ELNRNG || error == -ENOSR) {
+ /*
+ * Don't bother walking the paths if the xattr structure or the
+ * parent pointers are corrupt; this scan cannot be completed
+ * without full information.
+ */
+ xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
+ error = 0;
+ goto out_scanlock;
+ }
+ if (error == -EBUSY) {
+ /*
+ * We couldn't scan some directory's parent pointers because
+ * the attr fork looked like it had been zapped. The
+ * scan was marked incomplete, so no further error code
+ * is necessary.
+ */
+ error = 0;
+ goto out_scanlock;
+ }
+ if (error)
+ goto out_scanlock;
+ if (dl->aborted) {
+ xchk_set_incomplete(sc);
+ goto out_scanlock;
+ }
+
+ /* Assess what we found in our path evaluation. */
+ xchk_dirtree_evaluate(dl, &oc);
+ if (xchk_dirtree_parentless(dl)) {
+ if (oc.good || oc.bad || oc.suspect)
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ } else {
+ if (oc.bad || oc.good + oc.suspect != 1)
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ if (oc.suspect)
+ xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
+ }
+
+out_scanlock:
+ mutex_unlock(&dl->lock);
+out:
+ trace_xchk_dirtree_done(sc->ip, sc->sm, error);
+ return error;
+}
diff --git a/fs/xfs/scrub/dirtree.h b/fs/xfs/scrub/dirtree.h
new file mode 100644
index 000000000000..1e1686365c61
--- /dev/null
+++ b/fs/xfs/scrub/dirtree.h
@@ -0,0 +1,178 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_DIRTREE_H__
+#define __XFS_SCRUB_DIRTREE_H__
+
+/*
+ * Each of these represents one parent pointer path step in a chain going
+ * up towards the directory tree root. These are stored inside an xfarray.
+ */
+struct xchk_dirpath_step {
+ /* Directory entry name associated with this parent link. */
+ xfblob_cookie name_cookie;
+ unsigned int name_len;
+
+ /* Handle of the parent directory. */
+ struct xfs_parent_rec pptr_rec;
+};
+
+enum xchk_dirpath_outcome {
+ XCHK_DIRPATH_SCANNING = 0, /* still being put together */
+ XCHK_DIRPATH_DELETE, /* delete this path */
+ XCHK_DIRPATH_CORRUPT, /* corruption detected in path */
+ XCHK_DIRPATH_LOOP, /* cycle detected further up */
+ XCHK_DIRPATH_STALE, /* path is stale */
+ XCHK_DIRPATH_OK, /* path reaches the root */
+
+ XREP_DIRPATH_DELETING, /* path is being deleted */
+ XREP_DIRPATH_DELETED, /* path has been deleted */
+ XREP_DIRPATH_ADOPTING, /* path is being adopted */
+ XREP_DIRPATH_ADOPTED, /* path has been adopted */
+};
+
+/*
+ * Each of these represents one parent pointer path out of the directory being
+ * scanned. These exist in-core, and hopefully there aren't more than a
+ * handful of them.
+ */
+struct xchk_dirpath {
+ struct list_head list;
+
+ /* Index of the first step in this path. */
+ xfarray_idx_t first_step;
+
+ /* Index of the second step in this path. */
+ xfarray_idx_t second_step;
+
+ /* Inodes seen while walking this path. */
+ struct xino_bitmap seen_inodes;
+
+ /* Number of steps in this path. */
+ unsigned int nr_steps;
+
+ /* Which path is this? */
+ unsigned int path_nr;
+
+ /* What did we conclude from following this path? */
+ enum xchk_dirpath_outcome outcome;
+};
+
+struct xchk_dirtree_outcomes {
+ /* Number of XCHK_DIRPATH_DELETE */
+ unsigned int bad;
+
+ /* Number of XCHK_DIRPATH_CORRUPT or XCHK_DIRPATH_LOOP */
+ unsigned int suspect;
+
+ /* Number of XCHK_DIRPATH_OK */
+ unsigned int good;
+
+ /* Directory needs to be added to lost+found */
+ bool needs_adoption;
+};
+
+struct xchk_dirtree {
+ struct xfs_scrub *sc;
+
+ /* Root inode that we're looking for. */
+ xfs_ino_t root_ino;
+
+ /*
+ * This is the inode that we're scanning. The live update hook can
+ * continue to be called after xchk_teardown drops sc->ip but before
+ * it calls buf_cleanup, so we keep a copy.
+ */
+ xfs_ino_t scan_ino;
+
+ /*
+ * If we start deleting redundant paths to this subdirectory, this is
+ * the inode number of the surviving parent and the dotdot entry will
+ * be set to this value. If the value is NULLFSINO, then use @root_ino
+ * as a stand-in until the orphanage can adopt the subdirectory.
+ */
+ xfs_ino_t parent_ino;
+
+ /* Scratch buffer for scanning pptr xattrs */
+ struct xfs_parent_rec pptr_rec;
+ struct xfs_da_args pptr_args;
+
+ /* Name buffer */
+ struct xfs_name xname;
+ char namebuf[MAXNAMELEN];
+
+ /* Information for reparenting this directory. */
+ struct xrep_adoption adoption;
+
+ /*
+ * Hook into directory updates so that we can receive live updates
+ * from other writer threads.
+ */
+ struct xfs_dir_hook dhook;
+
+ /* Parent pointer update arguments. */
+ struct xfs_parent_args ppargs;
+
+ /* lock for everything below here */
+ struct mutex lock;
+
+ /* buffer for the live update functions to use for dirent names */
+ struct xfs_name hook_xname;
+ unsigned char hook_namebuf[MAXNAMELEN];
+
+ /*
+ * All path steps observed during this scan. Each of the path
+ * steps for a particular pathwalk are recorded in sequential
+ * order in the xfarray. A pathwalk ends either with a step
+ * pointing to the root directory (success) or pointing to NULLFSINO
+ * (loop detected, empty dir detected, etc).
+ */
+ struct xfarray *path_steps;
+
+ /* All names observed during this scan. */
+ struct xfblob *path_names;
+
+ /* All paths being tracked by this scanner. */
+ struct list_head path_list;
+
+ /* Number of paths in path_list. */
+ unsigned int nr_paths;
+
+ /* Number of parents found by a pptr scan. */
+ unsigned int parents_found;
+
+ /* Have the path data been invalidated by a concurrent update? */
+ bool stale:1;
+
+ /* Has the scan been aborted? */
+ bool aborted:1;
+};
+
+#define xchk_dirtree_for_each_path_safe(dl, path, n) \
+ list_for_each_entry_safe((path), (n), &(dl)->path_list, list)
+
+#define xchk_dirtree_for_each_path(dl, path) \
+ list_for_each_entry((path), &(dl)->path_list, list)
+
+static inline bool
+xchk_dirtree_parentless(const struct xchk_dirtree *dl)
+{
+ struct xfs_scrub *sc = dl->sc;
+
+ if (sc->ip == sc->mp->m_rootip)
+ return true;
+ if (VFS_I(sc->ip)->i_nlink == 0)
+ return true;
+ return false;
+}
+
+int xchk_dirtree_find_paths_to_root(struct xchk_dirtree *dl);
+int xchk_dirpath_append(struct xchk_dirtree *dl, struct xfs_inode *ip,
+ struct xchk_dirpath *path, const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr);
+void xchk_dirtree_evaluate(struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc);
+
+#endif /* __XFS_SCRUB_DIRTREE_H__ */
diff --git a/fs/xfs/scrub/dirtree_repair.c b/fs/xfs/scrub/dirtree_repair.c
new file mode 100644
index 000000000000..5c04e70ba951
--- /dev/null
+++ b/fs/xfs/scrub/dirtree_repair.c
@@ -0,0 +1,821 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_trans_space.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_attr.h"
+#include "xfs_parent.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/bitmap.h"
+#include "scrub/ino_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/listxattr.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/orphanage.h"
+#include "scrub/dirtree.h"
+#include "scrub/readdir.h"
+
+/*
+ * Directory Tree Structure Repairs
+ * ================================
+ *
+ * If we decide that the directory being scanned is participating in a
+ * directory loop, the only change we can make is to remove directory entries
+ * pointing down to @sc->ip. If that leaves it with no parents, the directory
+ * should be adopted by the orphanage.
+ */
+
+/* Set up to repair directory loops. */
+int
+xrep_setup_dirtree(
+ struct xfs_scrub *sc)
+{
+ return xrep_orphanage_try_create(sc);
+}
+
+/* Change the outcome of this path. */
+static inline void
+xrep_dirpath_set_outcome(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path,
+ enum xchk_dirpath_outcome outcome)
+{
+ trace_xrep_dirpath_set_outcome(dl->sc, path->path_nr, path->nr_steps,
+ outcome);
+
+ path->outcome = outcome;
+}
+
+/* Delete all paths. */
+STATIC void
+xrep_dirtree_delete_all_paths(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ switch (path->outcome) {
+ case XCHK_DIRPATH_CORRUPT:
+ case XCHK_DIRPATH_LOOP:
+ oc->suspect--;
+ oc->bad++;
+ xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ break;
+ case XCHK_DIRPATH_OK:
+ oc->good--;
+ oc->bad++;
+ xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ break;
+ default:
+ break;
+ }
+ }
+
+ ASSERT(oc->suspect == 0);
+ ASSERT(oc->good == 0);
+}
+
+/* Since this is the surviving path, set the dotdot entry to this value. */
+STATIC void
+xrep_dirpath_retain_parent(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path)
+{
+ struct xchk_dirpath_step step;
+ int error;
+
+ error = xfarray_load(dl->path_steps, path->first_step, &step);
+ if (error)
+ return;
+
+ dl->parent_ino = be64_to_cpu(step.pptr_rec.p_ino);
+}
+
+/* Find the one surviving path so we know how to set dotdot. */
+STATIC void
+xrep_dirtree_find_surviving_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+ bool foundit = false;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ switch (path->outcome) {
+ case XCHK_DIRPATH_CORRUPT:
+ case XCHK_DIRPATH_LOOP:
+ case XCHK_DIRPATH_OK:
+ if (!foundit) {
+ xrep_dirpath_retain_parent(dl, path);
+ foundit = true;
+ continue;
+ }
+ ASSERT(foundit == false);
+ break;
+ default:
+ break;
+ }
+ }
+
+ ASSERT(oc->suspect + oc->good == 1);
+}
+
+/* Delete all paths except for the one good one. */
+STATIC void
+xrep_dirtree_keep_one_good_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+ bool foundit = false;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ switch (path->outcome) {
+ case XCHK_DIRPATH_CORRUPT:
+ case XCHK_DIRPATH_LOOP:
+ oc->suspect--;
+ oc->bad++;
+ xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ break;
+ case XCHK_DIRPATH_OK:
+ if (!foundit) {
+ xrep_dirpath_retain_parent(dl, path);
+ foundit = true;
+ continue;
+ }
+ oc->good--;
+ oc->bad++;
+ xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ break;
+ default:
+ break;
+ }
+ }
+
+ ASSERT(oc->suspect == 0);
+ ASSERT(oc->good < 2);
+}
+
+/* Delete all paths except for one suspect one. */
+STATIC void
+xrep_dirtree_keep_one_suspect_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+ bool foundit = false;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ switch (path->outcome) {
+ case XCHK_DIRPATH_CORRUPT:
+ case XCHK_DIRPATH_LOOP:
+ if (!foundit) {
+ xrep_dirpath_retain_parent(dl, path);
+ foundit = true;
+ continue;
+ }
+ oc->suspect--;
+ oc->bad++;
+ xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ break;
+ case XCHK_DIRPATH_OK:
+ ASSERT(0);
+ break;
+ default:
+ break;
+ }
+ }
+
+ ASSERT(oc->suspect == 1);
+ ASSERT(oc->good == 0);
+}
+
+/*
+ * Figure out what to do with the paths we tried to find. Returns -EDEADLOCK
+ * if the scan results have become stale.
+ */
+STATIC void
+xrep_dirtree_decide_fate(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ xchk_dirtree_evaluate(dl, oc);
+
+ /* Parentless directories should not have any paths at all. */
+ if (xchk_dirtree_parentless(dl)) {
+ xrep_dirtree_delete_all_paths(dl, oc);
+ return;
+ }
+
+ /* One path is exactly the number of paths we want. */
+ if (oc->good + oc->suspect == 1) {
+ xrep_dirtree_find_surviving_path(dl, oc);
+ return;
+ }
+
+ /* Zero paths means we should reattach the subdir to the orphanage. */
+ if (oc->good + oc->suspect == 0) {
+ if (dl->sc->orphanage)
+ oc->needs_adoption = true;
+ return;
+ }
+
+ /*
+ * Otherwise, this subdirectory has too many parents. If there's at
+ * least one good path, keep it and delete the others.
+ */
+ if (oc->good > 0) {
+ xrep_dirtree_keep_one_good_path(dl, oc);
+ return;
+ }
+
+ /*
+ * There are no good paths and there are too many suspect paths.
+ * Keep the first suspect path and delete the rest.
+ */
+ xrep_dirtree_keep_one_suspect_path(dl, oc);
+}
+
+/*
+ * Load the first step of this path into @step and @dl->xname/pptr
+ * for later repair work.
+ */
+STATIC int
+xrep_dirtree_prep_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path,
+ struct xchk_dirpath_step *step)
+{
+ int error;
+
+ error = xfarray_load(dl->path_steps, path->first_step, step);
+ if (error)
+ return error;
+
+ error = xfblob_loadname(dl->path_names, step->name_cookie, &dl->xname,
+ step->name_len);
+ if (error)
+ return error;
+
+ dl->pptr_rec = step->pptr_rec; /* struct copy */
+ return 0;
+}
+
+/* Delete the VFS dentry for a removed child. */
+STATIC int
+xrep_dirtree_purge_dentry(
+ struct xchk_dirtree *dl,
+ struct xfs_inode *dp,
+ const struct xfs_name *name)
+{
+ struct qstr qname = QSTR_INIT(name->name, name->len);
+ struct dentry *parent_dentry, *child_dentry;
+ int error = 0;
+
+ /*
+ * Find the dentry for the parent directory. If there isn't one, we're
+ * done. Caller already holds i_rwsem for parent and child.
+ */
+ parent_dentry = d_find_alias(VFS_I(dp));
+ if (!parent_dentry)
+ return 0;
+
+ /* The VFS thinks the parent is a directory, right? */
+ if (!d_is_dir(parent_dentry)) {
+ ASSERT(d_is_dir(parent_dentry));
+ error = -EFSCORRUPTED;
+ goto out_dput_parent;
+ }
+
+ /*
+ * Try to find the dirent pointing to the child. If there isn't one,
+ * we're done.
+ */
+ qname.hash = full_name_hash(parent_dentry, name->name, name->len);
+ child_dentry = d_lookup(parent_dentry, &qname);
+ if (!child_dentry) {
+ error = 0;
+ goto out_dput_parent;
+ }
+
+ trace_xrep_dirtree_delete_child(dp->i_mount, child_dentry);
+
+ /* Child is not a directory? We're screwed. */
+ if (!d_is_dir(child_dentry)) {
+ ASSERT(d_is_dir(child_dentry));
+ error = -EFSCORRUPTED;
+ goto out_dput_child;
+ }
+
+ /* Replace the child dentry with a negative one. */
+ d_delete(child_dentry);
+
+out_dput_child:
+ dput(child_dentry);
+out_dput_parent:
+ dput(parent_dentry);
+ return error;
+}
+
+/*
+ * Prepare to delete a link by taking the IOLOCK of the parent and the child
+ * (scrub target). Caller must hold IOLOCK_EXCL on @sc->ip. Returns 0 if we
+ * took both locks, or a negative errno if we couldn't lock the parent in time.
+ */
+static inline int
+xrep_dirtree_unlink_iolock(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp)
+{
+ int error;
+
+ ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL);
+
+ if (xfs_ilock_nowait(dp, XFS_IOLOCK_EXCL))
+ return 0;
+
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
+ do {
+ xfs_ilock(dp, XFS_IOLOCK_EXCL);
+ if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL))
+ break;
+ xfs_iunlock(dp, XFS_IOLOCK_EXCL);
+
+ if (xchk_should_terminate(sc, &error)) {
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+ return error;
+ }
+
+ delay(1);
+ } while (1);
+
+ return 0;
+}
+
+/*
+ * Remove a link from the directory tree and update the dcache. Returns
+ * -ESTALE if the scan data are now out of date.
+ */
+STATIC int
+xrep_dirtree_unlink(
+ struct xchk_dirtree *dl,
+ struct xfs_inode *dp,
+ struct xchk_dirpath *path,
+ struct xchk_dirpath_step *step)
+{
+ struct xfs_scrub *sc = dl->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t dotdot_ino;
+ xfs_ino_t parent_ino = dl->parent_ino;
+ unsigned int resblks;
+ int dontcare;
+ int error;
+
+ /* Take IOLOCK_EXCL of the parent and child. */
+ error = xrep_dirtree_unlink_iolock(sc, dp);
+ if (error)
+ return error;
+
+ /*
+ * Create the transaction that we need to sever the path. Ignore
+ * EDQUOT and ENOSPC being returned via nospace_error because the
+ * directory code can handle a reservationless update.
+ */
+ resblks = xfs_remove_space_res(mp, step->name_len);
+ error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, sc->ip,
+ &resblks, &sc->tp, &dontcare);
+ if (error)
+ goto out_iolock;
+
+ /*
+ * Cancel if someone invalidate the paths while we were trying to get
+ * the ILOCK.
+ */
+ mutex_lock(&dl->lock);
+ if (dl->stale) {
+ mutex_unlock(&dl->lock);
+ error = -ESTALE;
+ goto out_trans_cancel;
+ }
+ xrep_dirpath_set_outcome(dl, path, XREP_DIRPATH_DELETING);
+ mutex_unlock(&dl->lock);
+
+ trace_xrep_dirtree_delete_path(dl->sc, sc->ip, path->path_nr,
+ &dl->xname, &dl->pptr_rec);
+
+ /*
+ * Decide if we need to reset the dotdot entry. Rules:
+ *
+ * - If there's a surviving parent, we want dotdot to point there.
+ * - If we don't have any surviving parents, then point dotdot at the
+ * root dir.
+ * - If dotdot is already set to the value we want, pass in NULLFSINO
+ * for no change necessary.
+ *
+ * Do this /before/ we dirty anything, in case the dotdot lookup
+ * fails.
+ */
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &dotdot_ino);
+ if (error)
+ goto out_trans_cancel;
+ if (parent_ino == NULLFSINO)
+ parent_ino = dl->root_ino;
+ if (dotdot_ino == parent_ino)
+ parent_ino = NULLFSINO;
+
+ /* Drop the link from sc->ip's dotdot entry. */
+ error = xfs_droplink(sc->tp, dp);
+ if (error)
+ goto out_trans_cancel;
+
+ /* Reset the dotdot entry to a surviving parent. */
+ if (parent_ino != NULLFSINO) {
+ error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot,
+ parent_ino, 0);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /* Drop the link from dp to sc->ip. */
+ error = xfs_droplink(sc->tp, sc->ip);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xfs_dir_removename(sc->tp, dp, &dl->xname, sc->ip->i_ino,
+ resblks);
+ if (error) {
+ ASSERT(error != -ENOENT);
+ goto out_trans_cancel;
+ }
+
+ if (xfs_has_parent(sc->mp)) {
+ error = xfs_parent_removename(sc->tp, &dl->ppargs, dp,
+ &dl->xname, sc->ip);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /*
+ * Notify dirent hooks that we removed the bad link, invalidate the
+ * dcache, and commit the repair.
+ */
+ xfs_dir_update_hook(dp, sc->ip, -1, &dl->xname);
+ error = xrep_dirtree_purge_dentry(dl, dp, &dl->xname);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xrep_trans_commit(sc);
+ goto out_ilock;
+
+out_trans_cancel:
+ xchk_trans_cancel(sc);
+out_ilock:
+ xfs_iunlock(sc->ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+out_iolock:
+ xfs_iunlock(dp, XFS_IOLOCK_EXCL);
+ return error;
+}
+
+/*
+ * Delete a directory entry that points to this directory. Returns -ESTALE
+ * if the scan data are now out of date.
+ */
+STATIC int
+xrep_dirtree_delete_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path)
+{
+ struct xchk_dirpath_step step;
+ struct xfs_scrub *sc = dl->sc;
+ struct xfs_inode *dp;
+ int error;
+
+ /*
+ * Load the parent pointer and directory inode for this path, then
+ * drop the scan lock, the ILOCK, and the transaction so that
+ * _delete_path can reserve the proper transaction. This sets up
+ * @dl->xname for the deletion.
+ */
+ error = xrep_dirtree_prep_path(dl, path, &step);
+ if (error)
+ return error;
+
+ error = xchk_iget(sc, be64_to_cpu(step.pptr_rec.p_ino), &dp);
+ if (error)
+ return error;
+
+ mutex_unlock(&dl->lock);
+ xchk_trans_cancel(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /* Delete the directory link and release the parent. */
+ error = xrep_dirtree_unlink(dl, dp, path, &step);
+ xchk_irele(sc, dp);
+
+ /*
+ * Retake all the resources we had at the beginning even if the repair
+ * failed or the scan data are now stale. This keeps things simple for
+ * the caller.
+ */
+ xchk_trans_alloc_empty(sc);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ mutex_lock(&dl->lock);
+
+ if (!error && dl->stale)
+ error = -ESTALE;
+ return error;
+}
+
+/* Add a new path to represent our in-progress adoption. */
+STATIC int
+xrep_dirtree_create_adoption_path(
+ struct xchk_dirtree *dl)
+{
+ struct xfs_scrub *sc = dl->sc;
+ struct xchk_dirpath *path;
+ int error;
+
+ /*
+ * We should have capped the number of paths at XFS_MAXLINK-1 in the
+ * scanner.
+ */
+ if (dl->nr_paths > XFS_MAXLINK) {
+ ASSERT(dl->nr_paths <= XFS_MAXLINK);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Create a new xchk_path structure to remember this parent pointer
+ * and record the first name step.
+ */
+ path = kmalloc(sizeof(struct xchk_dirpath), XCHK_GFP_FLAGS);
+ if (!path)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&path->list);
+ xino_bitmap_init(&path->seen_inodes);
+ path->nr_steps = 0;
+ path->outcome = XREP_DIRPATH_ADOPTING;
+
+ /*
+ * Record the new link that we just created in the orphanage. Because
+ * adoption is the last repair that we perform, we don't bother filling
+ * in the path all the way back to the root.
+ */
+ xfs_inode_to_parent_rec(&dl->pptr_rec, sc->orphanage);
+
+ error = xino_bitmap_set(&path->seen_inodes, sc->orphanage->i_ino);
+ if (error)
+ goto out_path;
+
+ trace_xrep_dirtree_create_adoption(sc, sc->ip, dl->nr_paths,
+ &dl->xname, &dl->pptr_rec);
+
+ error = xchk_dirpath_append(dl, sc->ip, path, &dl->xname,
+ &dl->pptr_rec);
+ if (error)
+ goto out_path;
+
+ path->first_step = xfarray_length(dl->path_steps) - 1;
+ path->second_step = XFARRAY_NULLIDX;
+ path->path_nr = dl->nr_paths;
+
+ list_add_tail(&path->list, &dl->path_list);
+ dl->nr_paths++;
+ return 0;
+
+out_path:
+ kfree(path);
+ return error;
+}
+
+/*
+ * Prepare to move a file to the orphanage by taking the IOLOCK of the
+ * orphanage and the child (scrub target). Caller must hold IOLOCK_EXCL on
+ * @sc->ip. Returns 0 if we took both locks, or a negative errno if we
+ * couldn't lock the orphanage in time.
+ */
+static inline int
+xrep_dirtree_adopt_iolock(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL);
+
+ if (xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL))
+ return 0;
+
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
+ do {
+ xrep_orphanage_ilock(sc, XFS_IOLOCK_EXCL);
+ if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL))
+ break;
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+
+ if (xchk_should_terminate(sc, &error)) {
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+ return error;
+ }
+
+ delay(1);
+ } while (1);
+
+ return 0;
+}
+
+/*
+ * Reattach this orphaned directory to the orphanage. Do not call this with
+ * any resources held. Returns -ESTALE if the scan data have become out of
+ * date.
+ */
+STATIC int
+xrep_dirtree_adopt(
+ struct xchk_dirtree *dl)
+{
+ struct xfs_scrub *sc = dl->sc;
+ int error;
+
+ /* Take the IOLOCK of the orphanage and the scrub target. */
+ error = xrep_dirtree_adopt_iolock(sc);
+ if (error)
+ return error;
+
+ /*
+ * Set up for an adoption. The directory tree fixer runs after the
+ * link counts have been corrected. Therefore, we must bump the
+ * child's link count since there will be no further opportunity to fix
+ * errors.
+ */
+ error = xrep_adoption_trans_alloc(sc, &dl->adoption);
+ if (error)
+ goto out_iolock;
+ dl->adoption.bump_child_nlink = true;
+
+ /* Figure out what name we're going to use here. */
+ error = xrep_adoption_compute_name(&dl->adoption, &dl->xname);
+ if (error)
+ goto out_trans;
+
+ /*
+ * Now that we have a proposed name for the orphanage entry, create
+ * a faux path so that the live update hook will see it.
+ */
+ mutex_lock(&dl->lock);
+ if (dl->stale) {
+ mutex_unlock(&dl->lock);
+ error = -ESTALE;
+ goto out_trans;
+ }
+ error = xrep_dirtree_create_adoption_path(dl);
+ mutex_unlock(&dl->lock);
+ if (error)
+ goto out_trans;
+
+ /* Reparent the directory. */
+ error = xrep_adoption_move(&dl->adoption);
+ if (error)
+ goto out_trans;
+
+ /*
+ * Commit the name and release all inode locks except for the scrub
+ * target's IOLOCK.
+ */
+ error = xrep_trans_commit(sc);
+ goto out_ilock;
+
+out_trans:
+ xchk_trans_cancel(sc);
+out_ilock:
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
+out_iolock:
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ return error;
+}
+
+/*
+ * This newly orphaned directory needs to be adopted by the orphanage.
+ * Make this happen.
+ */
+STATIC int
+xrep_dirtree_move_to_orphanage(
+ struct xchk_dirtree *dl)
+{
+ struct xfs_scrub *sc = dl->sc;
+ int error;
+
+ /*
+ * Start by dropping all the resources that we hold so that we can grab
+ * all the resources that we need for the adoption.
+ */
+ mutex_unlock(&dl->lock);
+ xchk_trans_cancel(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /* Perform the adoption. */
+ error = xrep_dirtree_adopt(dl);
+
+ /*
+ * Retake all the resources we had at the beginning even if the repair
+ * failed or the scan data are now stale. This keeps things simple for
+ * the caller.
+ */
+ xchk_trans_alloc_empty(sc);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ mutex_lock(&dl->lock);
+
+ if (!error && dl->stale)
+ error = -ESTALE;
+ return error;
+}
+
+/*
+ * Try to fix all the problems. Returns -ESTALE if the scan data have become
+ * out of date.
+ */
+STATIC int
+xrep_dirtree_fix_problems(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+ int error;
+
+ /* Delete all the paths we don't want. */
+ xchk_dirtree_for_each_path(dl, path) {
+ if (path->outcome != XCHK_DIRPATH_DELETE)
+ continue;
+
+ error = xrep_dirtree_delete_path(dl, path);
+ if (error)
+ return error;
+ }
+
+ /* Reparent this directory to the orphanage. */
+ if (oc->needs_adoption) {
+ if (xrep_orphanage_can_adopt(dl->sc))
+ return xrep_dirtree_move_to_orphanage(dl);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/* Fix directory loops involving this directory. */
+int
+xrep_dirtree(
+ struct xfs_scrub *sc)
+{
+ struct xchk_dirtree *dl = sc->buf;
+ struct xchk_dirtree_outcomes oc;
+ int error;
+
+ /*
+ * Prepare to fix the directory tree by retaking the scan lock. The
+ * order of resource acquisition is still IOLOCK -> transaction ->
+ * ILOCK -> scan lock.
+ */
+ mutex_lock(&dl->lock);
+ do {
+ /*
+ * Decide what we're going to do, then do it. An -ESTALE
+ * return here means the scan results are invalid and we have
+ * to walk again.
+ */
+ if (!dl->stale) {
+ xrep_dirtree_decide_fate(dl, &oc);
+
+ trace_xrep_dirtree_decided_fate(dl, &oc);
+
+ error = xrep_dirtree_fix_problems(dl, &oc);
+ if (!error || error != -ESTALE)
+ break;
+ }
+ error = xchk_dirtree_find_paths_to_root(dl);
+ if (error == -ELNRNG || error == -ENOSR)
+ error = -EFSCORRUPTED;
+ } while (!error);
+ mutex_unlock(&dl->lock);
+
+ return error;
+}
diff --git a/fs/xfs/scrub/findparent.c b/fs/xfs/scrub/findparent.c
new file mode 100644
index 000000000000..01766041ba2c
--- /dev/null
+++ b/fs/xfs/scrub/findparent.c
@@ -0,0 +1,454 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trans_space.h"
+#include "xfs_health.h"
+#include "xfs_exchmaps.h"
+#include "xfs_parent.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/iscan.h"
+#include "scrub/findparent.h"
+#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
+#include "scrub/listxattr.h"
+
+/*
+ * Finding the Parent of a Directory
+ * =================================
+ *
+ * Directories have parent pointers, in the sense that each directory contains
+ * a dotdot entry that points to the single allowed parent. The brute force
+ * way to find the parent of a given directory is to scan every directory in
+ * the filesystem looking for a child dirent that references this directory.
+ *
+ * This module wraps the process of scanning the directory tree. It requires
+ * that @sc->ip is the directory whose parent we want to find, and that the
+ * caller hold only the IOLOCK on that directory. The scan itself needs to
+ * take the ILOCK of each directory visited.
+ *
+ * Because we cannot hold @sc->ip's ILOCK during a scan of the whole fs, it is
+ * necessary to use dirent hook to update the parent scan results. Callers
+ * must not read the scan results without re-taking @sc->ip's ILOCK.
+ *
+ * There are a few shortcuts that we can take to avoid scanning the entire
+ * filesystem, such as noticing directory tree roots and querying the dentry
+ * cache for parent information.
+ */
+
+struct xrep_findparent_info {
+ /* The directory currently being scanned. */
+ struct xfs_inode *dp;
+
+ /*
+ * Scrub context. We're looking for a @dp containing a directory
+ * entry pointing to sc->ip->i_ino.
+ */
+ struct xfs_scrub *sc;
+
+ /* Optional scan information for a xrep_findparent_scan call. */
+ struct xrep_parent_scan_info *parent_scan;
+
+ /*
+ * Parent that we've found for sc->ip. If we're scanning the entire
+ * directory tree, we need this to ensure that we only find /one/
+ * parent directory.
+ */
+ xfs_ino_t found_parent;
+
+ /*
+ * This is set to true if @found_parent was not observed directly from
+ * the directory scan but by noticing a change in dotdot entries after
+ * cycling the sc->ip IOLOCK.
+ */
+ bool parent_tentative;
+};
+
+/*
+ * If this directory entry points to the scrub target inode, then the directory
+ * we're scanning is the parent of the scrub target inode.
+ */
+STATIC int
+xrep_findparent_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xrep_findparent_info *fpi = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(fpi->sc, &error))
+ return error;
+
+ if (ino != fpi->sc->ip->i_ino)
+ return 0;
+
+ /* Ignore garbage directory entry names. */
+ if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
+ return -EFSCORRUPTED;
+
+ /*
+ * Ignore dotdot and dot entries -- we're looking for parent -> child
+ * links only.
+ */
+ if (name->name[0] == '.' && (name->len == 1 ||
+ (name->len == 2 && name->name[1] == '.')))
+ return 0;
+
+ /* Uhoh, more than one parent for a dir? */
+ if (fpi->found_parent != NULLFSINO &&
+ !(fpi->parent_tentative && fpi->found_parent == fpi->dp->i_ino)) {
+ trace_xrep_findparent_dirent(fpi->sc->ip, 0);
+ return -EFSCORRUPTED;
+ }
+
+ /* We found a potential parent; remember this. */
+ trace_xrep_findparent_dirent(fpi->sc->ip, fpi->dp->i_ino);
+ fpi->found_parent = fpi->dp->i_ino;
+ fpi->parent_tentative = false;
+
+ if (fpi->parent_scan)
+ xrep_findparent_scan_found(fpi->parent_scan, fpi->dp->i_ino);
+
+ return 0;
+}
+
+/*
+ * If this is a directory, walk the dirents looking for any that point to the
+ * scrub target inode.
+ */
+STATIC int
+xrep_findparent_walk_directory(
+ struct xrep_findparent_info *fpi)
+{
+ struct xfs_scrub *sc = fpi->sc;
+ struct xfs_inode *dp = fpi->dp;
+ unsigned int lock_mode;
+ int error = 0;
+
+ /*
+ * The inode being scanned cannot be its own parent, nor can any
+ * temporary directory we created to stage this repair.
+ */
+ if (dp == sc->ip || dp == sc->tempip)
+ return 0;
+
+ /*
+ * Similarly, temporary files created to stage a repair cannot be the
+ * parent of this inode.
+ */
+ if (xrep_is_tempfile(dp))
+ return 0;
+
+ /*
+ * Scan the directory to see if there it contains an entry pointing to
+ * the directory that we are repairing.
+ */
+ lock_mode = xfs_ilock_data_map_shared(dp);
+
+ /*
+ * If this directory is known to be sick, we cannot scan it reliably
+ * and must abort.
+ */
+ if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE |
+ XFS_SICK_INO_BMBTD |
+ XFS_SICK_INO_DIR)) {
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+
+ /*
+ * We cannot complete our parent pointer scan if a directory looks as
+ * though it has been zapped by the inode record repair code.
+ */
+ if (xchk_dir_looks_zapped(dp)) {
+ error = -EBUSY;
+ goto out_unlock;
+ }
+
+ error = xchk_dir_walk(sc, dp, xrep_findparent_dirent, fpi);
+ if (error)
+ goto out_unlock;
+
+out_unlock:
+ xfs_iunlock(dp, lock_mode);
+ return error;
+}
+
+/*
+ * Update this directory's dotdot pointer based on ongoing dirent updates.
+ */
+STATIC int
+xrep_findparent_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xrep_parent_scan_info *pscan;
+ struct xfs_scrub *sc;
+
+ pscan = container_of(nb, struct xrep_parent_scan_info,
+ dhook.dirent_hook.nb);
+ sc = pscan->sc;
+
+ /*
+ * If @p->ip is the subdirectory that we're interested in and we've
+ * already scanned @p->dp, update the dotdot target inumber to the
+ * parent inode.
+ */
+ if (p->ip->i_ino == sc->ip->i_ino &&
+ xchk_iscan_want_live_update(&pscan->iscan, p->dp->i_ino)) {
+ if (p->delta > 0) {
+ xrep_findparent_scan_found(pscan, p->dp->i_ino);
+ } else {
+ xrep_findparent_scan_found(pscan, NULLFSINO);
+ }
+ }
+
+ return NOTIFY_DONE;
+}
+
+/*
+ * Set up a scan to find the parent of a directory. The provided dirent hook
+ * will be called when there is a dotdot update for the inode being repaired.
+ */
+int
+__xrep_findparent_scan_start(
+ struct xfs_scrub *sc,
+ struct xrep_parent_scan_info *pscan,
+ notifier_fn_t custom_fn)
+{
+ int error;
+
+ if (!(sc->flags & XCHK_FSGATES_DIRENTS)) {
+ ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
+ return -EINVAL;
+ }
+
+ pscan->sc = sc;
+ pscan->parent_ino = NULLFSINO;
+
+ mutex_init(&pscan->lock);
+
+ xchk_iscan_start(sc, 30000, 100, &pscan->iscan);
+
+ /*
+ * Hook into the dirent update code. The hook only operates on inodes
+ * that were already scanned, and the scanner thread takes each inode's
+ * ILOCK, which means that any in-progress inode updates will finish
+ * before we can scan the inode.
+ */
+ if (custom_fn)
+ xfs_dir_hook_setup(&pscan->dhook, custom_fn);
+ else
+ xfs_dir_hook_setup(&pscan->dhook, xrep_findparent_live_update);
+ error = xfs_dir_hook_add(sc->mp, &pscan->dhook);
+ if (error)
+ goto out_iscan;
+
+ return 0;
+out_iscan:
+ xchk_iscan_teardown(&pscan->iscan);
+ mutex_destroy(&pscan->lock);
+ return error;
+}
+
+/*
+ * Scan the entire filesystem looking for a parent inode for the inode being
+ * scrubbed. @sc->ip must not be the root of a directory tree. Callers must
+ * not hold a dirty transaction or any lock that would interfere with taking
+ * an ILOCK.
+ *
+ * Returns 0 with @pscan->parent_ino set to the parent that we found.
+ * Returns 0 with @pscan->parent_ino set to NULLFSINO if we found no parents.
+ * Returns the usual negative errno if something else happened.
+ */
+int
+xrep_findparent_scan(
+ struct xrep_parent_scan_info *pscan)
+{
+ struct xrep_findparent_info fpi = {
+ .sc = pscan->sc,
+ .found_parent = NULLFSINO,
+ .parent_scan = pscan,
+ };
+ struct xfs_scrub *sc = pscan->sc;
+ int ret;
+
+ ASSERT(S_ISDIR(VFS_IC(sc->ip)->i_mode));
+
+ while ((ret = xchk_iscan_iter(&pscan->iscan, &fpi.dp)) == 1) {
+ if (S_ISDIR(VFS_I(fpi.dp)->i_mode))
+ ret = xrep_findparent_walk_directory(&fpi);
+ else
+ ret = 0;
+ xchk_iscan_mark_visited(&pscan->iscan, fpi.dp);
+ xchk_irele(sc, fpi.dp);
+ if (ret)
+ break;
+
+ if (xchk_should_terminate(sc, &ret))
+ break;
+ }
+ xchk_iscan_iter_finish(&pscan->iscan);
+
+ return ret;
+}
+
+/* Tear down a parent scan. */
+void
+xrep_findparent_scan_teardown(
+ struct xrep_parent_scan_info *pscan)
+{
+ xfs_dir_hook_del(pscan->sc->mp, &pscan->dhook);
+ xchk_iscan_teardown(&pscan->iscan);
+ mutex_destroy(&pscan->lock);
+}
+
+/* Finish a parent scan early. */
+void
+xrep_findparent_scan_finish_early(
+ struct xrep_parent_scan_info *pscan,
+ xfs_ino_t ino)
+{
+ xrep_findparent_scan_found(pscan, ino);
+ xchk_iscan_finish_early(&pscan->iscan);
+}
+
+/*
+ * Confirm that the directory @parent_ino actually contains a directory entry
+ * pointing to the child @sc->ip->ino. This function returns one of several
+ * ways:
+ *
+ * Returns 0 with @parent_ino unchanged if the parent was confirmed.
+ * Returns 0 with @parent_ino set to NULLFSINO if the parent was not valid.
+ * Returns the usual negative errno if something else happened.
+ */
+int
+xrep_findparent_confirm(
+ struct xfs_scrub *sc,
+ xfs_ino_t *parent_ino)
+{
+ struct xrep_findparent_info fpi = {
+ .sc = sc,
+ .found_parent = NULLFSINO,
+ };
+ int error;
+
+ /*
+ * The root directory always points to itself. Unlinked dirs can point
+ * anywhere, so we point them at the root dir too.
+ */
+ if (sc->ip == sc->mp->m_rootip || VFS_I(sc->ip)->i_nlink == 0) {
+ *parent_ino = sc->mp->m_sb.sb_rootino;
+ return 0;
+ }
+
+ /* Reject garbage parent inode numbers and self-referential parents. */
+ if (*parent_ino == NULLFSINO)
+ return 0;
+ if (!xfs_verify_dir_ino(sc->mp, *parent_ino) ||
+ *parent_ino == sc->ip->i_ino) {
+ *parent_ino = NULLFSINO;
+ return 0;
+ }
+
+ error = xchk_iget(sc, *parent_ino, &fpi.dp);
+ if (error)
+ return error;
+
+ if (!S_ISDIR(VFS_I(fpi.dp)->i_mode)) {
+ *parent_ino = NULLFSINO;
+ goto out_rele;
+ }
+
+ error = xrep_findparent_walk_directory(&fpi);
+ if (error)
+ goto out_rele;
+
+ *parent_ino = fpi.found_parent;
+out_rele:
+ xchk_irele(sc, fpi.dp);
+ return error;
+}
+
+/*
+ * If we're the root of a directory tree, we are our own parent. If we're an
+ * unlinked directory, the parent /won't/ have a link to us. Set the parent
+ * directory to the root for both cases. Returns NULLFSINO if we don't know
+ * what to do.
+ */
+xfs_ino_t
+xrep_findparent_self_reference(
+ struct xfs_scrub *sc)
+{
+ if (sc->ip->i_ino == sc->mp->m_sb.sb_rootino)
+ return sc->mp->m_sb.sb_rootino;
+
+ if (VFS_I(sc->ip)->i_nlink == 0)
+ return sc->mp->m_sb.sb_rootino;
+
+ return NULLFSINO;
+}
+
+/* Check the dentry cache to see if knows of a parent for the scrub target. */
+xfs_ino_t
+xrep_findparent_from_dcache(
+ struct xfs_scrub *sc)
+{
+ struct inode *pip = NULL;
+ struct dentry *dentry, *parent;
+ xfs_ino_t ret = NULLFSINO;
+
+ dentry = d_find_alias(VFS_I(sc->ip));
+ if (!dentry)
+ goto out;
+
+ parent = dget_parent(dentry);
+ if (!parent)
+ goto out_dput;
+
+ ASSERT(parent->d_sb == sc->ip->i_mount->m_super);
+
+ pip = igrab(d_inode(parent));
+ dput(parent);
+
+ if (S_ISDIR(pip->i_mode)) {
+ trace_xrep_findparent_from_dcache(sc->ip, XFS_I(pip)->i_ino);
+ ret = XFS_I(pip)->i_ino;
+ }
+
+ xchk_irele(sc, XFS_I(pip));
+
+out_dput:
+ dput(dentry);
+out:
+ return ret;
+}
diff --git a/fs/xfs/scrub/findparent.h b/fs/xfs/scrub/findparent.h
new file mode 100644
index 000000000000..d998c7a88152
--- /dev/null
+++ b/fs/xfs/scrub/findparent.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_FINDPARENT_H__
+#define __XFS_SCRUB_FINDPARENT_H__
+
+struct xrep_parent_scan_info {
+ struct xfs_scrub *sc;
+
+ /* Inode scan cursor. */
+ struct xchk_iscan iscan;
+
+ /* Hook to capture directory entry updates. */
+ struct xfs_dir_hook dhook;
+
+ /* Lock protecting parent_ino. */
+ struct mutex lock;
+
+ /* Parent inode that we've found. */
+ xfs_ino_t parent_ino;
+
+ bool lookup_parent;
+};
+
+int __xrep_findparent_scan_start(struct xfs_scrub *sc,
+ struct xrep_parent_scan_info *pscan,
+ notifier_fn_t custom_fn);
+static inline int xrep_findparent_scan_start(struct xfs_scrub *sc,
+ struct xrep_parent_scan_info *pscan)
+{
+ return __xrep_findparent_scan_start(sc, pscan, NULL);
+}
+int xrep_findparent_scan(struct xrep_parent_scan_info *pscan);
+void xrep_findparent_scan_teardown(struct xrep_parent_scan_info *pscan);
+
+static inline void
+xrep_findparent_scan_found(
+ struct xrep_parent_scan_info *pscan,
+ xfs_ino_t ino)
+{
+ mutex_lock(&pscan->lock);
+ pscan->parent_ino = ino;
+ mutex_unlock(&pscan->lock);
+}
+
+void xrep_findparent_scan_finish_early(struct xrep_parent_scan_info *pscan,
+ xfs_ino_t ino);
+
+int xrep_findparent_confirm(struct xfs_scrub *sc, xfs_ino_t *parent_ino);
+
+xfs_ino_t xrep_findparent_self_reference(struct xfs_scrub *sc);
+xfs_ino_t xrep_findparent_from_dcache(struct xfs_scrub *sc);
+
+#endif /* __XFS_SCRUB_FINDPARENT_H__ */
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index d310737c8823..1d3e98346933 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -85,7 +85,7 @@ xchk_fscount_warmup(
continue;
/* Lock both AG headers. */
- error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
+ error = xfs_ialloc_read_agi(pag, sc->tp, 0, &agi_bp);
if (error)
break;
error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp);
@@ -412,10 +412,11 @@ xchk_fscount_count_frextents(
int error;
fsc->frextents = 0;
+ fsc->frextents_delayed = 0;
if (!xfs_has_realtime(mp))
return 0;
- xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ xfs_rtbitmap_lock_shared(sc->mp, XFS_RBMLOCK_BITMAP);
error = xfs_rtalloc_query_all(sc->mp, sc->tp,
xchk_fscount_add_frextent, fsc);
if (error) {
@@ -423,8 +424,10 @@ xchk_fscount_count_frextents(
goto out_unlock;
}
+ fsc->frextents_delayed = percpu_counter_sum(&mp->m_delalloc_rtextents);
+
out_unlock:
- xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ xfs_rtbitmap_unlock_shared(sc->mp, XFS_RBMLOCK_BITMAP);
return error;
}
#else
@@ -434,6 +437,7 @@ xchk_fscount_count_frextents(
struct xchk_fscounters *fsc)
{
fsc->frextents = 0;
+ fsc->frextents_delayed = 0;
return 0;
}
#endif /* CONFIG_XFS_RT */
@@ -517,7 +521,7 @@ xchk_fscounters(
/*
* If the filesystem is not frozen, the counter summation calls above
- * can race with xfs_mod_freecounter, which subtracts a requested space
+ * can race with xfs_dec_freecounter, which subtracts a requested space
* reservation from the counter and undoes the subtraction if that made
* the counter go negative. Therefore, it's possible to see negative
* values here, and we should only flag that as a corruption if we
@@ -593,7 +597,7 @@ xchk_fscounters(
}
if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
- fsc->frextents)) {
+ fsc->frextents - fsc->frextents_delayed)) {
if (fsc->frozen)
xchk_set_corrupt(sc);
else
diff --git a/fs/xfs/scrub/fscounters.h b/fs/xfs/scrub/fscounters.h
index 461a13d25f4b..bcf56e1c36f9 100644
--- a/fs/xfs/scrub/fscounters.h
+++ b/fs/xfs/scrub/fscounters.h
@@ -12,6 +12,7 @@ struct xchk_fscounters {
uint64_t ifree;
uint64_t fdblocks;
uint64_t frextents;
+ uint64_t frextents_delayed;
unsigned long long icount_min;
unsigned long long icount_max;
bool frozen;
diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c
index 94cdb852bee4..469bf645dbea 100644
--- a/fs/xfs/scrub/fscounters_repair.c
+++ b/fs/xfs/scrub/fscounters_repair.c
@@ -65,7 +65,17 @@ xrep_fscounters(
percpu_counter_set(&mp->m_icount, fsc->icount);
percpu_counter_set(&mp->m_ifree, fsc->ifree);
percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks);
- percpu_counter_set(&mp->m_frextents, fsc->frextents);
+
+ /*
+ * Online repair is only supported on v5 file systems, which require
+ * lazy sb counters and thus no update of sb_fdblocks here. But as of
+ * now we don't support lazy counting sb_frextents yet, and thus need
+ * to also update it directly here. And for that we need to keep
+ * track of the delalloc reservations separately, as they are are
+ * subtracted from m_frextents, but not included in sb_frextents.
+ */
+ percpu_counter_set(&mp->m_frextents,
+ fsc->frextents - fsc->frextents_delayed);
mp->m_sb.sb_frextents = fsc->frextents;
return 0;
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 9020a6bef7f1..b712a8bd34f5 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -108,6 +108,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS },
[XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK },
[XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS },
+ [XFS_SCRUB_TYPE_DIRTREE] = { XHG_INO, XFS_SICK_INO_DIRTREE },
};
/* Return the health status mask for this scrub type. */
diff --git a/fs/xfs/scrub/ino_bitmap.h b/fs/xfs/scrub/ino_bitmap.h
new file mode 100644
index 000000000000..1300833679ab
--- /dev/null
+++ b/fs/xfs/scrub/ino_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_INO_BITMAP_H__
+#define __XFS_SCRUB_INO_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_ino_t */
+
+struct xino_bitmap {
+ struct xbitmap64 inobitmap;
+};
+
+static inline void xino_bitmap_init(struct xino_bitmap *bitmap)
+{
+ xbitmap64_init(&bitmap->inobitmap);
+}
+
+static inline void xino_bitmap_destroy(struct xino_bitmap *bitmap)
+{
+ xbitmap64_destroy(&bitmap->inobitmap);
+}
+
+static inline int xino_bitmap_set(struct xino_bitmap *bitmap, xfs_ino_t ino)
+{
+ return xbitmap64_set(&bitmap->inobitmap, ino, 1);
+}
+
+static inline int xino_bitmap_test(struct xino_bitmap *bitmap, xfs_ino_t ino)
+{
+ uint64_t len = 1;
+
+ return xbitmap64_test(&bitmap->inobitmap, ino, &len);
+}
+
+#endif /* __XFS_SCRUB_INO_BITMAP_H__ */
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 6e2fe2d6250b..d32716fb2fec 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -739,6 +739,23 @@ xchk_inode_check_reflink_iflag(
xchk_ino_set_corrupt(sc, ino);
}
+/*
+ * If this inode has zero link count, it must be on the unlinked list. If
+ * it has nonzero link count, it must not be on the unlinked list.
+ */
+STATIC void
+xchk_inode_check_unlinked(
+ struct xfs_scrub *sc)
+{
+ if (VFS_I(sc->ip)->i_nlink == 0) {
+ if (!xfs_inode_on_unlinked_list(sc->ip))
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ } else {
+ if (xfs_inode_on_unlinked_list(sc->ip))
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ }
+}
+
/* Scrub an inode. */
int
xchk_inode(
@@ -771,6 +788,8 @@ xchk_inode(
if (S_ISREG(VFS_I(sc->ip)->i_mode))
xchk_inode_check_reflink_iflag(sc, sc->ip->i_ino);
+ xchk_inode_check_unlinked(sc);
+
xchk_inode_xref(sc, sc->ip->i_ino, &di);
out:
return error;
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index eab380e95ef4..daf9f1ee7c2c 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -46,6 +46,7 @@
#include "scrub/repair.h"
#include "scrub/iscan.h"
#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
/*
* Inode Record Repair
@@ -282,6 +283,51 @@ xrep_dinode_findmode_dirent(
return 0;
}
+/* Try to lock a directory, or wait a jiffy. */
+static inline int
+xrep_dinode_ilock_nowait(
+ struct xfs_inode *dp,
+ unsigned int lock_mode)
+{
+ if (xfs_ilock_nowait(dp, lock_mode))
+ return true;
+
+ schedule_timeout_killable(1);
+ return false;
+}
+
+/*
+ * Try to lock a directory to look for ftype hints. Since we already hold the
+ * AGI buffer, we cannot block waiting for the ILOCK because rename can take
+ * the ILOCK and then try to lock AGIs.
+ */
+STATIC int
+xrep_dinode_trylock_directory(
+ struct xrep_inode *ri,
+ struct xfs_inode *dp,
+ unsigned int *lock_modep)
+{
+ unsigned long deadline = jiffies + msecs_to_jiffies(30000);
+ unsigned int lock_mode;
+ int error = 0;
+
+ do {
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ if (xfs_need_iread_extents(&dp->i_df))
+ lock_mode = XFS_ILOCK_EXCL;
+ else
+ lock_mode = XFS_ILOCK_SHARED;
+
+ if (xrep_dinode_ilock_nowait(dp, lock_mode)) {
+ *lock_modep = lock_mode;
+ return 0;
+ }
+ } while (!time_is_before_jiffies(deadline));
+ return -EBUSY;
+}
+
/*
* If this is a directory, walk the dirents looking for any that point to the
* scrub target inode.
@@ -295,11 +341,17 @@ xrep_dinode_findmode_walk_directory(
unsigned int lock_mode;
int error = 0;
+ /* Ignore temporary repair directories. */
+ if (xrep_is_tempfile(dp))
+ return 0;
+
/*
* Scan the directory to see if there it contains an entry pointing to
* the directory that we are repairing.
*/
- lock_mode = xfs_ilock_data_map_shared(dp);
+ error = xrep_dinode_trylock_directory(ri, dp, &lock_mode);
+ if (error)
+ return error;
/*
* If this directory is known to be sick, we cannot scan it reliably
@@ -356,6 +408,7 @@ xrep_dinode_find_mode(
* so there's a real possibility that _iscan_iter can return EBUSY.
*/
xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan);
+ xchk_iscan_set_agi_trylock(&ri->ftype_iscan);
ri->ftype_iscan.skip_ino = sc->sm->sm_ino;
ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN;
while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) {
@@ -463,6 +516,17 @@ xrep_dinode_mode(
return 0;
}
+/* Fix unused link count fields having nonzero values. */
+STATIC void
+xrep_dinode_nlinks(
+ struct xfs_dinode *dip)
+{
+ if (dip->di_version > 1)
+ dip->di_onlink = 0;
+ else
+ dip->di_nlink = 0;
+}
+
/* Fix any conflicting flags that the verifiers complain about. */
STATIC void
xrep_dinode_flags(
@@ -1324,6 +1388,7 @@ xrep_dinode_core(
iget_error = xrep_dinode_mode(ri, dip);
if (iget_error)
goto write;
+ xrep_dinode_nlinks(dip);
xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
xrep_dinode_size(ri, dip);
xrep_dinode_extsize_hints(sc, dip);
@@ -1671,6 +1736,44 @@ xrep_inode_extsize(
}
}
+/* Ensure this file has an attr fork if it needs to hold a parent pointer. */
+STATIC int
+xrep_inode_pptr(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ struct inode *inode = VFS_I(ip);
+
+ if (!xfs_has_parent(mp))
+ return 0;
+
+ /*
+ * Unlinked inodes that cannot be added to the directory tree will not
+ * have a parent pointer.
+ */
+ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ return 0;
+
+ /* The root directory doesn't have a parent pointer. */
+ if (ip == mp->m_rootip)
+ return 0;
+
+ /*
+ * Metadata inodes are rooted in the superblock and do not have any
+ * parents.
+ */
+ if (xfs_is_metadata_inode(ip))
+ return 0;
+
+ /* Inode already has an attr fork; no further work possible here. */
+ if (xfs_inode_has_attr_fork(ip))
+ return 0;
+
+ return xfs_bmap_add_attrfork(sc->tp, ip,
+ sizeof(struct xfs_attr_sf_hdr), true);
+}
+
/* Fix any irregularities in an inode that the verifiers don't catch. */
STATIC int
xrep_inode_problems(
@@ -1681,6 +1784,9 @@ xrep_inode_problems(
error = xrep_inode_blockcounts(sc);
if (error)
return error;
+ error = xrep_inode_pptr(sc);
+ if (error)
+ return error;
xrep_inode_timestamps(sc->ip);
xrep_inode_flags(sc);
xrep_inode_ids(sc);
@@ -1697,6 +1803,46 @@ xrep_inode_problems(
return xrep_roll_trans(sc);
}
+/*
+ * Make sure this inode's unlinked list pointers are consistent with its
+ * link count.
+ */
+STATIC int
+xrep_inode_unlinked(
+ struct xfs_scrub *sc)
+{
+ unsigned int nlink = VFS_I(sc->ip)->i_nlink;
+ int error;
+
+ /*
+ * If this inode is linked from the directory tree and on the unlinked
+ * list, remove it from the unlinked list.
+ */
+ if (nlink > 0 && xfs_inode_on_unlinked_list(sc->ip)) {
+ struct xfs_perag *pag;
+ int error;
+
+ pag = xfs_perag_get(sc->mp,
+ XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino));
+ error = xfs_iunlink_remove(sc->tp, pag, sc->ip);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If this inode is not linked from the directory tree yet not on the
+ * unlinked list, put it on the unlinked list.
+ */
+ if (nlink == 0 && !xfs_inode_on_unlinked_list(sc->ip)) {
+ error = xfs_iunlink(sc->tp, sc->ip);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
/* Repair an inode's fields. */
int
xrep_inode(
@@ -1746,5 +1892,10 @@ xrep_inode(
return error;
}
+ /* Reconnect incore unlinked list */
+ error = xrep_inode_unlinked(sc);
+ if (error)
+ return error;
+
return xrep_defer_finish(sc);
}
diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c
index ec3478bc505e..cf9d983667ce 100644
--- a/fs/xfs/scrub/iscan.c
+++ b/fs/xfs/scrub/iscan.c
@@ -243,6 +243,51 @@ xchk_iscan_finish(
mutex_unlock(&iscan->lock);
}
+/* Mark an inode scan finished before we actually scan anything. */
+void
+xchk_iscan_finish_early(
+ struct xchk_iscan *iscan)
+{
+ ASSERT(iscan->cursor_ino == iscan->scan_start_ino);
+ ASSERT(iscan->__visited_ino == iscan->scan_start_ino);
+
+ xchk_iscan_finish(iscan);
+}
+
+/*
+ * Grab the AGI to advance the inode scan. Returns 0 if *agi_bpp is now set,
+ * -ECANCELED if the live scan aborted, -EBUSY if the AGI could not be grabbed,
+ * or the usual negative errno.
+ */
+STATIC int
+xchk_iscan_read_agi(
+ struct xchk_iscan *iscan,
+ struct xfs_perag *pag,
+ struct xfs_buf **agi_bpp)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ unsigned long relax;
+ int ret;
+
+ if (!xchk_iscan_agi_needs_trylock(iscan))
+ return xfs_ialloc_read_agi(pag, sc->tp, 0, agi_bpp);
+
+ relax = msecs_to_jiffies(iscan->iget_retry_delay);
+ do {
+ ret = xfs_ialloc_read_agi(pag, sc->tp, XFS_IALLOC_FLAG_TRYLOCK,
+ agi_bpp);
+ if (ret != -EAGAIN)
+ return ret;
+ if (!iscan->iget_timeout ||
+ time_is_before_jiffies(iscan->__iget_deadline))
+ return -EBUSY;
+
+ trace_xchk_iscan_agi_retry_wait(iscan);
+ } while (!schedule_timeout_killable(relax) &&
+ !xchk_iscan_aborted(iscan));
+ return -ECANCELED;
+}
+
/*
* Advance ino to the next inode that the inobt thinks is allocated, being
* careful to jump to the next AG if we've reached the right end of this AG's
@@ -281,7 +326,7 @@ xchk_iscan_advance(
if (!pag)
return -ECANCELED;
- ret = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
+ ret = xchk_iscan_read_agi(iscan, pag, &agi_bp);
if (ret)
goto out_pag;
@@ -363,6 +408,15 @@ xchk_iscan_iget_retry(
}
/*
+ * For an inode scan, we hold the AGI and want to try to grab a batch of
+ * inodes. Holding the AGI prevents inodegc from clearing freed inodes,
+ * so we must use noretry here. For every inode after the first one in the
+ * batch, we don't want to wait, so we use retry there too. Finally, use
+ * dontcache to avoid polluting the cache.
+ */
+#define ISCAN_IGET_FLAGS (XFS_IGET_NORETRY | XFS_IGET_DONTCACHE)
+
+/*
* Grab an inode as part of an inode scan. While scanning this inode, the
* caller must ensure that no other threads can modify the inode until a call
* to xchk_iscan_visit succeeds.
@@ -389,7 +443,7 @@ xchk_iscan_iget(
ASSERT(iscan->__inodes[0] == NULL);
/* Fill the first slot in the inode array. */
- error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0,
+ error = xfs_iget(sc->mp, sc->tp, ino, ISCAN_IGET_FLAGS, 0,
&iscan->__inodes[idx]);
trace_xchk_iscan_iget(iscan, error);
@@ -402,8 +456,13 @@ xchk_iscan_iget(
* It's possible that this inode has lost all of its links but
* hasn't yet been inactivated. If we don't have a transaction
* or it's not writable, flush the inodegc workers and wait.
+ * If we have a non-empty transaction, we must not block on
+ * inodegc, which allocates its own transactions.
*/
- xfs_inodegc_flush(mp);
+ if (sc->tp && !(sc->tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
+ xfs_inodegc_push(mp);
+ else
+ xfs_inodegc_flush(mp);
return xchk_iscan_iget_retry(iscan, true);
}
@@ -457,7 +516,7 @@ xchk_iscan_iget(
ASSERT(iscan->__inodes[idx] == NULL);
- error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0,
+ error = xfs_iget(sc->mp, sc->tp, ino, ISCAN_IGET_FLAGS, 0,
&iscan->__inodes[idx]);
if (error)
break;
diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h
index 71f657552dfa..f9f47fa01a9e 100644
--- a/fs/xfs/scrub/iscan.h
+++ b/fs/xfs/scrub/iscan.h
@@ -59,6 +59,9 @@ struct xchk_iscan {
/* Set if the scan has been aborted due to some event in the fs. */
#define XCHK_ISCAN_OPSTATE_ABORTED (1)
+/* Use trylock to acquire the AGI */
+#define XCHK_ISCAN_OPSTATE_TRYLOCK_AGI (2)
+
static inline bool
xchk_iscan_aborted(const struct xchk_iscan *iscan)
{
@@ -71,8 +74,21 @@ xchk_iscan_abort(struct xchk_iscan *iscan)
set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
}
+static inline bool
+xchk_iscan_agi_needs_trylock(const struct xchk_iscan *iscan)
+{
+ return test_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate);
+}
+
+static inline void
+xchk_iscan_set_agi_trylock(struct xchk_iscan *iscan)
+{
+ set_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate);
+}
+
void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout,
unsigned int iget_retry_delay, struct xchk_iscan *iscan);
+void xchk_iscan_finish_early(struct xchk_iscan *iscan);
void xchk_iscan_teardown(struct xchk_iscan *iscan);
int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp);
diff --git a/fs/xfs/scrub/listxattr.c b/fs/xfs/scrub/listxattr.c
new file mode 100644
index 000000000000..256ff7700c94
--- /dev/null
+++ b/fs/xfs/scrub/listxattr.c
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_sf.h"
+#include "xfs_trans.h"
+#include "scrub/scrub.h"
+#include "scrub/bitmap.h"
+#include "scrub/dab_bitmap.h"
+#include "scrub/listxattr.h"
+
+/* Call a function for every entry in a shortform xattr structure. */
+STATIC int
+xchk_xattr_walk_sf(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn,
+ void *priv)
+{
+ struct xfs_attr_sf_hdr *hdr = ip->i_af.if_data;
+ struct xfs_attr_sf_entry *sfe;
+ unsigned int i;
+ int error;
+
+ sfe = xfs_attr_sf_firstentry(hdr);
+ for (i = 0; i < hdr->count; i++) {
+ error = attr_fn(sc, ip, sfe->flags, sfe->nameval, sfe->namelen,
+ &sfe->nameval[sfe->namelen], sfe->valuelen,
+ priv);
+ if (error)
+ return error;
+
+ sfe = xfs_attr_sf_nextentry(sfe);
+ }
+
+ return 0;
+}
+
+/* Call a function for every entry in this xattr leaf block. */
+STATIC int
+xchk_xattr_walk_leaf_entries(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn,
+ struct xfs_buf *bp,
+ void *priv)
+{
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_attr_leafblock *leaf = bp->b_addr;
+ struct xfs_attr_leaf_entry *entry;
+ unsigned int i;
+ int error;
+
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
+ entry = xfs_attr3_leaf_entryp(leaf);
+
+ for (i = 0; i < ichdr.count; entry++, i++) {
+ void *value;
+ unsigned char *name;
+ unsigned int namelen, valuelen;
+
+ if (entry->flags & XFS_ATTR_LOCAL) {
+ struct xfs_attr_leaf_name_local *name_loc;
+
+ name_loc = xfs_attr3_leaf_name_local(leaf, i);
+ name = name_loc->nameval;
+ namelen = name_loc->namelen;
+ value = &name_loc->nameval[name_loc->namelen];
+ valuelen = be16_to_cpu(name_loc->valuelen);
+ } else {
+ struct xfs_attr_leaf_name_remote *name_rmt;
+
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+ name = name_rmt->name;
+ namelen = name_rmt->namelen;
+ value = NULL;
+ valuelen = be32_to_cpu(name_rmt->valuelen);
+ }
+
+ error = attr_fn(sc, ip, entry->flags, name, namelen, value,
+ valuelen, priv);
+ if (error)
+ return error;
+
+ }
+
+ return 0;
+}
+
+/*
+ * Call a function for every entry in a leaf-format xattr structure. Avoid
+ * memory allocations for the loop detector since there's only one block.
+ */
+STATIC int
+xchk_xattr_walk_leaf(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn,
+ void *priv)
+{
+ struct xfs_buf *leaf_bp;
+ int error;
+
+ error = xfs_attr3_leaf_read(sc->tp, ip, ip->i_ino, 0, &leaf_bp);
+ if (error)
+ return error;
+
+ error = xchk_xattr_walk_leaf_entries(sc, ip, attr_fn, leaf_bp, priv);
+ xfs_trans_brelse(sc->tp, leaf_bp);
+ return error;
+}
+
+/* Find the leftmost leaf in the xattr dabtree. */
+STATIC int
+xchk_xattr_find_leftmost_leaf(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ struct xdab_bitmap *seen_dablks,
+ struct xfs_buf **leaf_bpp)
+{
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_trans *tp = sc->tp;
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ struct xfs_buf *bp;
+ xfs_failaddr_t fa;
+ xfs_dablk_t blkno = 0;
+ unsigned int expected_level = 0;
+ int error;
+
+ for (;;) {
+ xfs_extlen_t len = 1;
+ uint16_t magic;
+
+ /* Make sure we haven't seen this new block already. */
+ if (xdab_bitmap_test(seen_dablks, blkno, &len))
+ return -EFSCORRUPTED;
+
+ error = xfs_da3_node_read(tp, ip, blkno, &bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ node = bp->b_addr;
+ magic = be16_to_cpu(node->hdr.info.magic);
+ if (magic == XFS_ATTR_LEAF_MAGIC ||
+ magic == XFS_ATTR3_LEAF_MAGIC)
+ break;
+
+ error = -EFSCORRUPTED;
+ if (magic != XFS_DA_NODE_MAGIC &&
+ magic != XFS_DA3_NODE_MAGIC)
+ goto out_buf;
+
+ fa = xfs_da3_node_header_check(bp, ip->i_ino);
+ if (fa)
+ goto out_buf;
+
+ xfs_da3_node_hdr_from_disk(mp, &nodehdr, node);
+
+ if (nodehdr.count == 0 || nodehdr.level >= XFS_DA_NODE_MAXDEPTH)
+ goto out_buf;
+
+ /* Check the level from the root node. */
+ if (blkno == 0)
+ expected_level = nodehdr.level - 1;
+ else if (expected_level != nodehdr.level)
+ goto out_buf;
+ else
+ expected_level--;
+
+ /* Remember that we've seen this node. */
+ error = xdab_bitmap_set(seen_dablks, blkno, 1);
+ if (error)
+ goto out_buf;
+
+ /* Find the next level towards the leaves of the dabtree. */
+ btree = nodehdr.btree;
+ blkno = be32_to_cpu(btree->before);
+ xfs_trans_brelse(tp, bp);
+ }
+
+ error = -EFSCORRUPTED;
+ fa = xfs_attr3_leaf_header_check(bp, ip->i_ino);
+ if (fa)
+ goto out_buf;
+
+ if (expected_level != 0)
+ goto out_buf;
+
+ /* Remember that we've seen this leaf. */
+ error = xdab_bitmap_set(seen_dablks, blkno, 1);
+ if (error)
+ goto out_buf;
+
+ *leaf_bpp = bp;
+ return 0;
+
+out_buf:
+ xfs_trans_brelse(tp, bp);
+ return error;
+}
+
+/* Call a function for every entry in a node-format xattr structure. */
+STATIC int
+xchk_xattr_walk_node(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn,
+ xchk_xattrleaf_fn leaf_fn,
+ void *priv)
+{
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ struct xdab_bitmap seen_dablks;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_buf *leaf_bp;
+ int error;
+
+ xdab_bitmap_init(&seen_dablks);
+
+ error = xchk_xattr_find_leftmost_leaf(sc, ip, &seen_dablks, &leaf_bp);
+ if (error)
+ goto out_bitmap;
+
+ for (;;) {
+ xfs_extlen_t len;
+
+ error = xchk_xattr_walk_leaf_entries(sc, ip, attr_fn, leaf_bp,
+ priv);
+ if (error)
+ goto out_leaf;
+
+ /* Find the right sibling of this leaf block. */
+ leaf = leaf_bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
+ if (leafhdr.forw == 0)
+ goto out_leaf;
+
+ xfs_trans_brelse(sc->tp, leaf_bp);
+
+ if (leaf_fn) {
+ error = leaf_fn(sc, priv);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Make sure we haven't seen this new leaf already. */
+ len = 1;
+ if (xdab_bitmap_test(&seen_dablks, leafhdr.forw, &len)) {
+ error = -EFSCORRUPTED;
+ goto out_bitmap;
+ }
+
+ error = xfs_attr3_leaf_read(sc->tp, ip, ip->i_ino,
+ leafhdr.forw, &leaf_bp);
+ if (error)
+ goto out_bitmap;
+
+ /* Remember that we've seen this new leaf. */
+ error = xdab_bitmap_set(&seen_dablks, leafhdr.forw, 1);
+ if (error)
+ goto out_leaf;
+ }
+
+out_leaf:
+ xfs_trans_brelse(sc->tp, leaf_bp);
+out_bitmap:
+ xdab_bitmap_destroy(&seen_dablks);
+ return error;
+}
+
+/*
+ * Call a function for every extended attribute in a file.
+ *
+ * Callers must hold the ILOCK. No validation or cursor restarts allowed.
+ * Returns -EFSCORRUPTED on any problem, including loops in the dabtree.
+ */
+int
+xchk_xattr_walk(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn,
+ xchk_xattrleaf_fn leaf_fn,
+ void *priv)
+{
+ int error;
+
+ xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
+
+ if (!xfs_inode_hasattr(ip))
+ return 0;
+
+ if (ip->i_af.if_format == XFS_DINODE_FMT_LOCAL)
+ return xchk_xattr_walk_sf(sc, ip, attr_fn, priv);
+
+ /* attr functions require that the attr fork is loaded */
+ error = xfs_iread_extents(sc->tp, ip, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ if (xfs_attr_is_leaf(ip))
+ return xchk_xattr_walk_leaf(sc, ip, attr_fn, priv);
+
+ return xchk_xattr_walk_node(sc, ip, attr_fn, leaf_fn, priv);
+}
diff --git a/fs/xfs/scrub/listxattr.h b/fs/xfs/scrub/listxattr.h
new file mode 100644
index 000000000000..703cfb7b14cf
--- /dev/null
+++ b/fs/xfs/scrub/listxattr.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_LISTXATTR_H__
+#define __XFS_SCRUB_LISTXATTR_H__
+
+typedef int (*xchk_xattr_fn)(struct xfs_scrub *sc, struct xfs_inode *ip,
+ unsigned int attr_flags, const unsigned char *name,
+ unsigned int namelen, const void *value, unsigned int valuelen,
+ void *priv);
+
+typedef int (*xchk_xattrleaf_fn)(struct xfs_scrub *sc, void *priv);
+
+int xchk_xattr_walk(struct xfs_scrub *sc, struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn, xchk_xattrleaf_fn leaf_fn, void *priv);
+
+#endif /* __XFS_SCRUB_LISTXATTR_H__ */
diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c
index 8a7d9557897c..80aee30886c4 100644
--- a/fs/xfs/scrub/nlinks.c
+++ b/fs/xfs/scrub/nlinks.c
@@ -18,15 +18,19 @@
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_ag.h"
+#include "xfs_parent.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/iscan.h"
+#include "scrub/orphanage.h"
#include "scrub/nlinks.h"
#include "scrub/trace.h"
#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
+#include "scrub/listxattr.h"
/*
* Live Inode Link Count Checking
@@ -43,11 +47,23 @@ int
xchk_setup_nlinks(
struct xfs_scrub *sc)
{
+ struct xchk_nlink_ctrs *xnc;
+ int error;
+
xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
- sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS);
- if (!sc->buf)
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_nlinks(sc);
+ if (error)
+ return error;
+ }
+
+ xnc = kvzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS);
+ if (!xnc)
return -ENOMEM;
+ xnc->xname.name = xnc->namebuf;
+ xnc->sc = sc;
+ sc->buf = xnc;
return xchk_setup_fs(sc);
}
@@ -152,6 +168,13 @@ xchk_nlinks_live_update(
xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb);
+ /*
+ * Ignore temporary directories being used to stage dir repairs, since
+ * we don't bump the link counts of the children.
+ */
+ if (xrep_is_tempfile(p->dp))
+ return NOTIFY_DONE;
+
trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino,
p->delta, p->name->name, p->name->len);
@@ -251,12 +274,17 @@ xchk_nlinks_collect_dirent(
* number of parents of the root directory.
*
* Otherwise, increment the number of backrefs pointing back to ino.
+ *
+ * If the filesystem has parent pointers, we walk the pptrs to
+ * determine the backref count.
*/
if (dotdot) {
if (dp == sc->mp->m_rootip)
error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
- else
+ else if (!xfs_has_parent(sc->mp))
error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0);
+ else
+ error = 0;
if (error)
goto out_unlock;
}
@@ -293,6 +321,61 @@ out_incomplete:
return error;
}
+/* Bump the backref count for the inode referenced by this parent pointer. */
+STATIC int
+xchk_nlinks_collect_pptr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xfs_name xname = {
+ .name = name,
+ .len = namelen,
+ };
+ struct xchk_nlink_ctrs *xnc = priv;
+ const struct xfs_parent_rec *pptr_rec = value;
+ xfs_ino_t parent_ino;
+ int error;
+
+ /* Update the shadow link counts if we haven't already failed. */
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ error = -ECANCELED;
+ goto out_incomplete;
+ }
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, &parent_ino, NULL);
+ if (error)
+ return error;
+
+ trace_xchk_nlinks_collect_pptr(sc->mp, ip, &xname, pptr_rec);
+
+ mutex_lock(&xnc->lock);
+
+ error = xchk_nlinks_update_incore(xnc, parent_ino, 0, 1, 0);
+ if (error)
+ goto out_unlock;
+
+ mutex_unlock(&xnc->lock);
+ return 0;
+
+out_unlock:
+ mutex_unlock(&xnc->lock);
+ xchk_iscan_abort(&xnc->collect_iscan);
+out_incomplete:
+ xchk_set_incomplete(sc);
+ return error;
+}
+
/* Walk a directory to bump the observed link counts of the children. */
STATIC int
xchk_nlinks_collect_dir(
@@ -303,6 +386,13 @@ xchk_nlinks_collect_dir(
unsigned int lock_mode;
int error = 0;
+ /*
+ * Ignore temporary directories being used to stage dir repairs, since
+ * we don't bump the link counts of the children.
+ */
+ if (xrep_is_tempfile(dp))
+ return 0;
+
/* Prevent anyone from changing this directory while we walk it. */
xfs_ilock(dp, XFS_IOLOCK_SHARED);
lock_mode = xfs_ilock_data_map_shared(dp);
@@ -332,6 +422,28 @@ xchk_nlinks_collect_dir(
if (error)
goto out_abort;
+ /* Walk the parent pointers to get real backref counts. */
+ if (xfs_has_parent(sc->mp)) {
+ /*
+ * If the extended attributes look as though they has been
+ * zapped by the inode record repair code, we cannot scan for
+ * parent pointers.
+ */
+ if (xchk_pptr_looks_zapped(dp)) {
+ error = -EBUSY;
+ goto out_unlock;
+ }
+
+ error = xchk_xattr_walk(sc, dp, xchk_nlinks_collect_pptr, NULL,
+ xnc);
+ if (error == -ECANCELED) {
+ error = 0;
+ goto out_unlock;
+ }
+ if (error)
+ goto out_abort;
+ }
+
xchk_iscan_mark_visited(&xnc->collect_iscan, dp);
goto out_unlock;
@@ -537,6 +649,14 @@ xchk_nlinks_compare_inode(
unsigned int actual_nlink;
int error;
+ /*
+ * Ignore temporary files being used to stage repairs, since we assume
+ * they're correct for non-directories, and the directory repair code
+ * doesn't bump the link counts for the children.
+ */
+ if (xrep_is_tempfile(ip))
+ return 0;
+
xfs_ilock(ip, XFS_ILOCK_SHARED);
mutex_lock(&xnc->lock);
@@ -571,9 +691,11 @@ xchk_nlinks_compare_inode(
* this as a corruption. The VFS won't let users increase the link
* count, but it will let them decrease it.
*/
- if (total_links > XFS_MAXLINK) {
+ if (total_links > XFS_NLINK_PINNED) {
xchk_ino_set_corrupt(sc, ip->i_ino);
goto out_corrupt;
+ } else if (total_links > XFS_MAXLINK) {
+ xchk_ino_set_warning(sc, ip->i_ino);
}
/* Link counts should match. */
@@ -850,9 +972,6 @@ xchk_nlinks_setup_scan(
xfs_agino_t first_agino, last_agino;
int error;
- ASSERT(xnc->sc == NULL);
- xnc->sc = sc;
-
mutex_init(&xnc->lock);
/* Retry iget every tenth of a second for up to 30 seconds. */
diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h
index a950f3daf204..b820712bfd87 100644
--- a/fs/xfs/scrub/nlinks.h
+++ b/fs/xfs/scrub/nlinks.h
@@ -28,6 +28,13 @@ struct xchk_nlink_ctrs {
* from other writer threads.
*/
struct xfs_dir_hook dhook;
+
+ /* Orphanage reparenting request. */
+ struct xrep_adoption adoption;
+
+ /* Directory entry name, plus the trailing null. */
+ struct xfs_name xname;
+ char namebuf[MAXNAMELEN];
};
/*
diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c
index b87618322f55..b3e707f47b7b 100644
--- a/fs/xfs/scrub/nlinks_repair.c
+++ b/fs/xfs/scrub/nlinks_repair.c
@@ -17,14 +17,19 @@
#include "xfs_iwalk.h"
#include "xfs_ialloc.h"
#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_parent.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/iscan.h"
+#include "scrub/orphanage.h"
#include "scrub/nlinks.h"
#include "scrub/trace.h"
+#include "scrub/tempfile.h"
/*
* Live Inode Link Count Repair
@@ -36,6 +41,48 @@
* inode is locked.
*/
+/* Set up to repair inode link counts. */
+int
+xrep_setup_nlinks(
+ struct xfs_scrub *sc)
+{
+ return xrep_orphanage_try_create(sc);
+}
+
+/*
+ * Inodes that aren't the root directory or the orphanage, have a nonzero link
+ * count, and no observed parents should be moved to the orphanage.
+ */
+static inline bool
+xrep_nlinks_is_orphaned(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int actual_nlink,
+ const struct xchk_nlink *obs)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (obs->parents != 0)
+ return false;
+ if (ip == mp->m_rootip || ip == sc->orphanage)
+ return false;
+ return actual_nlink != 0;
+}
+
+/* Remove an inode from the unlinked list. */
+STATIC int
+xrep_nlinks_iunlink_remove(
+ struct xfs_scrub *sc)
+{
+ struct xfs_perag *pag;
+ int error;
+
+ pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino));
+ error = xfs_iunlink_remove(sc->tp, pag, sc->ip);
+ xfs_perag_put(pag);
+ return error;
+}
+
/*
* Correct the link count of the given inode. Because we have to grab locks
* and resources in a certain order, it's possible that this will be a no-op.
@@ -50,17 +97,55 @@ xrep_nlinks_repair_inode(
struct xfs_inode *ip = sc->ip;
uint64_t total_links;
uint64_t actual_nlink;
+ bool orphanage_available = false;
bool dirty = false;
int error;
- xchk_ilock(sc, XFS_IOLOCK_EXCL);
+ /*
+ * Ignore temporary files being used to stage repairs, since we assume
+ * they're correct for non-directories, and the directory repair code
+ * doesn't bump the link counts for the children.
+ */
+ if (xrep_is_tempfile(ip))
+ return 0;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &sc->tp);
- if (error)
- return error;
+ /*
+ * If the filesystem has an orphanage attached to the scrub context,
+ * prepare for a link count repair that could involve @ip being adopted
+ * by the lost+found.
+ */
+ if (xrep_orphanage_can_adopt(sc)) {
+ error = xrep_orphanage_iolock_two(sc);
+ if (error)
+ return error;
- xchk_ilock(sc, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(sc->tp, ip, 0);
+ error = xrep_adoption_trans_alloc(sc, &xnc->adoption);
+ if (error) {
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ } else {
+ orphanage_available = true;
+ }
+ }
+
+ /*
+ * Either there is no orphanage or we couldn't allocate resources for
+ * that kind of update. Let's try again with only the resources we
+ * need for a simple link count update, since that's much more common.
+ */
+ if (!orphanage_available) {
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0,
+ &sc->tp);
+ if (error) {
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
+ return error;
+ }
+
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(sc->tp, ip, 0);
+ }
mutex_lock(&xnc->lock);
@@ -99,28 +184,68 @@ xrep_nlinks_repair_inode(
}
/*
- * We did not find any links to this inode. If the inode agrees, we
- * have nothing further to do. If not, the inode has a nonzero link
- * count and we don't have anywhere to graft the child onto. Dropping
- * a live inode's link count to zero can cause unexpected shutdowns in
- * inactivation, so leave it alone.
+ * Decide if we're going to move this file to the orphanage, and fix
+ * up the incore link counts if we are.
*/
- if (total_links == 0) {
- if (actual_nlink != 0)
- trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
- goto out_trans;
+ if (orphanage_available &&
+ xrep_nlinks_is_orphaned(sc, ip, actual_nlink, &obs)) {
+ /* Figure out what name we're going to use here. */
+ error = xrep_adoption_compute_name(&xnc->adoption, &xnc->xname);
+ if (error)
+ goto out_trans;
+
+ /*
+ * Reattach this file to the directory tree by moving it to
+ * the orphanage per the adoption parameters that we already
+ * computed.
+ */
+ error = xrep_adoption_move(&xnc->adoption);
+ if (error)
+ goto out_trans;
+
+ /*
+ * Re-read the link counts since the reparenting will have
+ * updated our scan info.
+ */
+ mutex_lock(&xnc->lock);
+ error = xfarray_load_sparse(xnc->nlinks, ip->i_ino, &obs);
+ mutex_unlock(&xnc->lock);
+ if (error)
+ goto out_trans;
+
+ total_links = xchk_nlink_total(ip, &obs);
+ actual_nlink = VFS_I(ip)->i_nlink;
+ dirty = true;
}
- /* Commit the new link count if it changed. */
- if (total_links != actual_nlink) {
- if (total_links > XFS_MAXLINK) {
- trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
+ /*
+ * If this inode is linked from the directory tree and on the unlinked
+ * list, remove it from the unlinked list.
+ */
+ if (total_links > 0 && xfs_inode_on_unlinked_list(ip)) {
+ error = xrep_nlinks_iunlink_remove(sc);
+ if (error)
goto out_trans;
- }
+ dirty = true;
+ }
+ /*
+ * If this inode is not linked from the directory tree yet not on the
+ * unlinked list, put it on the unlinked list.
+ */
+ if (total_links == 0 && !xfs_inode_on_unlinked_list(ip)) {
+ error = xfs_iunlink(sc->tp, ip);
+ if (error)
+ goto out_trans;
+ dirty = true;
+ }
+
+ /* Commit the new link count if it changed. */
+ if (total_links != actual_nlink) {
trace_xrep_nlinks_update_inode(mp, ip, &obs);
- set_nlink(VFS_I(ip), total_links);
+ set_nlink(VFS_I(ip), min_t(unsigned long long, total_links,
+ XFS_NLINK_PINNED));
dirty = true;
}
@@ -132,14 +257,19 @@ xrep_nlinks_repair_inode(
xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE);
error = xrep_trans_commit(sc);
- xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- return error;
+ goto out_unlock;
out_scanlock:
mutex_unlock(&xnc->lock);
out_trans:
xchk_trans_cancel(sc);
- xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+out_unlock:
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ if (orphanage_available) {
+ xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ }
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
return error;
}
@@ -172,10 +302,10 @@ xrep_nlinks(
/*
* We need ftype for an accurate count of the number of child
* subdirectory links. Child subdirectories with a back link (dotdot
- * entry) but no forward link are unfixable, so we cannot repair the
- * link count of the parent directory based on the back link count
- * alone. Filesystems without ftype support are rare (old V4) so we
- * just skip out here.
+ * entry) but no forward link are moved to the orphanage, so we cannot
+ * repair the link count of the parent directory based on the back link
+ * count alone. Filesystems without ftype support are rare (old V4) so
+ * we just skip out here.
*/
if (!xfs_has_ftype(sc->mp))
return -EOPNOTSUPP;
diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c
new file mode 100644
index 000000000000..7148d8362db8
--- /dev/null
+++ b/fs/xfs/scrub/orphanage.c
@@ -0,0 +1,627 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_dir2.h"
+#include "xfs_icache.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_parent.h"
+#include "xfs_attr_sf.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/trace.h"
+#include "scrub/orphanage.h"
+#include "scrub/readdir.h"
+
+#include <linux/namei.h>
+
+/*
+ * The Orphanage
+ * =============
+ *
+ * If the directory tree is damaged, children of that directory become
+ * inaccessible via that file path. If a child has no other parents, the file
+ * is said to be orphaned. xfs_repair fixes this situation by creating a
+ * orphanage directory (specifically, /lost+found) and creating a directory
+ * entry pointing to the orphaned file.
+ *
+ * Online repair follows this tactic by creating a root-owned /lost+found
+ * directory if one does not exist. If an orphan is found, it will move that
+ * files into orphanage.
+ */
+
+/* Make the orphanage owned by root. */
+STATIC int
+xrep_chown_orphanage(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp)
+{
+ struct xfs_trans *tp;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_dquot *udqp = NULL, *gdqp = NULL, *pdqp = NULL;
+ struct xfs_dquot *oldu = NULL, *oldg = NULL, *oldp = NULL;
+ struct inode *inode = VFS_I(dp);
+ int error;
+
+ error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
+ XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp);
+ if (error)
+ return error;
+
+ error = xfs_trans_alloc_ichange(dp, udqp, gdqp, pdqp, true, &tp);
+ if (error)
+ goto out_dqrele;
+
+ /*
+ * Always clear setuid/setgid/sticky on the orphanage since we don't
+ * normally want that functionality on this directory and xfs_repair
+ * doesn't create it this way either. Leave the other access bits
+ * unchanged.
+ */
+ inode->i_mode &= ~(S_ISUID | S_ISGID | S_ISVTX);
+
+ /*
+ * Change the ownerships and register quota modifications
+ * in the transaction.
+ */
+ if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID)) {
+ if (XFS_IS_UQUOTA_ON(mp))
+ oldu = xfs_qm_vop_chown(tp, dp, &dp->i_udquot, udqp);
+ inode->i_uid = GLOBAL_ROOT_UID;
+ }
+ if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID)) {
+ if (XFS_IS_GQUOTA_ON(mp))
+ oldg = xfs_qm_vop_chown(tp, dp, &dp->i_gdquot, gdqp);
+ inode->i_gid = GLOBAL_ROOT_GID;
+ }
+ if (dp->i_projid != 0) {
+ if (XFS_IS_PQUOTA_ON(mp))
+ oldp = xfs_qm_vop_chown(tp, dp, &dp->i_pdquot, pdqp);
+ dp->i_projid = 0;
+ }
+
+ dp->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT);
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+ XFS_STATS_INC(mp, xs_ig_attrchg);
+
+ if (xfs_has_wsync(mp))
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp);
+
+ xfs_qm_dqrele(oldu);
+ xfs_qm_dqrele(oldg);
+ xfs_qm_dqrele(oldp);
+
+out_dqrele:
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+ return error;
+}
+
+#define ORPHANAGE "lost+found"
+
+/* Create the orphanage directory, and set sc->orphanage to it. */
+int
+xrep_orphanage_create(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct dentry *root_dentry, *orphanage_dentry;
+ struct inode *root_inode = VFS_I(sc->mp->m_rootip);
+ struct inode *orphanage_inode;
+ int error;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+ if (xfs_is_readonly(mp)) {
+ sc->orphanage = NULL;
+ return 0;
+ }
+
+ ASSERT(sc->tp == NULL);
+ ASSERT(sc->orphanage == NULL);
+
+ /* Find the dentry for the root directory... */
+ root_dentry = d_find_alias(root_inode);
+ if (!root_dentry) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* ...which is a directory, right? */
+ if (!d_is_dir(root_dentry)) {
+ error = -EFSCORRUPTED;
+ goto out_dput_root;
+ }
+
+ /* Try to find the orphanage directory. */
+ inode_lock_nested(root_inode, I_MUTEX_PARENT);
+ orphanage_dentry = lookup_one_len(ORPHANAGE, root_dentry,
+ strlen(ORPHANAGE));
+ if (IS_ERR(orphanage_dentry)) {
+ error = PTR_ERR(orphanage_dentry);
+ goto out_unlock_root;
+ }
+
+ /*
+ * Nothing found? Call mkdir to create the orphanage. Create the
+ * directory without other-user access because we're live and someone
+ * could have been relying partly on minimal access to a parent
+ * directory to control access to a file we put in here.
+ */
+ if (d_really_is_negative(orphanage_dentry)) {
+ error = vfs_mkdir(&nop_mnt_idmap, root_inode, orphanage_dentry,
+ 0750);
+ if (error)
+ goto out_dput_orphanage;
+ }
+
+ /* Not a directory? Bail out. */
+ if (!d_is_dir(orphanage_dentry)) {
+ error = -ENOTDIR;
+ goto out_dput_orphanage;
+ }
+
+ /*
+ * Grab a reference to the orphanage. This /should/ succeed since
+ * we hold the root directory locked and therefore nobody can delete
+ * the orphanage.
+ */
+ orphanage_inode = igrab(d_inode(orphanage_dentry));
+ if (!orphanage_inode) {
+ error = -ENOENT;
+ goto out_dput_orphanage;
+ }
+
+ /* Make sure the orphanage is owned by root. */
+ error = xrep_chown_orphanage(sc, XFS_I(orphanage_inode));
+ if (error)
+ goto out_dput_orphanage;
+
+ /* Stash the reference for later and bail out. */
+ sc->orphanage = XFS_I(orphanage_inode);
+ sc->orphanage_ilock_flags = 0;
+
+out_dput_orphanage:
+ dput(orphanage_dentry);
+out_unlock_root:
+ inode_unlock(VFS_I(sc->mp->m_rootip));
+out_dput_root:
+ dput(root_dentry);
+out:
+ return error;
+}
+
+void
+xrep_orphanage_ilock(
+ struct xfs_scrub *sc,
+ unsigned int ilock_flags)
+{
+ sc->orphanage_ilock_flags |= ilock_flags;
+ xfs_ilock(sc->orphanage, ilock_flags);
+}
+
+bool
+xrep_orphanage_ilock_nowait(
+ struct xfs_scrub *sc,
+ unsigned int ilock_flags)
+{
+ if (xfs_ilock_nowait(sc->orphanage, ilock_flags)) {
+ sc->orphanage_ilock_flags |= ilock_flags;
+ return true;
+ }
+
+ return false;
+}
+
+void
+xrep_orphanage_iunlock(
+ struct xfs_scrub *sc,
+ unsigned int ilock_flags)
+{
+ xfs_iunlock(sc->orphanage, ilock_flags);
+ sc->orphanage_ilock_flags &= ~ilock_flags;
+}
+
+/* Grab the IOLOCK of the orphanage and sc->ip. */
+int
+xrep_orphanage_iolock_two(
+ struct xfs_scrub *sc)
+{
+ int error = 0;
+
+ while (true) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /*
+ * Normal XFS takes the IOLOCK before grabbing a transaction.
+ * Scrub holds a transaction, which means that we can't block
+ * on either IOLOCK.
+ */
+ if (xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) {
+ if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL))
+ break;
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ }
+ delay(1);
+ }
+
+ return 0;
+}
+
+/* Release the orphanage. */
+void
+xrep_orphanage_rele(
+ struct xfs_scrub *sc)
+{
+ if (!sc->orphanage)
+ return;
+
+ if (sc->orphanage_ilock_flags)
+ xfs_iunlock(sc->orphanage, sc->orphanage_ilock_flags);
+
+ xchk_irele(sc, sc->orphanage);
+ sc->orphanage = NULL;
+}
+
+/* Adoption moves a file into /lost+found */
+
+/* Can the orphanage adopt @sc->ip? */
+bool
+xrep_orphanage_can_adopt(
+ struct xfs_scrub *sc)
+{
+ ASSERT(sc->ip != NULL);
+
+ if (!sc->orphanage)
+ return false;
+ if (sc->ip == sc->orphanage)
+ return false;
+ if (xfs_internal_inum(sc->mp, sc->ip->i_ino))
+ return false;
+ return true;
+}
+
+/*
+ * Create a new transaction to send a child to the orphanage.
+ *
+ * Allocate a new transaction with sufficient disk space to handle the
+ * adoption, take ILOCK_EXCL of the orphanage and sc->ip, joins them to the
+ * transaction, and reserve quota to reparent the latter. Caller must hold the
+ * IOLOCK of the orphanage and sc->ip.
+ */
+int
+xrep_adoption_trans_alloc(
+ struct xfs_scrub *sc,
+ struct xrep_adoption *adopt)
+{
+ struct xfs_mount *mp = sc->mp;
+ unsigned int child_blkres = 0;
+ int error;
+
+ ASSERT(sc->tp == NULL);
+ ASSERT(sc->ip != NULL);
+ ASSERT(sc->orphanage != NULL);
+ ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL);
+ ASSERT(sc->orphanage_ilock_flags & XFS_IOLOCK_EXCL);
+ ASSERT(!(sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)));
+ ASSERT(!(sc->orphanage_ilock_flags &
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)));
+
+ /* Compute the worst case space reservation that we need. */
+ adopt->sc = sc;
+ adopt->orphanage_blkres = xfs_link_space_res(mp, MAXNAMELEN);
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode))
+ child_blkres = xfs_rename_space_res(mp, 0, false,
+ xfs_name_dotdot.len, false);
+ if (xfs_has_parent(mp))
+ child_blkres += XFS_ADDAFORK_SPACE_RES(mp);
+ adopt->child_blkres = child_blkres;
+
+ /*
+ * Allocate a transaction to link the child into the parent, along with
+ * enough disk space to handle expansion of both the orphanage and the
+ * dotdot entry of a child directory.
+ */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link,
+ adopt->orphanage_blkres + adopt->child_blkres, 0, 0,
+ &sc->tp);
+ if (error)
+ return error;
+
+ xfs_lock_two_inodes(sc->orphanage, XFS_ILOCK_EXCL,
+ sc->ip, XFS_ILOCK_EXCL);
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ sc->orphanage_ilock_flags |= XFS_ILOCK_EXCL;
+
+ xfs_trans_ijoin(sc->tp, sc->orphanage, 0);
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /*
+ * Reserve enough quota in the orphan directory to add the new name.
+ * Normally the orphanage should have user/group/project ids of zero
+ * and hence is not subject to quota enforcement, but we're allowed to
+ * exceed quota to reattach disconnected parts of the directory tree.
+ */
+ error = xfs_trans_reserve_quota_nblks(sc->tp, sc->orphanage,
+ adopt->orphanage_blkres, 0, true);
+ if (error)
+ goto out_cancel;
+
+ /*
+ * Reserve enough quota in the child directory to change dotdot.
+ * Here we're also allowed to exceed file quota to repair inconsistent
+ * metadata.
+ */
+ if (adopt->child_blkres) {
+ error = xfs_trans_reserve_quota_nblks(sc->tp, sc->ip,
+ adopt->child_blkres, 0, true);
+ if (error)
+ goto out_cancel;
+ }
+
+ return 0;
+out_cancel:
+ xchk_trans_cancel(sc);
+ xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * Compute the xfs_name for the directory entry that we're adding to the
+ * orphanage. Caller must hold ILOCKs of sc->ip and the orphanage and must not
+ * reuse namebuf until the adoption completes or is dissolved.
+ */
+int
+xrep_adoption_compute_name(
+ struct xrep_adoption *adopt,
+ struct xfs_name *xname)
+{
+ struct xfs_scrub *sc = adopt->sc;
+ char *namebuf = (void *)xname->name;
+ xfs_ino_t ino;
+ unsigned int incr = 0;
+ int error = 0;
+
+ adopt->xname = xname;
+ xname->len = snprintf(namebuf, MAXNAMELEN, "%llu", sc->ip->i_ino);
+ xname->type = xfs_mode_to_ftype(VFS_I(sc->ip)->i_mode);
+
+ /* Make sure the filename is unique in the lost+found. */
+ error = xchk_dir_lookup(sc, sc->orphanage, xname, &ino);
+ while (error == 0 && incr < 10000) {
+ xname->len = snprintf(namebuf, MAXNAMELEN, "%llu.%u",
+ sc->ip->i_ino, ++incr);
+ error = xchk_dir_lookup(sc, sc->orphanage, xname, &ino);
+ }
+ if (error == 0) {
+ /* We already have 10,000 entries in the orphanage? */
+ return -EFSCORRUPTED;
+ }
+
+ if (error != -ENOENT)
+ return error;
+ return 0;
+}
+
+/*
+ * Make sure the dcache does not have a positive dentry for the name we've
+ * chosen. The caller should have checked with the ondisk directory, so any
+ * discrepancy is a sign that something is seriously wrong.
+ */
+static int
+xrep_adoption_check_dcache(
+ struct xrep_adoption *adopt)
+{
+ struct qstr qname = QSTR_INIT(adopt->xname->name,
+ adopt->xname->len);
+ struct xfs_scrub *sc = adopt->sc;
+ struct dentry *d_orphanage, *d_child;
+ int error = 0;
+
+ d_orphanage = d_find_alias(VFS_I(sc->orphanage));
+ if (!d_orphanage)
+ return 0;
+
+ d_child = d_hash_and_lookup(d_orphanage, &qname);
+ if (d_child) {
+ trace_xrep_adoption_check_child(sc->mp, d_child);
+
+ if (d_is_positive(d_child)) {
+ ASSERT(d_is_negative(d_child));
+ error = -EFSCORRUPTED;
+ }
+
+ dput(d_child);
+ }
+
+ dput(d_orphanage);
+ return error;
+}
+
+/*
+ * Invalidate all dentries for the name that was added to the orphanage
+ * directory, and all dentries pointing to the child inode that was moved.
+ *
+ * There should not be any positive entries for the name, since we've
+ * maintained our lock on the orphanage directory.
+ */
+static void
+xrep_adoption_zap_dcache(
+ struct xrep_adoption *adopt)
+{
+ struct qstr qname = QSTR_INIT(adopt->xname->name,
+ adopt->xname->len);
+ struct xfs_scrub *sc = adopt->sc;
+ struct dentry *d_orphanage, *d_child;
+
+ /* Invalidate all dentries for the adoption name */
+ d_orphanage = d_find_alias(VFS_I(sc->orphanage));
+ if (!d_orphanage)
+ return;
+
+ d_child = d_hash_and_lookup(d_orphanage, &qname);
+ while (d_child != NULL) {
+ trace_xrep_adoption_invalidate_child(sc->mp, d_child);
+
+ ASSERT(d_is_negative(d_child));
+ d_invalidate(d_child);
+ dput(d_child);
+ d_child = d_lookup(d_orphanage, &qname);
+ }
+
+ dput(d_orphanage);
+
+ /* Invalidate all the dentries pointing down to this file. */
+ while ((d_child = d_find_alias(VFS_I(sc->ip))) != NULL) {
+ trace_xrep_adoption_invalidate_child(sc->mp, d_child);
+
+ d_invalidate(d_child);
+ dput(d_child);
+ }
+}
+
+/*
+ * If we have to add an attr fork ahead of a parent pointer update, how much
+ * space should we ask for?
+ */
+static inline int
+xrep_adoption_attr_sizeof(
+ const struct xrep_adoption *adopt)
+{
+ return sizeof(struct xfs_attr_sf_hdr) +
+ xfs_attr_sf_entsize_byname(sizeof(struct xfs_parent_rec),
+ adopt->xname->len);
+}
+
+/*
+ * Move the current file to the orphanage under the computed name.
+ *
+ * Returns with a dirty transaction so that the caller can handle any other
+ * work, such as fixing up unlinked lists or resetting link counts.
+ */
+int
+xrep_adoption_move(
+ struct xrep_adoption *adopt)
+{
+ struct xfs_scrub *sc = adopt->sc;
+ bool isdir = S_ISDIR(VFS_I(sc->ip)->i_mode);
+ int error;
+
+ trace_xrep_adoption_reparent(sc->orphanage, adopt->xname,
+ sc->ip->i_ino);
+
+ error = xrep_adoption_check_dcache(adopt);
+ if (error)
+ return error;
+
+ /*
+ * If this filesystem has parent pointers, ensure that the file being
+ * moved to the orphanage has an attribute fork. This is required
+ * because the parent pointer code does not itself add attr forks.
+ */
+ if (!xfs_inode_has_attr_fork(sc->ip) && xfs_has_parent(sc->mp)) {
+ int sf_size = xrep_adoption_attr_sizeof(adopt);
+
+ error = xfs_bmap_add_attrfork(sc->tp, sc->ip, sf_size, true);
+ if (error)
+ return error;
+ }
+
+ /* Create the new name in the orphanage. */
+ error = xfs_dir_createname(sc->tp, sc->orphanage, adopt->xname,
+ sc->ip->i_ino, adopt->orphanage_blkres);
+ if (error)
+ return error;
+
+ /*
+ * Bump the link count of the orphanage if we just added a
+ * subdirectory, and update its timestamps.
+ */
+ xfs_trans_ichgtime(sc->tp, sc->orphanage,
+ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ if (isdir)
+ xfs_bumplink(sc->tp, sc->orphanage);
+ xfs_trans_log_inode(sc->tp, sc->orphanage, XFS_ILOG_CORE);
+
+ /* Bump the link count of the child. */
+ if (adopt->bump_child_nlink) {
+ xfs_bumplink(sc->tp, sc->ip);
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ }
+
+ /* Replace the dotdot entry if the child is a subdirectory. */
+ if (isdir) {
+ error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot,
+ sc->orphanage->i_ino, adopt->child_blkres);
+ if (error)
+ return error;
+ }
+
+ /* Add a parent pointer from the file back to the lost+found. */
+ if (xfs_has_parent(sc->mp)) {
+ error = xfs_parent_addname(sc->tp, &adopt->ppargs,
+ sc->orphanage, adopt->xname, sc->ip);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Notify dirent hooks that we moved the file to /lost+found, and
+ * finish all the deferred work so that we know the adoption is fully
+ * recorded in the log.
+ */
+ xfs_dir_update_hook(sc->orphanage, sc->ip, 1, adopt->xname);
+
+ /* Remove negative dentries from the lost+found's dcache */
+ xrep_adoption_zap_dcache(adopt);
+ return 0;
+}
+
+/*
+ * Roll to a clean scrub transaction so that we can release the orphanage,
+ * even if xrep_adoption_move was not called.
+ *
+ * Commits all the work and deferred ops attached to an adoption request and
+ * rolls to a clean scrub transaction. On success, returns 0 with the scrub
+ * context holding a clean transaction with no inodes joined. On failure,
+ * returns negative errno with no scrub transaction. All inode locks are
+ * still held after this function returns.
+ */
+int
+xrep_adoption_trans_roll(
+ struct xrep_adoption *adopt)
+{
+ struct xfs_scrub *sc = adopt->sc;
+ int error;
+
+ trace_xrep_adoption_trans_roll(sc->orphanage, sc->ip,
+ !!(sc->tp->t_flags & XFS_TRANS_DIRTY));
+
+ /* Finish all the deferred ops to commit all repairs. */
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+
+ /* Roll the transaction once more to detach the inodes. */
+ return xfs_trans_roll(&sc->tp);
+}
diff --git a/fs/xfs/scrub/orphanage.h b/fs/xfs/scrub/orphanage.h
new file mode 100644
index 000000000000..7c7a2e7d81db
--- /dev/null
+++ b/fs/xfs/scrub/orphanage.h
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_ORPHANAGE_H__
+#define __XFS_SCRUB_ORPHANAGE_H__
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+int xrep_orphanage_create(struct xfs_scrub *sc);
+
+/*
+ * If we're doing a repair, ensure that the orphanage exists and attach it to
+ * the scrub context.
+ */
+static inline int
+xrep_orphanage_try_create(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ ASSERT(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR);
+
+ error = xrep_orphanage_create(sc);
+ switch (error) {
+ case 0:
+ case -ENOENT:
+ case -ENOTDIR:
+ case -ENOSPC:
+ /*
+ * If the orphanage can't be found or isn't a directory, we'll
+ * keep going, but we won't be able to attach the file to the
+ * orphanage if we can't find the parent.
+ */
+ return 0;
+ }
+
+ return error;
+}
+
+int xrep_orphanage_iolock_two(struct xfs_scrub *sc);
+
+void xrep_orphanage_ilock(struct xfs_scrub *sc, unsigned int ilock_flags);
+bool xrep_orphanage_ilock_nowait(struct xfs_scrub *sc,
+ unsigned int ilock_flags);
+void xrep_orphanage_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags);
+
+void xrep_orphanage_rele(struct xfs_scrub *sc);
+
+/* Information about a request to add a file to the orphanage. */
+struct xrep_adoption {
+ struct xfs_scrub *sc;
+
+ /* Name used for the adoption. */
+ struct xfs_name *xname;
+
+ /* Parent pointer context tracking */
+ struct xfs_parent_args ppargs;
+
+ /* Block reservations for orphanage and child (if directory). */
+ unsigned int orphanage_blkres;
+ unsigned int child_blkres;
+
+ /*
+ * Does the caller want us to bump the child link count? This is not
+ * needed when reattaching files that have become disconnected but have
+ * nlink > 1. It is necessary when changing the directory tree
+ * structure.
+ */
+ bool bump_child_nlink:1;
+};
+
+bool xrep_orphanage_can_adopt(struct xfs_scrub *sc);
+
+int xrep_adoption_trans_alloc(struct xfs_scrub *sc,
+ struct xrep_adoption *adopt);
+int xrep_adoption_compute_name(struct xrep_adoption *adopt,
+ struct xfs_name *xname);
+int xrep_adoption_move(struct xrep_adoption *adopt);
+int xrep_adoption_trans_roll(struct xrep_adoption *adopt);
+#else
+struct xrep_adoption { /* empty */ };
+# define xrep_orphanage_rele(sc) ((void)0)
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_ORPHANAGE_H__ */
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 7db873672146..733c410a2279 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -10,19 +10,37 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_log_format.h"
+#include "xfs_trans.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
+#include "xfs_attr.h"
+#include "xfs_parent.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
+#include "scrub/repair.h"
+#include "scrub/listxattr.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/trace.h"
/* Set us up to scrub parents. */
int
xchk_setup_parent(
struct xfs_scrub *sc)
{
+ int error;
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_parent(sc);
+ if (error)
+ return error;
+ }
+
return xchk_setup_inode_contents(sc, 0);
}
@@ -143,7 +161,8 @@ xchk_parent_validate(
}
if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
return error;
- if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) {
+ if (dp == sc->ip || xrep_is_tempfile(dp) ||
+ !S_ISDIR(VFS_I(dp)->i_mode)) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
goto out_rele;
}
@@ -185,6 +204,621 @@ out_rele:
return error;
}
+/*
+ * Checking of Parent Pointers
+ * ===========================
+ *
+ * On filesystems with directory parent pointers, we check the referential
+ * integrity by visiting each parent pointer of a child file and checking that
+ * the directory referenced by the pointer actually has a dirent pointing
+ * forward to the child file.
+ */
+
+/* Deferred parent pointer entry that we saved for later. */
+struct xchk_pptr {
+ /* Cookie for retrieval of the pptr name. */
+ xfblob_cookie name_cookie;
+
+ /* Parent pointer record. */
+ struct xfs_parent_rec pptr_rec;
+
+ /* Length of the pptr name. */
+ uint8_t namelen;
+};
+
+struct xchk_pptrs {
+ struct xfs_scrub *sc;
+
+ /* How many parent pointers did we find at the end? */
+ unsigned long long pptrs_found;
+
+ /* Parent of this directory. */
+ xfs_ino_t parent_ino;
+
+ /* Fixed-size array of xchk_pptr structures. */
+ struct xfarray *pptr_entries;
+
+ /* Blobs containing parent pointer names. */
+ struct xfblob *pptr_names;
+
+ /* Scratch buffer for scanning pptr xattrs */
+ struct xfs_da_args pptr_args;
+
+ /* If we've cycled the ILOCK, we must revalidate all deferred pptrs. */
+ bool need_revalidate;
+
+ /* Name buffer */
+ struct xfs_name xname;
+ char namebuf[MAXNAMELEN];
+};
+
+/* Does this parent pointer match the dotdot entry? */
+STATIC int
+xchk_parent_scan_dotdot(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xchk_pptrs *pp = priv;
+ xfs_ino_t parent_ino;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, &parent_ino, NULL);
+ if (error)
+ return error;
+
+ if (pp->parent_ino == parent_ino)
+ return -ECANCELED;
+
+ return 0;
+}
+
+/* Look up the dotdot entry so that we can check it as we walk the pptrs. */
+STATIC int
+xchk_parent_pptr_and_dotdot(
+ struct xchk_pptrs *pp)
+{
+ struct xfs_scrub *sc = pp->sc;
+ int error;
+
+ /* Look up '..' */
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &pp->parent_ino);
+ if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ return error;
+ if (!xfs_verify_dir_ino(sc->mp, pp->parent_ino)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
+
+ /* Is this the root dir? Then '..' must point to itself. */
+ if (sc->ip == sc->mp->m_rootip) {
+ if (sc->ip->i_ino != pp->parent_ino)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
+
+ /*
+ * If this is now an unlinked directory, the dotdot value is
+ * meaningless as long as it points to a valid inode.
+ */
+ if (VFS_I(sc->ip)->i_nlink == 0)
+ return 0;
+
+ if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /* Otherwise, walk the pptrs again, and check. */
+ error = xchk_xattr_walk(sc, sc->ip, xchk_parent_scan_dotdot, NULL, pp);
+ if (error == -ECANCELED) {
+ /* Found a parent pointer that matches dotdot. */
+ return 0;
+ }
+ if (!error || error == -EFSCORRUPTED) {
+ /* Found a broken parent pointer or no match. */
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return 0;
+ }
+ return error;
+}
+
+/*
+ * Try to lock a parent directory for checking dirents. Returns the inode
+ * flags for the locks we now hold, or zero if we failed.
+ */
+STATIC unsigned int
+xchk_parent_lock_dir(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp)
+{
+ if (!xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED))
+ return 0;
+
+ if (!xfs_ilock_nowait(dp, XFS_ILOCK_SHARED)) {
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+ return 0;
+ }
+
+ if (!xfs_need_iread_extents(&dp->i_df))
+ return XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED;
+
+ xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+ if (!xfs_ilock_nowait(dp, XFS_ILOCK_EXCL)) {
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+ return 0;
+ }
+
+ return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL;
+}
+
+/* Check the forward link (dirent) associated with this parent pointer. */
+STATIC int
+xchk_parent_dirent(
+ struct xchk_pptrs *pp,
+ const struct xfs_name *xname,
+ struct xfs_inode *dp)
+{
+ struct xfs_scrub *sc = pp->sc;
+ xfs_ino_t child_ino;
+ int error;
+
+ /*
+ * Use the name attached to this parent pointer to look up the
+ * directory entry in the alleged parent.
+ */
+ error = xchk_dir_lookup(sc, dp, xname, &child_ino);
+ if (error == -ENOENT) {
+ xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return 0;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error))
+ return error;
+
+ /* Does the inode number match? */
+ if (child_ino != sc->ip->i_ino) {
+ xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return 0;
+ }
+
+ return 0;
+}
+
+/* Try to grab a parent directory. */
+STATIC int
+xchk_parent_iget(
+ struct xchk_pptrs *pp,
+ const struct xfs_parent_rec *pptr,
+ struct xfs_inode **dpp)
+{
+ struct xfs_scrub *sc = pp->sc;
+ struct xfs_inode *ip;
+ xfs_ino_t parent_ino = be64_to_cpu(pptr->p_ino);
+ int error;
+
+ /* Validate inode number. */
+ error = xfs_dir_ino_validate(sc->mp, parent_ino);
+ if (error) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return -ECANCELED;
+ }
+
+ error = xchk_iget(sc, parent_ino, &ip);
+ if (error == -EINVAL || error == -ENOENT) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return -ECANCELED;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error))
+ return error;
+
+ /* The parent must be a directory. */
+ if (!S_ISDIR(VFS_I(ip)->i_mode)) {
+ xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ goto out_rele;
+ }
+
+ /* Validate generation number. */
+ if (VFS_I(ip)->i_generation != be32_to_cpu(pptr->p_gen)) {
+ xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ goto out_rele;
+ }
+
+ *dpp = ip;
+ return 0;
+out_rele:
+ xchk_irele(sc, ip);
+ return 0;
+}
+
+/*
+ * Walk an xattr of a file. If this xattr is a parent pointer, follow it up
+ * to a parent directory and check that the parent has a dirent pointing back
+ * to us.
+ */
+STATIC int
+xchk_parent_scan_attr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xfs_name xname = {
+ .name = name,
+ .len = namelen,
+ };
+ struct xchk_pptrs *pp = priv;
+ struct xfs_inode *dp = NULL;
+ const struct xfs_parent_rec *pptr_rec = value;
+ xfs_ino_t parent_ino;
+ unsigned int lockmode;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, &parent_ino, NULL);
+ if (error) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return error;
+ }
+
+ /* No self-referential parent pointers. */
+ if (parent_ino == sc->ip->i_ino) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return -ECANCELED;
+ }
+
+ pp->pptrs_found++;
+
+ error = xchk_parent_iget(pp, pptr_rec, &dp);
+ if (error)
+ return error;
+ if (!dp)
+ return 0;
+
+ /* Try to lock the inode. */
+ lockmode = xchk_parent_lock_dir(sc, dp);
+ if (!lockmode) {
+ struct xchk_pptr save_pp = {
+ .pptr_rec = *pptr_rec, /* struct copy */
+ .namelen = namelen,
+ };
+
+ /* Couldn't lock the inode, so save the pptr for later. */
+ trace_xchk_parent_defer(sc->ip, &xname, dp->i_ino);
+
+ error = xfblob_storename(pp->pptr_names, &save_pp.name_cookie,
+ &xname);
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0,
+ &error))
+ goto out_rele;
+
+ error = xfarray_append(pp->pptr_entries, &save_pp);
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0,
+ &error))
+ goto out_rele;
+
+ goto out_rele;
+ }
+
+ error = xchk_parent_dirent(pp, &xname, dp);
+ if (error)
+ goto out_unlock;
+
+out_unlock:
+ xfs_iunlock(dp, lockmode);
+out_rele:
+ xchk_irele(sc, dp);
+ return error;
+}
+
+/*
+ * Revalidate a parent pointer that we collected in the past but couldn't check
+ * because of lock contention. Returns 0 if the parent pointer is still valid,
+ * -ENOENT if it has gone away on us, or a negative errno.
+ */
+STATIC int
+xchk_parent_revalidate_pptr(
+ struct xchk_pptrs *pp,
+ const struct xfs_name *xname,
+ struct xfs_parent_rec *pptr)
+{
+ struct xfs_scrub *sc = pp->sc;
+ int error;
+
+ error = xfs_parent_lookup(sc->tp, sc->ip, xname, pptr, &pp->pptr_args);
+ if (error == -ENOATTR) {
+ /* Parent pointer went away, nothing to revalidate. */
+ return -ENOENT;
+ }
+
+ return error;
+}
+
+/*
+ * Check a parent pointer the slow way, which means we cycle locks a bunch
+ * and put up with revalidation until we get it done.
+ */
+STATIC int
+xchk_parent_slow_pptr(
+ struct xchk_pptrs *pp,
+ const struct xfs_name *xname,
+ struct xfs_parent_rec *pptr)
+{
+ struct xfs_scrub *sc = pp->sc;
+ struct xfs_inode *dp = NULL;
+ unsigned int lockmode;
+ int error;
+
+ /* Check that the deferred parent pointer still exists. */
+ if (pp->need_revalidate) {
+ error = xchk_parent_revalidate_pptr(pp, xname, pptr);
+ if (error == -ENOENT)
+ return 0;
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0,
+ &error))
+ return error;
+ }
+
+ error = xchk_parent_iget(pp, pptr, &dp);
+ if (error)
+ return error;
+ if (!dp)
+ return 0;
+
+ /*
+ * If we can grab both IOLOCK and ILOCK of the alleged parent, we
+ * can proceed with the validation.
+ */
+ lockmode = xchk_parent_lock_dir(sc, dp);
+ if (lockmode) {
+ trace_xchk_parent_slowpath(sc->ip, xname, dp->i_ino);
+ goto check_dirent;
+ }
+
+ /*
+ * We couldn't lock the parent dir. Drop all the locks and try to
+ * get them again, one at a time.
+ */
+ xchk_iunlock(sc, sc->ilock_flags);
+ pp->need_revalidate = true;
+
+ trace_xchk_parent_ultraslowpath(sc->ip, xname, dp->i_ino);
+
+ error = xchk_dir_trylock_for_pptrs(sc, dp, &lockmode);
+ if (error)
+ goto out_rele;
+
+ /* Revalidate the parent pointer now that we cycled locks. */
+ error = xchk_parent_revalidate_pptr(pp, xname, pptr);
+ if (error == -ENOENT) {
+ error = 0;
+ goto out_unlock;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error))
+ goto out_unlock;
+
+check_dirent:
+ error = xchk_parent_dirent(pp, xname, dp);
+out_unlock:
+ xfs_iunlock(dp, lockmode);
+out_rele:
+ xchk_irele(sc, dp);
+ return error;
+}
+
+/* Check all the parent pointers that we deferred the first time around. */
+STATIC int
+xchk_parent_finish_slow_pptrs(
+ struct xchk_pptrs *pp)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ foreach_xfarray_idx(pp->pptr_entries, array_cur) {
+ struct xchk_pptr pptr;
+
+ if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ error = xfarray_load(pp->pptr_entries, array_cur, &pptr);
+ if (error)
+ return error;
+
+ error = xfblob_loadname(pp->pptr_names, pptr.name_cookie,
+ &pp->xname, pptr.namelen);
+ if (error)
+ return error;
+
+ error = xchk_parent_slow_pptr(pp, &pp->xname, &pptr.pptr_rec);
+ if (error)
+ return error;
+ }
+
+ /* Empty out both xfiles now that we've checked everything. */
+ xfarray_truncate(pp->pptr_entries);
+ xfblob_truncate(pp->pptr_names);
+ return 0;
+}
+
+/* Count the number of parent pointers. */
+STATIC int
+xchk_parent_count_pptr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xchk_pptrs *pp = priv;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, NULL, NULL);
+ if (error)
+ return error;
+
+ pp->pptrs_found++;
+ return 0;
+}
+
+/*
+ * Compare the number of parent pointers to the link count. For
+ * non-directories these should be the same. For unlinked directories the
+ * count should be zero; for linked directories, it should be nonzero.
+ */
+STATIC int
+xchk_parent_count_pptrs(
+ struct xchk_pptrs *pp)
+{
+ struct xfs_scrub *sc = pp->sc;
+ int error;
+
+ /*
+ * If we cycled the ILOCK while cross-checking parent pointers with
+ * dirents, then we need to recalculate the number of parent pointers.
+ */
+ if (pp->need_revalidate) {
+ pp->pptrs_found = 0;
+ error = xchk_xattr_walk(sc, sc->ip, xchk_parent_count_pptr,
+ NULL, pp);
+ if (error == -EFSCORRUPTED) {
+ /* Found a bad parent pointer */
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return 0;
+ }
+ if (error)
+ return error;
+ }
+
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode)) {
+ if (sc->ip == sc->mp->m_rootip)
+ pp->pptrs_found++;
+
+ if (VFS_I(sc->ip)->i_nlink == 0 && pp->pptrs_found > 0)
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ else if (VFS_I(sc->ip)->i_nlink > 0 &&
+ pp->pptrs_found == 0)
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ } else {
+ if (VFS_I(sc->ip)->i_nlink != pp->pptrs_found)
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ }
+
+ return 0;
+}
+
+/* Check parent pointers of a file. */
+STATIC int
+xchk_parent_pptr(
+ struct xfs_scrub *sc)
+{
+ struct xchk_pptrs *pp;
+ char *descr;
+ int error;
+
+ pp = kvzalloc(sizeof(struct xchk_pptrs), XCHK_GFP_FLAGS);
+ if (!pp)
+ return -ENOMEM;
+ pp->sc = sc;
+ pp->xname.name = pp->namebuf;
+
+ /*
+ * Set up some staging memory for parent pointers that we can't check
+ * due to locking contention.
+ */
+ descr = xchk_xfile_ino_descr(sc, "slow parent pointer entries");
+ error = xfarray_create(descr, 0, sizeof(struct xchk_pptr),
+ &pp->pptr_entries);
+ kfree(descr);
+ if (error)
+ goto out_pp;
+
+ descr = xchk_xfile_ino_descr(sc, "slow parent pointer names");
+ error = xfblob_create(descr, &pp->pptr_names);
+ kfree(descr);
+ if (error)
+ goto out_entries;
+
+ error = xchk_xattr_walk(sc, sc->ip, xchk_parent_scan_attr, NULL, pp);
+ if (error == -ECANCELED) {
+ error = 0;
+ goto out_names;
+ }
+ if (error)
+ goto out_names;
+
+ error = xchk_parent_finish_slow_pptrs(pp);
+ if (error == -ETIMEDOUT) {
+ /* Couldn't grab a lock, scrub was marked incomplete */
+ error = 0;
+ goto out_names;
+ }
+ if (error)
+ goto out_names;
+
+ if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out_names;
+
+ /*
+ * For subdirectories, make sure the dotdot entry references the same
+ * inode as the parent pointers.
+ *
+ * If we're scanning a /consistent/ directory, there should only be
+ * one parent pointer, and it should point to the same directory as
+ * the dotdot entry.
+ *
+ * However, a corrupt directory tree might feature a subdirectory with
+ * multiple parents. The directory loop scanner is responsible for
+ * correcting that kind of problem, so for now we only validate that
+ * the dotdot entry matches /one/ of the parents.
+ */
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode)) {
+ error = xchk_parent_pptr_and_dotdot(pp);
+ if (error)
+ goto out_names;
+ }
+
+ if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out_pp;
+
+ /*
+ * Complain if the number of parent pointers doesn't match the link
+ * count. This could be a sign of missing parent pointers (or an
+ * incorrect link count).
+ */
+ error = xchk_parent_count_pptrs(pp);
+ if (error)
+ goto out_names;
+
+out_names:
+ xfblob_destroy(pp->pptr_names);
+out_entries:
+ xfarray_destroy(pp->pptr_entries);
+out_pp:
+ kvfree(pp);
+ return error;
+}
+
/* Scrub a parent pointer. */
int
xchk_parent(
@@ -194,6 +828,9 @@ xchk_parent(
xfs_ino_t parent_ino;
int error = 0;
+ if (xfs_has_parent(mp))
+ return xchk_parent_pptr(sc);
+
/*
* If we're a directory, check that the '..' link points up to
* a directory that has one entry pointing to us.
@@ -237,3 +874,64 @@ xchk_parent(
return error;
}
+
+/*
+ * Decide if this file's extended attributes (and therefore its parent
+ * pointers) have been zapped to satisfy the inode and ifork verifiers.
+ * Checking and repairing should be postponed until the extended attribute
+ * structure is fixed.
+ */
+bool
+xchk_pptr_looks_zapped(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct inode *inode = VFS_I(ip);
+
+ ASSERT(xfs_has_parent(mp));
+
+ /*
+ * Temporary files that cannot be linked into the directory tree do not
+ * have attr forks because they cannot ever have parents.
+ */
+ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ return false;
+
+ /*
+ * Directory tree roots do not have parents, so the expected outcome
+ * of a parent pointer scan is always the empty set. It's safe to scan
+ * them even if the attr fork was zapped.
+ */
+ if (ip == mp->m_rootip)
+ return false;
+
+ /*
+ * Metadata inodes are all rooted in the superblock and do not have
+ * any parents. Hence the attr fork will not be initialized, but
+ * there are no parent pointers that might have been zapped.
+ */
+ if (xfs_is_metadata_inode(ip))
+ return false;
+
+ /*
+ * Linked and linkable non-rootdir files should always have an
+ * attribute fork because that is where parent pointers are
+ * stored. If the fork is absent, something is amiss.
+ */
+ if (!xfs_inode_has_attr_fork(ip))
+ return true;
+
+ /* Repair zapped this file's attr fork a short time ago */
+ if (xfs_ifork_zapped(ip, XFS_ATTR_FORK))
+ return true;
+
+ /*
+ * If the dinode repair found a bad attr fork, it will reset the fork
+ * to extents format with zero records and wait for the bmapbta
+ * scrubber to reconstruct the block mappings. The extended attribute
+ * structure always contain some content when parent pointers are
+ * enabled, so this is a clear sign of a zapped attr fork.
+ */
+ return ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_af.if_nextents == 0;
+}
diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c
new file mode 100644
index 000000000000..7b42b7f65a0b
--- /dev/null
+++ b/fs/xfs/scrub/parent_repair.c
@@ -0,0 +1,1612 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trans_space.h"
+#include "xfs_health.h"
+#include "xfs_exchmaps.h"
+#include "xfs_parent.h"
+#include "xfs_attr.h"
+#include "xfs_bmap.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/iscan.h"
+#include "scrub/findparent.h"
+#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/orphanage.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/attr_repair.h"
+#include "scrub/listxattr.h"
+
+/*
+ * Repairing The Directory Parent Pointer
+ * ======================================
+ *
+ * Currently, only directories support parent pointers (in the form of '..'
+ * entries), so we simply scan the filesystem and update the '..' entry.
+ *
+ * Note that because the only parent pointer is the dotdot entry, we won't
+ * touch an unhealthy directory, since the directory repair code is perfectly
+ * capable of rebuilding a directory with the proper parent inode.
+ *
+ * See the section on locking issues in dir_repair.c for more information about
+ * conflicts with the VFS. The findparent code wll keep our incore parent
+ * inode up to date.
+ *
+ * If parent pointers are enabled, we instead reconstruct the parent pointer
+ * information by visiting every directory entry of every directory in the
+ * system and translating the relevant dirents into parent pointers. In this
+ * case, it is advantageous to stash all parent pointers created from dirents
+ * from a single parent file before replaying them into the temporary file. To
+ * save memory, the live filesystem scan reuses the findparent object. Parent
+ * pointer repair chooses either directory scanning or findparent, but not
+ * both.
+ *
+ * When salvaging completes, the remaining stashed entries are replayed to the
+ * temporary file. All non-parent pointer extended attributes are copied to
+ * the temporary file's extended attributes. An atomic file mapping exchange
+ * is used to commit the new xattr blocks to the file being repaired. This
+ * will disrupt attrmulti cursors.
+ */
+
+/* Create a parent pointer in the tempfile. */
+#define XREP_PPTR_ADD (1)
+
+/* Remove a parent pointer from the tempfile. */
+#define XREP_PPTR_REMOVE (2)
+
+/* A stashed parent pointer update. */
+struct xrep_pptr {
+ /* Cookie for retrieval of the pptr name. */
+ xfblob_cookie name_cookie;
+
+ /* Parent pointer record. */
+ struct xfs_parent_rec pptr_rec;
+
+ /* Length of the pptr name. */
+ uint8_t namelen;
+
+ /* XREP_PPTR_{ADD,REMOVE} */
+ uint8_t action;
+};
+
+/*
+ * Stash up to 8 pages of recovered parent pointers in pptr_recs and
+ * pptr_names before we write them to the temp file.
+ */
+#define XREP_PARENT_MAX_STASH_BYTES (PAGE_SIZE * 8)
+
+struct xrep_parent {
+ struct xfs_scrub *sc;
+
+ /* Fixed-size array of xrep_pptr structures. */
+ struct xfarray *pptr_recs;
+
+ /* Blobs containing parent pointer names. */
+ struct xfblob *pptr_names;
+
+ /* xattr keys */
+ struct xfarray *xattr_records;
+
+ /* xattr values */
+ struct xfblob *xattr_blobs;
+
+ /* Scratch buffers for saving extended attributes */
+ unsigned char *xattr_name;
+ void *xattr_value;
+ unsigned int xattr_value_sz;
+
+ /*
+ * Information used to exchange the attr fork mappings, if the fs
+ * supports parent pointers.
+ */
+ struct xrep_tempexch tx;
+
+ /*
+ * Information used to scan the filesystem to find the inumber of the
+ * dotdot entry for this directory. On filesystems without parent
+ * pointers, we use the findparent_* functions on this object and
+ * access only the parent_ino field directly.
+ *
+ * When parent pointers are enabled, the directory entry scanner uses
+ * the iscan, hooks, and lock fields of this object directly.
+ * @pscan.lock coordinates access to pptr_recs, pptr_names, pptr, and
+ * pptr_scratch. This reduces the memory requirements of this
+ * structure.
+ *
+ * The lock also controls access to xattr_records and xattr_blobs(?)
+ */
+ struct xrep_parent_scan_info pscan;
+
+ /* Orphanage reparenting request. */
+ struct xrep_adoption adoption;
+
+ /* Directory entry name, plus the trailing null. */
+ struct xfs_name xname;
+ unsigned char namebuf[MAXNAMELEN];
+
+ /* Scratch buffer for scanning pptr xattrs */
+ struct xfs_da_args pptr_args;
+
+ /* Have we seen any live updates of parent pointers recently? */
+ bool saw_pptr_updates;
+
+ /* Number of parents we found after all other repairs */
+ unsigned long long parents;
+};
+
+struct xrep_parent_xattr {
+ /* Cookie for retrieval of the xattr name. */
+ xfblob_cookie name_cookie;
+
+ /* Cookie for retrieval of the xattr value. */
+ xfblob_cookie value_cookie;
+
+ /* XFS_ATTR_* flags */
+ int flags;
+
+ /* Length of the value and name. */
+ uint32_t valuelen;
+ uint16_t namelen;
+};
+
+/*
+ * Stash up to 8 pages of attrs in xattr_records/xattr_blobs before we write
+ * them to the temp file.
+ */
+#define XREP_PARENT_XATTR_MAX_STASH_BYTES (PAGE_SIZE * 8)
+
+/* Tear down all the incore stuff we created. */
+static void
+xrep_parent_teardown(
+ struct xrep_parent *rp)
+{
+ xrep_findparent_scan_teardown(&rp->pscan);
+ kvfree(rp->xattr_name);
+ rp->xattr_name = NULL;
+ kvfree(rp->xattr_value);
+ rp->xattr_value = NULL;
+ if (rp->xattr_blobs)
+ xfblob_destroy(rp->xattr_blobs);
+ rp->xattr_blobs = NULL;
+ if (rp->xattr_records)
+ xfarray_destroy(rp->xattr_records);
+ rp->xattr_records = NULL;
+ if (rp->pptr_names)
+ xfblob_destroy(rp->pptr_names);
+ rp->pptr_names = NULL;
+ if (rp->pptr_recs)
+ xfarray_destroy(rp->pptr_recs);
+ rp->pptr_recs = NULL;
+}
+
+/* Set up for a parent repair. */
+int
+xrep_setup_parent(
+ struct xfs_scrub *sc)
+{
+ struct xrep_parent *rp;
+ int error;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ rp = kvzalloc(sizeof(struct xrep_parent), XCHK_GFP_FLAGS);
+ if (!rp)
+ return -ENOMEM;
+ rp->sc = sc;
+ rp->xname.name = rp->namebuf;
+ sc->buf = rp;
+
+ error = xrep_tempfile_create(sc, S_IFREG);
+ if (error)
+ return error;
+
+ return xrep_orphanage_try_create(sc);
+}
+
+/*
+ * Scan all files in the filesystem for a child dirent that we can turn into
+ * the dotdot entry for this directory.
+ */
+STATIC int
+xrep_parent_find_dotdot(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ xfs_ino_t ino;
+ unsigned int sick, checked;
+ int error;
+
+ /*
+ * Avoid sick directories. There shouldn't be anyone else clearing the
+ * directory's sick status.
+ */
+ xfs_inode_measure_sickness(sc->ip, &sick, &checked);
+ if (sick & XFS_SICK_INO_DIR)
+ return -EFSCORRUPTED;
+
+ ino = xrep_findparent_self_reference(sc);
+ if (ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rp->pscan, ino);
+ return 0;
+ }
+
+ /*
+ * Drop the ILOCK on this directory so that we can scan for the dotdot
+ * entry. Figure out who is going to be the parent of this directory,
+ * then retake the ILOCK so that we can salvage directory entries.
+ */
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /* Does the VFS dcache have an answer for us? */
+ ino = xrep_findparent_from_dcache(sc);
+ if (ino != NULLFSINO) {
+ error = xrep_findparent_confirm(sc, &ino);
+ if (!error && ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rp->pscan, ino);
+ goto out_relock;
+ }
+ }
+
+ /* Scan the entire filesystem for a parent. */
+ error = xrep_findparent_scan(&rp->pscan);
+out_relock:
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+
+ return error;
+}
+
+/*
+ * Add this stashed incore parent pointer to the temporary file.
+ * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and
+ * must not be in transaction context.
+ */
+STATIC int
+xrep_parent_replay_update(
+ struct xrep_parent *rp,
+ const struct xfs_name *xname,
+ struct xrep_pptr *pptr)
+{
+ struct xfs_scrub *sc = rp->sc;
+
+ switch (pptr->action) {
+ case XREP_PPTR_ADD:
+ /* Create parent pointer. */
+ trace_xrep_parent_replay_parentadd(sc->tempip, xname,
+ &pptr->pptr_rec);
+
+ return xfs_parent_set(sc->tempip, sc->ip->i_ino, xname,
+ &pptr->pptr_rec, &rp->pptr_args);
+ case XREP_PPTR_REMOVE:
+ /* Remove parent pointer. */
+ trace_xrep_parent_replay_parentremove(sc->tempip, xname,
+ &pptr->pptr_rec);
+
+ return xfs_parent_unset(sc->tempip, sc->ip->i_ino, xname,
+ &pptr->pptr_rec, &rp->pptr_args);
+ }
+
+ ASSERT(0);
+ return -EIO;
+}
+
+/*
+ * Flush stashed parent pointer updates that have been recorded by the scanner.
+ * This is done to reduce the memory requirements of the parent pointer
+ * rebuild, since files can have a lot of hardlinks and the fs can be busy.
+ *
+ * Caller must not hold transactions or ILOCKs. Caller must hold the tempfile
+ * IOLOCK.
+ */
+STATIC int
+xrep_parent_replay_updates(
+ struct xrep_parent *rp)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ mutex_lock(&rp->pscan.lock);
+ foreach_xfarray_idx(rp->pptr_recs, array_cur) {
+ struct xrep_pptr pptr;
+
+ error = xfarray_load(rp->pptr_recs, array_cur, &pptr);
+ if (error)
+ goto out_unlock;
+
+ error = xfblob_loadname(rp->pptr_names, pptr.name_cookie,
+ &rp->xname, pptr.namelen);
+ if (error)
+ goto out_unlock;
+ rp->xname.len = pptr.namelen;
+ mutex_unlock(&rp->pscan.lock);
+
+ error = xrep_parent_replay_update(rp, &rp->xname, &pptr);
+ if (error)
+ return error;
+
+ mutex_lock(&rp->pscan.lock);
+ }
+
+ /* Empty out both arrays now that we've added the entries. */
+ xfarray_truncate(rp->pptr_recs);
+ xfblob_truncate(rp->pptr_names);
+ mutex_unlock(&rp->pscan.lock);
+ return 0;
+out_unlock:
+ mutex_unlock(&rp->pscan.lock);
+ return error;
+}
+
+/*
+ * Remember that we want to create a parent pointer in the tempfile. These
+ * stashed actions will be replayed later.
+ */
+STATIC int
+xrep_parent_stash_parentadd(
+ struct xrep_parent *rp,
+ const struct xfs_name *name,
+ const struct xfs_inode *dp)
+{
+ struct xrep_pptr pptr = {
+ .action = XREP_PPTR_ADD,
+ .namelen = name->len,
+ };
+ int error;
+
+ trace_xrep_parent_stash_parentadd(rp->sc->tempip, dp, name);
+
+ xfs_inode_to_parent_rec(&pptr.pptr_rec, dp);
+ error = xfblob_storename(rp->pptr_names, &pptr.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rp->pptr_recs, &pptr);
+}
+
+/*
+ * Remember that we want to remove a parent pointer from the tempfile. These
+ * stashed actions will be replayed later.
+ */
+STATIC int
+xrep_parent_stash_parentremove(
+ struct xrep_parent *rp,
+ const struct xfs_name *name,
+ const struct xfs_inode *dp)
+{
+ struct xrep_pptr pptr = {
+ .action = XREP_PPTR_REMOVE,
+ .namelen = name->len,
+ };
+ int error;
+
+ trace_xrep_parent_stash_parentremove(rp->sc->tempip, dp, name);
+
+ xfs_inode_to_parent_rec(&pptr.pptr_rec, dp);
+ error = xfblob_storename(rp->pptr_names, &pptr.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rp->pptr_recs, &pptr);
+}
+
+/*
+ * Examine an entry of a directory. If this dirent leads us back to the file
+ * whose parent pointers we're rebuilding, add a pptr to the temporary
+ * directory.
+ */
+STATIC int
+xrep_parent_scan_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xrep_parent *rp = priv;
+ int error;
+
+ /* Dirent doesn't point to this directory. */
+ if (ino != rp->sc->ip->i_ino)
+ return 0;
+
+ /* No weird looking names. */
+ if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
+ return -EFSCORRUPTED;
+
+ /* No mismatching ftypes. */
+ if (name->type != xfs_mode_to_ftype(VFS_I(sc->ip)->i_mode))
+ return -EFSCORRUPTED;
+
+ /* Don't pick up dot or dotdot entries; we only want child dirents. */
+ if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
+ xfs_dir2_samename(name, &xfs_name_dot))
+ return 0;
+
+ /*
+ * Transform this dirent into a parent pointer and queue it for later
+ * addition to the temporary file.
+ */
+ mutex_lock(&rp->pscan.lock);
+ error = xrep_parent_stash_parentadd(rp, name, dp);
+ mutex_unlock(&rp->pscan.lock);
+ return error;
+}
+
+/*
+ * Decide if we want to look for dirents in this directory. Skip the file
+ * being repaired and any files being used to stage repairs.
+ */
+static inline bool
+xrep_parent_want_scan(
+ struct xrep_parent *rp,
+ const struct xfs_inode *ip)
+{
+ return ip != rp->sc->ip && !xrep_is_tempfile(ip);
+}
+
+/*
+ * Take ILOCK on a file that we want to scan.
+ *
+ * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt.
+ * Otherwise, take ILOCK_SHARED.
+ */
+static inline unsigned int
+xrep_parent_scan_ilock(
+ struct xrep_parent *rp,
+ struct xfs_inode *ip)
+{
+ uint lock_mode = XFS_ILOCK_SHARED;
+
+ /* Still need to take the shared ILOCK to advance the iscan cursor. */
+ if (!xrep_parent_want_scan(rp, ip))
+ goto lock;
+
+ if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) {
+ lock_mode = XFS_ILOCK_EXCL;
+ goto lock;
+ }
+
+lock:
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
+}
+
+/*
+ * Scan this file for relevant child dirents that point to the file whose
+ * parent pointers we're rebuilding.
+ */
+STATIC int
+xrep_parent_scan_file(
+ struct xrep_parent *rp,
+ struct xfs_inode *ip)
+{
+ unsigned int lock_mode;
+ int error = 0;
+
+ lock_mode = xrep_parent_scan_ilock(rp, ip);
+
+ if (!xrep_parent_want_scan(rp, ip))
+ goto scan_done;
+
+ if (S_ISDIR(VFS_I(ip)->i_mode)) {
+ /*
+ * If the directory looks as though it has been zapped by the
+ * inode record repair code, we cannot scan for child dirents.
+ */
+ if (xchk_dir_looks_zapped(ip)) {
+ error = -EBUSY;
+ goto scan_done;
+ }
+
+ error = xchk_dir_walk(rp->sc, ip, xrep_parent_scan_dirent, rp);
+ if (error)
+ goto scan_done;
+ }
+
+scan_done:
+ xchk_iscan_mark_visited(&rp->pscan.iscan, ip);
+ xfs_iunlock(ip, lock_mode);
+ return error;
+}
+
+/* Decide if we've stashed too much pptr data in memory. */
+static inline bool
+xrep_parent_want_flush_stashed(
+ struct xrep_parent *rp)
+{
+ unsigned long long bytes;
+
+ bytes = xfarray_bytes(rp->pptr_recs) + xfblob_bytes(rp->pptr_names);
+ return bytes > XREP_PARENT_MAX_STASH_BYTES;
+}
+
+/*
+ * Scan all directories in the filesystem to look for dirents that we can turn
+ * into parent pointers.
+ */
+STATIC int
+xrep_parent_scan_dirtree(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ struct xfs_inode *ip;
+ int error;
+
+ /*
+ * Filesystem scans are time consuming. Drop the file ILOCK and all
+ * other resources for the duration of the scan and hope for the best.
+ * The live update hooks will keep our scan information up to date.
+ */
+ xchk_trans_cancel(sc);
+ if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
+ xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
+ XFS_ILOCK_EXCL));
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ while ((error = xchk_iscan_iter(&rp->pscan.iscan, &ip)) == 1) {
+ bool flush;
+
+ error = xrep_parent_scan_file(rp, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ /* Flush stashed pptr updates to constrain memory usage. */
+ mutex_lock(&rp->pscan.lock);
+ flush = xrep_parent_want_flush_stashed(rp);
+ mutex_unlock(&rp->pscan.lock);
+ if (flush) {
+ xchk_trans_cancel(sc);
+
+ error = xrep_tempfile_iolock_polled(sc);
+ if (error)
+ break;
+
+ error = xrep_parent_replay_updates(rp);
+ xrep_tempfile_iounlock(sc);
+ if (error)
+ break;
+
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ break;
+ }
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&rp->pscan.iscan);
+ if (error) {
+ /*
+ * If we couldn't grab an inode that was busy with a state
+ * change, change the error code so that we exit to userspace
+ * as quickly as possible.
+ */
+ if (error == -EBUSY)
+ return -ECANCELED;
+ return error;
+ }
+
+ /*
+ * Retake sc->ip's ILOCK now that we're done flushing stashed parent
+ * pointers. We end this function with an empty transaction and the
+ * ILOCK.
+ */
+ xchk_ilock(rp->sc, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/*
+ * Capture dirent updates being made by other threads which are relevant to the
+ * file being repaired.
+ */
+STATIC int
+xrep_parent_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xrep_parent *rp;
+ struct xfs_scrub *sc;
+ int error;
+
+ rp = container_of(nb, struct xrep_parent, pscan.dhook.dirent_hook.nb);
+ sc = rp->sc;
+
+ /*
+ * This thread updated a dirent that points to the file that we're
+ * repairing, so stash the update for replay against the temporary
+ * file.
+ */
+ if (p->ip->i_ino == sc->ip->i_ino &&
+ xchk_iscan_want_live_update(&rp->pscan.iscan, p->dp->i_ino)) {
+ mutex_lock(&rp->pscan.lock);
+ if (p->delta > 0)
+ error = xrep_parent_stash_parentadd(rp, p->name, p->dp);
+ else
+ error = xrep_parent_stash_parentremove(rp, p->name,
+ p->dp);
+ if (!error)
+ rp->saw_pptr_updates = true;
+ mutex_unlock(&rp->pscan.lock);
+ if (error)
+ goto out_abort;
+ }
+
+ return NOTIFY_DONE;
+out_abort:
+ xchk_iscan_abort(&rp->pscan.iscan);
+ return NOTIFY_DONE;
+}
+
+/* Reset a directory's dotdot entry, if needed. */
+STATIC int
+xrep_parent_reset_dotdot(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ xfs_ino_t ino;
+ unsigned int spaceres;
+ int error = 0;
+
+ ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
+
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &ino);
+ if (error || ino == rp->pscan.parent_ino)
+ return error;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ trace_xrep_parent_reset_dotdot(sc->ip, rp->pscan.parent_ino);
+
+ /*
+ * Reserve more space just in case we have to expand the dir. We're
+ * allowed to exceed quota to repair inconsistent metadata.
+ */
+ spaceres = xfs_rename_space_res(sc->mp, 0, false, xfs_name_dotdot.len,
+ false);
+ error = xfs_trans_reserve_more_inode(sc->tp, sc->ip, spaceres, 0,
+ true);
+ if (error)
+ return error;
+
+ error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot,
+ rp->pscan.parent_ino, spaceres);
+ if (error)
+ return error;
+
+ /*
+ * Roll transaction to detach the inode from the transaction but retain
+ * ILOCK_EXCL.
+ */
+ return xfs_trans_roll(&sc->tp);
+}
+
+/* Pass back the parent inumber if this a parent pointer */
+STATIC int
+xrep_parent_lookup_pptr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ xfs_ino_t *inop = priv;
+ xfs_ino_t parent_ino;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, &parent_ino, NULL);
+ if (error)
+ return error;
+
+ *inop = parent_ino;
+ return -ECANCELED;
+}
+
+/*
+ * Find the first parent of the scrub target by walking parent pointers for
+ * the purpose of deciding if we're going to move it to the orphanage.
+ * We don't care if the attr fork is zapped.
+ */
+STATIC int
+xrep_parent_lookup_pptrs(
+ struct xfs_scrub *sc,
+ xfs_ino_t *inop)
+{
+ int error;
+
+ *inop = NULLFSINO;
+
+ error = xchk_xattr_walk(sc, sc->ip, xrep_parent_lookup_pptr, NULL,
+ inop);
+ if (error && error != -ECANCELED)
+ return error;
+ return 0;
+}
+
+/*
+ * Move the current file to the orphanage.
+ *
+ * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon
+ * successful return, the scrub transaction will have enough extra reservation
+ * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the
+ * orphanage; and both inodes will be ijoined.
+ */
+STATIC int
+xrep_parent_move_to_orphanage(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ xfs_ino_t orig_parent, new_parent;
+ int error;
+
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode)) {
+ /*
+ * We are about to drop the ILOCK on sc->ip to lock the
+ * orphanage and prepare for the adoption. Therefore, look up
+ * the old dotdot entry for sc->ip so that we can compare it
+ * after we re-lock sc->ip.
+ */
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot,
+ &orig_parent);
+ if (error)
+ return error;
+ } else {
+ /*
+ * We haven't dropped the ILOCK since we committed the new
+ * xattr structure (and hence the new parent pointer records),
+ * which means that the file cannot have been moved in the
+ * directory tree, and there are no parents.
+ */
+ orig_parent = NULLFSINO;
+ }
+
+ /*
+ * Drop the ILOCK on the scrub target and commit the transaction.
+ * Adoption computes its own resource requirements and gathers the
+ * necessary components.
+ */
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /* If we can take the orphanage's iolock then we're ready to move. */
+ if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) {
+ xchk_iunlock(sc, sc->ilock_flags);
+ error = xrep_orphanage_iolock_two(sc);
+ if (error)
+ return error;
+ }
+
+ /* Grab transaction and ILOCK the two files. */
+ error = xrep_adoption_trans_alloc(sc, &rp->adoption);
+ if (error)
+ return error;
+
+ error = xrep_adoption_compute_name(&rp->adoption, &rp->xname);
+ if (error)
+ return error;
+
+ /*
+ * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot
+ * entry again. If the parent changed or the child was unlinked while
+ * the child directory was unlocked, we don't need to move the child to
+ * the orphanage after all. For a non-directory, we have to scan for
+ * the first parent pointer to see if one has been added.
+ */
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode))
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot,
+ &new_parent);
+ else
+ error = xrep_parent_lookup_pptrs(sc, &new_parent);
+ if (error)
+ return error;
+
+ /*
+ * Attach to the orphanage if we still have a linked directory and it
+ * hasn't been moved.
+ */
+ if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) {
+ error = xrep_adoption_move(&rp->adoption);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Launder the scrub transaction so we can drop the orphanage ILOCK
+ * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK.
+ */
+ error = xrep_adoption_trans_roll(&rp->adoption);
+ if (error)
+ return error;
+
+ xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ return 0;
+}
+
+/* Ensure that the xattr value buffer is large enough. */
+STATIC int
+xrep_parent_alloc_xattr_value(
+ struct xrep_parent *rp,
+ size_t bufsize)
+{
+ void *new_val;
+
+ if (rp->xattr_value_sz >= bufsize)
+ return 0;
+
+ if (rp->xattr_value) {
+ kvfree(rp->xattr_value);
+ rp->xattr_value = NULL;
+ rp->xattr_value_sz = 0;
+ }
+
+ new_val = kvmalloc(bufsize, XCHK_GFP_FLAGS);
+ if (!new_val)
+ return -ENOMEM;
+
+ rp->xattr_value = new_val;
+ rp->xattr_value_sz = bufsize;
+ return 0;
+}
+
+/* Retrieve the (remote) value of a non-pptr xattr. */
+STATIC int
+xrep_parent_fetch_xattr_remote(
+ struct xrep_parent *rp,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ unsigned int valuelen)
+{
+ struct xfs_scrub *sc = rp->sc;
+ struct xfs_da_args args = {
+ .attr_filter = attr_flags & XFS_ATTR_NSP_ONDISK_MASK,
+ .geo = sc->mp->m_attr_geo,
+ .whichfork = XFS_ATTR_FORK,
+ .dp = ip,
+ .name = name,
+ .namelen = namelen,
+ .trans = sc->tp,
+ .valuelen = valuelen,
+ .owner = ip->i_ino,
+ };
+ int error;
+
+ /*
+ * If we need a larger value buffer, try to allocate one. If that
+ * fails, return with -EDEADLOCK to try harder.
+ */
+ error = xrep_parent_alloc_xattr_value(rp, valuelen);
+ if (error == -ENOMEM)
+ return -EDEADLOCK;
+ if (error)
+ return error;
+
+ args.value = rp->xattr_value;
+ xfs_attr_sethash(&args);
+ return xfs_attr_get_ilocked(&args);
+}
+
+/* Stash non-pptr attributes for later replay into the temporary file. */
+STATIC int
+xrep_parent_stash_xattr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xrep_parent_xattr key = {
+ .valuelen = valuelen,
+ .namelen = namelen,
+ .flags = attr_flags & XFS_ATTR_NSP_ONDISK_MASK,
+ };
+ struct xrep_parent *rp = priv;
+ int error;
+
+ if (attr_flags & (XFS_ATTR_INCOMPLETE | XFS_ATTR_PARENT))
+ return 0;
+
+ if (!value) {
+ error = xrep_parent_fetch_xattr_remote(rp, ip, attr_flags,
+ name, namelen, valuelen);
+ if (error)
+ return error;
+
+ value = rp->xattr_value;
+ }
+
+ trace_xrep_parent_stash_xattr(rp->sc->tempip, key.flags, (void *)name,
+ key.namelen, key.valuelen);
+
+ error = xfblob_store(rp->xattr_blobs, &key.name_cookie, name,
+ key.namelen);
+ if (error)
+ return error;
+
+ error = xfblob_store(rp->xattr_blobs, &key.value_cookie, value,
+ key.valuelen);
+ if (error)
+ return error;
+
+ return xfarray_append(rp->xattr_records, &key);
+}
+
+/* Insert one xattr key/value. */
+STATIC int
+xrep_parent_insert_xattr(
+ struct xrep_parent *rp,
+ const struct xrep_parent_xattr *key)
+{
+ struct xfs_da_args args = {
+ .dp = rp->sc->tempip,
+ .attr_filter = key->flags,
+ .namelen = key->namelen,
+ .valuelen = key->valuelen,
+ .owner = rp->sc->ip->i_ino,
+ .geo = rp->sc->mp->m_attr_geo,
+ .whichfork = XFS_ATTR_FORK,
+ .op_flags = XFS_DA_OP_OKNOENT,
+ };
+ int error;
+
+ ASSERT(!(key->flags & XFS_ATTR_PARENT));
+
+ /*
+ * Grab pointers to the scrub buffer so that we can use them to insert
+ * attrs into the temp file.
+ */
+ args.name = rp->xattr_name;
+ args.value = rp->xattr_value;
+
+ /*
+ * The attribute name is stored near the end of the in-core buffer,
+ * though we reserve one more byte to ensure null termination.
+ */
+ rp->xattr_name[XATTR_NAME_MAX] = 0;
+
+ error = xfblob_load(rp->xattr_blobs, key->name_cookie, rp->xattr_name,
+ key->namelen);
+ if (error)
+ return error;
+
+ error = xfblob_free(rp->xattr_blobs, key->name_cookie);
+ if (error)
+ return error;
+
+ error = xfblob_load(rp->xattr_blobs, key->value_cookie, args.value,
+ key->valuelen);
+ if (error)
+ return error;
+
+ error = xfblob_free(rp->xattr_blobs, key->value_cookie);
+ if (error)
+ return error;
+
+ rp->xattr_name[key->namelen] = 0;
+
+ trace_xrep_parent_insert_xattr(rp->sc->tempip, key->flags,
+ rp->xattr_name, key->namelen, key->valuelen);
+
+ xfs_attr_sethash(&args);
+ return xfs_attr_set(&args, XFS_ATTRUPDATE_UPSERT, false);
+}
+
+/*
+ * Periodically flush salvaged attributes to the temporary file. This is done
+ * to reduce the memory requirements of the xattr rebuild because files can
+ * contain millions of attributes.
+ */
+STATIC int
+xrep_parent_flush_xattrs(
+ struct xrep_parent *rp)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ /*
+ * Entering this function, the scrub context has a reference to the
+ * inode being repaired, the temporary file, and the empty scrub
+ * transaction that we created for the xattr scan. We hold ILOCK_EXCL
+ * on the inode being repaired.
+ *
+ * To constrain kernel memory use, we occasionally flush salvaged
+ * xattrs from the xfarray and xfblob structures into the temporary
+ * file in preparation for exchanging the xattr structures at the end.
+ * Updating the temporary file requires a transaction, so we commit the
+ * scrub transaction and drop the ILOCK so that xfs_attr_set can
+ * allocate whatever transaction it wants.
+ *
+ * We still hold IOLOCK_EXCL on the inode being repaired, which
+ * prevents anyone from adding xattrs (or parent pointers) while we're
+ * flushing.
+ */
+ xchk_trans_cancel(rp->sc);
+ xchk_iunlock(rp->sc, XFS_ILOCK_EXCL);
+
+ /*
+ * Take the IOLOCK of the temporary file while we modify xattrs. This
+ * isn't strictly required because the temporary file is never revealed
+ * to userspace, but we follow the same locking rules. We still hold
+ * sc->ip's IOLOCK.
+ */
+ error = xrep_tempfile_iolock_polled(rp->sc);
+ if (error)
+ return error;
+
+ /* Add all the salvaged attrs to the temporary file. */
+ foreach_xfarray_idx(rp->xattr_records, array_cur) {
+ struct xrep_parent_xattr key;
+
+ error = xfarray_load(rp->xattr_records, array_cur, &key);
+ if (error)
+ return error;
+
+ error = xrep_parent_insert_xattr(rp, &key);
+ if (error)
+ return error;
+ }
+
+ /* Empty out both arrays now that we've added the entries. */
+ xfarray_truncate(rp->xattr_records);
+ xfblob_truncate(rp->xattr_blobs);
+
+ xrep_tempfile_iounlock(rp->sc);
+
+ /* Recreate the empty transaction and relock the inode. */
+ error = xchk_trans_alloc_empty(rp->sc);
+ if (error)
+ return error;
+ xchk_ilock(rp->sc, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/* Decide if we've stashed too much xattr data in memory. */
+static inline bool
+xrep_parent_want_flush_xattrs(
+ struct xrep_parent *rp)
+{
+ unsigned long long bytes;
+
+ bytes = xfarray_bytes(rp->xattr_records) +
+ xfblob_bytes(rp->xattr_blobs);
+ return bytes > XREP_PARENT_XATTR_MAX_STASH_BYTES;
+}
+
+/* Flush staged attributes to the temporary file if we're over the limit. */
+STATIC int
+xrep_parent_try_flush_xattrs(
+ struct xfs_scrub *sc,
+ void *priv)
+{
+ struct xrep_parent *rp = priv;
+ int error;
+
+ if (!xrep_parent_want_flush_xattrs(rp))
+ return 0;
+
+ error = xrep_parent_flush_xattrs(rp);
+ if (error)
+ return error;
+
+ /*
+ * If there were any parent pointer updates to the xattr structure
+ * while we dropped the ILOCK, the xattr structure is now stale.
+ * Signal to the attr copy process that we need to start over, but
+ * this time without opportunistic attr flushing.
+ *
+ * This is unlikely to happen, so we're ok with restarting the copy.
+ */
+ mutex_lock(&rp->pscan.lock);
+ if (rp->saw_pptr_updates)
+ error = -ESTALE;
+ mutex_unlock(&rp->pscan.lock);
+ return error;
+}
+
+/* Copy all the non-pptr extended attributes into the temporary file. */
+STATIC int
+xrep_parent_copy_xattrs(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ int error;
+
+ /*
+ * Clear the pptr updates flag. We hold sc->ip ILOCKed, so there
+ * can't be any parent pointer updates in progress.
+ */
+ mutex_lock(&rp->pscan.lock);
+ rp->saw_pptr_updates = false;
+ mutex_unlock(&rp->pscan.lock);
+
+ /* Copy xattrs, stopping periodically to flush the incore buffers. */
+ error = xchk_xattr_walk(sc, sc->ip, xrep_parent_stash_xattr,
+ xrep_parent_try_flush_xattrs, rp);
+ if (error && error != -ESTALE)
+ return error;
+
+ if (error == -ESTALE) {
+ /*
+ * The xattr copy collided with a parent pointer update.
+ * Restart the copy, but this time hold the ILOCK all the way
+ * to the end to lock out any directory parent pointer updates.
+ */
+ error = xchk_xattr_walk(sc, sc->ip, xrep_parent_stash_xattr,
+ NULL, rp);
+ if (error)
+ return error;
+ }
+
+ /* Flush any remaining stashed xattrs to the temporary file. */
+ if (xfarray_bytes(rp->xattr_records) == 0)
+ return 0;
+
+ return xrep_parent_flush_xattrs(rp);
+}
+
+/*
+ * Ensure that @sc->ip and @sc->tempip both have attribute forks before we head
+ * into the attr fork exchange transaction. All files on a filesystem with
+ * parent pointers must have an attr fork because the parent pointer code does
+ * not itself add attribute forks.
+ *
+ * Note: Unlinkable unlinked files don't need one, but the overhead of having
+ * an unnecessary attr fork is not justified by the additional code complexity
+ * that would be needed to track that state correctly.
+ */
+STATIC int
+xrep_parent_ensure_attr_fork(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ int error;
+
+ error = xfs_attr_add_fork(sc->tempip,
+ sizeof(struct xfs_attr_sf_hdr), 1);
+ if (error)
+ return error;
+ return xfs_attr_add_fork(sc->ip, sizeof(struct xfs_attr_sf_hdr), 1);
+}
+
+/*
+ * Finish replaying stashed parent pointer updates, allocate a transaction for
+ * exchanging extent mappings, and take the ILOCKs of both files before we
+ * commit the new attribute structure.
+ */
+STATIC int
+xrep_parent_finalize_tempfile(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ int error;
+
+ /*
+ * Repair relies on the ILOCK to quiesce all possible xattr updates.
+ * Replay all queued parent pointer updates into the tempfile before
+ * exchanging the contents, even if that means dropping the ILOCKs and
+ * the transaction.
+ */
+ do {
+ error = xrep_parent_replay_updates(rp);
+ if (error)
+ return error;
+
+ error = xrep_parent_ensure_attr_fork(rp);
+ if (error)
+ return error;
+
+ error = xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rp->tx);
+ if (error)
+ return error;
+
+ if (xfarray_length(rp->pptr_recs) == 0)
+ break;
+
+ xchk_trans_cancel(sc);
+ xrep_tempfile_iunlock_both(sc);
+ } while (!xchk_should_terminate(sc, &error));
+ return error;
+}
+
+/*
+ * Replay all the stashed parent pointers into the temporary file, copy all
+ * the non-pptr xattrs from the file being repaired into the temporary file,
+ * and exchange the attr fork contents atomically.
+ */
+STATIC int
+xrep_parent_rebuild_pptrs(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ xfs_ino_t parent_ino = NULLFSINO;
+ int error;
+
+ /*
+ * Copy non-ppttr xattrs from the file being repaired into the
+ * temporary file's xattr structure. We hold sc->ip's IOLOCK, which
+ * prevents setxattr/removexattr calls from occurring, but renames
+ * update the parent pointers without holding IOLOCK. If we detect
+ * stale attr structures, we restart the scan but only flush at the
+ * end.
+ */
+ error = xrep_parent_copy_xattrs(rp);
+ if (error)
+ return error;
+
+ /*
+ * Cancel the empty transaction that we used to walk and copy attrs,
+ * and drop the ILOCK so that we can take the IOLOCK on the temporary
+ * file. We still hold sc->ip's IOLOCK.
+ */
+ xchk_trans_cancel(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ error = xrep_tempfile_iolock_polled(sc);
+ if (error)
+ return error;
+
+ /*
+ * Allocate transaction, lock inodes, and make sure that we've replayed
+ * all the stashed pptr updates to the tempdir. After this point,
+ * we're ready to exchange the attr fork mappings.
+ */
+ error = xrep_parent_finalize_tempfile(rp);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing pptr fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ if (xchk_iscan_aborted(&rp->pscan.iscan))
+ return -ECANCELED;
+
+ /*
+ * Exchange the attr fork contents and junk the old attr fork contents,
+ * which are now in the tempfile.
+ */
+ error = xrep_xattr_swap(sc, &rp->tx);
+ if (error)
+ return error;
+ error = xrep_xattr_reset_tempfile_fork(sc);
+ if (error)
+ return error;
+
+ /*
+ * Roll to get a transaction without any inodes joined to it. Then we
+ * can drop the tempfile's ILOCK and IOLOCK before doing more work on
+ * the scrub target file.
+ */
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ return error;
+ xrep_tempfile_iunlock(sc);
+ xrep_tempfile_iounlock(sc);
+
+ /*
+ * We've committed the new parent pointers. Find at least one parent
+ * so that we can decide if we're moving this file to the orphanage.
+ * For this purpose, root directories are their own parents.
+ */
+ if (sc->ip == sc->mp->m_rootip) {
+ xrep_findparent_scan_found(&rp->pscan, sc->ip->i_ino);
+ } else {
+ error = xrep_parent_lookup_pptrs(sc, &parent_ino);
+ if (error)
+ return error;
+ if (parent_ino != NULLFSINO)
+ xrep_findparent_scan_found(&rp->pscan, parent_ino);
+ }
+ return 0;
+}
+
+/*
+ * Commit the new parent pointer structure (currently only the dotdot entry) to
+ * the file that we're repairing.
+ */
+STATIC int
+xrep_parent_rebuild_tree(
+ struct xrep_parent *rp)
+{
+ int error;
+
+ if (xfs_has_parent(rp->sc->mp)) {
+ error = xrep_parent_rebuild_pptrs(rp);
+ if (error)
+ return error;
+ }
+
+ if (rp->pscan.parent_ino == NULLFSINO) {
+ if (xrep_orphanage_can_adopt(rp->sc))
+ return xrep_parent_move_to_orphanage(rp);
+ return -EFSCORRUPTED;
+ }
+
+ if (S_ISDIR(VFS_I(rp->sc->ip)->i_mode))
+ return xrep_parent_reset_dotdot(rp);
+
+ return 0;
+}
+
+/* Count the number of parent pointers. */
+STATIC int
+xrep_parent_count_pptr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xrep_parent *rp = priv;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, NULL, NULL);
+ if (error)
+ return error;
+
+ rp->parents++;
+ return 0;
+}
+
+/*
+ * After all parent pointer rebuilding and adoption activity completes, reset
+ * the link count of this nondirectory, having scanned the fs to rebuild all
+ * parent pointers.
+ */
+STATIC int
+xrep_parent_set_nondir_nlink(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_perag *pag;
+ bool joined = false;
+ int error;
+
+ /* Count parent pointers so we can reset the file link count. */
+ rp->parents = 0;
+ error = xchk_xattr_walk(sc, ip, xrep_parent_count_pptr, NULL, rp);
+ if (error)
+ return error;
+
+ if (rp->parents > 0 && xfs_inode_on_unlinked_list(ip)) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ joined = true;
+
+ /*
+ * The file is on the unlinked list but we found parents.
+ * Remove the file from the unlinked list.
+ */
+ pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, ip->i_ino));
+ if (!pag) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ error = xfs_iunlink_remove(sc->tp, pag, ip);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+ } else if (rp->parents == 0 && !xfs_inode_on_unlinked_list(ip)) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ joined = true;
+
+ /*
+ * The file is not on the unlinked list but we found no
+ * parents. Add the file to the unlinked list.
+ */
+ error = xfs_iunlink(sc->tp, ip);
+ if (error)
+ return error;
+ }
+
+ /* Set the correct link count. */
+ if (VFS_I(ip)->i_nlink != rp->parents) {
+ if (!joined) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ joined = true;
+ }
+
+ set_nlink(VFS_I(ip), min_t(unsigned long long, rp->parents,
+ XFS_NLINK_PINNED));
+ }
+
+ /* Log the inode to keep it moving forward if we dirtied anything. */
+ if (joined)
+ xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE);
+ return 0;
+}
+
+/* Set up the filesystem scan so we can look for parents. */
+STATIC int
+xrep_parent_setup_scan(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ char *descr;
+ struct xfs_da_geometry *geo = sc->mp->m_attr_geo;
+ int max_len;
+ int error;
+
+ if (!xfs_has_parent(sc->mp))
+ return xrep_findparent_scan_start(sc, &rp->pscan);
+
+ /* Buffers for copying non-pptr attrs to the tempfile */
+ rp->xattr_name = kvmalloc(XATTR_NAME_MAX + 1, XCHK_GFP_FLAGS);
+ if (!rp->xattr_name)
+ return -ENOMEM;
+
+ /*
+ * Allocate enough memory to handle loading local attr values from the
+ * xfblob data while flushing stashed attrs to the temporary file.
+ * We only realloc the buffer when salvaging remote attr values, so
+ * TRY_HARDER means we allocate the maximal attr value size.
+ */
+ if (sc->flags & XCHK_TRY_HARDER)
+ max_len = XATTR_SIZE_MAX;
+ else
+ max_len = xfs_attr_leaf_entsize_local_max(geo->blksize);
+ error = xrep_parent_alloc_xattr_value(rp, max_len);
+ if (error)
+ goto out_xattr_name;
+
+ /* Set up some staging memory for logging parent pointer updates. */
+ descr = xchk_xfile_ino_descr(sc, "parent pointer entries");
+ error = xfarray_create(descr, 0, sizeof(struct xrep_pptr),
+ &rp->pptr_recs);
+ kfree(descr);
+ if (error)
+ goto out_xattr_value;
+
+ descr = xchk_xfile_ino_descr(sc, "parent pointer names");
+ error = xfblob_create(descr, &rp->pptr_names);
+ kfree(descr);
+ if (error)
+ goto out_recs;
+
+ /* Set up some storage for copying attrs before the mapping exchange */
+ descr = xchk_xfile_ino_descr(sc,
+ "parent pointer retained xattr entries");
+ error = xfarray_create(descr, 0, sizeof(struct xrep_parent_xattr),
+ &rp->xattr_records);
+ kfree(descr);
+ if (error)
+ goto out_names;
+
+ descr = xchk_xfile_ino_descr(sc,
+ "parent pointer retained xattr values");
+ error = xfblob_create(descr, &rp->xattr_blobs);
+ kfree(descr);
+ if (error)
+ goto out_attr_keys;
+
+ error = __xrep_findparent_scan_start(sc, &rp->pscan,
+ xrep_parent_live_update);
+ if (error)
+ goto out_attr_values;
+
+ return 0;
+
+out_attr_values:
+ xfblob_destroy(rp->xattr_blobs);
+ rp->xattr_blobs = NULL;
+out_attr_keys:
+ xfarray_destroy(rp->xattr_records);
+ rp->xattr_records = NULL;
+out_names:
+ xfblob_destroy(rp->pptr_names);
+ rp->pptr_names = NULL;
+out_recs:
+ xfarray_destroy(rp->pptr_recs);
+ rp->pptr_recs = NULL;
+out_xattr_value:
+ kvfree(rp->xattr_value);
+ rp->xattr_value = NULL;
+out_xattr_name:
+ kvfree(rp->xattr_name);
+ rp->xattr_name = NULL;
+ return error;
+}
+
+int
+xrep_parent(
+ struct xfs_scrub *sc)
+{
+ struct xrep_parent *rp = sc->buf;
+ int error;
+
+ /*
+ * When the parent pointers feature is enabled, repairs are committed
+ * by atomically committing a new xattr structure and reaping the old
+ * attr fork. Reaping requires rmap and exchange-range to be enabled.
+ */
+ if (xfs_has_parent(sc->mp)) {
+ if (!xfs_has_rmapbt(sc->mp))
+ return -EOPNOTSUPP;
+ if (!xfs_has_exchange_range(sc->mp))
+ return -EOPNOTSUPP;
+ }
+
+ error = xrep_parent_setup_scan(rp);
+ if (error)
+ return error;
+
+ if (xfs_has_parent(sc->mp))
+ error = xrep_parent_scan_dirtree(rp);
+ else
+ error = xrep_parent_find_dotdot(rp);
+ if (error)
+ goto out_teardown;
+
+ /* Last chance to abort before we start committing dotdot fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto out_teardown;
+
+ error = xrep_parent_rebuild_tree(rp);
+ if (error)
+ goto out_teardown;
+ if (xfs_has_parent(sc->mp) && !S_ISDIR(VFS_I(sc->ip)->i_mode)) {
+ error = xrep_parent_set_nondir_nlink(rp);
+ if (error)
+ goto out_teardown;
+ }
+
+ error = xrep_defer_finish(sc);
+
+out_teardown:
+ xrep_parent_teardown(rp);
+ return error;
+}
diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c
index 0bab4c30cb85..90cd1512bba9 100644
--- a/fs/xfs/scrub/quota_repair.c
+++ b/fs/xfs/scrub/quota_repair.c
@@ -77,8 +77,6 @@ xrep_quota_item_fill_bmap_hole(
irec, &nmaps);
if (error)
return error;
- if (nmaps != 1)
- return -ENOSPC;
dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec->br_startblock);
@@ -444,10 +442,6 @@ xrep_quota_data_fork(
XFS_BMAPI_CONVERT, 0, &nrec, &nmap);
if (error)
goto out;
- if (nmap != 1) {
- error = -ENOSPC;
- goto out;
- }
ASSERT(nrec.br_startoff == irec.br_startoff);
ASSERT(nrec.br_blockcount == irec.br_blockcount);
diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c
index dfdcb96b6c16..01c9a2dc0f2c 100644
--- a/fs/xfs/scrub/readdir.c
+++ b/fs/xfs/scrub/readdir.c
@@ -18,6 +18,7 @@
#include "xfs_trans.h"
#include "xfs_error.h"
#include "scrub/scrub.h"
+#include "scrub/common.h"
#include "scrub/readdir.h"
/* Call a function for every entry in a shortform directory. */
@@ -99,7 +100,7 @@ xchk_dir_walk_block(
unsigned int off, next_off, end;
int error;
- error = xfs_dir3_block_read(sc->tp, dp, &bp);
+ error = xfs_dir3_block_read(sc->tp, dp, dp->i_ino, &bp);
if (error)
return error;
@@ -175,7 +176,7 @@ xchk_read_leaf_dir_buf(
if (new_off > *curoff)
*curoff = new_off;
- return xfs_dir3_data_read(tp, dp, map.br_startoff, 0, bpp);
+ return xfs_dir3_data_read(tp, dp, dp->i_ino, map.br_startoff, 0, bpp);
}
/* Call a function for every entry in a leaf directory. */
@@ -273,8 +274,8 @@ xchk_dir_walk(
.dp = dp,
.geo = dp->i_mount->m_dir_geo,
.trans = sc->tp,
+ .owner = dp->i_ino,
};
- bool isblock;
int error;
if (xfs_is_shutdown(dp->i_mount))
@@ -283,22 +284,17 @@ xchk_dir_walk(
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
- if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
+ switch (xfs_dir2_format(&args, &error)) {
+ case XFS_DIR2_FMT_SF:
return xchk_dir_walk_sf(sc, dp, dirent_fn, priv);
-
- /* dir2 functions require that the data fork is loaded */
- error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK);
- if (error)
- return error;
-
- error = xfs_dir2_isblock(&args, &isblock);
- if (error)
- return error;
-
- if (isblock)
+ case XFS_DIR2_FMT_BLOCK:
return xchk_dir_walk_block(sc, dp, dirent_fn, priv);
-
- return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv);
+ case XFS_DIR2_FMT_LEAF:
+ case XFS_DIR2_FMT_NODE:
+ return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv);
+ default:
+ return error;
+ }
}
/*
@@ -324,50 +320,102 @@ xchk_dir_lookup(
.hashval = xfs_dir2_hashname(dp->i_mount, name),
.whichfork = XFS_DATA_FORK,
.op_flags = XFS_DA_OP_OKNOENT,
+ .owner = dp->i_ino,
};
- bool isblock, isleaf;
int error;
if (xfs_is_shutdown(dp->i_mount))
return -EIO;
+ /*
+ * A temporary directory's block headers are written with the owner
+ * set to sc->ip, so we must switch the owner here for the lookup.
+ */
+ if (dp == sc->tempip)
+ args.owner = sc->ip->i_ino;
+
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
- if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
- error = xfs_dir2_sf_lookup(&args);
- goto out_check_rval;
- }
+ error = xfs_dir_lookup_args(&args);
+ if (!error)
+ *ino = args.inumber;
+ return error;
+}
- /* dir2 functions require that the data fork is loaded */
- error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK);
- if (error)
- return error;
+/*
+ * Try to grab the IOLOCK and ILOCK of sc->ip and ip, returning @ip's lock
+ * state. The caller may have a transaction, so we must use trylock for both
+ * IOLOCKs.
+ */
+static inline unsigned int
+xchk_dir_trylock_both(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ if (!xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL))
+ return 0;
- error = xfs_dir2_isblock(&args, &isblock);
- if (error)
- return error;
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
+ goto parent_iolock;
- if (isblock) {
- error = xfs_dir2_block_lookup(&args);
- goto out_check_rval;
- }
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+ goto parent_ilock;
- error = xfs_dir2_isleaf(&args, &isleaf);
- if (error)
- return error;
+ return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL;
+
+parent_ilock:
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+parent_iolock:
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
+ return 0;
+}
+
+/*
+ * Try for a limited time to grab the IOLOCK and ILOCK of both the scrub target
+ * (@sc->ip) and the inode at the other end (@ip) of a directory or parent
+ * pointer link so that we can check that link.
+ *
+ * We do not know ahead of time that the directory tree is /not/ corrupt, so we
+ * cannot use the "lock two inode" functions because we do not know that there
+ * is not a racing thread trying to take the locks in opposite order. First
+ * take IOLOCK_EXCL of the scrub target, and then try to take IOLOCK_SHARED
+ * of @ip to synchronize with the VFS. Next, take ILOCK_EXCL of the scrub
+ * target and @ip to synchronize with XFS.
+ *
+ * If the trylocks succeed, *lockmode will be set to the locks held for @ip;
+ * @sc->ilock_flags will be set for the locks held for @sc->ip; and zero will
+ * be returned. If not, returns -EDEADLOCK to try again; or -ETIMEDOUT if
+ * XCHK_TRY_HARDER was set. Returns -EINTR if the process has been killed.
+ */
+int
+xchk_dir_trylock_for_pptrs(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int *lockmode)
+{
+ unsigned int nr;
+ int error = 0;
+
+ ASSERT(sc->ilock_flags == 0);
+
+ for (nr = 0; nr < HZ; nr++) {
+ *lockmode = xchk_dir_trylock_both(sc, ip);
+ if (*lockmode)
+ return 0;
- if (isleaf) {
- error = xfs_dir2_leaf_lookup(&args);
- goto out_check_rval;
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ delay(1);
}
- error = xfs_dir2_node_lookup(&args);
+ if (sc->flags & XCHK_TRY_HARDER) {
+ xchk_set_incomplete(sc);
+ return -ETIMEDOUT;
+ }
-out_check_rval:
- if (error == -EEXIST)
- error = 0;
- if (!error)
- *ino = args.inumber;
- return error;
+ return -EDEADLOCK;
}
diff --git a/fs/xfs/scrub/readdir.h b/fs/xfs/scrub/readdir.h
index 55787f4df123..da501877a64d 100644
--- a/fs/xfs/scrub/readdir.h
+++ b/fs/xfs/scrub/readdir.h
@@ -16,4 +16,7 @@ int xchk_dir_walk(struct xfs_scrub *sc, struct xfs_inode *dp,
int xchk_dir_lookup(struct xfs_scrub *sc, struct xfs_inode *dp,
const struct xfs_name *name, xfs_ino_t *ino);
+int xchk_dir_trylock_for_pptrs(struct xfs_scrub *sc, struct xfs_inode *ip,
+ unsigned int *lockmode);
+
#endif /* __XFS_SCRUB_READDIR_H__ */
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index 0252a3b5b65a..be283153c254 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -211,6 +211,48 @@ static inline void xreap_defer_finish_reset(struct xreap_state *rs)
rs->force_roll = false;
}
+/*
+ * Compute the maximum length of a buffer cache scan (in units of sectors),
+ * given a quantity of fs blocks.
+ */
+xfs_daddr_t
+xrep_bufscan_max_sectors(
+ struct xfs_mount *mp,
+ xfs_extlen_t fsblocks)
+{
+ int max_fsbs;
+
+ /* Remote xattr values are the largest buffers that we support. */
+ max_fsbs = xfs_attr3_max_rmt_blocks(mp);
+
+ return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs));
+}
+
+/*
+ * Return an incore buffer from a sector scan, or NULL if there are no buffers
+ * left to return.
+ */
+struct xfs_buf *
+xrep_bufscan_advance(
+ struct xfs_mount *mp,
+ struct xrep_bufscan *scan)
+{
+ scan->__sector_count += scan->daddr_step;
+ while (scan->__sector_count <= scan->max_sectors) {
+ struct xfs_buf *bp = NULL;
+ int error;
+
+ error = xfs_buf_incore(mp->m_ddev_targp, scan->daddr,
+ scan->__sector_count, XBF_LIVESCAN, &bp);
+ if (!error)
+ return bp;
+
+ scan->__sector_count += scan->daddr_step;
+ }
+
+ return NULL;
+}
+
/* Try to invalidate the incore buffers for an extent that we're freeing. */
STATIC void
xreap_agextent_binval(
@@ -241,28 +283,15 @@ xreap_agextent_binval(
* of any plausible size.
*/
while (bno < agbno_next) {
- xfs_agblock_t fsbcount;
- xfs_agblock_t max_fsbs;
-
- /*
- * Max buffer size is the max remote xattr buffer size, which
- * is one fs block larger than 64k.
- */
- max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
- xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
-
- for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) {
- struct xfs_buf *bp = NULL;
- xfs_daddr_t daddr;
- int error;
-
- daddr = XFS_AGB_TO_DADDR(mp, agno, bno);
- error = xfs_buf_incore(mp->m_ddev_targp, daddr,
- XFS_FSB_TO_BB(mp, fsbcount),
- XBF_LIVESCAN, &bp);
- if (error)
- continue;
-
+ struct xrep_bufscan scan = {
+ .daddr = XFS_AGB_TO_DADDR(mp, agno, bno),
+ .max_sectors = xrep_bufscan_max_sectors(mp,
+ agbno_next - bno),
+ .daddr_step = XFS_FSB_TO_BB(mp, 1),
+ };
+ struct xfs_buf *bp;
+
+ while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
xfs_trans_bjoin(sc->tp, bp);
xfs_trans_binval(sc->tp, bp);
rs->invalidated++;
@@ -646,3 +675,375 @@ xrep_reap_fsblocks(
return 0;
}
+
+/*
+ * Metadata files are not supposed to share blocks with anything else.
+ * If blocks are shared, we remove the reverse mapping (thus reducing the
+ * crosslink factor); if blocks are not shared, we also need to free them.
+ *
+ * This first step determines the longest subset of the passed-in imap
+ * (starting at its beginning) that is either crosslinked or not crosslinked.
+ * The blockcount will be adjust down as needed.
+ */
+STATIC int
+xreap_bmapi_select(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *imap,
+ bool *crosslinked)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_btree_cur *cur;
+ xfs_filblks_t len = 1;
+ xfs_agblock_t bno;
+ xfs_agblock_t agbno;
+ xfs_agblock_t agbno_next;
+ int error;
+
+ agbno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
+ agbno_next = agbno + imap->br_blockcount;
+
+ cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+
+ xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff);
+ error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked);
+ if (error)
+ goto out_cur;
+
+ bno = agbno + 1;
+ while (bno < agbno_next) {
+ bool also_crosslinked;
+
+ oinfo.oi_offset++;
+ error = xfs_rmap_has_other_keys(cur, bno, 1, &oinfo,
+ &also_crosslinked);
+ if (error)
+ goto out_cur;
+
+ if (also_crosslinked != *crosslinked)
+ break;
+
+ len++;
+ bno++;
+ }
+
+ imap->br_blockcount = len;
+ trace_xreap_bmapi_select(sc->sa.pag, agbno, len, *crosslinked);
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Decide if this buffer can be joined to a transaction. This is true for most
+ * buffers, but there are two cases that we want to catch: large remote xattr
+ * value buffers are not logged and can overflow the buffer log item dirty
+ * bitmap size; and oversized cached buffers if things have really gone
+ * haywire.
+ */
+static inline bool
+xreap_buf_loggable(
+ const struct xfs_buf *bp)
+{
+ int i;
+
+ for (i = 0; i < bp->b_map_count; i++) {
+ int chunks;
+ int map_size;
+
+ chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
+ XFS_BLF_CHUNK);
+ map_size = DIV_ROUND_UP(chunks, NBWORD);
+ if (map_size > XFS_BLF_DATAMAP_SIZE)
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Invalidate any buffers for this file mapping. The @imap blockcount may be
+ * adjusted downward if we need to roll the transaction.
+ */
+STATIC int
+xreap_bmapi_binval(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *imap)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_perag *pag = sc->sa.pag;
+ int bmap_flags = xfs_bmapi_aflag(whichfork);
+ xfs_fileoff_t off;
+ xfs_fileoff_t max_off;
+ xfs_extlen_t scan_blocks;
+ xfs_agnumber_t agno = sc->sa.pag->pag_agno;
+ xfs_agblock_t bno;
+ xfs_agblock_t agbno;
+ xfs_agblock_t agbno_next;
+ unsigned int invalidated = 0;
+ int error;
+
+ /*
+ * Avoid invalidating AG headers and post-EOFS blocks because we never
+ * own those.
+ */
+ agbno = bno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
+ agbno_next = agbno + imap->br_blockcount;
+ if (!xfs_verify_agbno(pag, agbno) ||
+ !xfs_verify_agbno(pag, agbno_next - 1))
+ return 0;
+
+ /*
+ * Buffers for file blocks can span multiple contiguous mappings. This
+ * means that for each block in the mapping, there could exist an
+ * xfs_buf indexed by that block with any length up to the maximum
+ * buffer size (remote xattr values) or to the next hole in the fork.
+ * To set up our binval scan, first we need to figure out the location
+ * of the next hole.
+ */
+ off = imap->br_startoff + imap->br_blockcount;
+ max_off = off + xfs_attr3_max_rmt_blocks(mp);
+ while (off < max_off) {
+ struct xfs_bmbt_irec hmap;
+ int nhmaps = 1;
+
+ error = xfs_bmapi_read(ip, off, max_off - off, &hmap,
+ &nhmaps, bmap_flags);
+ if (error)
+ return error;
+ if (nhmaps != 1 || hmap.br_startblock == DELAYSTARTBLOCK) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ if (!xfs_bmap_is_real_extent(&hmap))
+ break;
+
+ off = hmap.br_startoff + hmap.br_blockcount;
+ }
+ scan_blocks = off - imap->br_startoff;
+
+ trace_xreap_bmapi_binval_scan(sc, imap, scan_blocks);
+
+ /*
+ * If there are incore buffers for these blocks, invalidate them. If
+ * we can't (try)lock the buffer we assume it's owned by someone else
+ * and leave it alone. The buffer cache cannot detect aliasing, so
+ * employ nested loops to detect incore buffers of any plausible size.
+ */
+ while (bno < agbno_next) {
+ struct xrep_bufscan scan = {
+ .daddr = XFS_AGB_TO_DADDR(mp, agno, bno),
+ .max_sectors = xrep_bufscan_max_sectors(mp,
+ scan_blocks),
+ .daddr_step = XFS_FSB_TO_BB(mp, 1),
+ };
+ struct xfs_buf *bp;
+
+ while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
+ if (xreap_buf_loggable(bp)) {
+ xfs_trans_bjoin(sc->tp, bp);
+ xfs_trans_binval(sc->tp, bp);
+ } else {
+ xfs_buf_stale(bp);
+ xfs_buf_relse(bp);
+ }
+ invalidated++;
+
+ /*
+ * Stop invalidating if we've hit the limit; we should
+ * still have enough reservation left to free however
+ * much of the mapping we've seen so far.
+ */
+ if (invalidated > XREAP_MAX_BINVAL) {
+ imap->br_blockcount = agbno_next - bno;
+ goto out;
+ }
+ }
+
+ bno++;
+ scan_blocks--;
+ }
+
+out:
+ trace_xreap_bmapi_binval(sc->sa.pag, agbno, imap->br_blockcount);
+ return 0;
+}
+
+/*
+ * Dispose of as much of the beginning of this file fork mapping as possible.
+ * The number of blocks disposed of is returned in @imap->br_blockcount.
+ */
+STATIC int
+xrep_reap_bmapi_iter(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *imap,
+ bool crosslinked)
+{
+ int error;
+
+ if (crosslinked) {
+ /*
+ * If there are other rmappings, this block is cross linked and
+ * must not be freed. Remove the reverse mapping, leave the
+ * buffer cache in its possibly confused state, and move on.
+ * We don't want to risk discarding valid data buffers from
+ * anybody else who thinks they own the block, even though that
+ * runs the risk of stale buffer warnings in the future.
+ */
+ trace_xreap_dispose_unmap_extent(sc->sa.pag,
+ XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
+ imap->br_blockcount);
+
+ /*
+ * Schedule removal of the mapping from the fork. We use
+ * deferred log intents in this function to control the exact
+ * sequence of metadata updates.
+ */
+ xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
+ xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
+ -(int64_t)imap->br_blockcount);
+ xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap);
+ return 0;
+ }
+
+ /*
+ * If the block is not crosslinked, we can invalidate all the incore
+ * buffers for the extent, and then free the extent. This is a bit of
+ * a mess since we don't detect discontiguous buffers that are indexed
+ * by a block starting before the first block of the extent but overlap
+ * anyway.
+ */
+ trace_xreap_dispose_free_extent(sc->sa.pag,
+ XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
+ imap->br_blockcount);
+
+ /*
+ * Invalidate as many buffers as we can, starting at the beginning of
+ * this mapping. If this function sets blockcount to zero, the
+ * transaction is full of logged buffer invalidations, so we need to
+ * return early so that we can roll and retry.
+ */
+ error = xreap_bmapi_binval(sc, ip, whichfork, imap);
+ if (error || imap->br_blockcount == 0)
+ return error;
+
+ /*
+ * Schedule removal of the mapping from the fork. We use deferred log
+ * intents in this function to control the exact sequence of metadata
+ * updates.
+ */
+ xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
+ xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
+ -(int64_t)imap->br_blockcount);
+ return xfs_free_extent_later(sc->tp, imap->br_startblock,
+ imap->br_blockcount, NULL, XFS_AG_RESV_NONE, true);
+}
+
+/*
+ * Dispose of as much of this file extent as we can. Upon successful return,
+ * the imap will reflect the mapping that was removed from the fork.
+ */
+STATIC int
+xreap_ifork_extent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *imap)
+{
+ xfs_agnumber_t agno;
+ bool crosslinked;
+ int error;
+
+ ASSERT(sc->sa.pag == NULL);
+
+ trace_xreap_ifork_extent(sc, ip, whichfork, imap);
+
+ agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock);
+ sc->sa.pag = xfs_perag_get(sc->mp, agno);
+ if (!sc->sa.pag)
+ return -EFSCORRUPTED;
+
+ error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
+ if (error)
+ goto out_pag;
+
+ /*
+ * Decide the fate of the blocks at the beginning of the mapping, then
+ * update the mapping to use it with the unmap calls.
+ */
+ error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked);
+ if (error)
+ goto out_agf;
+
+ error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked);
+ if (error)
+ goto out_agf;
+
+out_agf:
+ xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
+ sc->sa.agf_bp = NULL;
+out_pag:
+ xfs_perag_put(sc->sa.pag);
+ sc->sa.pag = NULL;
+ return error;
+}
+
+/*
+ * Dispose of each block mapped to the given fork of the given file. Callers
+ * must hold ILOCK_EXCL, and ip can only be sc->ip or sc->tempip. The fork
+ * must not have any delalloc reservations.
+ */
+int
+xrep_reap_ifork(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ xfs_fileoff_t off = 0;
+ int bmap_flags = xfs_bmapi_aflag(whichfork);
+ int error;
+
+ ASSERT(xfs_has_rmapbt(sc->mp));
+ ASSERT(ip == sc->ip || ip == sc->tempip);
+ ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip));
+
+ while (off < XFS_MAX_FILEOFF) {
+ struct xfs_bmbt_irec imap;
+ int nimaps = 1;
+
+ /* Read the next extent, skip past holes and delalloc. */
+ error = xfs_bmapi_read(ip, off, XFS_MAX_FILEOFF - off, &imap,
+ &nimaps, bmap_flags);
+ if (error)
+ return error;
+ if (nimaps != 1 || imap.br_startblock == DELAYSTARTBLOCK) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * If this is a real space mapping, reap as much of it as we
+ * can in a single transaction.
+ */
+ if (xfs_bmap_is_real_extent(&imap)) {
+ error = xreap_ifork_extent(sc, ip, whichfork, &imap);
+ if (error)
+ return error;
+
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+ }
+
+ off = imap.br_startoff + imap.br_blockcount;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h
index 0b69f16dd98f..3f2f1775e29d 100644
--- a/fs/xfs/scrub/reap.h
+++ b/fs/xfs/scrub/reap.h
@@ -13,5 +13,26 @@ int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap,
const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap,
const struct xfs_owner_info *oinfo);
+int xrep_reap_ifork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork);
+
+/* Buffer cache scan context. */
+struct xrep_bufscan {
+ /* Disk address for the buffers we want to scan. */
+ xfs_daddr_t daddr;
+
+ /* Maximum number of sectors to scan. */
+ xfs_daddr_t max_sectors;
+
+ /* Each round, increment the search length by this number of sectors. */
+ xfs_daddr_t daddr_step;
+
+ /* Internal scan state; initialize to zero. */
+ xfs_daddr_t __sector_count;
+};
+
+xfs_daddr_t xrep_bufscan_max_sectors(struct xfs_mount *mp,
+ xfs_extlen_t fsblocks);
+struct xfs_buf *xrep_bufscan_advance(struct xfs_mount *mp,
+ struct xrep_bufscan *scan);
#endif /* __XFS_SCRUB_REAP_H__ */
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index f43dce771cdd..67478294f11a 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -32,6 +32,10 @@
#include "xfs_reflink.h"
#include "xfs_health.h"
#include "xfs_buf_mem.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_dir2.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -39,6 +43,7 @@
#include "scrub/bitmap.h"
#include "scrub/stats.h"
#include "scrub/xfile.h"
+#include "scrub/attr_repair.h"
/*
* Attempt to repair some metadata, if the metadata is corrupt and userspace
@@ -290,7 +295,7 @@ xrep_calc_ag_resblks(
icount = pag->pagi_count;
} else {
/* Try to get the actual counters from disk. */
- error = xfs_ialloc_read_agi(pag, NULL, &bp);
+ error = xfs_ialloc_read_agi(pag, NULL, 0, &bp);
if (!error) {
icount = pag->pagi_count;
xfs_buf_relse(bp);
@@ -724,7 +729,7 @@ xrep_update_qflags(
xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1);
no_update:
- mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+ mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
}
/* Force a quotacheck the next time we mount. */
@@ -908,7 +913,7 @@ xrep_reinit_pagi(
ASSERT(xfs_perag_initialised_agi(pag));
clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
- error = xfs_ialloc_read_agi(pag, sc->tp, &bp);
+ error = xfs_ialloc_read_agi(pag, sc->tp, 0, &bp);
if (error)
return error;
@@ -934,7 +939,7 @@ xrep_ag_init(
ASSERT(!sa->pag);
- error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp);
+ error = xfs_ialloc_read_agi(pag, sc->tp, 0, &sa->agi_bp);
if (error)
return error;
@@ -963,9 +968,7 @@ xrep_reset_perag_resv(
ASSERT(sc->tp);
sc->flags &= ~XREP_RESET_PERAG_RESV;
- error = xfs_ag_resv_free(sc->sa.pag);
- if (error)
- goto out;
+ xfs_ag_resv_free(sc->sa.pag);
error = xfs_ag_resv_init(sc->sa.pag, sc->tp);
if (error == -ENOSPC) {
xfs_err(sc->mp,
@@ -974,7 +977,6 @@ xrep_reset_perag_resv(
error = 0;
}
-out:
return error;
}
@@ -1004,55 +1006,27 @@ xrep_metadata_inode_subtype(
struct xfs_scrub *sc,
unsigned int scrub_type)
{
- __u32 smtype = sc->sm->sm_type;
- __u32 smflags = sc->sm->sm_flags;
- unsigned int sick_mask = sc->sick_mask;
+ struct xfs_scrub_subord *sub;
int error;
/*
- * Let's see if the inode needs repair. We're going to open-code calls
- * to the scrub and repair functions so that we can hang on to the
+ * Let's see if the inode needs repair. Use a subordinate scrub context
+ * to call the scrub and repair functions so that we can hang on to the
* resources that we already acquired instead of using the standard
* setup/teardown routines.
*/
- sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
- sc->sm->sm_type = scrub_type;
-
- switch (scrub_type) {
- case XFS_SCRUB_TYPE_INODE:
- error = xchk_inode(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTD:
- error = xchk_bmap_data(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTA:
- error = xchk_bmap_attr(sc);
- break;
- default:
- ASSERT(0);
- error = -EFSCORRUPTED;
- }
+ sub = xchk_scrub_create_subord(sc, scrub_type);
+ error = sub->sc.ops->scrub(&sub->sc);
if (error)
goto out;
-
- if (!xrep_will_attempt(sc))
+ if (!xrep_will_attempt(&sub->sc))
goto out;
/*
* Repair some part of the inode. This will potentially join the inode
* to the transaction.
*/
- switch (scrub_type) {
- case XFS_SCRUB_TYPE_INODE:
- error = xrep_inode(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTD:
- error = xrep_bmap(sc, XFS_DATA_FORK, false);
- break;
- case XFS_SCRUB_TYPE_BMBTA:
- error = xrep_bmap(sc, XFS_ATTR_FORK, false);
- break;
- }
+ error = sub->sc.ops->repair(&sub->sc);
if (error)
goto out;
@@ -1061,10 +1035,10 @@ xrep_metadata_inode_subtype(
* that the inode will not be joined to the transaction when we exit
* the function.
*/
- error = xfs_defer_finish(&sc->tp);
+ error = xfs_defer_finish(&sub->sc.tp);
if (error)
goto out;
- error = xfs_trans_roll(&sc->tp);
+ error = xfs_trans_roll(&sub->sc.tp);
if (error)
goto out;
@@ -1072,31 +1046,18 @@ xrep_metadata_inode_subtype(
* Clear the corruption flags and re-check the metadata that we just
* repaired.
*/
- sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
-
- switch (scrub_type) {
- case XFS_SCRUB_TYPE_INODE:
- error = xchk_inode(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTD:
- error = xchk_bmap_data(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTA:
- error = xchk_bmap_attr(sc);
- break;
- }
+ sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+ error = sub->sc.ops->scrub(&sub->sc);
if (error)
goto out;
/* If corruption persists, the repair has failed. */
- if (xchk_needs_repair(sc->sm)) {
+ if (xchk_needs_repair(sub->sc.sm)) {
error = -EFSCORRUPTED;
goto out;
}
out:
- sc->sick_mask = sick_mask;
- sc->sm->sm_type = smtype;
- sc->sm->sm_flags = smflags;
+ xchk_scrub_free_subord(sub);
return error;
}
@@ -1136,6 +1097,17 @@ xrep_metadata_inode_forks(
return error;
}
+ /* Clear the attr forks since metadata shouldn't have that. */
+ if (xfs_inode_hasattr(sc->ip)) {
+ if (!dirty) {
+ dirty = true;
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ }
+ error = xrep_xattr_reset_fork(sc);
+ if (error)
+ return error;
+ }
+
/*
* If we modified the inode, roll the transaction but don't rejoin the
* inode to the new transaction because xrep_bmap_data can do that.
@@ -1201,3 +1173,34 @@ xrep_trans_cancel_hook_dummy(
current->journal_info = *cookiep;
*cookiep = NULL;
}
+
+/*
+ * See if this buffer can pass the given ->verify_struct() function.
+ *
+ * If the buffer already has ops attached and they're not the ones that were
+ * passed in, we reject the buffer. Otherwise, we perform the structure test
+ * (note that we do not check CRCs) and return the outcome of the test. The
+ * buffer ops and error state are left unchanged.
+ */
+bool
+xrep_buf_verify_struct(
+ struct xfs_buf *bp,
+ const struct xfs_buf_ops *ops)
+{
+ const struct xfs_buf_ops *old_ops = bp->b_ops;
+ xfs_failaddr_t fa;
+ int old_error;
+
+ if (old_ops) {
+ if (old_ops != ops)
+ return false;
+ }
+
+ old_error = bp->b_error;
+ bp->b_ops = ops;
+ fa = bp->b_ops->verify_struct(bp);
+ bp->b_ops = old_ops;
+ bp->b_error = old_error;
+
+ return fa == NULL;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index ce082d941459..0e0dc2bf985c 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -90,6 +90,12 @@ int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten);
int xrep_metadata_inode_forks(struct xfs_scrub *sc);
int xrep_setup_ag_rmapbt(struct xfs_scrub *sc);
int xrep_setup_ag_refcountbt(struct xfs_scrub *sc);
+int xrep_setup_xattr(struct xfs_scrub *sc);
+int xrep_setup_directory(struct xfs_scrub *sc);
+int xrep_setup_parent(struct xfs_scrub *sc);
+int xrep_setup_nlinks(struct xfs_scrub *sc);
+int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *resblks);
+int xrep_setup_dirtree(struct xfs_scrub *sc);
/* Repair setup functions */
int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
@@ -123,11 +129,18 @@ int xrep_bmap_attr(struct xfs_scrub *sc);
int xrep_bmap_cow(struct xfs_scrub *sc);
int xrep_nlinks(struct xfs_scrub *sc);
int xrep_fscounters(struct xfs_scrub *sc);
+int xrep_xattr(struct xfs_scrub *sc);
+int xrep_directory(struct xfs_scrub *sc);
+int xrep_parent(struct xfs_scrub *sc);
+int xrep_symlink(struct xfs_scrub *sc);
+int xrep_dirtree(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
int xrep_rtbitmap(struct xfs_scrub *sc);
+int xrep_rtsummary(struct xfs_scrub *sc);
#else
# define xrep_rtbitmap xrep_notsupported
+# define xrep_rtsummary xrep_notsupported
#endif /* CONFIG_XFS_RT */
#ifdef CONFIG_XFS_QUOTA
@@ -145,6 +158,8 @@ int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep,
struct xfs_trans **tpp);
void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp);
+bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+
#else
#define xrep_ino_dqattach(sc) (0)
@@ -188,9 +203,19 @@ xrep_setup_nothing(
#define xrep_setup_ag_allocbt xrep_setup_nothing
#define xrep_setup_ag_rmapbt xrep_setup_nothing
#define xrep_setup_ag_refcountbt xrep_setup_nothing
+#define xrep_setup_xattr xrep_setup_nothing
+#define xrep_setup_directory xrep_setup_nothing
+#define xrep_setup_parent xrep_setup_nothing
+#define xrep_setup_nlinks xrep_setup_nothing
+#define xrep_setup_dirtree xrep_setup_nothing
#define xrep_setup_inode(sc, imap) ((void)0)
+static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x)
+{
+ return 0;
+}
+
#define xrep_revalidate_allocbt (NULL)
#define xrep_revalidate_iallocbt (NULL)
@@ -212,6 +237,12 @@ xrep_setup_nothing(
#define xrep_quotacheck xrep_notsupported
#define xrep_nlinks xrep_notsupported
#define xrep_fscounters xrep_notsupported
+#define xrep_rtsummary xrep_notsupported
+#define xrep_xattr xrep_notsupported
+#define xrep_directory xrep_notsupported
+#define xrep_parent xrep_notsupported
+#define xrep_symlink xrep_notsupported
+#define xrep_dirtree xrep_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
index e8e07b683eab..e8080eba37d2 100644
--- a/fs/xfs/scrub/rmap_repair.c
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -432,14 +432,6 @@ out:
return error;
}
-static inline bool
-is_rt_data_fork(
- struct xfs_inode *ip,
- int whichfork)
-{
- return XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK;
-}
-
/*
* Iterate the block mapping btree to collect rmap records for anything in this
* fork that matches the AG. Sets @mappings_done to true if we've scanned the
@@ -578,23 +570,9 @@ xrep_rmap_scan_inode(
struct xrep_rmap *rr,
struct xfs_inode *ip)
{
- unsigned int lock_mode = 0;
+ unsigned int lock_mode = xrep_rmap_scan_ilock(ip);
int error;
- /*
- * Directory updates (create/link/unlink/rename) drop the directory's
- * ILOCK before finishing any rmapbt updates associated with directory
- * shape changes. For this scan to coordinate correctly with the live
- * update hook, we must take the only lock (i_rwsem) that is held all
- * the way to dir op completion. This will get fixed by the parent
- * pointer patchset.
- */
- if (S_ISDIR(VFS_I(ip)->i_mode)) {
- lock_mode = XFS_IOLOCK_SHARED;
- xfs_ilock(ip, lock_mode);
- }
- lock_mode |= xrep_rmap_scan_ilock(ip);
-
/* Check the data fork. */
error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK);
if (error)
diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c
index 46f5d5f605c9..0fef98e9f834 100644
--- a/fs/xfs/scrub/rtbitmap_repair.c
+++ b/fs/xfs/scrub/rtbitmap_repair.c
@@ -108,8 +108,6 @@ xrep_rtbitmap_data_mappings(
0, &map, &nmaps);
if (error)
return error;
- if (nmaps != 1)
- return -EFSCORRUPTED;
/* Commit new extent and all deferred work. */
error = xrep_defer_finish(sc);
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index 5055092bd9e8..3fee603f5244 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -17,10 +17,14 @@
#include "xfs_bit.h"
#include "xfs_bmap.h"
#include "xfs_sb.h"
+#include "xfs_exchmaps.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/xfile.h"
+#include "scrub/repair.h"
+#include "scrub/tempexch.h"
+#include "scrub/rtsummary.h"
/*
* Realtime Summary
@@ -32,18 +36,6 @@
* (potentially large) amount of data in pageable memory.
*/
-struct xchk_rtsummary {
- struct xfs_rtalloc_args args;
-
- uint64_t rextents;
- uint64_t rbmblocks;
- uint64_t rsumsize;
- unsigned int rsumlevels;
-
- /* Memory buffer for the summary comparison. */
- union xfs_suminfo_raw words[];
-};
-
/* Set us up to check the rtsummary file. */
int
xchk_setup_rtsummary(
@@ -60,6 +52,12 @@ xchk_setup_rtsummary(
return -ENOMEM;
sc->buf = rts;
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_rtsummary(sc, rts);
+ if (error)
+ return error;
+ }
+
/*
* Create an xfile to construct a new rtsummary file. The xfile allows
* us to avoid pinning kernel memory for this purpose.
@@ -70,7 +68,7 @@ xchk_setup_rtsummary(
if (error)
return error;
- error = xchk_trans_alloc(sc, 0);
+ error = xchk_trans_alloc(sc, rts->resblks);
if (error)
return error;
@@ -135,7 +133,7 @@ xfsum_store(
sumoff << XFS_WORDLOG);
}
-static inline int
+inline int
xfsum_copyout(
struct xfs_scrub *sc,
xfs_rtsumoff_t sumoff,
@@ -362,7 +360,12 @@ xchk_rtsummary(
error = xchk_rtsum_compare(sc);
out_rbm:
- /* Unlock the rtbitmap since we're done with it. */
+ /*
+ * Unlock the rtbitmap since we're done with it. All other writers of
+ * the rt free space metadata grab the bitmap and summary ILOCKs in
+ * that order, so we're still protected against allocation activities
+ * even if we continue on to the repair function.
+ */
xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
return error;
}
diff --git a/fs/xfs/scrub/rtsummary.h b/fs/xfs/scrub/rtsummary.h
new file mode 100644
index 000000000000..e1d50304d8d4
--- /dev/null
+++ b/fs/xfs/scrub/rtsummary.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RTSUMMARY_H__
+#define __XFS_SCRUB_RTSUMMARY_H__
+
+struct xchk_rtsummary {
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ struct xrep_tempexch tempexch;
+#endif
+ struct xfs_rtalloc_args args;
+
+ uint64_t rextents;
+ uint64_t rbmblocks;
+ uint64_t rsumsize;
+ unsigned int rsumlevels;
+ unsigned int resblks;
+
+ /* suminfo position of xfile as we write buffers to disk. */
+ xfs_rtsumoff_t prep_wordoff;
+
+ /* Memory buffer for the summary comparison. */
+ union xfs_suminfo_raw words[];
+};
+
+int xfsum_copyout(struct xfs_scrub *sc, xfs_rtsumoff_t sumoff,
+ union xfs_suminfo_raw *rawinfo, unsigned int nr_words);
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+int xrep_setup_rtsummary(struct xfs_scrub *sc, struct xchk_rtsummary *rts);
+#else
+# define xrep_setup_rtsummary(sc, rts) (0)
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_RTSUMMARY_H__ */
diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c
new file mode 100644
index 000000000000..d9e971c4c79f
--- /dev/null
+++ b/fs/xfs/scrub/rtsummary_repair.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_rtalloc.h"
+#include "xfs_inode.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_exchmaps.h"
+#include "xfs_rtbitmap.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/reap.h"
+#include "scrub/xfile.h"
+#include "scrub/rtsummary.h"
+
+/* Set us up to repair the rtsummary file. */
+int
+xrep_setup_rtsummary(
+ struct xfs_scrub *sc,
+ struct xchk_rtsummary *rts)
+{
+ struct xfs_mount *mp = sc->mp;
+ unsigned long long blocks;
+ int error;
+
+ error = xrep_tempfile_create(sc, S_IFREG);
+ if (error)
+ return error;
+
+ /*
+ * If we're doing a repair, we reserve enough blocks to write out a
+ * completely new summary file, plus twice as many blocks as we would
+ * need if we can only allocate one block per data fork mapping. This
+ * should cover the preallocation of the temporary file and exchanging
+ * the extent mappings.
+ *
+ * We cannot use xfs_exchmaps_estimate because we have not yet
+ * constructed the replacement rtsummary and therefore do not know how
+ * many extents it will use. By the time we do, we will have a dirty
+ * transaction (which we cannot drop because we cannot drop the
+ * rtsummary ILOCK) and cannot ask for more reservation.
+ */
+ blocks = XFS_B_TO_FSB(mp, mp->m_rsumsize);
+ blocks += xfs_bmbt_calc_size(mp, blocks) * 2;
+ if (blocks > UINT_MAX)
+ return -EOPNOTSUPP;
+
+ rts->resblks += blocks;
+ return 0;
+}
+
+static int
+xrep_rtsummary_prep_buf(
+ struct xfs_scrub *sc,
+ struct xfs_buf *bp,
+ void *data)
+{
+ struct xchk_rtsummary *rts = data;
+ struct xfs_mount *mp = sc->mp;
+ union xfs_suminfo_raw *ondisk;
+ int error;
+
+ rts->args.mp = sc->mp;
+ rts->args.tp = sc->tp;
+ rts->args.sumbp = bp;
+ ondisk = xfs_rsumblock_infoptr(&rts->args, 0);
+ rts->args.sumbp = NULL;
+
+ bp->b_ops = &xfs_rtbuf_ops;
+
+ error = xfsum_copyout(sc, rts->prep_wordoff, ondisk, mp->m_blockwsize);
+ if (error)
+ return error;
+
+ rts->prep_wordoff += mp->m_blockwsize;
+ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTSUMMARY_BUF);
+ return 0;
+}
+
+/* Repair the realtime summary. */
+int
+xrep_rtsummary(
+ struct xfs_scrub *sc)
+{
+ struct xchk_rtsummary *rts = sc->buf;
+ struct xfs_mount *mp = sc->mp;
+ xfs_filblks_t rsumblocks;
+ int error;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_has_rmapbt(mp))
+ return -EOPNOTSUPP;
+ /* We require atomic file exchange range to rebuild anything. */
+ if (!xfs_has_exchange_range(mp))
+ return -EOPNOTSUPP;
+
+ /* Walk away if we disagree on the size of the rt bitmap. */
+ if (rts->rbmblocks != mp->m_sb.sb_rbmblocks)
+ return 0;
+
+ /* Make sure any problems with the fork are fixed. */
+ error = xrep_metadata_inode_forks(sc);
+ if (error)
+ return error;
+
+ /*
+ * Try to take ILOCK_EXCL of the temporary file. We had better be the
+ * only ones holding onto this inode, but we can't block while holding
+ * the rtsummary file's ILOCK_EXCL.
+ */
+ while (!xrep_tempfile_ilock_nowait(sc)) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+ delay(1);
+ }
+
+ /* Make sure we have space allocated for the entire summary file. */
+ rsumblocks = XFS_B_TO_FSB(mp, rts->rsumsize);
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+ error = xrep_tempfile_prealloc(sc, 0, rsumblocks);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /* Copy the rtsummary file that we generated. */
+ error = xrep_tempfile_copyin(sc, 0, rsumblocks,
+ xrep_rtsummary_prep_buf, rts);
+ if (error)
+ return error;
+ error = xrep_tempfile_set_isize(sc, rts->rsumsize);
+ if (error)
+ return error;
+
+ /*
+ * Now exchange the contents. Nothing in repair uses the temporary
+ * buffer, so we can reuse it for the tempfile exchrange information.
+ */
+ error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, &rts->tempexch);
+ if (error)
+ return error;
+
+ error = xrep_tempexch_contents(sc, &rts->tempexch);
+ if (error)
+ return error;
+
+ /* Reset incore state and blow out the summary cache. */
+ if (mp->m_rsum_cache)
+ memset(mp->m_rsum_cache, 0xFF, mp->m_sb.sb_rbmblocks);
+
+ mp->m_rsumlevels = rts->rsumlevels;
+ mp->m_rsumsize = rts->rsumsize;
+
+ /* Free the old rtsummary blocks if they're not in use. */
+ return xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 20fac9723c08..c013f0ba4f36 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -17,6 +17,11 @@
#include "xfs_scrub.h"
#include "xfs_buf_mem.h"
#include "xfs_rmap.h"
+#include "xfs_exchrange.h"
+#include "xfs_exchmaps.h"
+#include "xfs_dir2.h"
+#include "xfs_parent.h"
+#include "xfs_icache.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -24,6 +29,8 @@
#include "scrub/health.h"
#include "scrub/stats.h"
#include "scrub/xfile.h"
+#include "scrub/tempfile.h"
+#include "scrub/orphanage.h"
/*
* Online Scrub and Repair
@@ -171,6 +178,39 @@ xchk_fsgates_disable(
sc->flags &= ~XCHK_FSGATES_ALL;
}
+/* Free the resources associated with a scrub subtype. */
+void
+xchk_scrub_free_subord(
+ struct xfs_scrub_subord *sub)
+{
+ struct xfs_scrub *sc = sub->parent_sc;
+
+ ASSERT(sc->ip == sub->sc.ip);
+ ASSERT(sc->orphanage == sub->sc.orphanage);
+ ASSERT(sc->tempip == sub->sc.tempip);
+
+ sc->sm->sm_type = sub->old_smtype;
+ sc->sm->sm_flags = sub->old_smflags |
+ (sc->sm->sm_flags & XFS_SCRUB_FLAGS_OUT);
+ sc->tp = sub->sc.tp;
+
+ if (sub->sc.buf) {
+ if (sub->sc.buf_cleanup)
+ sub->sc.buf_cleanup(sub->sc.buf);
+ kvfree(sub->sc.buf);
+ }
+ if (sub->sc.xmbtp)
+ xmbuf_free(sub->sc.xmbtp);
+ if (sub->sc.xfile)
+ xfile_destroy(sub->sc.xfile);
+
+ sc->ilock_flags = sub->sc.ilock_flags;
+ sc->orphanage_ilock_flags = sub->sc.orphanage_ilock_flags;
+ sc->temp_ilock_flags = sub->sc.temp_ilock_flags;
+
+ kfree(sub);
+}
+
/* Free all the resources and finish the transactions. */
STATIC int
xchk_teardown(
@@ -211,6 +251,8 @@ xchk_teardown(
sc->buf = NULL;
}
+ xrep_tempfile_rele(sc);
+ xrep_orphanage_rele(sc);
xchk_fsgates_disable(sc);
return error;
}
@@ -319,25 +361,25 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_INODE,
.setup = xchk_setup_directory,
.scrub = xchk_directory,
- .repair = xrep_notsupported,
+ .repair = xrep_directory,
},
[XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */
.type = ST_INODE,
.setup = xchk_setup_xattr,
.scrub = xchk_xattr,
- .repair = xrep_notsupported,
+ .repair = xrep_xattr,
},
[XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */
.type = ST_INODE,
.setup = xchk_setup_symlink,
.scrub = xchk_symlink,
- .repair = xrep_notsupported,
+ .repair = xrep_symlink,
},
[XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */
.type = ST_INODE,
.setup = xchk_setup_parent,
.scrub = xchk_parent,
- .repair = xrep_notsupported,
+ .repair = xrep_parent,
},
[XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
.type = ST_FS,
@@ -349,7 +391,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_FS,
.setup = xchk_setup_rtsummary,
.scrub = xchk_rtsummary,
- .repair = xrep_notsupported,
+ .repair = xrep_rtsummary,
},
[XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */
.type = ST_FS,
@@ -393,6 +435,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.scrub = xchk_health_record,
.repair = xrep_notsupported,
},
+ [XFS_SCRUB_TYPE_DIRTREE] = { /* directory tree structure */
+ .type = ST_INODE,
+ .setup = xchk_setup_dirtree,
+ .scrub = xchk_dirtree,
+ .has = xfs_has_parent,
+ .repair = xrep_dirtree,
+ },
};
static int
@@ -497,8 +546,38 @@ static inline void xchk_postmortem(struct xfs_scrub *sc)
}
#endif /* CONFIG_XFS_ONLINE_REPAIR */
+/*
+ * Create a new scrub context from an existing one, but with a different scrub
+ * type.
+ */
+struct xfs_scrub_subord *
+xchk_scrub_create_subord(
+ struct xfs_scrub *sc,
+ unsigned int subtype)
+{
+ struct xfs_scrub_subord *sub;
+
+ sub = kzalloc(sizeof(*sub), XCHK_GFP_FLAGS);
+ if (!sub)
+ return ERR_PTR(-ENOMEM);
+
+ sub->old_smtype = sc->sm->sm_type;
+ sub->old_smflags = sc->sm->sm_flags;
+ sub->parent_sc = sc;
+ memcpy(&sub->sc, sc, sizeof(struct xfs_scrub));
+ sub->sc.ops = &meta_scrub_ops[subtype];
+ sub->sc.sm->sm_type = subtype;
+ sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+ sub->sc.buf = NULL;
+ sub->sc.buf_cleanup = NULL;
+ sub->sc.xfile = NULL;
+ sub->sc.xmbtp = NULL;
+
+ return sub;
+}
+
/* Dispatch metadata scrubbing. */
-int
+STATIC int
xfs_scrub_metadata(
struct file *file,
struct xfs_scrub_metadata *sm)
@@ -540,6 +619,7 @@ xfs_scrub_metadata(
sc->sm = sm;
sc->ops = &meta_scrub_ops[sm->sm_type];
sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
+ sc->relax = INIT_XCHK_RELAX;
retry_op:
/*
* When repairs are allowed, prevent freezing or readonly remount while
@@ -643,3 +723,221 @@ try_harder:
run.retries++;
goto retry_op;
}
+
+/* Scrub one aspect of one piece of metadata. */
+int
+xfs_ioc_scrub_metadata(
+ struct file *file,
+ void __user *arg)
+{
+ struct xfs_scrub_metadata scrub;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&scrub, arg, sizeof(scrub)))
+ return -EFAULT;
+
+ error = xfs_scrub_metadata(file, &scrub);
+ if (error)
+ return error;
+
+ if (copy_to_user(arg, &scrub, sizeof(scrub)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* Decide if there have been any scrub failures up to this point. */
+static inline int
+xfs_scrubv_check_barrier(
+ struct xfs_mount *mp,
+ const struct xfs_scrub_vec *vectors,
+ const struct xfs_scrub_vec *stop_vec)
+{
+ const struct xfs_scrub_vec *v;
+ __u32 failmask;
+
+ failmask = stop_vec->sv_flags & XFS_SCRUB_FLAGS_OUT;
+
+ for (v = vectors; v < stop_vec; v++) {
+ if (v->sv_type == XFS_SCRUB_TYPE_BARRIER)
+ continue;
+
+ /*
+ * Runtime errors count as a previous failure, except the ones
+ * used to ask userspace to retry.
+ */
+ switch (v->sv_ret) {
+ case -EBUSY:
+ case -ENOENT:
+ case -EUSERS:
+ case 0:
+ break;
+ default:
+ return -ECANCELED;
+ }
+
+ /*
+ * If any of the out-flags on the scrub vector match the mask
+ * that was set on the barrier vector, that's a previous fail.
+ */
+ if (v->sv_flags & failmask)
+ return -ECANCELED;
+ }
+
+ return 0;
+}
+
+/*
+ * If the caller provided us with a nonzero inode number that isn't the ioctl
+ * file, try to grab a reference to it to eliminate all further untrusted inode
+ * lookups. If we can't get the inode, let each scrub function try again.
+ */
+STATIC struct xfs_inode *
+xchk_scrubv_open_by_handle(
+ struct xfs_mount *mp,
+ const struct xfs_scrub_vec_head *head)
+{
+ struct xfs_trans *tp;
+ struct xfs_inode *ip;
+ int error;
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return NULL;
+
+ error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip);
+ xfs_trans_cancel(tp);
+ if (error)
+ return NULL;
+
+ if (VFS_I(ip)->i_generation != head->svh_gen) {
+ xfs_irele(ip);
+ return NULL;
+ }
+
+ return ip;
+}
+
+/* Vectored scrub implementation to reduce ioctl calls. */
+int
+xfs_ioc_scrubv_metadata(
+ struct file *file,
+ void __user *arg)
+{
+ struct xfs_scrub_vec_head head;
+ struct xfs_scrub_vec_head __user *uhead = arg;
+ struct xfs_scrub_vec *vectors;
+ struct xfs_scrub_vec __user *uvectors;
+ struct xfs_inode *ip_in = XFS_I(file_inode(file));
+ struct xfs_mount *mp = ip_in->i_mount;
+ struct xfs_inode *handle_ip = NULL;
+ struct xfs_scrub_vec *v;
+ size_t vec_bytes;
+ unsigned int i;
+ int error = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&head, uhead, sizeof(head)))
+ return -EFAULT;
+
+ if (head.svh_reserved)
+ return -EINVAL;
+ if (head.svh_flags & ~XFS_SCRUB_VEC_FLAGS_ALL)
+ return -EINVAL;
+ if (head.svh_nr == 0)
+ return 0;
+
+ vec_bytes = array_size(head.svh_nr, sizeof(struct xfs_scrub_vec));
+ if (vec_bytes > PAGE_SIZE)
+ return -ENOMEM;
+
+ uvectors = (void __user *)(uintptr_t)head.svh_vectors;
+ vectors = memdup_user(uvectors, vec_bytes);
+ if (IS_ERR(vectors))
+ return PTR_ERR(vectors);
+
+ trace_xchk_scrubv_start(ip_in, &head);
+
+ for (i = 0, v = vectors; i < head.svh_nr; i++, v++) {
+ if (v->sv_reserved) {
+ error = -EINVAL;
+ goto out_free;
+ }
+
+ if (v->sv_type == XFS_SCRUB_TYPE_BARRIER &&
+ (v->sv_flags & ~XFS_SCRUB_FLAGS_OUT)) {
+ error = -EINVAL;
+ goto out_free;
+ }
+
+ trace_xchk_scrubv_item(mp, &head, i, v);
+ }
+
+ /*
+ * If the caller wants us to do a scrub-by-handle and the file used to
+ * call the ioctl is not the same file, load the incore inode and pin
+ * it across all the scrubv actions to avoid repeated UNTRUSTED
+ * lookups. The reference is not passed to deeper layers of scrub
+ * because each scrubber gets to decide its own strategy and return
+ * values for getting an inode.
+ */
+ if (head.svh_ino && head.svh_ino != ip_in->i_ino)
+ handle_ip = xchk_scrubv_open_by_handle(mp, &head);
+
+ /* Run all the scrubbers. */
+ for (i = 0, v = vectors; i < head.svh_nr; i++, v++) {
+ struct xfs_scrub_metadata sm = {
+ .sm_type = v->sv_type,
+ .sm_flags = v->sv_flags,
+ .sm_ino = head.svh_ino,
+ .sm_gen = head.svh_gen,
+ .sm_agno = head.svh_agno,
+ };
+
+ if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) {
+ v->sv_ret = xfs_scrubv_check_barrier(mp, vectors, v);
+ if (v->sv_ret) {
+ trace_xchk_scrubv_barrier_fail(mp, &head, i, v);
+ break;
+ }
+
+ continue;
+ }
+
+ v->sv_ret = xfs_scrub_metadata(file, &sm);
+ v->sv_flags = sm.sm_flags;
+
+ trace_xchk_scrubv_outcome(mp, &head, i, v);
+
+ if (head.svh_rest_us) {
+ ktime_t expires;
+
+ expires = ktime_add_ns(ktime_get(),
+ head.svh_rest_us * 1000);
+ set_current_state(TASK_KILLABLE);
+ schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+ }
+
+ if (fatal_signal_pending(current)) {
+ error = -EINTR;
+ goto out_free;
+ }
+ }
+
+ if (copy_to_user(uvectors, vectors, vec_bytes) ||
+ copy_to_user(uhead, &head, sizeof(head))) {
+ error = -EFAULT;
+ goto out_free;
+ }
+
+out_free:
+ if (handle_ip)
+ xfs_irele(handle_ip);
+ kfree(vectors);
+ return error;
+}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 9ad65b604fe1..1bc33f010d0e 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -8,6 +8,49 @@
struct xfs_scrub;
+struct xchk_relax {
+ unsigned long next_resched;
+ unsigned int resched_nr;
+ bool interruptible;
+};
+
+/* Yield to the scheduler at most 10x per second. */
+#define XCHK_RELAX_NEXT (jiffies + (HZ / 10))
+
+#define INIT_XCHK_RELAX \
+ (struct xchk_relax){ \
+ .next_resched = XCHK_RELAX_NEXT, \
+ .resched_nr = 0, \
+ .interruptible = true, \
+ }
+
+/*
+ * Relax during a scrub operation and exit if there's a fatal signal pending.
+ *
+ * If preemption is disabled, we need to yield to the scheduler every now and
+ * then so that we don't run afoul of the soft lockup watchdog or RCU stall
+ * detector. cond_resched calls are somewhat expensive (~5ns) so we want to
+ * ratelimit this to 10x per second. Amortize the cost of the other checks by
+ * only doing it once every 100 calls.
+ */
+static inline int xchk_maybe_relax(struct xchk_relax *widget)
+{
+ /* Amortize the cost of scheduling and checking signals. */
+ if (likely(++widget->resched_nr < 100))
+ return 0;
+ widget->resched_nr = 0;
+
+ if (unlikely(widget->next_resched <= jiffies)) {
+ cond_resched();
+ widget->next_resched = XCHK_RELAX_NEXT;
+ }
+
+ if (widget->interruptible && fatal_signal_pending(current))
+ return -EINTR;
+
+ return 0;
+}
+
/*
* Standard flags for allocating memory within scrub. NOFS context is
* configured by the process allocation scope. Scrub and repair must be able
@@ -17,6 +60,13 @@ struct xfs_scrub;
#define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \
__GFP_RETRY_MAYFAIL))
+/*
+ * For opening files by handle for fsck operations, we don't trust the inumber
+ * or the allocation state; therefore, perform an untrusted lookup. We don't
+ * want these inodes to pollute the cache, so mark them for immediate removal.
+ */
+#define XCHK_IGET_FLAGS (XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE)
+
/* Type info and names for the scrub types. */
enum xchk_type {
ST_NONE = 1, /* disabled */
@@ -105,6 +155,14 @@ struct xfs_scrub {
/* Lock flags for @ip. */
uint ilock_flags;
+ /* The orphanage, for stashing files that have lost their parent. */
+ uint orphanage_ilock_flags;
+ struct xfs_inode *orphanage;
+
+ /* A temporary file on this filesystem, for staging new metadata. */
+ struct xfs_inode *tempip;
+ uint temp_ilock_flags;
+
/* See the XCHK/XREP state flags below. */
unsigned int flags;
@@ -115,6 +173,9 @@ struct xfs_scrub {
*/
unsigned int sick_mask;
+ /* next time we want to cond_resched() */
+ struct xchk_relax relax;
+
/* State tracking for single-AG operations. */
struct xchk_ag sa;
};
@@ -141,6 +202,35 @@ struct xfs_scrub {
XCHK_FSGATES_DIRENTS | \
XCHK_FSGATES_RMAP)
+struct xfs_scrub_subord {
+ struct xfs_scrub sc;
+ struct xfs_scrub *parent_sc;
+ unsigned int old_smtype;
+ unsigned int old_smflags;
+};
+
+struct xfs_scrub_subord *xchk_scrub_create_subord(struct xfs_scrub *sc,
+ unsigned int subtype);
+void xchk_scrub_free_subord(struct xfs_scrub_subord *sub);
+
+/*
+ * We /could/ terminate a scrub/repair operation early. If we're not
+ * in a good place to continue (fatal signal, etc.) then bail out.
+ * Note that we're careful not to make any judgements about *error.
+ */
+static inline bool
+xchk_should_terminate(
+ struct xfs_scrub *sc,
+ int *error)
+{
+ if (xchk_maybe_relax(&sc->relax)) {
+ if (*error == 0)
+ *error = -EINTR;
+ return true;
+ }
+ return false;
+}
+
/* Metadata scrubbers */
int xchk_tester(struct xfs_scrub *sc);
int xchk_superblock(struct xfs_scrub *sc);
@@ -159,6 +249,7 @@ int xchk_directory(struct xfs_scrub *sc);
int xchk_xattr(struct xfs_scrub *sc);
int xchk_symlink(struct xfs_scrub *sc);
int xchk_parent(struct xfs_scrub *sc);
+int xchk_dirtree(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
int xchk_rtbitmap(struct xfs_scrub *sc);
int xchk_rtsummary(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index 42cafbed94ac..7996c2335476 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -79,6 +79,7 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters",
[XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck",
[XFS_SCRUB_TYPE_NLINKS] = "nlinks",
+ [XFS_SCRUB_TYPE_DIRTREE] = "dirtree",
};
/* Format the scrub stats into a text buffer, similar to pcp style. */
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index d77d8a9598f6..c848bcc07cd5 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -10,6 +10,7 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_log_format.h"
+#include "xfs_trans.h"
#include "xfs_inode.h"
#include "xfs_symlink.h"
#include "xfs_health.h"
@@ -17,18 +18,28 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/health.h"
+#include "scrub/repair.h"
/* Set us up to scrub a symbolic link. */
int
xchk_setup_symlink(
struct xfs_scrub *sc)
{
+ unsigned int resblks = 0;
+ int error;
+
/* Allocate the buffer without the inode lock held. */
sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, XCHK_GFP_FLAGS);
if (!sc->buf)
return -ENOMEM;
- return xchk_setup_inode_contents(sc, 0);
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_symlink(sc, &resblks);
+ if (error)
+ return error;
+ }
+
+ return xchk_setup_inode_contents(sc, resblks);
}
/* Symbolic links. */
diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c
new file mode 100644
index 000000000000..d015a86ef460
--- /dev/null
+++ b/fs/xfs/scrub/symlink_repair.c
@@ -0,0 +1,509 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_symlink.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_symlink_remote.h"
+#include "xfs_exchmaps.h"
+#include "xfs_exchrange.h"
+#include "xfs_health.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/reap.h"
+
+/*
+ * Symbolic Link Repair
+ * ====================
+ *
+ * We repair symbolic links by reading whatever target data we can find, up to
+ * the first NULL byte. If the recovered target strlen matches i_size, then
+ * we rewrite the target. In all other cases, we replace the target with an
+ * overly long string that cannot possibly resolve. The new target is written
+ * into a private hidden temporary file, and then a file contents exchange
+ * commits the new symlink target to the file being repaired.
+ */
+
+/* Set us up to repair the symlink file. */
+int
+xrep_setup_symlink(
+ struct xfs_scrub *sc,
+ unsigned int *resblks)
+{
+ struct xfs_mount *mp = sc->mp;
+ unsigned long long blocks;
+ int error;
+
+ error = xrep_tempfile_create(sc, S_IFLNK);
+ if (error)
+ return error;
+
+ /*
+ * If we're doing a repair, we reserve enough blocks to write out a
+ * completely new symlink file, plus twice as many blocks as we would
+ * need if we can only allocate one block per data fork mapping. This
+ * should cover the preallocation of the temporary file and exchanging
+ * the extent mappings.
+ *
+ * We cannot use xfs_exchmaps_estimate because we have not yet
+ * constructed the replacement symlink and therefore do not know how
+ * many extents it will use. By the time we do, we will have a dirty
+ * transaction (which we cannot drop because we cannot drop the
+ * symlink ILOCK) and cannot ask for more reservation.
+ */
+ blocks = xfs_symlink_blocks(sc->mp, XFS_SYMLINK_MAXLEN);
+ blocks += xfs_bmbt_calc_size(mp, blocks) * 2;
+ if (blocks > UINT_MAX)
+ return -EOPNOTSUPP;
+
+ *resblks += blocks;
+ return 0;
+}
+
+/*
+ * Try to salvage the pathname from remote blocks. Returns the number of bytes
+ * salvaged or a negative errno.
+ */
+STATIC ssize_t
+xrep_symlink_salvage_remote(
+ struct xfs_scrub *sc)
+{
+ struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_buf *bp;
+ char *target_buf = sc->buf;
+ xfs_failaddr_t fa;
+ xfs_filblks_t fsblocks;
+ xfs_daddr_t d;
+ loff_t len;
+ loff_t offset = 0;
+ unsigned int byte_cnt;
+ bool magic_ok;
+ bool hdr_ok;
+ int n;
+ int nmaps = XFS_SYMLINK_MAPS;
+ int error;
+
+ /* We'll only read until the buffer is full. */
+ len = min_t(loff_t, ip->i_disk_size, XFS_SYMLINK_MAXLEN);
+ fsblocks = xfs_symlink_blocks(sc->mp, len);
+ error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0);
+ if (error)
+ return error;
+
+ for (n = 0; n < nmaps; n++) {
+ struct xfs_dsymlink_hdr *dsl;
+
+ d = XFS_FSB_TO_DADDR(sc->mp, mval[n].br_startblock);
+
+ /* Read the rmt block. We'll run the verifiers manually. */
+ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
+ d, XFS_FSB_TO_BB(sc->mp, mval[n].br_blockcount),
+ 0, &bp, NULL);
+ if (error)
+ return error;
+ bp->b_ops = &xfs_symlink_buf_ops;
+
+ /* How many bytes do we expect to get out of this buffer? */
+ byte_cnt = XFS_FSB_TO_B(sc->mp, mval[n].br_blockcount);
+ byte_cnt = XFS_SYMLINK_BUF_SPACE(sc->mp, byte_cnt);
+ byte_cnt = min_t(unsigned int, byte_cnt, len);
+
+ /*
+ * See if the verifiers accept this block. We're willing to
+ * salvage if the if the offset/byte/ino are ok and either the
+ * verifier passed or the magic is ok. Anything else and we
+ * stop dead in our tracks.
+ */
+ fa = bp->b_ops->verify_struct(bp);
+ dsl = bp->b_addr;
+ magic_ok = dsl->sl_magic == cpu_to_be32(XFS_SYMLINK_MAGIC);
+ hdr_ok = xfs_symlink_hdr_ok(ip->i_ino, offset, byte_cnt, bp);
+ if (!hdr_ok || (fa != NULL && !magic_ok))
+ break;
+
+ memcpy(target_buf + offset, dsl + 1, byte_cnt);
+
+ len -= byte_cnt;
+ offset += byte_cnt;
+ }
+ return offset;
+}
+
+/*
+ * Try to salvage an inline symlink's contents. Returns the number of bytes
+ * salvaged or a negative errno.
+ */
+STATIC ssize_t
+xrep_symlink_salvage_inline(
+ struct xfs_scrub *sc)
+{
+ struct xfs_inode *ip = sc->ip;
+ char *target_buf = sc->buf;
+ char *old_target;
+ struct xfs_ifork *ifp;
+ unsigned int nr;
+
+ ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ if (!ifp->if_data)
+ return 0;
+
+ /*
+ * If inode repair zapped the link target, pretend that we didn't find
+ * any bytes at all so that we can replace the (now totally lost) link
+ * target with a warning message.
+ */
+ old_target = ifp->if_data;
+ if (xfs_inode_has_sickness(sc->ip, XFS_SICK_INO_SYMLINK_ZAPPED) &&
+ sc->ip->i_disk_size == 1 && old_target[0] == '?')
+ return 0;
+
+ nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip));
+ strncpy(target_buf, ifp->if_data, nr);
+ return nr;
+}
+
+#define DUMMY_TARGET \
+ "The target of this symbolic link could not be recovered at all and " \
+ "has been replaced with this explanatory message. To avoid " \
+ "accidentally pointing to an existing file path, this message is " \
+ "longer than the maximum supported file name length. That is an " \
+ "acceptable length for a symlink target on XFS but will produce " \
+ "File Name Too Long errors if resolved."
+
+/* Salvage whatever we can of the target. */
+STATIC int
+xrep_symlink_salvage(
+ struct xfs_scrub *sc)
+{
+ char *target_buf = sc->buf;
+ ssize_t buflen = 0;
+
+ BUILD_BUG_ON(sizeof(DUMMY_TARGET) - 1 <= NAME_MAX);
+
+ /*
+ * Salvage the target if there weren't any corruption problems observed
+ * while scanning it.
+ */
+ if (!(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+ if (sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL)
+ buflen = xrep_symlink_salvage_inline(sc);
+ else
+ buflen = xrep_symlink_salvage_remote(sc);
+ if (buflen < 0)
+ return buflen;
+
+ /*
+ * NULL-terminate the buffer because the ondisk target does not
+ * do that for us. If salvage didn't find the exact amount of
+ * data that we expected to find, don't salvage anything.
+ */
+ target_buf[buflen] = 0;
+ if (strlen(target_buf) != sc->ip->i_disk_size)
+ buflen = 0;
+ }
+
+ /*
+ * Change an empty target into a dummy target and clear the symlink
+ * target zapped flag.
+ */
+ if (buflen == 0) {
+ sc->sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED;
+ sprintf(target_buf, DUMMY_TARGET);
+ }
+
+ trace_xrep_symlink_salvage_target(sc->ip, target_buf,
+ strlen(target_buf));
+ return 0;
+}
+
+STATIC void
+xrep_symlink_local_to_remote(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp,
+ void *priv)
+{
+ struct xfs_scrub *sc = priv;
+ struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+ xfs_symlink_local_to_remote(tp, bp, ip, ifp, NULL);
+
+ if (!xfs_has_crc(sc->mp))
+ return;
+
+ dsl->sl_owner = cpu_to_be64(sc->ip->i_ino);
+ xfs_trans_log_buf(tp, bp, 0,
+ sizeof(struct xfs_dsymlink_hdr) + ifp->if_bytes - 1);
+}
+
+/*
+ * Prepare both links' data forks for an exchange. Promote the tempfile from
+ * local format to extents format, and if the file being repaired has a short
+ * format data fork, turn it into an empty extent list.
+ */
+STATIC int
+xrep_symlink_swap_prep(
+ struct xfs_scrub *sc,
+ bool temp_local,
+ bool ip_local)
+{
+ int error;
+
+ /*
+ * If the temp link is in shortform format, convert that to a remote
+ * target so that we can use the atomic mapping exchange.
+ */
+ if (temp_local) {
+ int logflags = XFS_ILOG_CORE;
+
+ error = xfs_bmap_local_to_extents(sc->tp, sc->tempip, 1,
+ &logflags, XFS_DATA_FORK,
+ xrep_symlink_local_to_remote,
+ sc);
+ if (error)
+ return error;
+
+ xfs_trans_log_inode(sc->tp, sc->ip, 0);
+
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If the file being repaired had a shortform data fork, convert that
+ * to an empty extent list in preparation for the atomic mapping
+ * exchange.
+ */
+ if (ip_local) {
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+ xfs_idestroy_fork(ifp);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+ ifp->if_nextents = 0;
+ ifp->if_bytes = 0;
+ ifp->if_data = NULL;
+ ifp->if_height = 0;
+
+ xfs_trans_log_inode(sc->tp, sc->ip,
+ XFS_ILOG_CORE | XFS_ILOG_DDATA);
+ }
+
+ return 0;
+}
+
+/* Exchange the temporary symlink's data fork with the one being repaired. */
+STATIC int
+xrep_symlink_swap(
+ struct xfs_scrub *sc)
+{
+ struct xrep_tempexch *tx = sc->buf;
+ bool ip_local, temp_local;
+ int error;
+
+ ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
+ temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
+
+ /*
+ * If the both links have a local format data fork and the rebuilt
+ * remote data would fit in the repaired file's data fork, copy the
+ * contents from the tempfile and declare ourselves done.
+ */
+ if (ip_local && temp_local &&
+ sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) {
+ xrep_tempfile_copyout_local(sc, XFS_DATA_FORK);
+ return 0;
+ }
+
+ /* Otherwise, make sure both data forks are in block-mapping mode. */
+ error = xrep_symlink_swap_prep(sc, temp_local, ip_local);
+ if (error)
+ return error;
+
+ return xrep_tempexch_contents(sc, tx);
+}
+
+/*
+ * Free all the remote blocks and reset the data fork. The caller must join
+ * the inode to the transaction. This function returns with the inode joined
+ * to a clean scrub transaction.
+ */
+STATIC int
+xrep_symlink_reset_fork(
+ struct xfs_scrub *sc)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK);
+ int error;
+
+ /* Unmap all the remote target buffers. */
+ if (xfs_ifork_has_extents(ifp)) {
+ error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
+ if (error)
+ return error;
+ }
+
+ trace_xrep_symlink_reset_fork(sc->tempip);
+
+ /* Reset the temp symlink target to dummy content. */
+ xfs_idestroy_fork(ifp);
+ return xfs_symlink_write_target(sc->tp, sc->tempip, sc->tempip->i_ino,
+ "?", 1, 0, 0);
+}
+
+/*
+ * Reinitialize a link target. Caller must ensure the inode is joined to
+ * the transaction.
+ */
+STATIC int
+xrep_symlink_rebuild(
+ struct xfs_scrub *sc)
+{
+ struct xrep_tempexch *tx;
+ char *target_buf = sc->buf;
+ xfs_fsblock_t fs_blocks;
+ unsigned int target_len;
+ unsigned int resblks;
+ int error;
+
+ /* How many blocks do we need? */
+ target_len = strlen(target_buf);
+ ASSERT(target_len != 0);
+ if (target_len == 0 || target_len > XFS_SYMLINK_MAXLEN)
+ return -EFSCORRUPTED;
+
+ trace_xrep_symlink_rebuild(sc->ip);
+
+ /*
+ * In preparation to write the new symlink target to the temporary
+ * file, drop the ILOCK of the file being repaired (it shouldn't be
+ * joined) and take the ILOCK of the temporary file.
+ *
+ * The VFS does not take the IOLOCK while reading a symlink (and new
+ * symlinks are hidden with INEW until they've been written) so it's
+ * possible that a readlink() could see the old corrupted contents
+ * while we're doing this.
+ */
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ xrep_tempfile_ilock(sc);
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+
+ /*
+ * Reserve resources to reinitialize the target. We're allowed to
+ * exceed file quota to repair inconsistent metadata, though this is
+ * unlikely.
+ */
+ fs_blocks = xfs_symlink_blocks(sc->mp, target_len);
+ resblks = xfs_symlink_space_res(sc->mp, target_len, fs_blocks);
+ error = xfs_trans_reserve_quota_nblks(sc->tp, sc->tempip, resblks, 0,
+ true);
+ if (error)
+ return error;
+
+ /* Erase the dummy target set up by the tempfile initialization. */
+ xfs_idestroy_fork(&sc->tempip->i_df);
+ sc->tempip->i_df.if_bytes = 0;
+ sc->tempip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
+
+ /* Write the salvaged target to the temporary link. */
+ error = xfs_symlink_write_target(sc->tp, sc->tempip, sc->ip->i_ino,
+ target_buf, target_len, fs_blocks, resblks);
+ if (error)
+ return error;
+
+ /*
+ * Commit the repair transaction so that we can use the atomic mapping
+ * exchange functions to compute the correct block reservations and
+ * re-lock the inodes.
+ */
+ target_buf = NULL;
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ xrep_tempfile_iunlock(sc);
+
+ /*
+ * We're done with the temporary buffer, so we can reuse it for the
+ * tempfile contents exchange information.
+ */
+ tx = sc->buf;
+ error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, tx);
+ if (error)
+ return error;
+
+ /*
+ * Exchange the temp link's data fork with the file being repaired.
+ * This recreates the transaction and takes the ILOCKs of the file
+ * being repaired and the temporary file.
+ */
+ error = xrep_symlink_swap(sc);
+ if (error)
+ return error;
+
+ /*
+ * Release the old symlink blocks and reset the data fork of the temp
+ * link to an empty shortform link. This is the last repair action we
+ * perform on the symlink, so we don't need to clean the transaction.
+ */
+ return xrep_symlink_reset_fork(sc);
+}
+
+/* Repair a symbolic link. */
+int
+xrep_symlink(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ /* The rmapbt is required to reap the old data fork. */
+ if (!xfs_has_rmapbt(sc->mp))
+ return -EOPNOTSUPP;
+ /* We require atomic file exchange range to rebuild anything. */
+ if (!xfs_has_exchange_range(sc->mp))
+ return -EOPNOTSUPP;
+
+ ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
+
+ error = xrep_symlink_salvage(sc);
+ if (error)
+ return error;
+
+ /* Now reset the target. */
+ error = xrep_symlink_rebuild(sc);
+ if (error)
+ return error;
+
+ return xrep_trans_commit(sc);
+}
diff --git a/fs/xfs/scrub/tempexch.h b/fs/xfs/scrub/tempexch.h
new file mode 100644
index 000000000000..995ba187c5aa
--- /dev/null
+++ b/fs/xfs/scrub/tempexch.h
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_TEMPEXCH_H__
+#define __XFS_SCRUB_TEMPEXCH_H__
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+struct xrep_tempexch {
+ struct xfs_exchmaps_req req;
+};
+
+int xrep_tempexch_trans_reserve(struct xfs_scrub *sc, int whichfork,
+ struct xrep_tempexch *ti);
+int xrep_tempexch_trans_alloc(struct xfs_scrub *sc, int whichfork,
+ struct xrep_tempexch *ti);
+
+int xrep_tempexch_contents(struct xfs_scrub *sc, struct xrep_tempexch *ti);
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_TEMPEXCH_H__ */
diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
new file mode 100644
index 000000000000..b747b625c5ee
--- /dev/null
+++ b/fs/xfs/scrub/tempfile.c
@@ -0,0 +1,851 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_quota.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_dir2.h"
+#include "xfs_exchrange.h"
+#include "xfs_exchmaps.h"
+#include "xfs_defer.h"
+#include "xfs_symlink_remote.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/trace.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/xfile.h"
+
+/*
+ * Create a temporary file for reconstructing metadata, with the intention of
+ * atomically exchanging the temporary file's contents with the file that's
+ * being repaired.
+ */
+int
+xrep_tempfile_create(
+ struct xfs_scrub *sc,
+ uint16_t mode)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_trans *tp = NULL;
+ struct xfs_dquot *udqp = NULL;
+ struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *pdqp = NULL;
+ struct xfs_trans_res *tres;
+ struct xfs_inode *dp = mp->m_rootip;
+ xfs_ino_t ino;
+ unsigned int resblks;
+ bool is_dir = S_ISDIR(mode);
+ int error;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+ if (xfs_is_readonly(mp))
+ return -EROFS;
+
+ ASSERT(sc->tp == NULL);
+ ASSERT(sc->tempip == NULL);
+
+ /*
+ * Make sure that we have allocated dquot(s) on disk. The temporary
+ * inode should be completely root owned so that we don't fail due to
+ * quota limits.
+ */
+ error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
+ XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp);
+ if (error)
+ return error;
+
+ if (is_dir) {
+ resblks = xfs_mkdir_space_res(mp, 0);
+ tres = &M_RES(mp)->tr_mkdir;
+ } else {
+ resblks = XFS_IALLOC_SPACE_RES(mp);
+ tres = &M_RES(mp)->tr_create_tmpfile;
+ }
+
+ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
+ &tp);
+ if (error)
+ goto out_release_dquots;
+
+ /* Allocate inode, set up directory. */
+ error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
+ if (error)
+ goto out_trans_cancel;
+ error = xfs_init_new_inode(&nop_mnt_idmap, tp, dp, ino, mode, 0, 0,
+ 0, false, &sc->tempip);
+ if (error)
+ goto out_trans_cancel;
+
+ /* Change the ownership of the inode to root. */
+ VFS_I(sc->tempip)->i_uid = GLOBAL_ROOT_UID;
+ VFS_I(sc->tempip)->i_gid = GLOBAL_ROOT_GID;
+ sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT);
+ xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE);
+
+ /*
+ * Mark our temporary file as private so that LSMs and the ACL code
+ * don't try to add their own metadata or reason about these files.
+ * The file should never be exposed to userspace.
+ */
+ VFS_I(sc->tempip)->i_flags |= S_PRIVATE;
+ VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR;
+
+ if (is_dir) {
+ error = xfs_dir_init(tp, sc->tempip, dp);
+ if (error)
+ goto out_trans_cancel;
+ } else if (S_ISLNK(VFS_I(sc->tempip)->i_mode)) {
+ /*
+ * Initialize the temporary symlink with a meaningless target
+ * that won't trip the verifiers. Repair must rewrite the
+ * target with meaningful content before swapping with the file
+ * being repaired. A single-byte target will not write a
+ * remote target block, so the owner is irrelevant.
+ */
+ error = xfs_symlink_write_target(tp, sc->tempip,
+ sc->tempip->i_ino, ".", 1, 0, 0);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /*
+ * Attach the dquot(s) to the inodes and modify them incore.
+ * These ids of the inode couldn't have changed since the new
+ * inode has been locked ever since it was created.
+ */
+ xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp);
+
+ /*
+ * Put our temp file on the unlinked list so it's purged automatically.
+ * All file-based metadata being reconstructed using this file must be
+ * atomically exchanged with the original file because the contents
+ * here will be purged when the inode is dropped or log recovery cleans
+ * out the unlinked list.
+ */
+ error = xfs_iunlink(tp, sc->tempip);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_release_inode;
+
+ trace_xrep_tempfile_create(sc);
+
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ /* Finish setting up the incore / vfs context. */
+ xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
+ xfs_setup_iops(sc->tempip);
+ xfs_finish_inode_setup(sc->tempip);
+
+ sc->temp_ilock_flags = 0;
+ return error;
+
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+out_release_inode:
+ /*
+ * Wait until after the current transaction is aborted to finish the
+ * setup of the inode and release the inode. This prevents recursive
+ * transactions and deadlocks from xfs_inactive.
+ */
+ if (sc->tempip) {
+ xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
+ xfs_finish_inode_setup(sc->tempip);
+ xchk_irele(sc, sc->tempip);
+ }
+out_release_dquots:
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ return error;
+}
+
+/* Take IOLOCK_EXCL on the temporary file, maybe. */
+bool
+xrep_tempfile_iolock_nowait(
+ struct xfs_scrub *sc)
+{
+ if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) {
+ sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Take the temporary file's IOLOCK while holding a different inode's IOLOCK.
+ * In theory nobody else should hold the tempfile's IOLOCK, but we use trylock
+ * to avoid deadlocks and lockdep complaints.
+ */
+int
+xrep_tempfile_iolock_polled(
+ struct xfs_scrub *sc)
+{
+ int error = 0;
+
+ while (!xrep_tempfile_iolock_nowait(sc)) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+ delay(1);
+ }
+
+ return 0;
+}
+
+/* Release IOLOCK_EXCL on the temporary file. */
+void
+xrep_tempfile_iounlock(
+ struct xfs_scrub *sc)
+{
+ xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL);
+ sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL;
+}
+
+/* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */
+void
+xrep_tempfile_ilock(
+ struct xfs_scrub *sc)
+{
+ sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_ilock(sc->tempip, XFS_ILOCK_EXCL);
+}
+
+/* Try to grab ILOCK_EXCL on the temporary file. */
+bool
+xrep_tempfile_ilock_nowait(
+ struct xfs_scrub *sc)
+{
+ if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) {
+ sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
+ return true;
+ }
+
+ return false;
+}
+
+/* Unlock ILOCK_EXCL on the temporary file after an update. */
+void
+xrep_tempfile_iunlock(
+ struct xfs_scrub *sc)
+{
+ xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
+ sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL;
+}
+
+/*
+ * Begin the process of making changes to both the file being scrubbed and
+ * the temporary file by taking ILOCK_EXCL on both.
+ */
+void
+xrep_tempfile_ilock_both(
+ struct xfs_scrub *sc)
+{
+ xfs_lock_two_inodes(sc->ip, XFS_ILOCK_EXCL, sc->tempip, XFS_ILOCK_EXCL);
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
+}
+
+/* Unlock ILOCK_EXCL on both files. */
+void
+xrep_tempfile_iunlock_both(
+ struct xfs_scrub *sc)
+{
+ xrep_tempfile_iunlock(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+}
+
+/* Release the temporary file. */
+void
+xrep_tempfile_rele(
+ struct xfs_scrub *sc)
+{
+ if (!sc->tempip)
+ return;
+
+ if (sc->temp_ilock_flags) {
+ xfs_iunlock(sc->tempip, sc->temp_ilock_flags);
+ sc->temp_ilock_flags = 0;
+ }
+
+ xchk_irele(sc, sc->tempip);
+ sc->tempip = NULL;
+}
+
+/*
+ * Make sure that the given range of the data fork of the temporary file is
+ * mapped to written blocks. The caller must ensure that both inodes are
+ * joined to the transaction.
+ */
+int
+xrep_tempfile_prealloc(
+ struct xfs_scrub *sc,
+ xfs_fileoff_t off,
+ xfs_filblks_t len)
+{
+ struct xfs_bmbt_irec map;
+ xfs_fileoff_t end = off + len;
+ int error;
+
+ ASSERT(sc->tempip != NULL);
+ ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip));
+
+ for (; off < end; off = map.br_startoff + map.br_blockcount) {
+ int nmaps = 1;
+
+ /*
+ * If we have a real extent mapping this block then we're
+ * in ok shape.
+ */
+ error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps,
+ XFS_DATA_FORK);
+ if (error)
+ return error;
+ if (nmaps == 0) {
+ ASSERT(nmaps != 0);
+ return -EFSCORRUPTED;
+ }
+
+ if (xfs_bmap_is_written_extent(&map))
+ continue;
+
+ /*
+ * If we find a delalloc reservation then something is very
+ * very wrong. Bail out.
+ */
+ if (map.br_startblock == DELAYSTARTBLOCK)
+ return -EFSCORRUPTED;
+
+ /*
+ * Make sure this block has a real zeroed extent allocated to
+ * it.
+ */
+ nmaps = 1;
+ error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off,
+ XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map,
+ &nmaps);
+ if (error)
+ return error;
+ if (nmaps != 1)
+ return -EFSCORRUPTED;
+
+ trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map);
+
+ /* Commit new extent and all deferred work. */
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Write data to each block of a file. The given range of the tempfile's data
+ * fork must already be populated with written extents.
+ */
+int
+xrep_tempfile_copyin(
+ struct xfs_scrub *sc,
+ xfs_fileoff_t off,
+ xfs_filblks_t len,
+ xrep_tempfile_copyin_fn prep_fn,
+ void *data)
+{
+ LIST_HEAD(buffers_list);
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+ xfs_fileoff_t flush_mask;
+ xfs_fileoff_t end = off + len;
+ loff_t pos = XFS_FSB_TO_B(mp, off);
+ int error = 0;
+
+ ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode));
+
+ /* Flush buffers to disk every 512K */
+ flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1;
+
+ for (; off < end; off++, pos += mp->m_sb.sb_blocksize) {
+ struct xfs_bmbt_irec map;
+ int nmaps = 1;
+
+ /* Read block mapping for this file block. */
+ error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0);
+ if (error)
+ goto out_err;
+ if (nmaps == 0 || !xfs_bmap_is_written_extent(&map)) {
+ error = -EFSCORRUPTED;
+ goto out_err;
+ }
+
+ /* Get the metadata buffer for this offset in the file. */
+ error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, map.br_startblock),
+ mp->m_bsize, 0, &bp);
+ if (error)
+ goto out_err;
+
+ trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map);
+
+ /* Read in a block's worth of data from the xfile. */
+ error = prep_fn(sc, bp, data);
+ if (error) {
+ xfs_trans_brelse(sc->tp, bp);
+ goto out_err;
+ }
+
+ /* Queue buffer, and flush if we have too much dirty data. */
+ xfs_buf_delwri_queue_here(bp, &buffers_list);
+ xfs_trans_brelse(sc->tp, bp);
+
+ if (!(off & flush_mask)) {
+ error = xfs_buf_delwri_submit(&buffers_list);
+ if (error)
+ goto out_err;
+ }
+ }
+
+ /*
+ * Write the new blocks to disk. If the ordered list isn't empty after
+ * that, then something went wrong and we have to fail. This should
+ * never happen, but we'll check anyway.
+ */
+ error = xfs_buf_delwri_submit(&buffers_list);
+ if (error)
+ goto out_err;
+
+ if (!list_empty(&buffers_list)) {
+ ASSERT(list_empty(&buffers_list));
+ error = -EIO;
+ goto out_err;
+ }
+
+ return 0;
+
+out_err:
+ xfs_buf_delwri_cancel(&buffers_list);
+ return error;
+}
+
+/*
+ * Set the temporary file's size. Caller must join the tempfile to the scrub
+ * transaction and is responsible for adjusting block mappings as needed.
+ */
+int
+xrep_tempfile_set_isize(
+ struct xfs_scrub *sc,
+ unsigned long long isize)
+{
+ if (sc->tempip->i_disk_size == isize)
+ return 0;
+
+ sc->tempip->i_disk_size = isize;
+ i_size_write(VFS_I(sc->tempip), isize);
+ return xrep_tempfile_roll_trans(sc);
+}
+
+/*
+ * Roll a repair transaction involving the temporary file. Caller must join
+ * both the temporary file and the file being scrubbed to the transaction.
+ * This function return with both inodes joined to a new scrub transaction,
+ * or the usual negative errno.
+ */
+int
+xrep_tempfile_roll_trans(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE);
+ error = xrep_roll_trans(sc);
+ if (error)
+ return error;
+
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+ return 0;
+}
+
+/*
+ * Fill out the mapping exchange request in preparation for atomically
+ * committing the contents of a metadata file that we've rebuilt in the temp
+ * file.
+ */
+STATIC int
+xrep_tempexch_prep_request(
+ struct xfs_scrub *sc,
+ int whichfork,
+ struct xrep_tempexch *tx)
+{
+ struct xfs_exchmaps_req *req = &tx->req;
+
+ memset(tx, 0, sizeof(struct xrep_tempexch));
+
+ /* COW forks don't exist on disk. */
+ if (whichfork == XFS_COW_FORK) {
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ /* Both files should have the relevant forks. */
+ if (!xfs_ifork_ptr(sc->ip, whichfork) ||
+ !xfs_ifork_ptr(sc->tempip, whichfork)) {
+ ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
+ ASSERT(xfs_ifork_ptr(sc->tempip, whichfork) != NULL);
+ return -EINVAL;
+ }
+
+ /* Exchange all mappings in both forks. */
+ req->ip1 = sc->tempip;
+ req->ip2 = sc->ip;
+ req->startoff1 = 0;
+ req->startoff2 = 0;
+ switch (whichfork) {
+ case XFS_ATTR_FORK:
+ req->flags |= XFS_EXCHMAPS_ATTR_FORK;
+ break;
+ case XFS_DATA_FORK:
+ /* Always exchange sizes when exchanging data fork mappings. */
+ req->flags |= XFS_EXCHMAPS_SET_SIZES;
+ break;
+ }
+ req->blockcount = XFS_MAX_FILEOFF;
+
+ return 0;
+}
+
+/*
+ * Fill out the mapping exchange resource estimation structures in preparation
+ * for exchanging the contents of a metadata file that we've rebuilt in the
+ * temp file. Caller must hold IOLOCK_EXCL but not ILOCK_EXCL on both files.
+ */
+STATIC int
+xrep_tempexch_estimate(
+ struct xfs_scrub *sc,
+ struct xrep_tempexch *tx)
+{
+ struct xfs_exchmaps_req *req = &tx->req;
+ struct xfs_ifork *ifp;
+ struct xfs_ifork *tifp;
+ int whichfork = xfs_exchmaps_reqfork(req);
+ int state = 0;
+
+ /*
+ * The exchmaps code only knows how to exchange file fork space
+ * mappings. Any fork data in local format must be promoted to a
+ * single block before the exchange can take place.
+ */
+ ifp = xfs_ifork_ptr(sc->ip, whichfork);
+ if (ifp->if_format == XFS_DINODE_FMT_LOCAL)
+ state |= 1;
+
+ tifp = xfs_ifork_ptr(sc->tempip, whichfork);
+ if (tifp->if_format == XFS_DINODE_FMT_LOCAL)
+ state |= 2;
+
+ switch (state) {
+ case 0:
+ /* Both files have mapped extents; use the regular estimate. */
+ return xfs_exchrange_estimate(req);
+ case 1:
+ /*
+ * The file being repaired is in local format, but the temp
+ * file has mapped extents. To perform the exchange, the file
+ * being repaired must have its shorform data converted to an
+ * ondisk block so that the forks will be in extents format.
+ * We need one resblk for the conversion; the number of
+ * exchanges is (worst case) the temporary file's extent count
+ * plus the block we converted.
+ */
+ req->ip1_bcount = sc->tempip->i_nblocks;
+ req->ip2_bcount = 1;
+ req->nr_exchanges = 1 + tifp->if_nextents;
+ req->resblks = 1;
+ break;
+ case 2:
+ /*
+ * The temporary file is in local format, but the file being
+ * repaired has mapped extents. To perform the exchange, the
+ * temp file must have its shortform data converted to an
+ * ondisk block, and the fork changed to extents format. We
+ * need one resblk for the conversion; the number of exchanges
+ * is (worst case) the extent count of the file being repaired
+ * plus the block we converted.
+ */
+ req->ip1_bcount = 1;
+ req->ip2_bcount = sc->ip->i_nblocks;
+ req->nr_exchanges = 1 + ifp->if_nextents;
+ req->resblks = 1;
+ break;
+ case 3:
+ /*
+ * Both forks are in local format. To perform the exchange,
+ * both files must have their shortform data converted to
+ * fsblocks, and both forks must be converted to extents
+ * format. We need two resblks for the two conversions, and
+ * the number of exchanges is 1 since there's only one block at
+ * fileoff 0. Presumably, the caller could not exchange the
+ * two inode fork areas directly.
+ */
+ req->ip1_bcount = 1;
+ req->ip2_bcount = 1;
+ req->nr_exchanges = 1;
+ req->resblks = 2;
+ break;
+ }
+
+ return xfs_exchmaps_estimate_overhead(req);
+}
+
+/*
+ * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip
+ * this if quota enforcement is disabled or if both inodes' dquots are the
+ * same. The qretry structure must be initialized to zeroes before the first
+ * call to this function.
+ */
+STATIC int
+xrep_tempexch_reserve_quota(
+ struct xfs_scrub *sc,
+ const struct xrep_tempexch *tx)
+{
+ struct xfs_trans *tp = sc->tp;
+ const struct xfs_exchmaps_req *req = &tx->req;
+ int64_t ddelta, rdelta;
+ int error;
+
+ /*
+ * Don't bother with a quota reservation if we're not enforcing them
+ * or the two inodes have the same dquots.
+ */
+ if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
+ (req->ip1->i_udquot == req->ip2->i_udquot &&
+ req->ip1->i_gdquot == req->ip2->i_gdquot &&
+ req->ip1->i_pdquot == req->ip2->i_pdquot))
+ return 0;
+
+ /*
+ * Quota reservation for each file comes from two sources. First, we
+ * need to account for any net gain in mapped blocks during the
+ * exchange. Second, we need reservation for the gross gain in mapped
+ * blocks so that we don't trip over any quota block reservation
+ * assertions. We must reserve the gross gain because the quota code
+ * subtracts from bcount the number of blocks that we unmap; it does
+ * not add that quantity back to the quota block reservation.
+ */
+ ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount);
+ rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount);
+ error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
+ ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount,
+ true);
+ if (error)
+ return error;
+
+ ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount);
+ rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount);
+ return xfs_trans_reserve_quota_nblks(tp, req->ip2,
+ ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount,
+ true);
+}
+
+/*
+ * Prepare an existing transaction for an atomic file contents exchange.
+ *
+ * This function fills out the mapping exchange request and resource estimation
+ * structures in preparation for exchanging the contents of a metadata file
+ * that has been rebuilt in the temp file. Next, it reserves space and quota
+ * for the transaction.
+ *
+ * The caller must hold ILOCK_EXCL of the scrub target file and the temporary
+ * file. The caller must join both inodes to the transaction with no unlock
+ * flags, and is responsible for dropping both ILOCKs when appropriate. Only
+ * use this when those ILOCKs cannot be dropped.
+ */
+int
+xrep_tempexch_trans_reserve(
+ struct xfs_scrub *sc,
+ int whichfork,
+ struct xrep_tempexch *tx)
+{
+ int error;
+
+ ASSERT(sc->tp != NULL);
+ xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL);
+ xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL);
+
+ error = xrep_tempexch_prep_request(sc, whichfork, tx);
+ if (error)
+ return error;
+
+ error = xfs_exchmaps_estimate(&tx->req);
+ if (error)
+ return error;
+
+ error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0);
+ if (error)
+ return error;
+
+ return xrep_tempexch_reserve_quota(sc, tx);
+}
+
+/*
+ * Create a new transaction for a file contents exchange.
+ *
+ * This function fills out the mapping excahange request and resource
+ * estimation structures in preparation for exchanging the contents of a
+ * metadata file that has been rebuilt in the temp file. Next, it reserves
+ * space, takes ILOCK_EXCL of both inodes, joins them to the transaction and
+ * reserves quota for the transaction.
+ *
+ * The caller is responsible for dropping both ILOCKs when appropriate.
+ */
+int
+xrep_tempexch_trans_alloc(
+ struct xfs_scrub *sc,
+ int whichfork,
+ struct xrep_tempexch *tx)
+{
+ unsigned int flags = 0;
+ int error;
+
+ ASSERT(sc->tp == NULL);
+ ASSERT(xfs_has_exchange_range(sc->mp));
+
+ error = xrep_tempexch_prep_request(sc, whichfork, tx);
+ if (error)
+ return error;
+
+ error = xrep_tempexch_estimate(sc, tx);
+ if (error)
+ return error;
+
+ if (xfs_has_lazysbcount(sc->mp))
+ flags |= XFS_TRANS_RES_FDBLKS;
+
+ error = xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
+ tx->req.resblks, 0, flags, &sc->tp);
+ if (error)
+ return error;
+
+ sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_exchrange_ilock(sc->tp, sc->ip, sc->tempip);
+
+ return xrep_tempexch_reserve_quota(sc, tx);
+}
+
+/*
+ * Exchange file mappings (and hence file contents) between the file being
+ * repaired and the temporary file. Returns with both inodes locked and joined
+ * to a clean scrub transaction.
+ */
+int
+xrep_tempexch_contents(
+ struct xfs_scrub *sc,
+ struct xrep_tempexch *tx)
+{
+ int error;
+
+ ASSERT(xfs_has_exchange_range(sc->mp));
+
+ xfs_exchange_mappings(sc->tp, &tx->req);
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+
+ /*
+ * If we exchanged the ondisk sizes of two metadata files, we must
+ * exchanged the incore sizes as well.
+ */
+ if (tx->req.flags & XFS_EXCHMAPS_SET_SIZES) {
+ loff_t temp;
+
+ temp = i_size_read(VFS_I(sc->ip));
+ i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip)));
+ i_size_write(VFS_I(sc->tempip), temp);
+ }
+
+ return 0;
+}
+
+/*
+ * Write local format data from one of the temporary file's forks into the same
+ * fork of file being repaired, and exchange the file sizes, if appropriate.
+ * Caller must ensure that the file being repaired has enough fork space to
+ * hold all the bytes.
+ */
+void
+xrep_tempfile_copyout_local(
+ struct xfs_scrub *sc,
+ int whichfork)
+{
+ struct xfs_ifork *temp_ifp;
+ struct xfs_ifork *ifp;
+ unsigned int ilog_flags = XFS_ILOG_CORE;
+
+ temp_ifp = xfs_ifork_ptr(sc->tempip, whichfork);
+ ifp = xfs_ifork_ptr(sc->ip, whichfork);
+
+ ASSERT(temp_ifp != NULL);
+ ASSERT(ifp != NULL);
+ ASSERT(temp_ifp->if_format == XFS_DINODE_FMT_LOCAL);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ ASSERT(sc->tempip->i_disk_size <=
+ xfs_inode_data_fork_size(sc->ip));
+ break;
+ case XFS_ATTR_FORK:
+ ASSERT(sc->tempip->i_forkoff >= sc->ip->i_forkoff);
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
+ /* Recreate @sc->ip's incore fork (ifp) with data from temp_ifp. */
+ xfs_idestroy_fork(ifp);
+ xfs_init_local_fork(sc->ip, whichfork, temp_ifp->if_data,
+ temp_ifp->if_bytes);
+
+ if (whichfork == XFS_DATA_FORK) {
+ i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip)));
+ sc->ip->i_disk_size = sc->tempip->i_disk_size;
+ }
+
+ ilog_flags |= xfs_ilog_fdata(whichfork);
+ xfs_trans_log_inode(sc->tp, sc->ip, ilog_flags);
+}
+
+/* Decide if a given XFS inode is a temporary file for a repair. */
+bool
+xrep_is_tempfile(
+ const struct xfs_inode *ip)
+{
+ const struct inode *inode = &ip->i_vnode;
+
+ if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR))
+ return true;
+
+ return false;
+}
diff --git a/fs/xfs/scrub/tempfile.h b/fs/xfs/scrub/tempfile.h
new file mode 100644
index 000000000000..e51399f595fe
--- /dev/null
+++ b/fs/xfs/scrub/tempfile.h
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_TEMPFILE_H__
+#define __XFS_SCRUB_TEMPFILE_H__
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+int xrep_tempfile_create(struct xfs_scrub *sc, uint16_t mode);
+void xrep_tempfile_rele(struct xfs_scrub *sc);
+
+bool xrep_tempfile_iolock_nowait(struct xfs_scrub *sc);
+int xrep_tempfile_iolock_polled(struct xfs_scrub *sc);
+void xrep_tempfile_iounlock(struct xfs_scrub *sc);
+
+void xrep_tempfile_ilock(struct xfs_scrub *sc);
+bool xrep_tempfile_ilock_nowait(struct xfs_scrub *sc);
+void xrep_tempfile_iunlock(struct xfs_scrub *sc);
+void xrep_tempfile_iunlock_both(struct xfs_scrub *sc);
+void xrep_tempfile_ilock_both(struct xfs_scrub *sc);
+
+int xrep_tempfile_prealloc(struct xfs_scrub *sc, xfs_fileoff_t off,
+ xfs_filblks_t len);
+
+enum xfs_blft;
+
+typedef int (*xrep_tempfile_copyin_fn)(struct xfs_scrub *sc,
+ struct xfs_buf *bp, void *data);
+
+int xrep_tempfile_copyin(struct xfs_scrub *sc, xfs_fileoff_t off,
+ xfs_filblks_t len, xrep_tempfile_copyin_fn fn, void *data);
+
+int xrep_tempfile_set_isize(struct xfs_scrub *sc, unsigned long long isize);
+
+int xrep_tempfile_roll_trans(struct xfs_scrub *sc);
+void xrep_tempfile_copyout_local(struct xfs_scrub *sc, int whichfork);
+bool xrep_is_tempfile(const struct xfs_inode *ip);
+#else
+static inline void xrep_tempfile_iolock_both(struct xfs_scrub *sc)
+{
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+}
+# define xrep_is_tempfile(ip) (false)
+# define xrep_tempfile_rele(sc)
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_TEMPFILE_H__ */
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 3dd281d6d185..4470ad0533b8 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -19,13 +19,19 @@
#include "xfs_da_format.h"
#include "xfs_dir2.h"
#include "xfs_rmap.h"
+#include "xfs_parent.h"
#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/quota.h"
#include "scrub/iscan.h"
+#include "scrub/orphanage.h"
#include "scrub/nlinks.h"
#include "scrub/fscounters.h"
+#include "scrub/bitmap.h"
+#include "scrub/ino_bitmap.h"
+#include "scrub/xfblob.h"
+#include "scrub/dirtree.h"
/* Figure out which block the btree cursor was pointing to. */
static inline xfs_fsblock_t
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 5b294be52c55..92ef4cdc486e 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -26,6 +26,10 @@ struct xchk_iscan;
struct xchk_nlink;
struct xchk_fscounters;
struct xfs_rmap_update_params;
+struct xfs_parent_rec;
+enum xchk_dirpath_outcome;
+struct xchk_dirtree;
+struct xchk_dirtree_outcomes;
/*
* ftrace's __print_symbolic requires that all enum values be wrapped in the
@@ -64,6 +68,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIRTREE);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER);
#define XFS_SCRUB_TYPE_STRINGS \
{ XFS_SCRUB_TYPE_PROBE, "probe" }, \
@@ -93,7 +99,9 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY);
{ XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }, \
{ XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" }, \
{ XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \
- { XFS_SCRUB_TYPE_HEALTHY, "healthy" }
+ { XFS_SCRUB_TYPE_HEALTHY, "healthy" }, \
+ { XFS_SCRUB_TYPE_DIRTREE, "dirtree" }, \
+ { XFS_SCRUB_TYPE_BARRIER, "barrier" }
#define XFS_SCRUB_FLAG_STRINGS \
{ XFS_SCRUB_IFLAG_REPAIR, "repair" }, \
@@ -169,6 +177,8 @@ DEFINE_EVENT(xchk_class, name, \
DEFINE_SCRUB_EVENT(xchk_start);
DEFINE_SCRUB_EVENT(xchk_done);
DEFINE_SCRUB_EVENT(xchk_deadlock_retry);
+DEFINE_SCRUB_EVENT(xchk_dirtree_start);
+DEFINE_SCRUB_EVENT(xchk_dirtree_done);
DEFINE_SCRUB_EVENT(xrep_attempt);
DEFINE_SCRUB_EVENT(xrep_done);
@@ -199,6 +209,81 @@ DEFINE_EVENT(xchk_fsgate_class, name, \
DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_enable);
DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_disable);
+DECLARE_EVENT_CLASS(xchk_vector_head_class,
+ TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead),
+ TP_ARGS(ip, vhead),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, inum)
+ __field(unsigned int, gen)
+ __field(unsigned int, flags)
+ __field(unsigned short, rest_us)
+ __field(unsigned short, nr_vecs)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->agno = vhead->svh_agno;
+ __entry->inum = vhead->svh_ino;
+ __entry->gen = vhead->svh_gen;
+ __entry->flags = vhead->svh_flags;
+ __entry->rest_us = vhead->svh_rest_us;
+ __entry->nr_vecs = vhead->svh_nr;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx agno 0x%x inum 0x%llx gen 0x%x flags 0x%x rest_us %u nr_vecs %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->agno,
+ __entry->inum,
+ __entry->gen,
+ __entry->flags,
+ __entry->rest_us,
+ __entry->nr_vecs)
+)
+#define DEFINE_SCRUBV_HEAD_EVENT(name) \
+DEFINE_EVENT(xchk_vector_head_class, name, \
+ TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead), \
+ TP_ARGS(ip, vhead))
+
+DEFINE_SCRUBV_HEAD_EVENT(xchk_scrubv_start);
+
+DECLARE_EVENT_CLASS(xchk_vector_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead,
+ unsigned int vec_nr, struct xfs_scrub_vec *v),
+ TP_ARGS(mp, vhead, vec_nr, v),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, vec_nr)
+ __field(unsigned int, vec_type)
+ __field(unsigned int, vec_flags)
+ __field(int, vec_ret)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->vec_nr = vec_nr;
+ __entry->vec_type = v->sv_type;
+ __entry->vec_flags = v->sv_flags;
+ __entry->vec_ret = v->sv_ret;
+ ),
+ TP_printk("dev %d:%d vec[%u] type %s flags %s ret %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->vec_nr,
+ __print_symbolic(__entry->vec_type, XFS_SCRUB_TYPE_STRINGS),
+ __print_flags(__entry->vec_flags, "|", XFS_SCRUB_FLAG_STRINGS),
+ __entry->vec_ret)
+)
+#define DEFINE_SCRUBV_EVENT(name) \
+DEFINE_EVENT(xchk_vector_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead, \
+ unsigned int vec_nr, struct xfs_scrub_vec *v), \
+ TP_ARGS(mp, vhead, vec_nr, v))
+
+DEFINE_SCRUBV_EVENT(xchk_scrubv_barrier_fail);
+DEFINE_SCRUBV_EVENT(xchk_scrubv_item);
+DEFINE_SCRUBV_EVENT(xchk_scrubv_outcome);
+
TRACE_EVENT(xchk_op_error,
TP_PROTO(struct xfs_scrub *sc, xfs_agnumber_t agno,
xfs_agblock_t bno, int error, void *ret_ip),
@@ -364,6 +449,7 @@ DEFINE_EVENT(xchk_fblock_error_class, name, \
DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error);
DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning);
+DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_preen);
#ifdef CONFIG_XFS_QUOTA
DECLARE_EVENT_CLASS(xchk_dqiter_class,
@@ -475,7 +561,7 @@ TRACE_EVENT(xchk_btree_op_error,
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
@@ -518,7 +604,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__entry->ino = sc->ip->i_ino;
__entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->level = level;
__entry->ptr = cur->bc_levels[level].ptr;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
@@ -558,7 +644,7 @@ TRACE_EVENT(xchk_btree_error,
xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
@@ -598,7 +684,7 @@ TRACE_EVENT(xchk_ifork_btree_error,
__entry->ino = sc->ip->i_ino;
__entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
@@ -637,7 +723,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->level = level;
@@ -947,6 +1033,7 @@ DEFINE_XFILE_EVENT(xfile_store);
DEFINE_XFILE_EVENT(xfile_seek_data);
DEFINE_XFILE_EVENT(xfile_get_folio);
DEFINE_XFILE_EVENT(xfile_put_folio);
+DEFINE_XFILE_EVENT(xfile_discard);
TRACE_EVENT(xfarray_create,
TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity),
@@ -1300,7 +1387,7 @@ TRACE_EVENT(xchk_iscan_iget_batch,
__entry->unavail)
);
-TRACE_EVENT(xchk_iscan_iget_retry_wait,
+DECLARE_EVENT_CLASS(xchk_iscan_retry_wait_class,
TP_PROTO(struct xchk_iscan *iscan),
TP_ARGS(iscan),
TP_STRUCT__entry(
@@ -1326,7 +1413,13 @@ TRACE_EVENT(xchk_iscan_iget_retry_wait,
__entry->remaining,
__entry->iget_timeout,
__entry->retry_delay)
-);
+)
+#define DEFINE_ISCAN_RETRY_WAIT_EVENT(name) \
+DEFINE_EVENT(xchk_iscan_retry_wait_class, name, \
+ TP_PROTO(struct xchk_iscan *iscan), \
+ TP_ARGS(iscan))
+DEFINE_ISCAN_RETRY_WAIT_EVENT(xchk_iscan_iget_retry_wait);
+DEFINE_ISCAN_RETRY_WAIT_EVENT(xchk_iscan_agi_retry_wait);
TRACE_EVENT(xchk_nlinks_collect_dirent,
TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp,
@@ -1354,6 +1447,33 @@ TRACE_EVENT(xchk_nlinks_collect_dirent,
__get_str(name))
);
+TRACE_EVENT(xchk_nlinks_collect_pptr,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp,
+ const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr),
+ TP_ARGS(mp, dp, name, pptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->dir = dp->i_ino;
+ __entry->ino = be64_to_cpu(pptr->p_ino);
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d dir 0x%llx -> ino 0x%llx name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir,
+ __entry->ino,
+ __entry->namelen,
+ __get_str(name))
+);
+
TRACE_EVENT(xchk_nlinks_collect_metafile,
TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino),
TP_ARGS(mp, ino),
@@ -1502,6 +1622,300 @@ DEFINE_EVENT(xchk_nlinks_diff_class, name, \
TP_ARGS(mp, ip, live))
DEFINE_SCRUB_NLINKS_DIFF_EVENT(xchk_nlinks_compare_inode);
+DECLARE_EVENT_CLASS(xchk_pptr_class,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name,
+ xfs_ino_t far_ino),
+ TP_ARGS(ip, name, far_ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ __field(xfs_ino_t, far_ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name, name->len);
+ __entry->far_ino = far_ino;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx name '%.*s' far_ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->namelen,
+ __get_str(name),
+ __entry->far_ino)
+)
+#define DEFINE_XCHK_PPTR_EVENT(name) \
+DEFINE_EVENT(xchk_pptr_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, \
+ xfs_ino_t far_ino), \
+ TP_ARGS(ip, name, far_ino))
+DEFINE_XCHK_PPTR_EVENT(xchk_dir_defer);
+DEFINE_XCHK_PPTR_EVENT(xchk_dir_slowpath);
+DEFINE_XCHK_PPTR_EVENT(xchk_dir_ultraslowpath);
+DEFINE_XCHK_PPTR_EVENT(xchk_parent_defer);
+DEFINE_XCHK_PPTR_EVENT(xchk_parent_slowpath);
+DEFINE_XCHK_PPTR_EVENT(xchk_parent_ultraslowpath);
+
+DECLARE_EVENT_CLASS(xchk_dirtree_class,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip,
+ unsigned int path_nr, const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr),
+ TP_ARGS(sc, ip, path_nr, name, pptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, path_nr)
+ __field(xfs_ino_t, child_ino)
+ __field(unsigned int, child_gen)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->path_nr = path_nr;
+ __entry->child_ino = ip->i_ino;
+ __entry->child_gen = VFS_I(ip)->i_generation;
+ __entry->parent_ino = be64_to_cpu(pptr->p_ino);
+ __entry->parent_gen = be32_to_cpu(pptr->p_gen);
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d path %u child_ino 0x%llx child_gen 0x%x parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->path_nr,
+ __entry->child_ino,
+ __entry->child_gen,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+);
+#define DEFINE_XCHK_DIRTREE_EVENT(name) \
+DEFINE_EVENT(xchk_dirtree_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, \
+ unsigned int path_nr, const struct xfs_name *name, \
+ const struct xfs_parent_rec *pptr), \
+ TP_ARGS(sc, ip, path_nr, name, pptr))
+DEFINE_XCHK_DIRTREE_EVENT(xchk_dirtree_create_path);
+DEFINE_XCHK_DIRTREE_EVENT(xchk_dirpath_walk_upwards);
+
+DECLARE_EVENT_CLASS(xchk_dirpath_class,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip,
+ unsigned int path_nr, unsigned int step_nr,
+ const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr),
+ TP_ARGS(sc, ip, path_nr, step_nr, name, pptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, path_nr)
+ __field(unsigned int, step_nr)
+ __field(xfs_ino_t, child_ino)
+ __field(unsigned int, child_gen)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->path_nr = path_nr;
+ __entry->step_nr = step_nr;
+ __entry->child_ino = ip->i_ino;
+ __entry->child_gen = VFS_I(ip)->i_generation;
+ __entry->parent_ino = be64_to_cpu(pptr->p_ino);
+ __entry->parent_gen = be32_to_cpu(pptr->p_gen);
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d path %u step %u child_ino 0x%llx child_gen 0x%x parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->path_nr,
+ __entry->step_nr,
+ __entry->child_ino,
+ __entry->child_gen,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+);
+#define DEFINE_XCHK_DIRPATH_EVENT(name) \
+DEFINE_EVENT(xchk_dirpath_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, \
+ unsigned int path_nr, unsigned int step_nr, \
+ const struct xfs_name *name, \
+ const struct xfs_parent_rec *pptr), \
+ TP_ARGS(sc, ip, path_nr, step_nr, name, pptr))
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_disappeared);
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_badgen);
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_nondir_parent);
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_unlinked_parent);
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_found_next_step);
+
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_SCANNING);
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_DELETE);
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_CORRUPT);
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_LOOP);
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_STALE);
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_OK);
+TRACE_DEFINE_ENUM(XREP_DIRPATH_DELETING);
+TRACE_DEFINE_ENUM(XREP_DIRPATH_DELETED);
+TRACE_DEFINE_ENUM(XREP_DIRPATH_ADOPTING);
+TRACE_DEFINE_ENUM(XREP_DIRPATH_ADOPTED);
+
+#define XCHK_DIRPATH_OUTCOME_STRINGS \
+ { XCHK_DIRPATH_SCANNING, "scanning" }, \
+ { XCHK_DIRPATH_DELETE, "delete" }, \
+ { XCHK_DIRPATH_CORRUPT, "corrupt" }, \
+ { XCHK_DIRPATH_LOOP, "loop" }, \
+ { XCHK_DIRPATH_STALE, "stale" }, \
+ { XCHK_DIRPATH_OK, "ok" }, \
+ { XREP_DIRPATH_DELETING, "deleting" }, \
+ { XREP_DIRPATH_DELETED, "deleted" }, \
+ { XREP_DIRPATH_ADOPTING, "adopting" }, \
+ { XREP_DIRPATH_ADOPTED, "adopted" }
+
+DECLARE_EVENT_CLASS(xchk_dirpath_outcome_class,
+ TP_PROTO(struct xfs_scrub *sc, unsigned long long path_nr,
+ unsigned int nr_steps, \
+ unsigned int outcome),
+ TP_ARGS(sc, path_nr, nr_steps, outcome),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long long, path_nr)
+ __field(unsigned int, nr_steps)
+ __field(unsigned int, outcome)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->path_nr = path_nr;
+ __entry->nr_steps = nr_steps;
+ __entry->outcome = outcome;
+ ),
+ TP_printk("dev %d:%d path %llu steps %u outcome %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->path_nr,
+ __entry->nr_steps,
+ __print_symbolic(__entry->outcome, XCHK_DIRPATH_OUTCOME_STRINGS))
+);
+#define DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(name) \
+DEFINE_EVENT(xchk_dirpath_outcome_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, unsigned long long path_nr, \
+ unsigned int nr_steps, \
+ unsigned int outcome), \
+ TP_ARGS(sc, path_nr, nr_steps, outcome))
+DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xchk_dirpath_set_outcome);
+DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xchk_dirpath_evaluate_path);
+
+DECLARE_EVENT_CLASS(xchk_dirtree_evaluate_class,
+ TP_PROTO(const struct xchk_dirtree *dl,
+ const struct xchk_dirtree_outcomes *oc),
+ TP_ARGS(dl, oc),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, rootino)
+ __field(unsigned int, nr_paths)
+ __field(unsigned int, bad)
+ __field(unsigned int, suspect)
+ __field(unsigned int, good)
+ __field(bool, needs_adoption)
+ ),
+ TP_fast_assign(
+ __entry->dev = dl->sc->mp->m_super->s_dev;
+ __entry->ino = dl->sc->ip->i_ino;
+ __entry->rootino = dl->root_ino;
+ __entry->nr_paths = dl->nr_paths;
+ __entry->bad = oc->bad;
+ __entry->suspect = oc->suspect;
+ __entry->good = oc->good;
+ __entry->needs_adoption = oc->needs_adoption ? 1 : 0;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx rootino 0x%llx nr_paths %u bad %u suspect %u good %u adopt? %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->rootino,
+ __entry->nr_paths,
+ __entry->bad,
+ __entry->suspect,
+ __entry->good,
+ __entry->needs_adoption)
+);
+#define DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(name) \
+DEFINE_EVENT(xchk_dirtree_evaluate_class, name, \
+ TP_PROTO(const struct xchk_dirtree *dl, \
+ const struct xchk_dirtree_outcomes *oc), \
+ TP_ARGS(dl, oc))
+DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xchk_dirtree_evaluate);
+
+TRACE_EVENT(xchk_dirpath_changed,
+ TP_PROTO(struct xfs_scrub *sc, unsigned int path_nr,
+ unsigned int step_nr, const struct xfs_inode *dp,
+ const struct xfs_inode *ip, const struct xfs_name *xname),
+ TP_ARGS(sc, path_nr, step_nr, dp, ip, xname),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, path_nr)
+ __field(unsigned int, step_nr)
+ __field(xfs_ino_t, child_ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, xname->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->path_nr = path_nr;
+ __entry->step_nr = step_nr;
+ __entry->child_ino = ip->i_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->namelen = xname->len;
+ memcpy(__get_str(name), xname->name, xname->len);
+ ),
+ TP_printk("dev %d:%d path %u step %u child_ino 0x%llx parent_ino 0x%llx name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->path_nr,
+ __entry->step_nr,
+ __entry->child_ino,
+ __entry->parent_ino,
+ __entry->namelen,
+ __get_str(name))
+);
+
+TRACE_EVENT(xchk_dirtree_live_update,
+ TP_PROTO(struct xfs_scrub *sc, const struct xfs_inode *dp,
+ int action, const struct xfs_inode *ip, int delta,
+ const struct xfs_name *xname),
+ TP_ARGS(sc, dp, action, ip, delta, xname),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, parent_ino)
+ __field(int, action)
+ __field(xfs_ino_t, child_ino)
+ __field(int, delta)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, xname->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->parent_ino = dp->i_ino;
+ __entry->action = action;
+ __entry->child_ino = ip->i_ino;
+ __entry->delta = delta;
+ __entry->namelen = xname->len;
+ memcpy(__get_str(name), xname->name, xname->len);
+ ),
+ TP_printk("dev %d:%d parent_ino 0x%llx child_ino 0x%llx nlink_delta %d name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->parent_ino,
+ __entry->child_ino,
+ __entry->delta,
+ __entry->namelen,
+ __get_str(name))
+);
+
/* repair tracepoints */
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
@@ -1533,6 +1947,7 @@ DEFINE_EVENT(xrep_extent_class, name, \
DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent);
DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent);
DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
+DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval);
DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
DECLARE_EVENT_CLASS(xrep_reap_find_class,
@@ -1566,6 +1981,7 @@ DEFINE_EVENT(xrep_reap_find_class, name, \
bool crosslinked), \
TP_ARGS(pag, agbno, len, crosslinked))
DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select);
+DEFINE_REPAIR_REAP_FIND_EVENT(xreap_bmapi_select);
DECLARE_EVENT_CLASS(xrep_rmap_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
@@ -2273,6 +2689,891 @@ TRACE_EVENT(xrep_rmap_live_update,
__entry->flags)
);
+TRACE_EVENT(xrep_tempfile_create,
+ TP_PROTO(struct xfs_scrub *sc),
+ TP_ARGS(sc),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, type)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, inum)
+ __field(unsigned int, gen)
+ __field(unsigned int, flags)
+ __field(xfs_ino_t, temp_inum)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->file ? XFS_I(file_inode(sc->file))->i_ino : 0;
+ __entry->type = sc->sm->sm_type;
+ __entry->agno = sc->sm->sm_agno;
+ __entry->inum = sc->sm->sm_ino;
+ __entry->gen = sc->sm->sm_gen;
+ __entry->flags = sc->sm->sm_flags;
+ __entry->temp_inum = sc->tempip->i_ino;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx type %s inum 0x%llx gen 0x%x flags 0x%x temp_inum 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+ __entry->inum,
+ __entry->gen,
+ __entry->flags,
+ __entry->temp_inum)
+);
+
+DECLARE_EVENT_CLASS(xrep_tempfile_class,
+ TP_PROTO(struct xfs_scrub *sc, int whichfork,
+ struct xfs_bmbt_irec *irec),
+ TP_ARGS(sc, whichfork, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(xfs_fileoff_t, lblk)
+ __field(xfs_filblks_t, len)
+ __field(xfs_fsblock_t, pblk)
+ __field(int, state)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->tempip->i_ino;
+ __entry->whichfork = whichfork;
+ __entry->lblk = irec->br_startoff;
+ __entry->len = irec->br_blockcount;
+ __entry->pblk = irec->br_startblock;
+ __entry->state = irec->br_state;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+ __entry->lblk,
+ __entry->len,
+ __entry->pblk,
+ __entry->state)
+);
+#define DEFINE_XREP_TEMPFILE_EVENT(name) \
+DEFINE_EVENT(xrep_tempfile_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, int whichfork, \
+ struct xfs_bmbt_irec *irec), \
+ TP_ARGS(sc, whichfork, irec))
+DEFINE_XREP_TEMPFILE_EVENT(xrep_tempfile_prealloc);
+DEFINE_XREP_TEMPFILE_EVENT(xrep_tempfile_copyin);
+
+TRACE_EVENT(xreap_ifork_extent,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork,
+ const struct xfs_bmbt_irec *irec),
+ TP_ARGS(sc, ip, whichfork, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(xfs_fileoff_t, fileoff)
+ __field(xfs_filblks_t, len)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(int, state)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->whichfork = whichfork;
+ __entry->fileoff = irec->br_startoff;
+ __entry->len = irec->br_blockcount;
+ __entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock);
+ __entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock);
+ __entry->state = irec->br_state;
+ ),
+ TP_printk("dev %d:%d ip 0x%llx whichfork %s agno 0x%x agbno 0x%x fileoff 0x%llx fsbcount 0x%llx state 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+ __entry->agno,
+ __entry->agbno,
+ __entry->fileoff,
+ __entry->len,
+ __entry->state)
+);
+
+TRACE_EVENT(xreap_bmapi_binval_scan,
+ TP_PROTO(struct xfs_scrub *sc, const struct xfs_bmbt_irec *irec,
+ xfs_extlen_t scan_blocks),
+ TP_ARGS(sc, irec, scan_blocks),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_filblks_t, len)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, scan_blocks)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->len = irec->br_blockcount;
+ __entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock);
+ __entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock);
+ __entry->scan_blocks = scan_blocks;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%llx scan_blocks 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->scan_blocks)
+);
+
+TRACE_EVENT(xrep_xattr_recover_leafblock,
+ TP_PROTO(struct xfs_inode *ip, xfs_dablk_t dabno, uint16_t magic),
+ TP_ARGS(ip, dabno, magic),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_dablk_t, dabno)
+ __field(uint16_t, magic)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->dabno = dabno;
+ __entry->magic = magic;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx dablk 0x%x magic 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->dabno,
+ __entry->magic)
+);
+
+DECLARE_EVENT_CLASS(xrep_xattr_salvage_class,
+ TP_PROTO(struct xfs_inode *ip, unsigned int flags, char *name,
+ unsigned int namelen, unsigned int valuelen),
+ TP_ARGS(ip, flags, name, namelen, valuelen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, flags)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, namelen)
+ __field(unsigned int, valuelen)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->flags = flags;
+ __entry->namelen = namelen;
+ memcpy(__get_str(name), name, namelen);
+ __entry->valuelen = valuelen;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx flags %s name '%.*s' valuelen 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_flags(__entry->flags, "|", XFS_ATTR_NAMESPACE_STR),
+ __entry->namelen,
+ __get_str(name),
+ __entry->valuelen)
+);
+#define DEFINE_XREP_XATTR_SALVAGE_EVENT(name) \
+DEFINE_EVENT(xrep_xattr_salvage_class, name, \
+ TP_PROTO(struct xfs_inode *ip, unsigned int flags, char *name, \
+ unsigned int namelen, unsigned int valuelen), \
+ TP_ARGS(ip, flags, name, namelen, valuelen))
+DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_xattr_salvage_rec);
+DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_xattr_insert_rec);
+DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_parent_stash_xattr);
+DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_parent_insert_xattr);
+
+DECLARE_EVENT_CLASS(xrep_pptr_salvage_class,
+ TP_PROTO(struct xfs_inode *ip, unsigned int flags, const void *name,
+ unsigned int namelen, const void *value, unsigned int valuelen),
+ TP_ARGS(ip, flags, name, namelen, value, valuelen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, namelen)
+ ),
+ TP_fast_assign(
+ const struct xfs_parent_rec *rec = value;
+
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->parent_ino = be64_to_cpu(rec->p_ino);
+ __entry->parent_gen = be32_to_cpu(rec->p_gen);
+ __entry->namelen = namelen;
+ memcpy(__get_str(name), name, namelen);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+)
+#define DEFINE_XREP_PPTR_SALVAGE_EVENT(name) \
+DEFINE_EVENT(xrep_pptr_salvage_class, name, \
+ TP_PROTO(struct xfs_inode *ip, unsigned int flags, const void *name, \
+ unsigned int namelen, const void *value, unsigned int valuelen), \
+ TP_ARGS(ip, flags, name, namelen, value, valuelen))
+DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_salvage_pptr);
+DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_insert_pptr);
+
+TRACE_EVENT(xrep_xattr_class,
+ TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip),
+ TP_ARGS(ip, arg_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, src_ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->src_ino = arg_ip->i_ino;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx src 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->src_ino)
+)
+#define DEFINE_XREP_XATTR_EVENT(name) \
+DEFINE_EVENT(xrep_xattr_class, name, \
+ TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip), \
+ TP_ARGS(ip, arg_ip))
+DEFINE_XREP_XATTR_EVENT(xrep_xattr_rebuild_tree);
+DEFINE_XREP_XATTR_EVENT(xrep_xattr_reset_fork);
+DEFINE_XREP_XATTR_EVENT(xrep_xattr_full_reset);
+
+DECLARE_EVENT_CLASS(xrep_xattr_pptr_scan_class,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp,
+ const struct xfs_name *name),
+ TP_ARGS(ip, dp, name),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->parent_gen = VFS_IC(dp)->i_generation;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+)
+#define DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(name) \
+DEFINE_EVENT(xrep_xattr_pptr_scan_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, \
+ const struct xfs_name *name), \
+ TP_ARGS(ip, dp, name))
+DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(xrep_xattr_stash_parentadd);
+DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(xrep_xattr_stash_parentremove);
+
+TRACE_EVENT(xrep_dir_recover_dirblock,
+ TP_PROTO(struct xfs_inode *dp, xfs_dablk_t dabno, uint32_t magic,
+ uint32_t magic_guess),
+ TP_ARGS(dp, dabno, magic, magic_guess),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(xfs_dablk_t, dabno)
+ __field(uint32_t, magic)
+ __field(uint32_t, magic_guess)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->dabno = dabno;
+ __entry->magic = magic;
+ __entry->magic_guess = magic_guess;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx dablk 0x%x magic 0x%x magic_guess 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __entry->dabno,
+ __entry->magic,
+ __entry->magic_guess)
+);
+
+DECLARE_EVENT_CLASS(xrep_dir_class,
+ TP_PROTO(struct xfs_inode *dp, xfs_ino_t parent_ino),
+ TP_ARGS(dp, parent_ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(xfs_ino_t, parent_ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->parent_ino = parent_ino;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx parent 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __entry->parent_ino)
+)
+#define DEFINE_XREP_DIR_EVENT(name) \
+DEFINE_EVENT(xrep_dir_class, name, \
+ TP_PROTO(struct xfs_inode *dp, xfs_ino_t parent_ino), \
+ TP_ARGS(dp, parent_ino))
+DEFINE_XREP_DIR_EVENT(xrep_dir_rebuild_tree);
+DEFINE_XREP_DIR_EVENT(xrep_dir_reset_fork);
+DEFINE_XREP_DIR_EVENT(xrep_parent_reset_dotdot);
+
+DECLARE_EVENT_CLASS(xrep_dirent_class,
+ TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name,
+ xfs_ino_t ino),
+ TP_ARGS(dp, name, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ __field(xfs_ino_t, ino)
+ __field(uint8_t, ftype)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ __entry->ino = ino;
+ __entry->ftype = name->type;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx ftype %s name '%.*s' ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
+ __entry->namelen,
+ __get_str(name),
+ __entry->ino)
+)
+#define DEFINE_XREP_DIRENT_EVENT(name) \
+DEFINE_EVENT(xrep_dirent_class, name, \
+ TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name, \
+ xfs_ino_t ino), \
+ TP_ARGS(dp, name, ino))
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_salvage_entry);
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_stash_createname);
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_replay_createname);
+DEFINE_XREP_DIRENT_EVENT(xrep_adoption_reparent);
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_stash_removename);
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_replay_removename);
+
+DECLARE_EVENT_CLASS(xrep_adoption_class,
+ TP_PROTO(struct xfs_inode *dp, struct xfs_inode *ip, bool moved),
+ TP_ARGS(dp, ip, moved),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(xfs_ino_t, child_ino)
+ __field(bool, moved)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->child_ino = ip->i_ino;
+ __entry->moved = moved;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx child 0x%llx moved? %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __entry->child_ino,
+ __entry->moved)
+);
+#define DEFINE_XREP_ADOPTION_EVENT(name) \
+DEFINE_EVENT(xrep_adoption_class, name, \
+ TP_PROTO(struct xfs_inode *dp, struct xfs_inode *ip, bool moved), \
+ TP_ARGS(dp, ip, moved))
+DEFINE_XREP_ADOPTION_EVENT(xrep_adoption_trans_roll);
+
+DECLARE_EVENT_CLASS(xrep_parent_salvage_class,
+ TP_PROTO(struct xfs_inode *dp, xfs_ino_t ino),
+ TP_ARGS(dp, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->ino = ino;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx parent 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __entry->ino)
+)
+#define DEFINE_XREP_PARENT_SALVAGE_EVENT(name) \
+DEFINE_EVENT(xrep_parent_salvage_class, name, \
+ TP_PROTO(struct xfs_inode *dp, xfs_ino_t ino), \
+ TP_ARGS(dp, ino))
+DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_dir_salvaged_parent);
+DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_findparent_dirent);
+DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_findparent_from_dcache);
+
+DECLARE_EVENT_CLASS(xrep_pptr_class,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr),
+ TP_ARGS(ip, name, pptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->parent_ino = be64_to_cpu(pptr->p_ino);
+ __entry->parent_gen = be32_to_cpu(pptr->p_gen);
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+)
+#define DEFINE_XREP_PPTR_EVENT(name) \
+DEFINE_EVENT(xrep_pptr_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, \
+ const struct xfs_parent_rec *pptr), \
+ TP_ARGS(ip, name, pptr))
+DEFINE_XREP_PPTR_EVENT(xrep_xattr_replay_parentadd);
+DEFINE_XREP_PPTR_EVENT(xrep_xattr_replay_parentremove);
+DEFINE_XREP_PPTR_EVENT(xrep_parent_replay_parentadd);
+DEFINE_XREP_PPTR_EVENT(xrep_parent_replay_parentremove);
+
+DECLARE_EVENT_CLASS(xrep_pptr_scan_class,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp,
+ const struct xfs_name *name),
+ TP_ARGS(ip, dp, name),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->parent_gen = VFS_IC(dp)->i_generation;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+)
+#define DEFINE_XREP_PPTR_SCAN_EVENT(name) \
+DEFINE_EVENT(xrep_pptr_scan_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, \
+ const struct xfs_name *name), \
+ TP_ARGS(ip, dp, name))
+DEFINE_XREP_PPTR_SCAN_EVENT(xrep_parent_stash_parentadd);
+DEFINE_XREP_PPTR_SCAN_EVENT(xrep_parent_stash_parentremove);
+
+TRACE_EVENT(xrep_nlinks_set_record,
+ TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino,
+ const struct xchk_nlink *obs),
+ TP_ARGS(mp, ino, obs),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_nlink_t, parents)
+ __field(xfs_nlink_t, backrefs)
+ __field(xfs_nlink_t, children)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ino;
+ __entry->parents = obs->parents;
+ __entry->backrefs = obs->backrefs;
+ __entry->children = obs->children;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parents %u backrefs %u children %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parents,
+ __entry->backrefs,
+ __entry->children)
+);
+
+DECLARE_EVENT_CLASS(xrep_dentry_class,
+ TP_PROTO(struct xfs_mount *mp, const struct dentry *dentry),
+ TP_ARGS(mp, dentry),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, flags)
+ __field(unsigned long, ino)
+ __field(bool, positive)
+ __field(unsigned long, parent_ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, dentry->d_name.len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->flags = dentry->d_flags;
+ __entry->positive = d_is_positive(dentry);
+ if (dentry->d_parent && d_inode(dentry->d_parent))
+ __entry->parent_ino = d_inode(dentry->d_parent)->i_ino;
+ else
+ __entry->parent_ino = -1UL;
+ __entry->ino = d_inode(dentry) ? d_inode(dentry)->i_ino : 0;
+ __entry->namelen = dentry->d_name.len;
+ memcpy(__get_str(name), dentry->d_name.name, dentry->d_name.len);
+ ),
+ TP_printk("dev %d:%d flags 0x%x positive? %d parent_ino 0x%lx ino 0x%lx name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->flags,
+ __entry->positive,
+ __entry->parent_ino,
+ __entry->ino,
+ __entry->namelen,
+ __get_str(name))
+);
+#define DEFINE_REPAIR_DENTRY_EVENT(name) \
+DEFINE_EVENT(xrep_dentry_class, name, \
+ TP_PROTO(struct xfs_mount *mp, const struct dentry *dentry), \
+ TP_ARGS(mp, dentry))
+DEFINE_REPAIR_DENTRY_EVENT(xrep_adoption_check_child);
+DEFINE_REPAIR_DENTRY_EVENT(xrep_adoption_invalidate_child);
+DEFINE_REPAIR_DENTRY_EVENT(xrep_dirtree_delete_child);
+
+TRACE_EVENT(xrep_symlink_salvage_target,
+ TP_PROTO(struct xfs_inode *ip, char *target, unsigned int targetlen),
+ TP_ARGS(ip, target, targetlen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, targetlen)
+ __dynamic_array(char, target, targetlen + 1)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->targetlen = targetlen;
+ memcpy(__get_str(target), target, targetlen);
+ __get_str(target)[targetlen] = 0;
+ ),
+ TP_printk("dev %d:%d ip 0x%llx target '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->targetlen,
+ __get_str(target))
+);
+
+DECLARE_EVENT_CLASS(xrep_symlink_class,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ ),
+ TP_printk("dev %d:%d ip 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino)
+);
+
+#define DEFINE_XREP_SYMLINK_EVENT(name) \
+DEFINE_EVENT(xrep_symlink_class, name, \
+ TP_PROTO(struct xfs_inode *ip), \
+ TP_ARGS(ip))
+DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_rebuild);
+DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_reset_fork);
+
+TRACE_EVENT(xrep_iunlink_visit,
+ TP_PROTO(struct xfs_perag *pag, unsigned int bucket,
+ xfs_agino_t bucket_agino, struct xfs_inode *ip),
+ TP_ARGS(pag, bucket, bucket_agino, ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, bucket_agino)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->agino = XFS_INO_TO_AGINO(pag->pag_mount, ip->i_ino);
+ __entry->bucket = bucket;
+ __entry->bucket_agino = bucket_agino;
+ __entry->prev_agino = ip->i_prev_unlinked;
+ __entry->next_agino = ip->i_next_unlinked;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x bucket_agino 0x%x prev_agino 0x%x next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->agino,
+ __entry->bucket_agino,
+ __entry->prev_agino,
+ __entry->next_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_reload_next,
+ TP_PROTO(struct xfs_inode *ip, xfs_agino_t prev_agino),
+ TP_ARGS(ip, prev_agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, old_prev_agino)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, next_agino)
+ __field(unsigned int, nlink)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ __entry->old_prev_agino = ip->i_prev_unlinked;
+ __entry->prev_agino = prev_agino;
+ __entry->next_agino = ip->i_next_unlinked;
+ __entry->nlink = VFS_I(ip)->i_nlink;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x nlink %u old_prev_agino %u prev_agino 0x%x next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino % XFS_AGI_UNLINKED_BUCKETS,
+ __entry->agino,
+ __entry->nlink,
+ __entry->old_prev_agino,
+ __entry->prev_agino,
+ __entry->next_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_reload_ondisk,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(unsigned int, nlink)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ __entry->nlink = VFS_I(ip)->i_nlink;
+ __entry->next_agino = ip->i_next_unlinked;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x nlink %u next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino % XFS_AGI_UNLINKED_BUCKETS,
+ __entry->agino,
+ __entry->nlink,
+ __entry->next_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_walk_ondisk_bucket,
+ TP_PROTO(struct xfs_perag *pag, unsigned int bucket,
+ xfs_agino_t prev_agino, xfs_agino_t next_agino),
+ TP_ARGS(pag, bucket, prev_agino, next_agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->bucket = bucket;
+ __entry->prev_agino = prev_agino;
+ __entry->next_agino = next_agino;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u prev_agino 0x%x next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->prev_agino,
+ __entry->next_agino)
+);
+
+DECLARE_EVENT_CLASS(xrep_iunlink_resolve_class,
+ TP_PROTO(struct xfs_perag *pag, unsigned int bucket,
+ xfs_agino_t prev_agino, xfs_agino_t next_agino),
+ TP_ARGS(pag, bucket, prev_agino, next_agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->bucket = bucket;
+ __entry->prev_agino = prev_agino;
+ __entry->next_agino = next_agino;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u prev_agino 0x%x next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->prev_agino,
+ __entry->next_agino)
+);
+#define DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(name) \
+DEFINE_EVENT(xrep_iunlink_resolve_class, name, \
+ TP_PROTO(struct xfs_perag *pag, unsigned int bucket, \
+ xfs_agino_t prev_agino, xfs_agino_t next_agino), \
+ TP_ARGS(pag, bucket, prev_agino, next_agino))
+DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_uncached);
+DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_wronglist);
+DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_nolist);
+DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_ok);
+
+TRACE_EVENT(xrep_iunlink_relink_next,
+ TP_PROTO(struct xfs_inode *ip, xfs_agino_t next_agino),
+ TP_ARGS(ip, next_agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, next_agino)
+ __field(xfs_agino_t, new_next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ __entry->next_agino = ip->i_next_unlinked;
+ __entry->new_next_agino = next_agino;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x next_agino 0x%x -> 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino % XFS_AGI_UNLINKED_BUCKETS,
+ __entry->agino,
+ __entry->next_agino,
+ __entry->new_next_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_relink_prev,
+ TP_PROTO(struct xfs_inode *ip, xfs_agino_t prev_agino),
+ TP_ARGS(ip, prev_agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, new_prev_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ __entry->prev_agino = ip->i_prev_unlinked;
+ __entry->new_prev_agino = prev_agino;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x prev_agino 0x%x -> 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino % XFS_AGI_UNLINKED_BUCKETS,
+ __entry->agino,
+ __entry->prev_agino,
+ __entry->new_prev_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_add_to_bucket,
+ TP_PROTO(struct xfs_perag *pag, unsigned int bucket,
+ xfs_agino_t agino, xfs_agino_t curr_head),
+ TP_ARGS(pag, bucket, agino, curr_head),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->bucket = bucket;
+ __entry->agino = agino;
+ __entry->next_agino = curr_head;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->agino,
+ __entry->next_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_commit_bucket,
+ TP_PROTO(struct xfs_perag *pag, unsigned int bucket,
+ xfs_agino_t old_agino, xfs_agino_t agino),
+ TP_ARGS(pag, bucket, old_agino, agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, old_agino)
+ __field(xfs_agino_t, agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->bucket = bucket;
+ __entry->old_agino = old_agino;
+ __entry->agino = agino;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x -> 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->old_agino,
+ __entry->agino)
+);
+
+DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xrep_dirpath_set_outcome);
+DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_delete_path);
+DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_create_adoption);
+DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xrep_dirtree_decided_fate);
+
#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
#endif /* _TRACE_XFS_SCRUB_TRACE_H */
diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c
index 17c982a4821d..9185ae7088d4 100644
--- a/fs/xfs/scrub/xfarray.c
+++ b/fs/xfs/scrub/xfarray.c
@@ -7,9 +7,9 @@
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
+#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
-#include "scrub/scrub.h"
#include "scrub/trace.h"
/*
@@ -486,6 +486,9 @@ xfarray_sortinfo_alloc(
xfarray_sortinfo_lo(si)[0] = 0;
xfarray_sortinfo_hi(si)[0] = array->nr - 1;
+ si->relax = INIT_XCHK_RELAX;
+ if (flags & XFARRAY_SORT_KILLABLE)
+ si->relax.interruptible = false;
trace_xfarray_sort(si, nr_bytes);
*infop = si;
@@ -503,10 +506,7 @@ xfarray_sort_terminated(
* few seconds so that we don't run afoul of the soft lockup watchdog
* or RCU stall detector.
*/
- cond_resched();
-
- if ((si->flags & XFARRAY_SORT_KILLABLE) &&
- fatal_signal_pending(current)) {
+ if (xchk_maybe_relax(&si->relax)) {
if (*error == 0)
*error = -EINTR;
return true;
@@ -1051,3 +1051,20 @@ out_free:
kvfree(si);
return error;
}
+
+/* How many bytes is this array consuming? */
+unsigned long long
+xfarray_bytes(
+ struct xfarray *array)
+{
+ return xfile_bytes(array->xfile);
+}
+
+/* Empty the entire array. */
+void
+xfarray_truncate(
+ struct xfarray *array)
+{
+ xfile_discard(array->xfile, 0, MAX_LFS_FILESIZE);
+ array->nr = 0;
+}
diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h
index acb2f94c56c1..5eeeeed13ae2 100644
--- a/fs/xfs/scrub/xfarray.h
+++ b/fs/xfs/scrub/xfarray.h
@@ -8,6 +8,7 @@
/* xfile array index type, along with cursor initialization */
typedef uint64_t xfarray_idx_t;
+#define XFARRAY_NULLIDX ((__force xfarray_idx_t)-1ULL)
#define XFARRAY_CURSOR_INIT ((__force xfarray_idx_t)0)
/* Iterate each index of an xfile array. */
@@ -44,6 +45,8 @@ int xfarray_unset(struct xfarray *array, xfarray_idx_t idx);
int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr);
int xfarray_store_anywhere(struct xfarray *array, const void *ptr);
bool xfarray_element_is_null(struct xfarray *array, const void *ptr);
+void xfarray_truncate(struct xfarray *array);
+unsigned long long xfarray_bytes(struct xfarray *array);
/*
* Load an array element, but zero the buffer if there's no data because we
@@ -124,6 +127,9 @@ struct xfarray_sortinfo {
/* XFARRAY_SORT_* flags; see below. */
unsigned int flags;
+ /* next time we want to cond_resched() */
+ struct xchk_relax relax;
+
/* Cache a folio here for faster scanning for pivots */
struct folio *folio;
diff --git a/fs/xfs/scrub/xfblob.c b/fs/xfs/scrub/xfblob.c
new file mode 100644
index 000000000000..6ef2a9637f16
--- /dev/null
+++ b/fs/xfs/scrub/xfblob.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "scrub/scrub.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+
+/*
+ * XFS Blob Storage
+ * ================
+ * Stores and retrieves blobs using an xfile. Objects are appended to the file
+ * and the offset is returned as a magic cookie for retrieval.
+ */
+
+#define XB_KEY_MAGIC 0xABAADDAD
+struct xb_key {
+ uint32_t xb_magic; /* XB_KEY_MAGIC */
+ uint32_t xb_size; /* size of the blob, in bytes */
+ loff_t xb_offset; /* byte offset of this key */
+ /* blob comes after here */
+} __packed;
+
+/* Initialize a blob storage object. */
+int
+xfblob_create(
+ const char *description,
+ struct xfblob **blobp)
+{
+ struct xfblob *blob;
+ struct xfile *xfile;
+ int error;
+
+ error = xfile_create(description, 0, &xfile);
+ if (error)
+ return error;
+
+ blob = kmalloc(sizeof(struct xfblob), XCHK_GFP_FLAGS);
+ if (!blob) {
+ error = -ENOMEM;
+ goto out_xfile;
+ }
+
+ blob->xfile = xfile;
+ blob->last_offset = PAGE_SIZE;
+
+ *blobp = blob;
+ return 0;
+
+out_xfile:
+ xfile_destroy(xfile);
+ return error;
+}
+
+/* Destroy a blob storage object. */
+void
+xfblob_destroy(
+ struct xfblob *blob)
+{
+ xfile_destroy(blob->xfile);
+ kfree(blob);
+}
+
+/* Retrieve a blob. */
+int
+xfblob_load(
+ struct xfblob *blob,
+ xfblob_cookie cookie,
+ void *ptr,
+ uint32_t size)
+{
+ struct xb_key key;
+ int error;
+
+ error = xfile_load(blob->xfile, &key, sizeof(key), cookie);
+ if (error)
+ return error;
+
+ if (key.xb_magic != XB_KEY_MAGIC || key.xb_offset != cookie) {
+ ASSERT(0);
+ return -ENODATA;
+ }
+ if (size < key.xb_size) {
+ ASSERT(0);
+ return -EFBIG;
+ }
+
+ return xfile_load(blob->xfile, ptr, key.xb_size,
+ cookie + sizeof(key));
+}
+
+/* Store a blob. */
+int
+xfblob_store(
+ struct xfblob *blob,
+ xfblob_cookie *cookie,
+ const void *ptr,
+ uint32_t size)
+{
+ struct xb_key key = {
+ .xb_offset = blob->last_offset,
+ .xb_magic = XB_KEY_MAGIC,
+ .xb_size = size,
+ };
+ loff_t pos = blob->last_offset;
+ int error;
+
+ error = xfile_store(blob->xfile, &key, sizeof(key), pos);
+ if (error)
+ return error;
+
+ pos += sizeof(key);
+ error = xfile_store(blob->xfile, ptr, size, pos);
+ if (error)
+ goto out_err;
+
+ *cookie = blob->last_offset;
+ blob->last_offset += sizeof(key) + size;
+ return 0;
+out_err:
+ xfile_discard(blob->xfile, blob->last_offset, sizeof(key));
+ return error;
+}
+
+/* Free a blob. */
+int
+xfblob_free(
+ struct xfblob *blob,
+ xfblob_cookie cookie)
+{
+ struct xb_key key;
+ int error;
+
+ error = xfile_load(blob->xfile, &key, sizeof(key), cookie);
+ if (error)
+ return error;
+
+ if (key.xb_magic != XB_KEY_MAGIC || key.xb_offset != cookie) {
+ ASSERT(0);
+ return -ENODATA;
+ }
+
+ xfile_discard(blob->xfile, cookie, sizeof(key) + key.xb_size);
+ return 0;
+}
+
+/* How many bytes is this blob storage object consuming? */
+unsigned long long
+xfblob_bytes(
+ struct xfblob *blob)
+{
+ return xfile_bytes(blob->xfile);
+}
+
+/* Drop all the blobs. */
+void
+xfblob_truncate(
+ struct xfblob *blob)
+{
+ xfile_discard(blob->xfile, PAGE_SIZE, MAX_LFS_FILESIZE - PAGE_SIZE);
+ blob->last_offset = PAGE_SIZE;
+}
diff --git a/fs/xfs/scrub/xfblob.h b/fs/xfs/scrub/xfblob.h
new file mode 100644
index 000000000000..ae78322613ca
--- /dev/null
+++ b/fs/xfs/scrub/xfblob.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_XFBLOB_H__
+#define __XFS_SCRUB_XFBLOB_H__
+
+struct xfblob {
+ struct xfile *xfile;
+ loff_t last_offset;
+};
+
+typedef loff_t xfblob_cookie;
+
+int xfblob_create(const char *descr, struct xfblob **blobp);
+void xfblob_destroy(struct xfblob *blob);
+int xfblob_load(struct xfblob *blob, xfblob_cookie cookie, void *ptr,
+ uint32_t size);
+int xfblob_store(struct xfblob *blob, xfblob_cookie *cookie, const void *ptr,
+ uint32_t size);
+int xfblob_free(struct xfblob *blob, xfblob_cookie cookie);
+unsigned long long xfblob_bytes(struct xfblob *blob);
+void xfblob_truncate(struct xfblob *blob);
+
+static inline int
+xfblob_storename(
+ struct xfblob *blob,
+ xfblob_cookie *cookie,
+ const struct xfs_name *xname)
+{
+ return xfblob_store(blob, cookie, xname->name, xname->len);
+}
+
+static inline int
+xfblob_loadname(
+ struct xfblob *blob,
+ xfblob_cookie cookie,
+ struct xfs_name *xname,
+ uint32_t size)
+{
+ int ret = xfblob_load(blob, cookie, (void *)xname->name, size);
+ if (ret)
+ return ret;
+
+ xname->len = size;
+ return 0;
+}
+
+#endif /* __XFS_SCRUB_XFBLOB_H__ */
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
index 8cdd863db585..d848222f802b 100644
--- a/fs/xfs/scrub/xfile.c
+++ b/fs/xfs/scrub/xfile.c
@@ -10,9 +10,9 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
-#include "scrub/scrub.h"
#include "scrub/trace.h"
#include <linux/shmem_fs.h>
@@ -310,3 +310,15 @@ xfile_put_folio(
folio_unlock(folio);
folio_put(folio);
}
+
+/* Discard the page cache that's backing a range of the xfile. */
+void
+xfile_discard(
+ struct xfile *xf,
+ loff_t pos,
+ u64 count)
+{
+ trace_xfile_discard(xf, pos, count);
+
+ shmem_truncate_range(file_inode(xf->file), pos, pos + count - 1);
+}
diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h
index 76d78dba7e34..cc2cc1714cd4 100644
--- a/fs/xfs/scrub/xfile.h
+++ b/fs/xfs/scrub/xfile.h
@@ -17,6 +17,7 @@ int xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos);
int xfile_store(struct xfile *xf, const void *buf, size_t count,
loff_t pos);
+void xfile_discard(struct xfile *xf, loff_t pos, u64 count);
loff_t xfile_seek_data(struct xfile *xf, loff_t pos);
#define XFILE_MAX_FOLIO_SIZE (PAGE_SIZE << MAX_PAGECACHE_ORDER)
@@ -26,4 +27,9 @@ struct folio *xfile_get_folio(struct xfile *xf, loff_t offset, size_t len,
unsigned int flags);
void xfile_put_folio(struct xfile *xf, struct folio *folio);
+static inline unsigned long long xfile_bytes(struct xfile *xf)
+{
+ return file_inode(xf->file)->i_blocks << SECTOR_SHIFT;
+}
+
#endif /* __XFS_SCRUB_XFILE_H__ */
diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h
index a39befa743ce..f17173b83e6f 100644
--- a/fs/xfs/scrub/xfs_scrub.h
+++ b/fs/xfs/scrub/xfs_scrub.h
@@ -7,9 +7,11 @@
#define __XFS_SCRUB_H__
#ifndef CONFIG_XFS_ONLINE_SCRUB
-# define xfs_scrub_metadata(file, sm) (-ENOTTY)
+# define xfs_ioc_scrub_metadata(f, a) (-ENOTTY)
+# define xfs_ioc_scrubv_metadata(f, a) (-ENOTTY)
#else
-int xfs_scrub_metadata(struct file *file, struct xfs_scrub_metadata *sm);
+int xfs_ioc_scrub_metadata(struct file *file, void __user *arg);
+int xfs_ioc_scrubv_metadata(struct file *file, void __user *arg);
#endif /* CONFIG_XFS_ONLINE_SCRUB */
#endif /* __XFS_SCRUB_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 4bf69c9c088e..c7c3dcfa2718 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -201,16 +201,17 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
if (!args.value)
return -ENOMEM;
xfs_acl_to_disk(args.value, acl);
+ error = xfs_attr_change(&args, XFS_ATTRUPDATE_UPSERT);
+ kvfree(args.value);
+ } else {
+ error = xfs_attr_change(&args, XFS_ATTRUPDATE_REMOVE);
+ /*
+ * If the attribute didn't exist to start with that's fine.
+ */
+ if (error == -ENOATTR)
+ error = 0;
}
- error = xfs_attr_change(&args);
- kvfree(args.value);
-
- /*
- * If the attribute didn't exist to start with that's fine.
- */
- if (!acl && error == -ENOATTR)
- error = 0;
if (!error)
set_cached_acl(inode, type, acl);
return error;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3f428620ebf2..6dead20338e2 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -233,45 +233,6 @@ xfs_imap_valid(
return true;
}
-/*
- * Pass in a dellalloc extent and convert it to real extents, return the real
- * extent that maps offset_fsb in wpc->iomap.
- *
- * The current page is held locked so nothing could have removed the block
- * backing offset_fsb, although it could have moved from the COW to the data
- * fork by another thread.
- */
-static int
-xfs_convert_blocks(
- struct iomap_writepage_ctx *wpc,
- struct xfs_inode *ip,
- int whichfork,
- loff_t offset)
-{
- int error;
- unsigned *seq;
-
- if (whichfork == XFS_COW_FORK)
- seq = &XFS_WPC(wpc)->cow_seq;
- else
- seq = &XFS_WPC(wpc)->data_seq;
-
- /*
- * Attempt to allocate whatever delalloc extent currently backs offset
- * and put the result into wpc->iomap. Allocate in a loop because it
- * may take several attempts to allocate real blocks for a contiguous
- * delalloc extent if free space is sufficiently fragmented.
- */
- do {
- error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
- &wpc->iomap, seq);
- if (error)
- return error;
- } while (wpc->iomap.offset + wpc->iomap.length <= offset);
-
- return 0;
-}
-
static int
xfs_map_blocks(
struct iomap_writepage_ctx *wpc,
@@ -290,6 +251,7 @@ xfs_map_blocks(
struct xfs_iext_cursor icur;
int retries = 0;
int error = 0;
+ unsigned int *seq;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -387,7 +349,19 @@ retry:
trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
return 0;
allocate_blocks:
- error = xfs_convert_blocks(wpc, ip, whichfork, offset);
+ /*
+ * Convert a dellalloc extent to a real one. The current page is held
+ * locked so nothing could have removed the block backing offset_fsb,
+ * although it could have moved from the COW to the data fork by another
+ * thread.
+ */
+ if (whichfork == XFS_COW_FORK)
+ seq = &XFS_WPC(wpc)->cow_seq;
+ else
+ seq = &XFS_WPC(wpc)->data_seq;
+
+ error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
+ &wpc->iomap, seq);
if (error) {
/*
* If we failed to find the extent in the COW fork we might have
@@ -469,7 +443,6 @@ xfs_discard_folio(
{
struct xfs_inode *ip = XFS_I(folio->mapping->host);
struct xfs_mount *mp = ip->i_mount;
- int error;
if (xfs_is_shutdown(mp))
return;
@@ -483,11 +456,8 @@ xfs_discard_folio(
* byte of the next folio. Hence the end offset is only dependent on the
* folio itself and not the start offset that is passed in.
*/
- error = xfs_bmap_punch_delalloc_range(ip, pos,
+ xfs_bmap_punch_delalloc_range(ip, pos,
folio_pos(folio) + folio_size(folio));
-
- if (error && !xfs_is_shutdown(mp))
- xfs_alert(mp, "page discard unable to remove delalloc mapping.");
}
static const struct iomap_writeback_ops xfs_writeback_ops = {
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 9b4c61e1c22e..2b10ac4c5fce 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -27,6 +27,7 @@
#include "xfs_error.h"
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
+#include "xfs_parent.h"
struct kmem_cache *xfs_attri_cache;
struct kmem_cache *xfs_attrd_cache;
@@ -73,8 +74,12 @@ static inline struct xfs_attri_log_nameval *
xfs_attri_log_nameval_alloc(
const void *name,
unsigned int name_len,
+ const void *new_name,
+ unsigned int new_name_len,
const void *value,
- unsigned int value_len)
+ unsigned int value_len,
+ const void *new_value,
+ unsigned int new_value_len)
{
struct xfs_attri_log_nameval *nv;
@@ -83,15 +88,26 @@ xfs_attri_log_nameval_alloc(
* this. But kvmalloc() utterly sucks, so we use our own version.
*/
nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) +
- name_len + value_len);
+ name_len + new_name_len + value_len +
+ new_value_len);
nv->name.i_addr = nv + 1;
nv->name.i_len = name_len;
nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME;
memcpy(nv->name.i_addr, name, name_len);
+ if (new_name_len) {
+ nv->new_name.i_addr = nv->name.i_addr + name_len;
+ nv->new_name.i_len = new_name_len;
+ memcpy(nv->new_name.i_addr, new_name, new_name_len);
+ } else {
+ nv->new_name.i_addr = NULL;
+ nv->new_name.i_len = 0;
+ }
+ nv->new_name.i_type = XLOG_REG_TYPE_ATTR_NEWNAME;
+
if (value_len) {
- nv->value.i_addr = nv->name.i_addr + name_len;
+ nv->value.i_addr = nv->name.i_addr + name_len + new_name_len;
nv->value.i_len = value_len;
memcpy(nv->value.i_addr, value, value_len);
} else {
@@ -100,6 +116,17 @@ xfs_attri_log_nameval_alloc(
}
nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE;
+ if (new_value_len) {
+ nv->new_value.i_addr = nv->name.i_addr + name_len +
+ new_name_len + value_len;
+ nv->new_value.i_len = new_value_len;
+ memcpy(nv->new_value.i_addr, new_value, new_value_len);
+ } else {
+ nv->new_value.i_addr = NULL;
+ nv->new_value.i_len = 0;
+ }
+ nv->new_value.i_type = XLOG_REG_TYPE_ATTR_NEWVALUE;
+
refcount_set(&nv->refcount, 1);
return nv;
}
@@ -145,11 +172,20 @@ xfs_attri_item_size(
*nbytes += sizeof(struct xfs_attri_log_format) +
xlog_calc_iovec_len(nv->name.i_len);
- if (!nv->value.i_len)
- return;
+ if (nv->new_name.i_len) {
+ *nvecs += 1;
+ *nbytes += xlog_calc_iovec_len(nv->new_name.i_len);
+ }
- *nvecs += 1;
- *nbytes += xlog_calc_iovec_len(nv->value.i_len);
+ if (nv->value.i_len) {
+ *nvecs += 1;
+ *nbytes += xlog_calc_iovec_len(nv->value.i_len);
+ }
+
+ if (nv->new_value.i_len) {
+ *nvecs += 1;
+ *nbytes += xlog_calc_iovec_len(nv->new_value.i_len);
+ }
}
/*
@@ -179,15 +215,28 @@ xfs_attri_item_format(
ASSERT(nv->name.i_len > 0);
attrip->attri_format.alfi_size++;
+ if (nv->new_name.i_len > 0)
+ attrip->attri_format.alfi_size++;
+
if (nv->value.i_len > 0)
attrip->attri_format.alfi_size++;
+ if (nv->new_value.i_len > 0)
+ attrip->attri_format.alfi_size++;
+
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT,
&attrip->attri_format,
sizeof(struct xfs_attri_log_format));
xlog_copy_from_iovec(lv, &vecp, &nv->name);
+
+ if (nv->new_name.i_len > 0)
+ xlog_copy_from_iovec(lv, &vecp, &nv->new_name);
+
if (nv->value.i_len > 0)
xlog_copy_from_iovec(lv, &vecp, &nv->value);
+
+ if (nv->new_value.i_len > 0)
+ xlog_copy_from_iovec(lv, &vecp, &nv->new_value);
}
/*
@@ -308,6 +357,12 @@ xfs_attrd_item_intent(
return &ATTRD_ITEM(lip)->attrd_attrip->attri_item;
}
+static inline unsigned int
+xfs_attr_log_item_op(const struct xfs_attri_log_format *attrp)
+{
+ return attrp->alfi_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK;
+}
+
/* Log an attr to the intent item. */
STATIC void
xfs_attr_log_item(
@@ -316,6 +371,8 @@ xfs_attr_log_item(
const struct xfs_attr_intent *attr)
{
struct xfs_attri_log_format *attrp;
+ struct xfs_attri_log_nameval *nv = attr->xattri_nameval;
+ struct xfs_da_args *args = attr->xattri_da_args;
/*
* At this point the xfs_attr_intent has been constructed, and we've
@@ -323,13 +380,30 @@ xfs_attr_log_item(
* structure with fields from this xfs_attr_intent
*/
attrp = &attrip->attri_format;
- attrp->alfi_ino = attr->xattri_da_args->dp->i_ino;
+ attrp->alfi_ino = args->dp->i_ino;
ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK));
attrp->alfi_op_flags = attr->xattri_op_flags;
- attrp->alfi_value_len = attr->xattri_nameval->value.i_len;
- attrp->alfi_name_len = attr->xattri_nameval->name.i_len;
- ASSERT(!(attr->xattri_da_args->attr_filter & ~XFS_ATTRI_FILTER_MASK));
- attrp->alfi_attr_filter = attr->xattri_da_args->attr_filter;
+ attrp->alfi_value_len = nv->value.i_len;
+
+ switch (xfs_attr_log_item_op(attrp)) {
+ case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
+ ASSERT(nv->value.i_len == nv->new_value.i_len);
+
+ attrp->alfi_igen = VFS_I(args->dp)->i_generation;
+ attrp->alfi_old_name_len = nv->name.i_len;
+ attrp->alfi_new_name_len = nv->new_name.i_len;
+ break;
+ case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
+ case XFS_ATTRI_OP_FLAGS_PPTR_SET:
+ attrp->alfi_igen = VFS_I(args->dp)->i_generation;
+ fallthrough;
+ default:
+ attrp->alfi_name_len = nv->name.i_len;
+ break;
+ }
+
+ ASSERT(!(args->attr_filter & ~XFS_ATTRI_FILTER_MASK));
+ attrp->alfi_attr_filter = args->attr_filter;
}
/* Get an ATTRI. */
@@ -368,8 +442,11 @@ xfs_attr_create_intent(
* Transfer our reference to the name/value buffer to the
* deferred work state structure.
*/
- attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name,
- args->namelen, args->value, args->valuelen);
+ attr->xattri_nameval = xfs_attri_log_nameval_alloc(
+ args->name, args->namelen,
+ args->new_name, args->new_namelen,
+ args->value, args->valuelen,
+ args->new_value, args->new_valuelen);
}
attrip = xfs_attri_init(mp, attr->xattri_nameval);
@@ -460,17 +537,19 @@ xfs_attri_item_match(
return ATTRI_ITEM(lip)->attri_format.alfi_id == intent_id;
}
+static inline bool
+xfs_attri_validate_namelen(unsigned int namelen)
+{
+ return namelen > 0 && namelen <= XATTR_NAME_MAX;
+}
+
/* Is this recovered ATTRI format ok? */
static inline bool
xfs_attri_validate(
struct xfs_mount *mp,
struct xfs_attri_log_format *attrp)
{
- unsigned int op = attrp->alfi_op_flags &
- XFS_ATTRI_OP_FLAGS_TYPE_MASK;
-
- if (attrp->__pad != 0)
- return false;
+ unsigned int op = xfs_attr_log_item_op(attrp);
if (attrp->alfi_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)
return false;
@@ -478,24 +557,75 @@ xfs_attri_validate(
if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK)
return false;
- /* alfi_op_flags should be either a set or remove */
+ if (!xfs_attr_check_namespace(attrp->alfi_attr_filter &
+ XFS_ATTR_NSP_ONDISK_MASK))
+ return false;
+
switch (op) {
+ case XFS_ATTRI_OP_FLAGS_PPTR_SET:
+ case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
+ if (!xfs_has_parent(mp))
+ return false;
+ if (attrp->alfi_value_len != sizeof(struct xfs_parent_rec))
+ return false;
+ if (!xfs_attri_validate_namelen(attrp->alfi_name_len))
+ return false;
+ if (!(attrp->alfi_attr_filter & XFS_ATTR_PARENT))
+ return false;
+ break;
case XFS_ATTRI_OP_FLAGS_SET:
case XFS_ATTRI_OP_FLAGS_REPLACE:
+ if (!xfs_is_using_logged_xattrs(mp))
+ return false;
+ if (attrp->alfi_value_len > XATTR_SIZE_MAX)
+ return false;
+ if (!xfs_attri_validate_namelen(attrp->alfi_name_len))
+ return false;
+ break;
case XFS_ATTRI_OP_FLAGS_REMOVE:
+ if (!xfs_is_using_logged_xattrs(mp))
+ return false;
+ if (attrp->alfi_value_len != 0)
+ return false;
+ if (!xfs_attri_validate_namelen(attrp->alfi_name_len))
+ return false;
+ break;
+ case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
+ if (!xfs_has_parent(mp))
+ return false;
+ if (!xfs_attri_validate_namelen(attrp->alfi_old_name_len))
+ return false;
+ if (!xfs_attri_validate_namelen(attrp->alfi_new_name_len))
+ return false;
+ if (attrp->alfi_value_len != sizeof(struct xfs_parent_rec))
+ return false;
+ if (!(attrp->alfi_attr_filter & XFS_ATTR_PARENT))
+ return false;
break;
default:
return false;
}
- if (attrp->alfi_value_len > XATTR_SIZE_MAX)
- return false;
+ return xfs_verify_ino(mp, attrp->alfi_ino);
+}
- if ((attrp->alfi_name_len > XATTR_NAME_MAX) ||
- (attrp->alfi_name_len == 0))
- return false;
+static int
+xfs_attri_iread_extents(
+ struct xfs_inode *ip)
+{
+ struct xfs_trans *tp;
+ int error;
- return xfs_verify_ino(mp, attrp->alfi_ino);
+ error = xfs_trans_alloc_empty(ip->i_mount, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_cancel(tp);
+
+ return error;
}
static inline struct xfs_attr_intent *
@@ -508,20 +638,46 @@ xfs_attri_recover_work(
{
struct xfs_attr_intent *attr;
struct xfs_da_args *args;
+ struct xfs_inode *ip;
int local;
int error;
- error = xlog_recover_iget(mp, attrp->alfi_ino, ipp);
- if (error)
- return ERR_PTR(error);
+ /*
+ * Parent pointer attr items record the generation but regular logged
+ * xattrs do not; select the right iget function.
+ */
+ switch (xfs_attr_log_item_op(attrp)) {
+ case XFS_ATTRI_OP_FLAGS_PPTR_SET:
+ case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
+ case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
+ error = xlog_recover_iget_handle(mp, attrp->alfi_ino,
+ attrp->alfi_igen, &ip);
+ break;
+ default:
+ error = xlog_recover_iget(mp, attrp->alfi_ino, &ip);
+ break;
+ }
+ if (error) {
+ xfs_irele(ip);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attrp,
+ sizeof(*attrp));
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+
+ if (xfs_inode_has_attr_fork(ip)) {
+ error = xfs_attri_iread_extents(ip);
+ if (error) {
+ xfs_irele(ip);
+ return ERR_PTR(error);
+ }
+ }
attr = kzalloc(sizeof(struct xfs_attr_intent) +
sizeof(struct xfs_da_args), GFP_KERNEL | __GFP_NOFAIL);
args = (struct xfs_da_args *)(attr + 1);
attr->xattri_da_args = args;
- attr->xattri_op_flags = attrp->alfi_op_flags &
- XFS_ATTRI_OP_FLAGS_TYPE_MASK;
+ attr->xattri_op_flags = xfs_attr_log_item_op(attrp);
/*
* We're reconstructing the deferred work state structure from the
@@ -531,35 +687,42 @@ xfs_attri_recover_work(
attr->xattri_nameval = xfs_attri_log_nameval_get(nv);
ASSERT(attr->xattri_nameval);
- args->dp = *ipp;
+ args->dp = ip;
args->geo = mp->m_attr_geo;
args->whichfork = XFS_ATTR_FORK;
args->name = nv->name.i_addr;
args->namelen = nv->name.i_len;
- args->hashval = xfs_da_hashname(args->name, args->namelen);
+ args->new_name = nv->new_name.i_addr;
+ args->new_namelen = nv->new_name.i_len;
+ args->value = nv->value.i_addr;
+ args->valuelen = nv->value.i_len;
+ args->new_value = nv->new_value.i_addr;
+ args->new_valuelen = nv->new_value.i_len;
args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK;
args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT |
XFS_DA_OP_LOGGED;
+ args->owner = args->dp->i_ino;
+ xfs_attr_sethash(args);
- ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb));
-
- switch (attr->xattri_op_flags) {
+ switch (xfs_attr_intent_op(attr)) {
+ case XFS_ATTRI_OP_FLAGS_PPTR_SET:
+ case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
case XFS_ATTRI_OP_FLAGS_SET:
case XFS_ATTRI_OP_FLAGS_REPLACE:
- args->value = nv->value.i_addr;
- args->valuelen = nv->value.i_len;
args->total = xfs_attr_calc_size(args, &local);
if (xfs_inode_hasattr(args->dp))
attr->xattri_dela_state = xfs_attr_init_replace_state(args);
else
attr->xattri_dela_state = xfs_attr_init_add_state(args);
break;
+ case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
case XFS_ATTRI_OP_FLAGS_REMOVE:
attr->xattri_dela_state = xfs_attr_init_remove_state(args);
break;
}
xfs_defer_add_item(dfp, &attr->xattri_list);
+ *ipp = ip;
return attr;
}
@@ -591,7 +754,8 @@ xfs_attr_recover_work(
*/
attrp = &attrip->attri_format;
if (!xfs_attri_validate(mp, attrp) ||
- !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len))
+ !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.i_addr,
+ nv->name.i_len))
return -EFSCORRUPTED;
attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv);
@@ -614,16 +778,17 @@ xfs_attr_recover_work(
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
&attrip->attri_format,
sizeof(attrip->attri_format));
- if (error) {
- xfs_trans_cancel(tp);
- goto out_unlock;
- }
+ if (error)
+ goto out_cancel;
error = xfs_defer_ops_capture_and_commit(tp, capture_list);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_irele(ip);
return error;
+out_cancel:
+ xfs_trans_cancel(tp);
+ goto out_unlock;
}
/* Re-log an intent item to push the log tail forward. */
@@ -649,9 +814,20 @@ xfs_attr_relog_intent(
new_attrp = &new_attrip->attri_format;
new_attrp->alfi_ino = old_attrp->alfi_ino;
+ new_attrp->alfi_igen = old_attrp->alfi_igen;
new_attrp->alfi_op_flags = old_attrp->alfi_op_flags;
new_attrp->alfi_value_len = old_attrp->alfi_value_len;
- new_attrp->alfi_name_len = old_attrp->alfi_name_len;
+
+ switch (xfs_attr_log_item_op(old_attrp)) {
+ case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
+ new_attrp->alfi_new_name_len = old_attrp->alfi_new_name_len;
+ new_attrp->alfi_old_name_len = old_attrp->alfi_old_name_len;
+ break;
+ default:
+ new_attrp->alfi_name_len = old_attrp->alfi_name_len;
+ break;
+ }
+
new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter;
return &new_attrip->attri_item;
@@ -679,6 +855,75 @@ xfs_attr_create_done(
return &attrdp->attrd_item;
}
+void
+xfs_attr_defer_add(
+ struct xfs_da_args *args,
+ enum xfs_attr_defer_op op)
+{
+ struct xfs_attr_intent *new;
+ unsigned int log_op = 0;
+ bool is_pptr = args->attr_filter & XFS_ATTR_PARENT;
+
+ if (is_pptr) {
+ ASSERT(xfs_has_parent(args->dp->i_mount));
+ ASSERT((args->attr_filter & ~XFS_ATTR_PARENT) == 0);
+ ASSERT(args->op_flags & XFS_DA_OP_LOGGED);
+ ASSERT(args->valuelen == sizeof(struct xfs_parent_rec));
+ }
+
+ new = kmem_cache_zalloc(xfs_attr_intent_cache,
+ GFP_NOFS | __GFP_NOFAIL);
+ new->xattri_da_args = args;
+
+ /* Compute log operation from the higher level op and namespace. */
+ switch (op) {
+ case XFS_ATTR_DEFER_SET:
+ if (is_pptr)
+ log_op = XFS_ATTRI_OP_FLAGS_PPTR_SET;
+ else
+ log_op = XFS_ATTRI_OP_FLAGS_SET;
+ break;
+ case XFS_ATTR_DEFER_REPLACE:
+ if (is_pptr)
+ log_op = XFS_ATTRI_OP_FLAGS_PPTR_REPLACE;
+ else
+ log_op = XFS_ATTRI_OP_FLAGS_REPLACE;
+ break;
+ case XFS_ATTR_DEFER_REMOVE:
+ if (is_pptr)
+ log_op = XFS_ATTRI_OP_FLAGS_PPTR_REMOVE;
+ else
+ log_op = XFS_ATTRI_OP_FLAGS_REMOVE;
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+ new->xattri_op_flags = log_op;
+
+ /* Set up initial attr operation state. */
+ switch (log_op) {
+ case XFS_ATTRI_OP_FLAGS_PPTR_SET:
+ case XFS_ATTRI_OP_FLAGS_SET:
+ new->xattri_dela_state = xfs_attr_init_add_state(args);
+ break;
+ case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
+ ASSERT(args->new_valuelen == args->valuelen);
+ new->xattri_dela_state = xfs_attr_init_replace_state(args);
+ break;
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ new->xattri_dela_state = xfs_attr_init_replace_state(args);
+ break;
+ case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ new->xattri_dela_state = xfs_attr_init_remove_state(args);
+ break;
+ }
+
+ xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type);
+ trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp);
+}
+
const struct xfs_defer_op_type xfs_attr_defer_type = {
.name = "attr",
.max_items = 1,
@@ -691,6 +936,56 @@ const struct xfs_defer_op_type xfs_attr_defer_type = {
.relog_intent = xfs_attr_relog_intent,
};
+static inline void *
+xfs_attri_validate_name_iovec(
+ struct xfs_mount *mp,
+ struct xfs_attri_log_format *attri_formatp,
+ const struct xfs_log_iovec *iovec,
+ unsigned int name_len)
+{
+ if (iovec->i_len != xlog_calc_iovec_len(name_len)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, sizeof(*attri_formatp));
+ return NULL;
+ }
+
+ if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, iovec->i_addr,
+ name_len)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, sizeof(*attri_formatp));
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ iovec->i_addr, iovec->i_len);
+ return NULL;
+ }
+
+ return iovec->i_addr;
+}
+
+static inline void *
+xfs_attri_validate_value_iovec(
+ struct xfs_mount *mp,
+ struct xfs_attri_log_format *attri_formatp,
+ const struct xfs_log_iovec *iovec,
+ unsigned int value_len)
+{
+ if (iovec->i_len != xlog_calc_iovec_len(value_len)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, sizeof(*attri_formatp));
+ return NULL;
+ }
+
+ if ((attri_formatp->alfi_attr_filter & XFS_ATTR_PARENT) &&
+ !xfs_parent_valuecheck(mp, iovec->i_addr, value_len)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, sizeof(*attri_formatp));
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ iovec->i_addr, iovec->i_len);
+ return NULL;
+ }
+
+ return iovec->i_addr;
+}
+
STATIC int
xlog_recover_attri_commit_pass2(
struct xlog *log,
@@ -702,51 +997,177 @@ xlog_recover_attri_commit_pass2(
struct xfs_attri_log_item *attrip;
struct xfs_attri_log_format *attri_formatp;
struct xfs_attri_log_nameval *nv;
- const void *attr_value = NULL;
const void *attr_name;
+ const void *attr_value = NULL;
+ const void *attr_new_name = NULL;
+ const void *attr_new_value = NULL;
size_t len;
-
- attri_formatp = item->ri_buf[0].i_addr;
- attr_name = item->ri_buf[1].i_addr;
+ unsigned int name_len = 0;
+ unsigned int value_len = 0;
+ unsigned int new_name_len = 0;
+ unsigned int new_value_len = 0;
+ unsigned int op, i = 0;
/* Validate xfs_attri_log_format before the large memory allocation */
len = sizeof(struct xfs_attri_log_format);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[i].i_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
return -EFSCORRUPTED;
}
+ attri_formatp = item->ri_buf[i].i_addr;
if (!xfs_attri_validate(mp, attri_formatp)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ attri_formatp, len);
return -EFSCORRUPTED;
}
- /* Validate the attr name */
- if (item->ri_buf[1].i_len !=
- xlog_calc_iovec_len(attri_formatp->alfi_name_len)) {
+ /* Check the number of log iovecs makes sense for the op code. */
+ op = xfs_attr_log_item_op(attri_formatp);
+ switch (op) {
+ case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
+ case XFS_ATTRI_OP_FLAGS_PPTR_SET:
+ /* Log item, attr name, attr value */
+ if (item->ri_total != 3) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ name_len = attri_formatp->alfi_name_len;
+ value_len = attri_formatp->alfi_value_len;
+ break;
+ case XFS_ATTRI_OP_FLAGS_SET:
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ /* Log item, attr name, attr value */
+ if (item->ri_total != 3) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ name_len = attri_formatp->alfi_name_len;
+ value_len = attri_formatp->alfi_value_len;
+ break;
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ /* Log item, attr name */
+ if (item->ri_total != 2) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ name_len = attri_formatp->alfi_name_len;
+ break;
+ case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
+ /*
+ * Log item, attr name, new attr name, attr value, new attr
+ * value
+ */
+ if (item->ri_total != 5) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ name_len = attri_formatp->alfi_old_name_len;
+ new_name_len = attri_formatp->alfi_new_name_len;
+ new_value_len = value_len = attri_formatp->alfi_value_len;
+ break;
+ default:
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ attri_formatp, len);
return -EFSCORRUPTED;
}
+ i++;
- if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[1].i_addr, item->ri_buf[1].i_len);
+ /* Validate the attr name */
+ attr_name = xfs_attri_validate_name_iovec(mp, attri_formatp,
+ &item->ri_buf[i], name_len);
+ if (!attr_name)
return -EFSCORRUPTED;
+ i++;
+
+ /* Validate the new attr name */
+ if (new_name_len > 0) {
+ attr_new_name = xfs_attri_validate_name_iovec(mp,
+ attri_formatp, &item->ri_buf[i],
+ new_name_len);
+ if (!attr_new_name)
+ return -EFSCORRUPTED;
+ i++;
}
/* Validate the attr value, if present */
- if (attri_formatp->alfi_value_len != 0) {
- if (item->ri_buf[2].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) {
+ if (value_len != 0) {
+ attr_value = xfs_attri_validate_value_iovec(mp, attri_formatp,
+ &item->ri_buf[i], value_len);
+ if (!attr_value)
+ return -EFSCORRUPTED;
+ i++;
+ }
+
+ /* Validate the new attr value, if present */
+ if (new_value_len != 0) {
+ attr_new_value = xfs_attri_validate_value_iovec(mp,
+ attri_formatp, &item->ri_buf[i],
+ new_value_len);
+ if (!attr_new_value)
+ return -EFSCORRUPTED;
+ i++;
+ }
+
+ /*
+ * Make sure we got the correct number of buffers for the operation
+ * that we just loaded.
+ */
+ if (i != item->ri_total) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+
+ switch (op) {
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ /* Regular remove operations operate only on names. */
+ if (attr_value != NULL || value_len != 0) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr,
- item->ri_buf[0].i_len);
+ attri_formatp, len);
return -EFSCORRUPTED;
}
-
- attr_value = item->ri_buf[2].i_addr;
+ fallthrough;
+ case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
+ case XFS_ATTRI_OP_FLAGS_PPTR_SET:
+ case XFS_ATTRI_OP_FLAGS_SET:
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ /*
+ * Regular xattr set/remove/replace operations require a name
+ * and do not take a newname. Values are optional for set and
+ * replace.
+ *
+ * Name-value set/remove operations must have a name, do not
+ * take a newname, and can take a value.
+ */
+ if (attr_name == NULL || name_len == 0) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ break;
+ case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
+ /*
+ * Name-value replace operations require the caller to
+ * specify the old and new names and values explicitly.
+ * Values are optional.
+ */
+ if (attr_name == NULL || name_len == 0) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ if (attr_new_name == NULL || new_name_len == 0) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ break;
}
/*
@@ -754,9 +1175,10 @@ xlog_recover_attri_commit_pass2(
* name/value buffer to the recovered incore log item and drop our
* reference.
*/
- nv = xfs_attri_log_nameval_alloc(attr_name,
- attri_formatp->alfi_name_len, attr_value,
- attri_formatp->alfi_value_len);
+ nv = xfs_attri_log_nameval_alloc(attr_name, name_len,
+ attr_new_name, new_name_len,
+ attr_value, value_len,
+ attr_new_value, new_value_len);
attrip = xfs_attri_init(mp, nv);
memcpy(&attrip->attri_format, attri_formatp, len);
diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h
index 3280a7930287..e74128cbb722 100644
--- a/fs/xfs/xfs_attr_item.h
+++ b/fs/xfs/xfs_attr_item.h
@@ -13,7 +13,9 @@ struct kmem_zone;
struct xfs_attri_log_nameval {
struct xfs_log_iovec name;
+ struct xfs_log_iovec new_name; /* PPTR_REPLACE only */
struct xfs_log_iovec value;
+ struct xfs_log_iovec new_value; /* PPTR_REPLACE only */
refcount_t refcount;
/* name and value follow the end of this struct */
@@ -51,4 +53,12 @@ struct xfs_attrd_log_item {
extern struct kmem_cache *xfs_attri_cache;
extern struct kmem_cache *xfs_attrd_cache;
+enum xfs_attr_defer_op {
+ XFS_ATTR_DEFER_SET,
+ XFS_ATTR_DEFER_REMOVE,
+ XFS_ATTR_DEFER_REPLACE,
+};
+
+void xfs_attr_defer_add(struct xfs_da_args *args, enum xfs_attr_defer_op op);
+
#endif /* __XFS_ATTR_ITEM_H__ */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index a6819a642cc0..5c947e5ce8b8 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -82,7 +82,8 @@ xfs_attr_shortform_list(
(dp->i_af.if_bytes + sf->count * 16) < context->bufsize)) {
for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) {
if (XFS_IS_CORRUPT(context->dp->i_mount,
- !xfs_attr_namecheck(sfe->nameval,
+ !xfs_attr_namecheck(sfe->flags,
+ sfe->nameval,
sfe->namelen))) {
xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
@@ -91,6 +92,7 @@ xfs_attr_shortform_list(
sfe->flags,
sfe->nameval,
(int)sfe->namelen,
+ &sfe->nameval[sfe->namelen],
(int)sfe->valuelen);
/*
* Either search callback finished early or
@@ -122,7 +124,8 @@ xfs_attr_shortform_list(
for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) {
if (unlikely(
((char *)sfe < (char *)sf) ||
- ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) {
+ ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)) ||
+ !xfs_attr_check_namespace(sfe->flags))) {
XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
XFS_ERRLEVEL_LOW,
context->dp->i_mount, sfe,
@@ -133,12 +136,16 @@ xfs_attr_shortform_list(
}
sbp->entno = i;
- sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
sbp->name = sfe->nameval;
sbp->namelen = sfe->namelen;
/* These are bytes, and both on-disk, don't endian-flip */
+ sbp->value = &sfe->nameval[sfe->namelen],
sbp->valuelen = sfe->valuelen;
sbp->flags = sfe->flags;
+ sbp->hash = xfs_attr_hashval(dp->i_mount, sfe->flags,
+ sfe->nameval, sfe->namelen,
+ sfe->nameval + sfe->namelen,
+ sfe->valuelen);
sfe = xfs_attr_sf_nextentry(sfe);
sbp++;
nsbuf++;
@@ -177,7 +184,7 @@ xfs_attr_shortform_list(
cursor->offset = 0;
}
if (XFS_IS_CORRUPT(context->dp->i_mount,
- !xfs_attr_namecheck(sbp->name,
+ !xfs_attr_namecheck(sbp->flags, sbp->name,
sbp->namelen))) {
xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
error = -EFSCORRUPTED;
@@ -187,6 +194,7 @@ xfs_attr_shortform_list(
sbp->flags,
sbp->name,
sbp->namelen,
+ sbp->value,
sbp->valuelen);
if (context->seen_enough)
break;
@@ -214,6 +222,7 @@ xfs_attr_node_list_lookup(
struct xfs_mount *mp = dp->i_mount;
struct xfs_trans *tp = context->tp;
struct xfs_buf *bp;
+ xfs_failaddr_t fa;
int i;
int error = 0;
unsigned int expected_level = 0;
@@ -238,6 +247,10 @@ xfs_attr_node_list_lookup(
goto out_corruptbuf;
}
+ fa = xfs_da3_node_header_check(bp, dp->i_ino);
+ if (fa)
+ goto out_corruptbuf;
+
xfs_da3_node_hdr_from_disk(mp, &nodehdr, node);
/* Tree taller than we can handle; bail out! */
@@ -273,6 +286,12 @@ xfs_attr_node_list_lookup(
}
}
+ fa = xfs_attr3_leaf_header_check(bp, dp->i_ino);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
+ goto out_releasebuf;
+ }
+
if (expected_level != 0)
goto out_corruptbuf;
@@ -281,6 +300,7 @@ xfs_attr_node_list_lookup(
out_corruptbuf:
xfs_buf_mark_corrupt(bp);
+out_releasebuf:
xfs_trans_brelse(tp, bp);
xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
@@ -297,6 +317,7 @@ xfs_attr_node_list(
struct xfs_buf *bp;
struct xfs_inode *dp = context->dp;
struct xfs_mount *mp = dp->i_mount;
+ xfs_failaddr_t fa;
int error = 0;
trace_xfs_attr_node_list(context);
@@ -310,46 +331,60 @@ xfs_attr_node_list(
*/
bp = NULL;
if (cursor->blkno > 0) {
+ struct xfs_attr_leaf_entry *entries;
+
error = xfs_da3_node_read(context->tp, dp, cursor->blkno, &bp,
XFS_ATTR_FORK);
if (xfs_metadata_is_sick(error))
xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
- if ((error != 0) && (error != -EFSCORRUPTED))
+ if (error != 0 && error != -EFSCORRUPTED)
return error;
- if (bp) {
- struct xfs_attr_leaf_entry *entries;
+ if (!bp)
+ goto need_lookup;
- node = bp->b_addr;
- switch (be16_to_cpu(node->hdr.info.magic)) {
- case XFS_DA_NODE_MAGIC:
- case XFS_DA3_NODE_MAGIC:
- trace_xfs_attr_list_wrong_blk(context);
+ node = bp->b_addr;
+ switch (be16_to_cpu(node->hdr.info.magic)) {
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ trace_xfs_attr_list_wrong_blk(context);
+ fa = xfs_da3_node_header_check(bp, dp->i_ino);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
+ }
+ xfs_trans_brelse(context->tp, bp);
+ bp = NULL;
+ break;
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ leaf = bp->b_addr;
+ fa = xfs_attr3_leaf_header_check(bp, dp->i_ino);
+ if (fa) {
+ __xfs_buf_mark_corrupt(bp, fa);
xfs_trans_brelse(context->tp, bp);
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
bp = NULL;
break;
- case XFS_ATTR_LEAF_MAGIC:
- case XFS_ATTR3_LEAF_MAGIC:
- leaf = bp->b_addr;
- xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo,
- &leafhdr, leaf);
- entries = xfs_attr3_leaf_entryp(leaf);
- if (cursor->hashval > be32_to_cpu(
- entries[leafhdr.count - 1].hashval)) {
- trace_xfs_attr_list_wrong_blk(context);
- xfs_trans_brelse(context->tp, bp);
- bp = NULL;
- } else if (cursor->hashval <= be32_to_cpu(
- entries[0].hashval)) {
- trace_xfs_attr_list_wrong_blk(context);
- xfs_trans_brelse(context->tp, bp);
- bp = NULL;
- }
- break;
- default:
+ }
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo,
+ &leafhdr, leaf);
+ entries = xfs_attr3_leaf_entryp(leaf);
+ if (cursor->hashval > be32_to_cpu(
+ entries[leafhdr.count - 1].hashval)) {
+ trace_xfs_attr_list_wrong_blk(context);
+ xfs_trans_brelse(context->tp, bp);
+ bp = NULL;
+ } else if (cursor->hashval <= be32_to_cpu(
+ entries[0].hashval)) {
trace_xfs_attr_list_wrong_blk(context);
xfs_trans_brelse(context->tp, bp);
bp = NULL;
}
+ break;
+ default:
+ trace_xfs_attr_list_wrong_blk(context);
+ xfs_trans_brelse(context->tp, bp);
+ bp = NULL;
}
}
@@ -359,6 +394,7 @@ xfs_attr_node_list(
* Note that start of node block is same as start of leaf block.
*/
if (bp == NULL) {
+need_lookup:
error = xfs_attr_node_list_lookup(context, cursor, &bp);
if (error || !bp)
return error;
@@ -380,8 +416,8 @@ xfs_attr_node_list(
break;
cursor->blkno = leafhdr.forw;
xfs_trans_brelse(context->tp, bp);
- error = xfs_attr3_leaf_read(context->tp, dp, cursor->blkno,
- &bp);
+ error = xfs_attr3_leaf_read(context->tp, dp, dp->i_ino,
+ cursor->blkno, &bp);
if (error)
return error;
}
@@ -446,6 +482,7 @@ xfs_attr3_leaf_list_int(
*/
for (; i < ichdr.count; entry++, i++) {
char *name;
+ void *value;
int namelen, valuelen;
if (be32_to_cpu(entry->hashval) != cursor->hashval) {
@@ -463,6 +500,7 @@ xfs_attr3_leaf_list_int(
name_loc = xfs_attr3_leaf_name_local(leaf, i);
name = name_loc->nameval;
namelen = name_loc->namelen;
+ value = &name_loc->nameval[name_loc->namelen];
valuelen = be16_to_cpu(name_loc->valuelen);
} else {
xfs_attr_leaf_name_remote_t *name_rmt;
@@ -470,16 +508,18 @@ xfs_attr3_leaf_list_int(
name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
name = name_rmt->name;
namelen = name_rmt->namelen;
+ value = NULL;
valuelen = be32_to_cpu(name_rmt->valuelen);
}
if (XFS_IS_CORRUPT(context->dp->i_mount,
- !xfs_attr_namecheck(name, namelen))) {
+ !xfs_attr_namecheck(entry->flags, name,
+ namelen))) {
xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
context->put_listent(context, entry->flags,
- name, namelen, valuelen);
+ name, namelen, value, valuelen);
if (context->seen_enough)
break;
cursor->offset++;
@@ -501,7 +541,8 @@ xfs_attr_leaf_list(
trace_xfs_attr_leaf_list(context);
context->cursor.blkno = 0;
- error = xfs_attr3_leaf_read(context->tp, context->dp, 0, &bp);
+ error = xfs_attr3_leaf_read(context->tp, context->dp,
+ context->dp->i_ino, 0, &bp);
if (error)
return error;
@@ -515,6 +556,7 @@ xfs_attr_list_ilocked(
struct xfs_attr_list_context *context)
{
struct xfs_inode *dp = context->dp;
+ int error;
xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
@@ -525,6 +567,12 @@ xfs_attr_list_ilocked(
return 0;
if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
return xfs_attr_shortform_list(context);
+
+ /* Prerequisite for xfs_attr_is_leaf */
+ error = xfs_iread_extents(NULL, dp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
if (xfs_attr_is_leaf(dp))
return xfs_attr_leaf_list(context);
return xfs_attr_node_list(context);
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index d27859a684aa..a19d62e78aa1 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -524,9 +524,7 @@ xfs_bmap_recover_work(
else
iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
- error = xfs_iext_count_may_overflow(ip, work->bi_whichfork, iext_delta);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip, iext_delta);
+ error = xfs_iext_count_extend(tp, ip, work->bi_whichfork, iext_delta);
if (error)
goto err_cancel;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 19e11d1da660..ac2e77ebb54c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -440,7 +440,7 @@ out_unlock_iolock:
* if the ranges only partially overlap them, so it is up to the caller to
* ensure that partial blocks are not passed in.
*/
-int
+void
xfs_bmap_punch_delalloc_range(
struct xfs_inode *ip,
xfs_off_t start_byte,
@@ -452,7 +452,6 @@ xfs_bmap_punch_delalloc_range(
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte);
struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
- int error = 0;
ASSERT(!xfs_need_iread_extents(ifp));
@@ -476,15 +475,13 @@ xfs_bmap_punch_delalloc_range(
continue;
}
- error = xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur,
- &got, &del);
- if (error || !xfs_iext_get_extent(ifp, &icur, &got))
+ xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur, &got, &del);
+ if (!xfs_iext_get_extent(ifp, &icur, &got))
break;
}
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
}
/*
@@ -542,7 +539,7 @@ xfs_can_free_eofblocks(
* forever.
*/
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
- if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1)
+ if (xfs_inode_has_bigrtalloc(ip))
end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb);
last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
if (last_fsb <= end_fsb)
@@ -713,41 +710,37 @@ xfs_alloc_file_space(
if (error)
break;
- error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip,
- XFS_IEXT_ADD_NOSPLIT_CNT);
- if (error)
- goto error;
-
- error = xfs_bmapi_write(tp, ip, startoffset_fsb,
- allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
- &nimaps);
if (error)
goto error;
- ip->i_diflags |= XFS_DIFLAG_PREALLOC;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
- error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- break;
-
/*
* If the allocator cannot find a single free extent large
* enough to cover the start block of the requested range,
- * xfs_bmapi_write will return 0 but leave *nimaps set to 0.
+ * xfs_bmapi_write will return -ENOSR.
*
* In that case we simply need to keep looping with the same
* startoffset_fsb so that one of the following allocations
* will eventually reach the requested range.
*/
- if (nimaps) {
+ error = xfs_bmapi_write(tp, ip, startoffset_fsb,
+ allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
+ &nimaps);
+ if (error) {
+ if (error != -ENOSR)
+ goto error;
+ error = 0;
+ } else {
startoffset_fsb += imapp->br_blockcount;
allocatesize_fsb -= imapp->br_blockcount;
}
+
+ ip->i_diflags |= XFS_DIFLAG_PREALLOC;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
return error;
@@ -775,10 +768,8 @@ xfs_unmap_extent(
if (error)
return error;
- error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
XFS_IEXT_PUNCH_HOLE_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
if (error)
goto out_trans_cancel;
@@ -843,7 +834,7 @@ xfs_free_file_space(
endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
/* We can only free complete realtime extents. */
- if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
+ if (xfs_inode_has_bigrtalloc(ip)) {
startoffset_fsb = xfs_rtb_roundup_rtx(mp, startoffset_fsb);
endoffset_fsb = xfs_rtb_rounddown_rtx(mp, endoffset_fsb);
}
@@ -1054,10 +1045,8 @@ xfs_insert_file_space(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
XFS_IEXT_PUNCH_HOLE_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
if (error)
goto out_trans_cancel;
@@ -1283,23 +1272,17 @@ xfs_swap_extent_rmap(
trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
if (xfs_bmap_is_real_extent(&uirec)) {
- error = xfs_iext_count_may_overflow(ip,
+ error = xfs_iext_count_extend(tp, ip,
XFS_DATA_FORK,
XFS_IEXT_SWAP_RMAP_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip,
- XFS_IEXT_SWAP_RMAP_CNT);
if (error)
goto out;
}
if (xfs_bmap_is_real_extent(&irec)) {
- error = xfs_iext_count_may_overflow(tip,
+ error = xfs_iext_count_extend(tp, tip,
XFS_DATA_FORK,
XFS_IEXT_SWAP_RMAP_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip,
- XFS_IEXT_SWAP_RMAP_CNT);
if (error)
goto out;
}
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 77ecbb753ef2..51f84d8ff372 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -30,7 +30,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
}
#endif /* CONFIG_XFS_RT */
-int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
+void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
xfs_off_t start_byte, xfs_off_t end_byte);
struct kgetbmap {
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index f0fa02264eda..aa4dbda7b536 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -494,6 +494,9 @@ _xfs_buf_obj_cmp(
* it stale has not yet committed. i.e. we are
* reallocating a busy extent. Skip this buffer and
* continue searching for an exact match.
+ *
+ * Note: If we're scanning for incore buffers to stale, don't
+ * complain if we find non-stale buffers.
*/
if (!(map->bm_flags & XBM_LIVESCAN))
ASSERT(bp->b_flags & XBF_STALE);
@@ -2043,7 +2046,7 @@ xfs_setsize_buftarg(
btp->bt_meta_sectorsize = sectorsize;
btp->bt_meta_sectormask = sectorsize - 1;
- if (set_blocksize(btp->bt_bdev, sectorsize)) {
+ if (set_blocksize(btp->bt_bdev_file, sectorsize)) {
xfs_warn(btp->bt_mount,
"Cannot set_blocksize to %u on device %pg",
sectorsize, btp->bt_bdev);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index cf9296b7e06f..06ac5a7de60a 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -157,7 +157,7 @@ xfs_dir2_block_getdents(
if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
return 0;
- error = xfs_dir3_block_read(args->trans, dp, &bp);
+ error = xfs_dir3_block_read(args->trans, dp, args->owner, &bp);
if (error)
return error;
@@ -282,7 +282,8 @@ xfs_dir2_leaf_readbuf(
new_off = xfs_dir2_da_to_byte(geo, map.br_startoff);
if (new_off > *cur_off)
*cur_off = new_off;
- error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, 0, &bp);
+ error = xfs_dir3_data_read(args->trans, dp, args->owner,
+ map.br_startoff, 0, &bp);
if (error)
goto out;
@@ -515,7 +516,6 @@ xfs_readdir(
{
struct xfs_da_args args = { NULL };
unsigned int lock_mode;
- bool isblock;
int error;
trace_xfs_readdir(dp);
@@ -532,23 +532,24 @@ xfs_readdir(
args.dp = dp;
args.geo = dp->i_mount->m_dir_geo;
args.trans = tp;
+ args.owner = dp->i_ino;
if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
return xfs_dir2_sf_getdents(&args, ctx);
lock_mode = xfs_ilock_data_map_shared(dp);
- error = xfs_dir2_isblock(&args, &isblock);
- if (error)
- goto out_unlock;
-
- if (isblock) {
+ switch (xfs_dir2_format(&args, &error)) {
+ case XFS_DIR2_FMT_BLOCK:
error = xfs_dir2_block_getdents(&args, ctx, &lock_mode);
- goto out_unlock;
+ break;
+ case XFS_DIR2_FMT_LEAF:
+ case XFS_DIR2_FMT_NODE:
+ error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode);
+ break;
+ default:
+ break;
}
- error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode);
-
-out_unlock:
if (lock_mode)
xfs_iunlock(dp, lock_mode);
return error;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 268bb734dc0a..25fe3b932b5a 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -145,14 +145,18 @@ xfs_discard_extents(
return error;
}
+struct xfs_trim_cur {
+ xfs_agblock_t start;
+ xfs_extlen_t count;
+ xfs_agblock_t end;
+ xfs_extlen_t minlen;
+ bool by_bno;
+};
static int
xfs_trim_gather_extents(
struct xfs_perag *pag,
- xfs_daddr_t start,
- xfs_daddr_t end,
- xfs_daddr_t minlen,
- struct xfs_alloc_rec_incore *tcur,
+ struct xfs_trim_cur *tcur,
struct xfs_busy_extents *extents,
uint64_t *blocks_trimmed)
{
@@ -179,21 +183,26 @@ xfs_trim_gather_extents(
if (error)
goto out_trans_cancel;
- cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
-
- /*
- * Look up the extent length requested in the AGF and start with it.
- */
- if (tcur->ar_startblock == NULLAGBLOCK)
- error = xfs_alloc_lookup_ge(cur, 0, tcur->ar_blockcount, &i);
- else
- error = xfs_alloc_lookup_le(cur, tcur->ar_startblock,
- tcur->ar_blockcount, &i);
+ if (tcur->by_bno) {
+ /* sub-AG discard request always starts at tcur->start */
+ cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
+ error = xfs_alloc_lookup_le(cur, tcur->start, 0, &i);
+ if (!error && !i)
+ error = xfs_alloc_lookup_ge(cur, tcur->start, 0, &i);
+ } else if (tcur->start == 0) {
+ /* first time through a by-len starts with max length */
+ cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
+ error = xfs_alloc_lookup_ge(cur, 0, tcur->count, &i);
+ } else {
+ /* nth time through a by-len starts where we left off */
+ cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
+ error = xfs_alloc_lookup_le(cur, tcur->start, tcur->count, &i);
+ }
if (error)
goto out_del_cursor;
if (i == 0) {
/* nothing of that length left in the AG, we are done */
- tcur->ar_blockcount = 0;
+ tcur->count = 0;
goto out_del_cursor;
}
@@ -204,8 +213,6 @@ xfs_trim_gather_extents(
while (i) {
xfs_agblock_t fbno;
xfs_extlen_t flen;
- xfs_daddr_t dbno;
- xfs_extlen_t dlen;
error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
if (error)
@@ -221,38 +228,46 @@ xfs_trim_gather_extents(
* Update the cursor to point at this extent so we
* restart the next batch from this extent.
*/
- tcur->ar_startblock = fbno;
- tcur->ar_blockcount = flen;
- break;
- }
-
- /*
- * use daddr format for all range/len calculations as that is
- * the format the range/len variables are supplied in by
- * userspace.
- */
- dbno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, fbno);
- dlen = XFS_FSB_TO_BB(mp, flen);
-
- /*
- * Too small? Give up.
- */
- if (dlen < minlen) {
- trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen);
- tcur->ar_blockcount = 0;
+ tcur->start = fbno;
+ tcur->count = flen;
break;
}
/*
* If the extent is entirely outside of the range we are
- * supposed to discard skip it. Do not bother to trim
- * down partially overlapping ranges for now.
+ * supposed to skip it. Do not bother to trim down partially
+ * overlapping ranges for now.
*/
- if (dbno + dlen < start || dbno > end) {
+ if (fbno + flen < tcur->start) {
+ trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen);
+ goto next_extent;
+ }
+ if (fbno > tcur->end) {
trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen);
+ if (tcur->by_bno) {
+ tcur->count = 0;
+ break;
+ }
goto next_extent;
}
+ /* Trim the extent returned to the range we want. */
+ if (fbno < tcur->start) {
+ flen -= tcur->start - fbno;
+ fbno = tcur->start;
+ }
+ if (fbno + flen > tcur->end + 1)
+ flen = tcur->end - fbno + 1;
+
+ /* Too small? Give up. */
+ if (flen < tcur->minlen) {
+ trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen);
+ if (tcur->by_bno)
+ goto next_extent;
+ tcur->count = 0;
+ break;
+ }
+
/*
* If any blocks in the range are still busy, skip the
* discard and try again the next time.
@@ -266,7 +281,10 @@ xfs_trim_gather_extents(
&extents->extent_list);
*blocks_trimmed += flen;
next_extent:
- error = xfs_btree_decrement(cur, 0, &i);
+ if (tcur->by_bno)
+ error = xfs_btree_increment(cur, 0, &i);
+ else
+ error = xfs_btree_decrement(cur, 0, &i);
if (error)
break;
@@ -276,7 +294,7 @@ next_extent:
* is no more extents to search.
*/
if (i == 0)
- tcur->ar_blockcount = 0;
+ tcur->count = 0;
}
/*
@@ -306,17 +324,22 @@ xfs_trim_should_stop(void)
static int
xfs_trim_extents(
struct xfs_perag *pag,
- xfs_daddr_t start,
- xfs_daddr_t end,
- xfs_daddr_t minlen,
+ xfs_agblock_t start,
+ xfs_agblock_t end,
+ xfs_extlen_t minlen,
uint64_t *blocks_trimmed)
{
- struct xfs_alloc_rec_incore tcur = {
- .ar_blockcount = pag->pagf_longest,
- .ar_startblock = NULLAGBLOCK,
+ struct xfs_trim_cur tcur = {
+ .start = start,
+ .count = pag->pagf_longest,
+ .end = end,
+ .minlen = minlen,
};
int error = 0;
+ if (start != 0 || end != pag->block_count)
+ tcur.by_bno = true;
+
do {
struct xfs_busy_extents *extents;
@@ -330,8 +353,8 @@ xfs_trim_extents(
extents->owner = extents;
INIT_LIST_HEAD(&extents->extent_list);
- error = xfs_trim_gather_extents(pag, start, end, minlen,
- &tcur, extents, blocks_trimmed);
+ error = xfs_trim_gather_extents(pag, &tcur, extents,
+ blocks_trimmed);
if (error) {
kfree(extents);
break;
@@ -354,7 +377,7 @@ xfs_trim_extents(
if (xfs_trim_should_stop())
break;
- } while (tcur.ar_blockcount != 0);
+ } while (tcur.count != 0);
return error;
@@ -378,8 +401,10 @@ xfs_ioc_trim(
unsigned int granularity =
bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
struct fstrim_range range;
- xfs_daddr_t start, end, minlen;
- xfs_agnumber_t agno;
+ xfs_daddr_t start, end;
+ xfs_extlen_t minlen;
+ xfs_agnumber_t start_agno, end_agno;
+ xfs_agblock_t start_agbno, end_agbno;
uint64_t blocks_trimmed = 0;
int error, last_error = 0;
@@ -399,7 +424,8 @@ xfs_ioc_trim(
return -EFAULT;
range.minlen = max_t(u64, granularity, range.minlen);
- minlen = BTOBB(range.minlen);
+ minlen = XFS_B_TO_FSB(mp, range.minlen);
+
/*
* Truncating down the len isn't actually quite correct, but using
* BBTOB would mean we trivially get overflows for values
@@ -413,15 +439,21 @@ xfs_ioc_trim(
return -EINVAL;
start = BTOBB(range.start);
- end = start + BTOBBT(range.len) - 1;
+ end = min_t(xfs_daddr_t, start + BTOBBT(range.len),
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) - 1;
+
+ start_agno = xfs_daddr_to_agno(mp, start);
+ start_agbno = xfs_daddr_to_agbno(mp, start);
+ end_agno = xfs_daddr_to_agno(mp, end);
+ end_agbno = xfs_daddr_to_agbno(mp, end);
- if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
- end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1;
+ for_each_perag_range(mp, start_agno, end_agno, pag) {
+ xfs_agblock_t agend = pag->block_count;
- agno = xfs_daddr_to_agno(mp, start);
- for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) {
- error = xfs_trim_extents(pag, start, end, minlen,
- &blocks_trimmed);
+ if (start_agno == end_agno)
+ agend = end_agbno;
+ error = xfs_trim_extents(pag, start_agbno, agend, minlen,
+ &blocks_trimmed);
if (error)
last_error = error;
@@ -429,6 +461,7 @@ xfs_ioc_trim(
xfs_perag_rele(pag);
break;
}
+ start_agbno = 0;
}
if (last_error)
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index c98cb468c357..c1b211c260a9 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -341,11 +341,8 @@ xfs_dquot_disk_alloc(
goto err_cancel;
}
- error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK,
+ error = xfs_iext_count_extend(tp, quotip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, quotip,
- XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto err_cancel;
@@ -357,7 +354,6 @@ xfs_dquot_disk_alloc(
goto err_cancel;
ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
- ASSERT(nmaps == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
(map.br_startblock != HOLESTARTBLOCK));
@@ -1371,6 +1367,47 @@ xfs_dqlock2(
}
}
+static int
+xfs_dqtrx_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_dqtrx *qa = a;
+ const struct xfs_dqtrx *qb = b;
+
+ if (qa->qt_dquot->q_id > qb->qt_dquot->q_id)
+ return 1;
+ if (qa->qt_dquot->q_id < qb->qt_dquot->q_id)
+ return -1;
+ return 0;
+}
+
+void
+xfs_dqlockn(
+ struct xfs_dqtrx *q)
+{
+ unsigned int i;
+
+ BUILD_BUG_ON(XFS_QM_TRANS_MAXDQS > MAX_LOCKDEP_SUBCLASSES);
+
+ /* Sort in order of dquot id, do not allow duplicates */
+ for (i = 0; i < XFS_QM_TRANS_MAXDQS && q[i].qt_dquot != NULL; i++) {
+ unsigned int j;
+
+ for (j = 0; j < i; j++)
+ ASSERT(q[i].qt_dquot != q[j].qt_dquot);
+ }
+ if (i == 0)
+ return;
+
+ sort(q, i, sizeof(struct xfs_dqtrx), xfs_dqtrx_cmp, NULL);
+
+ mutex_lock(&q[0].qt_dquot->q_qlock);
+ for (i = 1; i < XFS_QM_TRANS_MAXDQS && q[i].qt_dquot != NULL; i++)
+ mutex_lock_nested(&q[i].qt_dquot->q_qlock,
+ XFS_QLOCK_NESTED + i - 1);
+}
+
int __init
xfs_qm_init(void)
{
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 956272d9b302..677bb2dc9ac9 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -223,6 +223,7 @@ int xfs_qm_dqget_uncached(struct xfs_mount *mp,
void xfs_qm_dqput(struct xfs_dquot *dqp);
void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
+void xfs_dqlockn(struct xfs_dqtrx *q);
void xfs_dquot_set_prealloc_limits(struct xfs_dquot *);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 7ad0e92c6b5b..78cdc5064a8c 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -62,6 +62,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_ATTR_LEAF_TO_NODE,
XFS_RANDOM_WB_DELAY_MS,
XFS_RANDOM_WRITE_DELAY_MS,
+ XFS_RANDOM_EXCHMAPS_FINISH_ONE,
};
struct xfs_errortag_attr {
@@ -179,6 +180,7 @@ XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT);
XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE);
XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS);
XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS);
+XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -224,6 +226,7 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
XFS_ERRORTAG_ATTR_LIST(wb_delay_ms),
XFS_ERRORTAG_ATTR_LIST(write_delay_ms),
+ XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one),
NULL,
};
ATTRIBUTE_GROUPS(xfs_errortag);
diff --git a/fs/xfs/xfs_exchmaps_item.c b/fs/xfs/xfs_exchmaps_item.c
new file mode 100644
index 000000000000..264a121c5e16
--- /dev/null
+++ b/fs/xfs/xfs_exchmaps_item.c
@@ -0,0 +1,614 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_shared.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_exchmaps_item.h"
+#include "xfs_exchmaps.h"
+#include "xfs_log.h"
+#include "xfs_bmap.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+#include "xfs_exchrange.h"
+#include "xfs_trace.h"
+
+struct kmem_cache *xfs_xmi_cache;
+struct kmem_cache *xfs_xmd_cache;
+
+static const struct xfs_item_ops xfs_xmi_item_ops;
+
+static inline struct xfs_xmi_log_item *XMI_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_xmi_log_item, xmi_item);
+}
+
+STATIC void
+xfs_xmi_item_free(
+ struct xfs_xmi_log_item *xmi_lip)
+{
+ kvfree(xmi_lip->xmi_item.li_lv_shadow);
+ kmem_cache_free(xfs_xmi_cache, xmi_lip);
+}
+
+/*
+ * Freeing the XMI requires that we remove it from the AIL if it has already
+ * been placed there. However, the XMI may not yet have been placed in the AIL
+ * when called by xfs_xmi_release() from XMD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the XMI.
+ */
+STATIC void
+xfs_xmi_release(
+ struct xfs_xmi_log_item *xmi_lip)
+{
+ ASSERT(atomic_read(&xmi_lip->xmi_refcount) > 0);
+ if (atomic_dec_and_test(&xmi_lip->xmi_refcount)) {
+ xfs_trans_ail_delete(&xmi_lip->xmi_item, 0);
+ xfs_xmi_item_free(xmi_lip);
+ }
+}
+
+
+STATIC void
+xfs_xmi_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 1;
+ *nbytes += sizeof(struct xfs_xmi_log_format);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the given xmi log
+ * item. We use only 1 iovec, and we point that at the xmi_log_format structure
+ * embedded in the xmi item.
+ */
+STATIC void
+xfs_xmi_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_xmi_log_item *xmi_lip = XMI_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ xmi_lip->xmi_format.xmi_type = XFS_LI_XMI;
+ xmi_lip->xmi_format.xmi_size = 1;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_XMI_FORMAT,
+ &xmi_lip->xmi_format,
+ sizeof(struct xfs_xmi_log_format));
+}
+
+/*
+ * The unpin operation is the last place an XMI is manipulated in the log. It
+ * is either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the XMI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the XMI to either construct
+ * and commit the XMD or drop the XMD's reference in the event of error. Simply
+ * drop the log's XMI reference now that the log is done with it.
+ */
+STATIC void
+xfs_xmi_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+ struct xfs_xmi_log_item *xmi_lip = XMI_ITEM(lip);
+
+ xfs_xmi_release(xmi_lip);
+}
+
+/*
+ * The XMI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an XMD isn't going to be
+ * constructed and thus we free the XMI here directly.
+ */
+STATIC void
+xfs_xmi_item_release(
+ struct xfs_log_item *lip)
+{
+ xfs_xmi_release(XMI_ITEM(lip));
+}
+
+/* Allocate and initialize an xmi item. */
+STATIC struct xfs_xmi_log_item *
+xfs_xmi_init(
+ struct xfs_mount *mp)
+
+{
+ struct xfs_xmi_log_item *xmi_lip;
+
+ xmi_lip = kmem_cache_zalloc(xfs_xmi_cache, GFP_KERNEL | __GFP_NOFAIL);
+
+ xfs_log_item_init(mp, &xmi_lip->xmi_item, XFS_LI_XMI, &xfs_xmi_item_ops);
+ xmi_lip->xmi_format.xmi_id = (uintptr_t)(void *)xmi_lip;
+ atomic_set(&xmi_lip->xmi_refcount, 2);
+
+ return xmi_lip;
+}
+
+static inline struct xfs_xmd_log_item *XMD_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_xmd_log_item, xmd_item);
+}
+
+STATIC void
+xfs_xmd_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 1;
+ *nbytes += sizeof(struct xfs_xmd_log_format);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the given xmd log
+ * item. We use only 1 iovec, and we point that at the xmd_log_format structure
+ * embedded in the xmd item.
+ */
+STATIC void
+xfs_xmd_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_xmd_log_item *xmd_lip = XMD_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ xmd_lip->xmd_format.xmd_type = XFS_LI_XMD;
+ xmd_lip->xmd_format.xmd_size = 1;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_XMD_FORMAT, &xmd_lip->xmd_format,
+ sizeof(struct xfs_xmd_log_format));
+}
+
+/*
+ * The XMD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the XMI and free the
+ * XMD.
+ */
+STATIC void
+xfs_xmd_item_release(
+ struct xfs_log_item *lip)
+{
+ struct xfs_xmd_log_item *xmd_lip = XMD_ITEM(lip);
+
+ xfs_xmi_release(xmd_lip->xmd_intent_log_item);
+ kvfree(xmd_lip->xmd_item.li_lv_shadow);
+ kmem_cache_free(xfs_xmd_cache, xmd_lip);
+}
+
+static struct xfs_log_item *
+xfs_xmd_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &XMD_ITEM(lip)->xmd_intent_log_item->xmi_item;
+}
+
+static const struct xfs_item_ops xfs_xmd_item_ops = {
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
+ .iop_size = xfs_xmd_item_size,
+ .iop_format = xfs_xmd_item_format,
+ .iop_release = xfs_xmd_item_release,
+ .iop_intent = xfs_xmd_item_intent,
+};
+
+/* Log file mapping exchange information in the intent item. */
+STATIC struct xfs_log_item *
+xfs_exchmaps_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_xmi_log_item *xmi_lip;
+ struct xfs_exchmaps_intent *xmi;
+ struct xfs_xmi_log_format *xlf;
+
+ ASSERT(count == 1);
+
+ xmi = list_first_entry_or_null(items, struct xfs_exchmaps_intent,
+ xmi_list);
+
+ xmi_lip = xfs_xmi_init(tp->t_mountp);
+ xlf = &xmi_lip->xmi_format;
+
+ xlf->xmi_inode1 = xmi->xmi_ip1->i_ino;
+ xlf->xmi_igen1 = VFS_I(xmi->xmi_ip1)->i_generation;
+ xlf->xmi_inode2 = xmi->xmi_ip2->i_ino;
+ xlf->xmi_igen2 = VFS_I(xmi->xmi_ip2)->i_generation;
+ xlf->xmi_startoff1 = xmi->xmi_startoff1;
+ xlf->xmi_startoff2 = xmi->xmi_startoff2;
+ xlf->xmi_blockcount = xmi->xmi_blockcount;
+ xlf->xmi_isize1 = xmi->xmi_isize1;
+ xlf->xmi_isize2 = xmi->xmi_isize2;
+ xlf->xmi_flags = xmi->xmi_flags & XFS_EXCHMAPS_LOGGED_FLAGS;
+
+ return &xmi_lip->xmi_item;
+}
+
+STATIC struct xfs_log_item *
+xfs_exchmaps_create_done(
+ struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ unsigned int count)
+{
+ struct xfs_xmi_log_item *xmi_lip = XMI_ITEM(intent);
+ struct xfs_xmd_log_item *xmd_lip;
+
+ xmd_lip = kmem_cache_zalloc(xfs_xmd_cache, GFP_KERNEL | __GFP_NOFAIL);
+ xfs_log_item_init(tp->t_mountp, &xmd_lip->xmd_item, XFS_LI_XMD,
+ &xfs_xmd_item_ops);
+ xmd_lip->xmd_intent_log_item = xmi_lip;
+ xmd_lip->xmd_format.xmd_xmi_id = xmi_lip->xmi_format.xmi_id;
+
+ return &xmd_lip->xmd_item;
+}
+
+/* Add this deferred XMI to the transaction. */
+void
+xfs_exchmaps_defer_add(
+ struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi)
+{
+ trace_xfs_exchmaps_defer(tp->t_mountp, xmi);
+
+ xfs_defer_add(tp, &xmi->xmi_list, &xfs_exchmaps_defer_type);
+}
+
+static inline struct xfs_exchmaps_intent *xmi_entry(const struct list_head *e)
+{
+ return list_entry(e, struct xfs_exchmaps_intent, xmi_list);
+}
+
+/* Cancel a deferred file mapping exchange. */
+STATIC void
+xfs_exchmaps_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_exchmaps_intent *xmi = xmi_entry(item);
+
+ kmem_cache_free(xfs_exchmaps_intent_cache, xmi);
+}
+
+/* Process a deferred file mapping exchange. */
+STATIC int
+xfs_exchmaps_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_log_item *done,
+ struct list_head *item,
+ struct xfs_btree_cur **state)
+{
+ struct xfs_exchmaps_intent *xmi = xmi_entry(item);
+ int error;
+
+ /*
+ * Exchange one more mappings between two files. If there's still more
+ * work to do, we want to requeue ourselves after all other pending
+ * deferred operations have finished. This includes all of the dfops
+ * that we queued directly as well as any new ones created in the
+ * process of finishing the others. Doing so prevents us from queuing
+ * a large number of XMI log items in kernel memory, which in turn
+ * prevents us from pinning the tail of the log (while logging those
+ * new XMI items) until the first XMI items can be processed.
+ */
+ error = xfs_exchmaps_finish_one(tp, xmi);
+ if (error != -EAGAIN)
+ xfs_exchmaps_cancel_item(item);
+ return error;
+}
+
+/* Abort all pending XMIs. */
+STATIC void
+xfs_exchmaps_abort_intent(
+ struct xfs_log_item *intent)
+{
+ xfs_xmi_release(XMI_ITEM(intent));
+}
+
+/* Is this recovered XMI ok? */
+static inline bool
+xfs_xmi_validate(
+ struct xfs_mount *mp,
+ struct xfs_xmi_log_item *xmi_lip)
+{
+ struct xfs_xmi_log_format *xlf = &xmi_lip->xmi_format;
+
+ if (!xfs_has_exchange_range(mp))
+ return false;
+
+ if (xmi_lip->xmi_format.__pad != 0)
+ return false;
+
+ if (xlf->xmi_flags & ~XFS_EXCHMAPS_LOGGED_FLAGS)
+ return false;
+
+ if (!xfs_verify_ino(mp, xlf->xmi_inode1) ||
+ !xfs_verify_ino(mp, xlf->xmi_inode2))
+ return false;
+
+ if (!xfs_verify_fileext(mp, xlf->xmi_startoff1, xlf->xmi_blockcount))
+ return false;
+
+ return xfs_verify_fileext(mp, xlf->xmi_startoff2, xlf->xmi_blockcount);
+}
+
+/*
+ * Use the recovered log state to create a new request, estimate resource
+ * requirements, and create a new incore intent state.
+ */
+STATIC struct xfs_exchmaps_intent *
+xfs_xmi_item_recover_intent(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ const struct xfs_xmi_log_format *xlf,
+ struct xfs_exchmaps_req *req,
+ struct xfs_inode **ipp1,
+ struct xfs_inode **ipp2)
+{
+ struct xfs_inode *ip1, *ip2;
+ struct xfs_exchmaps_intent *xmi;
+ int error;
+
+ /*
+ * Grab both inodes and set IRECOVERY to prevent trimming of post-eof
+ * mappings and freeing of unlinked inodes until we're totally done
+ * processing files. The ondisk format of this new log item contains
+ * file handle information, which is why recovery for other items do
+ * not check the inode generation number.
+ */
+ error = xlog_recover_iget_handle(mp, xlf->xmi_inode1, xlf->xmi_igen1,
+ &ip1);
+ if (error) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, xlf,
+ sizeof(*xlf));
+ return ERR_PTR(error);
+ }
+
+ error = xlog_recover_iget_handle(mp, xlf->xmi_inode2, xlf->xmi_igen2,
+ &ip2);
+ if (error) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, xlf,
+ sizeof(*xlf));
+ goto err_rele1;
+ }
+
+ req->ip1 = ip1;
+ req->ip2 = ip2;
+ req->startoff1 = xlf->xmi_startoff1;
+ req->startoff2 = xlf->xmi_startoff2;
+ req->blockcount = xlf->xmi_blockcount;
+ req->flags = xlf->xmi_flags & XFS_EXCHMAPS_PARAMS;
+
+ xfs_exchrange_ilock(NULL, ip1, ip2);
+ error = xfs_exchmaps_estimate(req);
+ xfs_exchrange_iunlock(ip1, ip2);
+ if (error)
+ goto err_rele2;
+
+ *ipp1 = ip1;
+ *ipp2 = ip2;
+ xmi = xfs_exchmaps_init_intent(req);
+ xfs_defer_add_item(dfp, &xmi->xmi_list);
+ return xmi;
+
+err_rele2:
+ xfs_irele(ip2);
+err_rele1:
+ xfs_irele(ip1);
+ req->ip2 = req->ip1 = NULL;
+ return ERR_PTR(error);
+}
+
+/* Process a file mapping exchange item that was recovered from the log. */
+STATIC int
+xfs_exchmaps_recover_work(
+ struct xfs_defer_pending *dfp,
+ struct list_head *capture_list)
+{
+ struct xfs_exchmaps_req req = { .flags = 0 };
+ struct xfs_trans_res resv;
+ struct xfs_exchmaps_intent *xmi;
+ struct xfs_log_item *lip = dfp->dfp_intent;
+ struct xfs_xmi_log_item *xmi_lip = XMI_ITEM(lip);
+ struct xfs_mount *mp = lip->li_log->l_mp;
+ struct xfs_trans *tp;
+ struct xfs_inode *ip1, *ip2;
+ int error = 0;
+
+ if (!xfs_xmi_validate(mp, xmi_lip)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &xmi_lip->xmi_format,
+ sizeof(xmi_lip->xmi_format));
+ return -EFSCORRUPTED;
+ }
+
+ xmi = xfs_xmi_item_recover_intent(mp, dfp, &xmi_lip->xmi_format, &req,
+ &ip1, &ip2);
+ if (IS_ERR(xmi))
+ return PTR_ERR(xmi);
+
+ trace_xfs_exchmaps_recover(mp, xmi);
+
+ resv = xlog_recover_resv(&M_RES(mp)->tr_write);
+ error = xfs_trans_alloc(mp, &resv, req.resblks, 0, 0, &tp);
+ if (error)
+ goto err_rele;
+
+ xfs_exchrange_ilock(tp, ip1, ip2);
+
+ xfs_exchmaps_ensure_reflink(tp, xmi);
+ xfs_exchmaps_upgrade_extent_counts(tp, xmi);
+ error = xlog_recover_finish_intent(tp, dfp);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &xmi_lip->xmi_format,
+ sizeof(xmi_lip->xmi_format));
+ if (error)
+ goto err_cancel;
+
+ /*
+ * Commit transaction, which frees the transaction and saves the inodes
+ * for later replay activities.
+ */
+ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+ goto err_unlock;
+
+err_cancel:
+ xfs_trans_cancel(tp);
+err_unlock:
+ xfs_exchrange_iunlock(ip1, ip2);
+err_rele:
+ xfs_irele(ip2);
+ xfs_irele(ip1);
+ return error;
+}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_exchmaps_relog_intent(
+ struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ struct xfs_log_item *done_item)
+{
+ struct xfs_xmi_log_item *xmi_lip;
+ struct xfs_xmi_log_format *old_xlf, *new_xlf;
+
+ old_xlf = &XMI_ITEM(intent)->xmi_format;
+
+ xmi_lip = xfs_xmi_init(tp->t_mountp);
+ new_xlf = &xmi_lip->xmi_format;
+
+ new_xlf->xmi_inode1 = old_xlf->xmi_inode1;
+ new_xlf->xmi_inode2 = old_xlf->xmi_inode2;
+ new_xlf->xmi_igen1 = old_xlf->xmi_igen1;
+ new_xlf->xmi_igen2 = old_xlf->xmi_igen2;
+ new_xlf->xmi_startoff1 = old_xlf->xmi_startoff1;
+ new_xlf->xmi_startoff2 = old_xlf->xmi_startoff2;
+ new_xlf->xmi_blockcount = old_xlf->xmi_blockcount;
+ new_xlf->xmi_flags = old_xlf->xmi_flags;
+ new_xlf->xmi_isize1 = old_xlf->xmi_isize1;
+ new_xlf->xmi_isize2 = old_xlf->xmi_isize2;
+
+ return &xmi_lip->xmi_item;
+}
+
+const struct xfs_defer_op_type xfs_exchmaps_defer_type = {
+ .name = "exchmaps",
+ .max_items = 1,
+ .create_intent = xfs_exchmaps_create_intent,
+ .abort_intent = xfs_exchmaps_abort_intent,
+ .create_done = xfs_exchmaps_create_done,
+ .finish_item = xfs_exchmaps_finish_item,
+ .cancel_item = xfs_exchmaps_cancel_item,
+ .recover_work = xfs_exchmaps_recover_work,
+ .relog_intent = xfs_exchmaps_relog_intent,
+};
+
+STATIC bool
+xfs_xmi_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return XMI_ITEM(lip)->xmi_format.xmi_id == intent_id;
+}
+
+static const struct xfs_item_ops xfs_xmi_item_ops = {
+ .flags = XFS_ITEM_INTENT,
+ .iop_size = xfs_xmi_item_size,
+ .iop_format = xfs_xmi_item_format,
+ .iop_unpin = xfs_xmi_item_unpin,
+ .iop_release = xfs_xmi_item_release,
+ .iop_match = xfs_xmi_item_match,
+};
+
+/*
+ * This routine is called to create an in-core file mapping exchange item from
+ * the xmi format structure which was logged on disk. It allocates an in-core
+ * xmi, copies the exchange information from the format structure into it, and
+ * adds the xmi to the AIL with the given LSN.
+ */
+STATIC int
+xlog_recover_xmi_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_xmi_log_item *xmi_lip;
+ struct xfs_xmi_log_format *xmi_formatp;
+ size_t len;
+
+ len = sizeof(struct xfs_xmi_log_format);
+ if (item->ri_buf[0].i_len != len) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+ return -EFSCORRUPTED;
+ }
+
+ xmi_formatp = item->ri_buf[0].i_addr;
+ if (xmi_formatp->__pad != 0) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+ return -EFSCORRUPTED;
+ }
+
+ xmi_lip = xfs_xmi_init(mp);
+ memcpy(&xmi_lip->xmi_format, xmi_formatp, len);
+
+ xlog_recover_intent_item(log, &xmi_lip->xmi_item, lsn,
+ &xfs_exchmaps_defer_type);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_xmi_item_ops = {
+ .item_type = XFS_LI_XMI,
+ .commit_pass2 = xlog_recover_xmi_commit_pass2,
+};
+
+/*
+ * This routine is called when an XMD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding XMI if it
+ * was still in the log. To do this it searches the AIL for the XMI with an id
+ * equal to that in the XMD format structure. If we find it we drop the XMD
+ * reference, which removes the XMI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_xmd_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_xmd_log_format *xmd_formatp;
+
+ xmd_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_xmd_log_format)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+ return -EFSCORRUPTED;
+ }
+
+ xlog_recover_release_intent(log, XFS_LI_XMI, xmd_formatp->xmd_xmi_id);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_xmd_item_ops = {
+ .item_type = XFS_LI_XMD,
+ .commit_pass2 = xlog_recover_xmd_commit_pass2,
+};
diff --git a/fs/xfs/xfs_exchmaps_item.h b/fs/xfs/xfs_exchmaps_item.h
new file mode 100644
index 000000000000..efa368d25d09
--- /dev/null
+++ b/fs/xfs/xfs_exchmaps_item.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_EXCHMAPS_ITEM_H__
+#define __XFS_EXCHMAPS_ITEM_H__
+
+/*
+ * The file mapping exchange intent item helps us exchange multiple file
+ * mappings between two inode forks. It does this by tracking the range of
+ * file block offsets that still need to be exchanged, and relogs as progress
+ * happens.
+ *
+ * *I items should be recorded in the *first* of a series of rolled
+ * transactions, and the *D items should be recorded in the same transaction
+ * that records the associated bmbt updates.
+ *
+ * Should the system crash after the commit of the first transaction but
+ * before the commit of the final transaction in a series, log recovery will
+ * use the redo information recorded by the intent items to replay the
+ * rest of the mapping exchanges.
+ */
+
+/* kernel only XMI/XMD definitions */
+
+struct xfs_mount;
+struct kmem_cache;
+
+/*
+ * This is the incore file mapping exchange intent log item. It is used to log
+ * the fact that we are exchanging mappings between two files. It is used in
+ * conjunction with the incore file mapping exchange done log item described
+ * below.
+ *
+ * These log items follow the same rules as struct xfs_efi_log_item; see the
+ * comments about that structure (in xfs_extfree_item.h) for more details.
+ */
+struct xfs_xmi_log_item {
+ struct xfs_log_item xmi_item;
+ atomic_t xmi_refcount;
+ struct xfs_xmi_log_format xmi_format;
+};
+
+/*
+ * This is the incore file mapping exchange done log item. It is used to log
+ * the fact that an exchange mentioned in an earlier xmi item have been
+ * performed.
+ */
+struct xfs_xmd_log_item {
+ struct xfs_log_item xmd_item;
+ struct xfs_xmi_log_item *xmd_intent_log_item;
+ struct xfs_xmd_log_format xmd_format;
+};
+
+extern struct kmem_cache *xfs_xmi_cache;
+extern struct kmem_cache *xfs_xmd_cache;
+
+struct xfs_exchmaps_intent;
+
+void xfs_exchmaps_defer_add(struct xfs_trans *tp,
+ struct xfs_exchmaps_intent *xmi);
+
+#endif /* __XFS_EXCHMAPS_ITEM_H__ */
diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
new file mode 100644
index 000000000000..c8a655c92c92
--- /dev/null
+++ b/fs/xfs/xfs_exchrange.c
@@ -0,0 +1,804 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_quota.h"
+#include "xfs_bmap_util.h"
+#include "xfs_reflink.h"
+#include "xfs_trace.h"
+#include "xfs_exchrange.h"
+#include "xfs_exchmaps.h"
+#include "xfs_sb.h"
+#include "xfs_icache.h"
+#include "xfs_log.h"
+#include "xfs_rtbitmap.h"
+#include <linux/fsnotify.h>
+
+/* Lock (and optionally join) two inodes for a file range exchange. */
+void
+xfs_exchrange_ilock(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ if (ip1 != ip2)
+ xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
+ ip2, XFS_ILOCK_EXCL);
+ else
+ xfs_ilock(ip1, XFS_ILOCK_EXCL);
+ if (tp) {
+ xfs_trans_ijoin(tp, ip1, 0);
+ if (ip2 != ip1)
+ xfs_trans_ijoin(tp, ip2, 0);
+ }
+
+}
+
+/* Unlock two inodes after a file range exchange operation. */
+void
+xfs_exchrange_iunlock(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ if (ip2 != ip1)
+ xfs_iunlock(ip2, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip1, XFS_ILOCK_EXCL);
+}
+
+/*
+ * Estimate the resource requirements to exchange file contents between the two
+ * files. The caller is required to hold the IOLOCK and the MMAPLOCK and to
+ * have flushed both inodes' pagecache and active direct-ios.
+ */
+int
+xfs_exchrange_estimate(
+ struct xfs_exchmaps_req *req)
+{
+ int error;
+
+ xfs_exchrange_ilock(NULL, req->ip1, req->ip2);
+ error = xfs_exchmaps_estimate(req);
+ xfs_exchrange_iunlock(req->ip1, req->ip2);
+ return error;
+}
+
+#define QRETRY_IP1 (0x1)
+#define QRETRY_IP2 (0x2)
+
+/*
+ * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip
+ * this if quota enforcement is disabled or if both inodes' dquots are the
+ * same. The qretry structure must be initialized to zeroes before the first
+ * call to this function.
+ */
+STATIC int
+xfs_exchrange_reserve_quota(
+ struct xfs_trans *tp,
+ const struct xfs_exchmaps_req *req,
+ unsigned int *qretry)
+{
+ int64_t ddelta, rdelta;
+ int ip1_error = 0;
+ int error;
+
+ /*
+ * Don't bother with a quota reservation if we're not enforcing them
+ * or the two inodes have the same dquots.
+ */
+ if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
+ (req->ip1->i_udquot == req->ip2->i_udquot &&
+ req->ip1->i_gdquot == req->ip2->i_gdquot &&
+ req->ip1->i_pdquot == req->ip2->i_pdquot))
+ return 0;
+
+ *qretry = 0;
+
+ /*
+ * For each file, compute the net gain in the number of regular blocks
+ * that will be mapped into that file and reserve that much quota. The
+ * quota counts must be able to absorb at least that much space.
+ */
+ ddelta = req->ip2_bcount - req->ip1_bcount;
+ rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
+ if (ddelta > 0 || rdelta > 0) {
+ error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
+ ddelta > 0 ? ddelta : 0,
+ rdelta > 0 ? rdelta : 0,
+ false);
+ if (error == -EDQUOT || error == -ENOSPC) {
+ /*
+ * Save this error and see what happens if we try to
+ * reserve quota for ip2. Then report both.
+ */
+ *qretry |= QRETRY_IP1;
+ ip1_error = error;
+ error = 0;
+ }
+ if (error)
+ return error;
+ }
+ if (ddelta < 0 || rdelta < 0) {
+ error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
+ ddelta < 0 ? -ddelta : 0,
+ rdelta < 0 ? -rdelta : 0,
+ false);
+ if (error == -EDQUOT || error == -ENOSPC)
+ *qretry |= QRETRY_IP2;
+ if (error)
+ return error;
+ }
+ if (ip1_error)
+ return ip1_error;
+
+ /*
+ * For each file, forcibly reserve the gross gain in mapped blocks so
+ * that we don't trip over any quota block reservation assertions.
+ * We must reserve the gross gain because the quota code subtracts from
+ * bcount the number of blocks that we unmap; it does not add that
+ * quantity back to the quota block reservation.
+ */
+ error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
+ req->ip1_rtbcount, true);
+ if (error)
+ return error;
+
+ return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
+ req->ip2_rtbcount, true);
+}
+
+/* Exchange the mappings (and hence the contents) of two files' forks. */
+STATIC int
+xfs_exchrange_mappings(
+ const struct xfs_exchrange *fxr,
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ struct xfs_mount *mp = ip1->i_mount;
+ struct xfs_exchmaps_req req = {
+ .ip1 = ip1,
+ .ip2 = ip2,
+ .startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset),
+ .startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset),
+ .blockcount = XFS_B_TO_FSB(mp, fxr->length),
+ };
+ struct xfs_trans *tp;
+ unsigned int qretry;
+ bool retried = false;
+ int error;
+
+ trace_xfs_exchrange_mappings(fxr, ip1, ip2);
+
+ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
+ req.flags |= XFS_EXCHMAPS_SET_SIZES;
+ if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN)
+ req.flags |= XFS_EXCHMAPS_INO1_WRITTEN;
+
+ /*
+ * Round the request length up to the nearest file allocation unit.
+ * The prep function already checked that the request offsets and
+ * length in @fxr are safe to round up.
+ */
+ if (xfs_inode_has_bigrtalloc(ip2))
+ req.blockcount = xfs_rtb_roundup_rtx(mp, req.blockcount);
+
+ error = xfs_exchrange_estimate(&req);
+ if (error)
+ return error;
+
+retry:
+ /* Allocate the transaction, lock the inodes, and join them. */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
+ XFS_TRANS_RES_FDBLKS, &tp);
+ if (error)
+ return error;
+
+ xfs_exchrange_ilock(tp, ip1, ip2);
+
+ trace_xfs_exchrange_before(ip2, 2);
+ trace_xfs_exchrange_before(ip1, 1);
+
+ error = xfs_exchmaps_check_forks(mp, &req);
+ if (error)
+ goto out_trans_cancel;
+
+ /*
+ * Reserve ourselves some quota if any of them are in enforcing mode.
+ * In theory we only need enough to satisfy the change in the number
+ * of blocks between the two ranges being remapped.
+ */
+ error = xfs_exchrange_reserve_quota(tp, &req, &qretry);
+ if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+ xfs_trans_cancel(tp);
+ xfs_exchrange_iunlock(ip1, ip2);
+ if (qretry & QRETRY_IP1)
+ xfs_blockgc_free_quota(ip1, 0);
+ if (qretry & QRETRY_IP2)
+ xfs_blockgc_free_quota(ip2, 0);
+ retried = true;
+ goto retry;
+ }
+ if (error)
+ goto out_trans_cancel;
+
+ /* If we got this far on a dry run, all parameters are ok. */
+ if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN)
+ goto out_trans_cancel;
+
+ /* Update the mtime and ctime of both files. */
+ if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1)
+ xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
+ xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+ xfs_exchange_mappings(tp, &req);
+
+ /*
+ * Force the log to persist metadata updates if the caller or the
+ * administrator requires this. The generic prep function already
+ * flushed the relevant parts of the page cache.
+ */
+ if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC))
+ xfs_trans_set_sync(tp);
+
+ error = xfs_trans_commit(tp);
+
+ trace_xfs_exchrange_after(ip2, 2);
+ trace_xfs_exchrange_after(ip1, 1);
+
+ if (error)
+ goto out_unlock;
+
+ /*
+ * If the caller wanted us to exchange the contents of two complete
+ * files of unequal length, exchange the incore sizes now. This should
+ * be safe because we flushed both files' page caches, exchanged all
+ * the mappings, and updated the ondisk sizes.
+ */
+ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
+ loff_t temp;
+
+ temp = i_size_read(VFS_I(ip2));
+ i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
+ i_size_write(VFS_I(ip1), temp);
+ }
+
+out_unlock:
+ xfs_exchrange_iunlock(ip1, ip2);
+ return error;
+
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ goto out_unlock;
+}
+
+/*
+ * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
+ * This part deals with struct file objects and byte ranges and does not deal
+ * with XFS-specific data structures such as xfs_inodes and block ranges. This
+ * separation may some day facilitate porting to another filesystem.
+ *
+ * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
+ * file1 with the same number of bytes starting at fxr.file2_offset in file2.
+ * Implementations must call xfs_exchange_range_prep to prepare the two
+ * files prior to taking locks; and they must update the inode change and mod
+ * times of both files as part of the metadata update. The timestamp update
+ * and freshness checks must be done atomically as part of the data exchange
+ * operation to ensure correctness of the freshness check.
+ * xfs_exchange_range_finish must be called after the operation completes
+ * successfully but before locks are dropped.
+ */
+
+/* Verify that we have security clearance to perform this operation. */
+static int
+xfs_exchange_range_verify_area(
+ struct xfs_exchrange *fxr)
+{
+ int ret;
+
+ ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length,
+ true);
+ if (ret)
+ return ret;
+
+ return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length,
+ true);
+}
+
+/*
+ * Performs necessary checks before doing a range exchange, having stabilized
+ * mutable inode attributes via i_rwsem.
+ */
+static inline int
+xfs_exchange_range_checks(
+ struct xfs_exchrange *fxr,
+ unsigned int alloc_unit)
+{
+ struct inode *inode1 = file_inode(fxr->file1);
+ struct inode *inode2 = file_inode(fxr->file2);
+ uint64_t allocmask = alloc_unit - 1;
+ int64_t test_len;
+ uint64_t blen;
+ loff_t size1, size2, tmp;
+ int error;
+
+ /* Don't touch certain kinds of inodes */
+ if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
+ return -EPERM;
+ if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
+ return -ETXTBSY;
+
+ size1 = i_size_read(inode1);
+ size2 = i_size_read(inode2);
+
+ /* Ranges cannot start after EOF. */
+ if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
+ return -EINVAL;
+
+ /*
+ * If the caller said to exchange to EOF, we set the length of the
+ * request large enough to cover everything to the end of both files.
+ */
+ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
+ fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
+ size2 - fxr->file2_offset);
+
+ error = xfs_exchange_range_verify_area(fxr);
+ if (error)
+ return error;
+ }
+
+ /*
+ * The start of both ranges must be aligned to the file allocation
+ * unit.
+ */
+ if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) ||
+ !IS_ALIGNED(fxr->file2_offset, alloc_unit))
+ return -EINVAL;
+
+ /* Ensure offsets don't wrap. */
+ if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) ||
+ check_add_overflow(fxr->file2_offset, fxr->length, &tmp))
+ return -EINVAL;
+
+ /*
+ * We require both ranges to end within EOF, unless we're exchanging
+ * to EOF.
+ */
+ if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) &&
+ (fxr->file1_offset + fxr->length > size1 ||
+ fxr->file2_offset + fxr->length > size2))
+ return -EINVAL;
+
+ /*
+ * Make sure we don't hit any file size limits. If we hit any size
+ * limits such that test_length was adjusted, we abort the whole
+ * operation.
+ */
+ test_len = fxr->length;
+ error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
+ &test_len);
+ if (error)
+ return error;
+ error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
+ &test_len);
+ if (error)
+ return error;
+ if (test_len != fxr->length)
+ return -EINVAL;
+
+ /*
+ * If the user wanted us to exchange up to the infile's EOF, round up
+ * to the next allocation unit boundary for this check. Do the same
+ * for the outfile.
+ *
+ * Otherwise, reject the range length if it's not aligned to an
+ * allocation unit.
+ */
+ if (fxr->file1_offset + fxr->length == size1)
+ blen = ALIGN(size1, alloc_unit) - fxr->file1_offset;
+ else if (fxr->file2_offset + fxr->length == size2)
+ blen = ALIGN(size2, alloc_unit) - fxr->file2_offset;
+ else if (!IS_ALIGNED(fxr->length, alloc_unit))
+ return -EINVAL;
+ else
+ blen = fxr->length;
+
+ /* Don't allow overlapped exchanges within the same file. */
+ if (inode1 == inode2 &&
+ fxr->file2_offset + blen > fxr->file1_offset &&
+ fxr->file1_offset + blen > fxr->file2_offset)
+ return -EINVAL;
+
+ /*
+ * Ensure that we don't exchange a partial EOF block into the middle of
+ * another file.
+ */
+ if ((fxr->length & allocmask) == 0)
+ return 0;
+
+ blen = fxr->length;
+ if (fxr->file2_offset + blen < size2)
+ blen &= ~allocmask;
+
+ if (fxr->file1_offset + blen < size1)
+ blen &= ~allocmask;
+
+ return blen == fxr->length ? 0 : -EINVAL;
+}
+
+/*
+ * Check that the two inodes are eligible for range exchanges, the ranges make
+ * sense, and then flush all dirty data. Caller must ensure that the inodes
+ * have been locked against any other modifications.
+ */
+static inline int
+xfs_exchange_range_prep(
+ struct xfs_exchrange *fxr,
+ unsigned int alloc_unit)
+{
+ struct inode *inode1 = file_inode(fxr->file1);
+ struct inode *inode2 = file_inode(fxr->file2);
+ bool same_inode = (inode1 == inode2);
+ int error;
+
+ /* Check that we don't violate system file offset limits. */
+ error = xfs_exchange_range_checks(fxr, alloc_unit);
+ if (error || fxr->length == 0)
+ return error;
+
+ /* Wait for the completion of any pending IOs on both files */
+ inode_dio_wait(inode1);
+ if (!same_inode)
+ inode_dio_wait(inode2);
+
+ error = filemap_write_and_wait_range(inode1->i_mapping,
+ fxr->file1_offset,
+ fxr->file1_offset + fxr->length - 1);
+ if (error)
+ return error;
+
+ error = filemap_write_and_wait_range(inode2->i_mapping,
+ fxr->file2_offset,
+ fxr->file2_offset + fxr->length - 1);
+ if (error)
+ return error;
+
+ /*
+ * If the files or inodes involved require synchronous writes, amend
+ * the request to force the filesystem to flush all data and metadata
+ * to disk after the operation completes.
+ */
+ if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) ||
+ IS_SYNC(inode1) || IS_SYNC(inode2))
+ fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC;
+
+ return 0;
+}
+
+/*
+ * Finish a range exchange operation, if it was successful. Caller must ensure
+ * that the inodes are still locked against any other modifications.
+ */
+static inline int
+xfs_exchange_range_finish(
+ struct xfs_exchrange *fxr)
+{
+ int error;
+
+ error = file_remove_privs(fxr->file1);
+ if (error)
+ return error;
+ if (file_inode(fxr->file1) == file_inode(fxr->file2))
+ return 0;
+
+ return file_remove_privs(fxr->file2);
+}
+
+/*
+ * Check the alignment of an exchange request when the allocation unit size
+ * isn't a power of two. The generic file-level helpers use (fast)
+ * bitmask-based alignment checks, but here we have to use slow long division.
+ */
+static int
+xfs_exchrange_check_rtalign(
+ const struct xfs_exchrange *fxr,
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2,
+ unsigned int alloc_unit)
+{
+ uint64_t length = fxr->length;
+ uint64_t blen;
+ loff_t size1, size2;
+
+ size1 = i_size_read(VFS_I(ip1));
+ size2 = i_size_read(VFS_I(ip2));
+
+ /* The start of both ranges must be aligned to a rt extent. */
+ if (!isaligned_64(fxr->file1_offset, alloc_unit) ||
+ !isaligned_64(fxr->file2_offset, alloc_unit))
+ return -EINVAL;
+
+ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
+ length = max_t(int64_t, size1 - fxr->file1_offset,
+ size2 - fxr->file2_offset);
+
+ /*
+ * If the user wanted us to exchange up to the infile's EOF, round up
+ * to the next rt extent boundary for this check. Do the same for the
+ * outfile.
+ *
+ * Otherwise, reject the range length if it's not rt extent aligned.
+ * We already confirmed the starting offsets' rt extent block
+ * alignment.
+ */
+ if (fxr->file1_offset + length == size1)
+ blen = roundup_64(size1, alloc_unit) - fxr->file1_offset;
+ else if (fxr->file2_offset + length == size2)
+ blen = roundup_64(size2, alloc_unit) - fxr->file2_offset;
+ else if (!isaligned_64(length, alloc_unit))
+ return -EINVAL;
+ else
+ blen = length;
+
+ /* Don't allow overlapped exchanges within the same file. */
+ if (ip1 == ip2 &&
+ fxr->file2_offset + blen > fxr->file1_offset &&
+ fxr->file1_offset + blen > fxr->file2_offset)
+ return -EINVAL;
+
+ /*
+ * Ensure that we don't exchange a partial EOF rt extent into the
+ * middle of another file.
+ */
+ if (isaligned_64(length, alloc_unit))
+ return 0;
+
+ blen = length;
+ if (fxr->file2_offset + length < size2)
+ blen = rounddown_64(blen, alloc_unit);
+
+ if (fxr->file1_offset + blen < size1)
+ blen = rounddown_64(blen, alloc_unit);
+
+ return blen == length ? 0 : -EINVAL;
+}
+
+/* Prepare two files to have their data exchanged. */
+STATIC int
+xfs_exchrange_prep(
+ struct xfs_exchrange *fxr,
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ struct xfs_mount *mp = ip2->i_mount;
+ unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip2);
+ int error;
+
+ trace_xfs_exchrange_prep(fxr, ip1, ip2);
+
+ /* Verify both files are either real-time or non-realtime */
+ if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
+ return -EINVAL;
+
+ /* Check non-power of two alignment issues, if necessary. */
+ if (!is_power_of_2(alloc_unit)) {
+ error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit);
+ if (error)
+ return error;
+
+ /*
+ * Do the generic file-level checks with the regular block
+ * alignment.
+ */
+ alloc_unit = mp->m_sb.sb_blocksize;
+ }
+
+ error = xfs_exchange_range_prep(fxr, alloc_unit);
+ if (error || fxr->length == 0)
+ return error;
+
+ /* Attach dquots to both inodes before changing block maps. */
+ error = xfs_qm_dqattach(ip2);
+ if (error)
+ return error;
+ error = xfs_qm_dqattach(ip1);
+ if (error)
+ return error;
+
+ trace_xfs_exchrange_flush(fxr, ip1, ip2);
+
+ /* Flush the relevant ranges of both files. */
+ error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
+ if (error)
+ return error;
+ error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
+ if (error)
+ return error;
+
+ /*
+ * Cancel CoW fork preallocations for the ranges of both files. The
+ * prep function should have flushed all the dirty data, so the only
+ * CoW mappings remaining should be speculative.
+ */
+ if (xfs_inode_has_cow_data(ip1)) {
+ error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
+ fxr->length, true);
+ if (error)
+ return error;
+ }
+
+ if (xfs_inode_has_cow_data(ip2)) {
+ error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
+ fxr->length, true);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Exchange contents of files. This is the binding between the generic
+ * file-level concepts and the XFS inode-specific implementation.
+ */
+STATIC int
+xfs_exchrange_contents(
+ struct xfs_exchrange *fxr)
+{
+ struct inode *inode1 = file_inode(fxr->file1);
+ struct inode *inode2 = file_inode(fxr->file2);
+ struct xfs_inode *ip1 = XFS_I(inode1);
+ struct xfs_inode *ip2 = XFS_I(inode2);
+ struct xfs_mount *mp = ip1->i_mount;
+ int error;
+
+ if (!xfs_has_exchange_range(mp))
+ return -EOPNOTSUPP;
+
+ if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
+ XFS_EXCHANGE_RANGE_PRIV_FLAGS))
+ return -EINVAL;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ /* Lock both files against IO */
+ error = xfs_ilock2_io_mmap(ip1, ip2);
+ if (error)
+ goto out_err;
+
+ /* Prepare and then exchange file contents. */
+ error = xfs_exchrange_prep(fxr, ip1, ip2);
+ if (error)
+ goto out_unlock;
+
+ error = xfs_exchrange_mappings(fxr, ip1, ip2);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * Finish the exchange by removing special file privileges like any
+ * other file write would do. This may involve turning on support for
+ * logged xattrs if either file has security capabilities.
+ */
+ error = xfs_exchange_range_finish(fxr);
+ if (error)
+ goto out_unlock;
+
+out_unlock:
+ xfs_iunlock2_io_mmap(ip1, ip2);
+out_err:
+ if (error)
+ trace_xfs_exchrange_error(ip2, error, _RET_IP_);
+ return error;
+}
+
+/* Exchange parts of two files. */
+static int
+xfs_exchange_range(
+ struct xfs_exchrange *fxr)
+{
+ struct inode *inode1 = file_inode(fxr->file1);
+ struct inode *inode2 = file_inode(fxr->file2);
+ int ret;
+
+ BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
+ XFS_EXCHANGE_RANGE_PRIV_FLAGS);
+
+ /* Both files must be on the same mount/filesystem. */
+ if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
+ return -EXDEV;
+
+ if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
+ return -EINVAL;
+
+ /* Userspace requests only honored for regular files. */
+ if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
+ return -EISDIR;
+ if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
+ return -EINVAL;
+
+ /* Both files must be opened for read and write. */
+ if (!(fxr->file1->f_mode & FMODE_READ) ||
+ !(fxr->file1->f_mode & FMODE_WRITE) ||
+ !(fxr->file2->f_mode & FMODE_READ) ||
+ !(fxr->file2->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ /* Neither file can be opened append-only. */
+ if ((fxr->file1->f_flags & O_APPEND) ||
+ (fxr->file2->f_flags & O_APPEND))
+ return -EBADF;
+
+ /*
+ * If we're not exchanging to EOF, we can check the areas before
+ * stabilizing both files' i_size.
+ */
+ if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) {
+ ret = xfs_exchange_range_verify_area(fxr);
+ if (ret)
+ return ret;
+ }
+
+ /* Update cmtime if the fd/inode don't forbid it. */
+ if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
+ fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1;
+ if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
+ fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;
+
+ file_start_write(fxr->file2);
+ ret = xfs_exchrange_contents(fxr);
+ file_end_write(fxr->file2);
+ if (ret)
+ return ret;
+
+ fsnotify_modify(fxr->file1);
+ if (fxr->file2 != fxr->file1)
+ fsnotify_modify(fxr->file2);
+ return 0;
+}
+
+/* Collect exchange-range arguments from userspace. */
+long
+xfs_ioc_exchange_range(
+ struct file *file,
+ struct xfs_exchange_range __user *argp)
+{
+ struct xfs_exchrange fxr = {
+ .file2 = file,
+ };
+ struct xfs_exchange_range args;
+ struct fd file1;
+ int error;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+ if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
+ return -EINVAL;
+ if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
+ return -EINVAL;
+
+ fxr.file1_offset = args.file1_offset;
+ fxr.file2_offset = args.file2_offset;
+ fxr.length = args.length;
+ fxr.flags = args.flags;
+
+ file1 = fdget(args.file1_fd);
+ if (!file1.file)
+ return -EBADF;
+ fxr.file1 = file1.file;
+
+ error = xfs_exchange_range(&fxr);
+ fdput(file1);
+ return error;
+}
diff --git a/fs/xfs/xfs_exchrange.h b/fs/xfs/xfs_exchrange.h
new file mode 100644
index 000000000000..039abcca546e
--- /dev/null
+++ b/fs/xfs/xfs_exchrange.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_EXCHRANGE_H__
+#define __XFS_EXCHRANGE_H__
+
+/* Update the mtime/cmtime of file1 and file2 */
+#define __XFS_EXCHANGE_RANGE_UPD_CMTIME1 (1ULL << 63)
+#define __XFS_EXCHANGE_RANGE_UPD_CMTIME2 (1ULL << 62)
+
+#define XFS_EXCHANGE_RANGE_PRIV_FLAGS (__XFS_EXCHANGE_RANGE_UPD_CMTIME1 | \
+ __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
+
+struct xfs_exchrange {
+ struct file *file1;
+ struct file *file2;
+
+ loff_t file1_offset;
+ loff_t file2_offset;
+ u64 length;
+
+ u64 flags; /* XFS_EXCHANGE_RANGE flags */
+};
+
+long xfs_ioc_exchange_range(struct file *file,
+ struct xfs_exchange_range __user *argp);
+
+struct xfs_exchmaps_req;
+
+void xfs_exchrange_ilock(struct xfs_trans *tp, struct xfs_inode *ip1,
+ struct xfs_inode *ip2);
+void xfs_exchrange_iunlock(struct xfs_inode *ip1, struct xfs_inode *ip2);
+
+int xfs_exchrange_estimate(struct xfs_exchmaps_req *req);
+
+#endif /* __XFS_EXCHRANGE_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 7cd09c3a82cb..201489d3de08 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -102,7 +102,7 @@ xfs_fs_encode_fh(
return fileid_type;
}
-STATIC struct inode *
+struct inode *
xfs_nfs_get_inode(
struct super_block *sb,
u64 ino,
@@ -160,7 +160,7 @@ xfs_nfs_get_inode(
}
}
- if (VFS_I(ip)->i_generation != generation) {
+ if (VFS_I(ip)->i_generation != generation || IS_PRIVATE(VFS_I(ip))) {
xfs_irele(ip);
return ERR_PTR(-ESTALE);
}
diff --git a/fs/xfs/xfs_export.h b/fs/xfs/xfs_export.h
index 64471a3ddb04..3cd85e8901a5 100644
--- a/fs/xfs/xfs_export.h
+++ b/fs/xfs/xfs_export.h
@@ -57,4 +57,6 @@ struct xfs_fid64 {
/* This flag goes on the wire. Don't play with it. */
#define XFS_FILEID_TYPE_64FLAG 0x80 /* NFS fileid has 64bit inodes */
+struct inode *xfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 gen);
+
#endif /* __XFS_EXPORT_H__ */
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 56cfa1498571..a73e7c73b664 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -518,35 +518,26 @@ fail:
goto out;
}
-STATIC void
+static bool
xfs_extent_busy_clear_one(
- struct xfs_mount *mp,
struct xfs_perag *pag,
- struct xfs_extent_busy *busyp)
+ struct xfs_extent_busy *busyp,
+ bool do_discard)
{
if (busyp->length) {
- trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
- busyp->length);
+ if (do_discard &&
+ !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
+ busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
+ return false;
+ }
+ trace_xfs_extent_busy_clear(pag->pag_mount, busyp->agno,
+ busyp->bno, busyp->length);
rb_erase(&busyp->rb_node, &pag->pagb_tree);
}
list_del_init(&busyp->list);
kfree(busyp);
-}
-
-static void
-xfs_extent_busy_put_pag(
- struct xfs_perag *pag,
- bool wakeup)
- __releases(pag->pagb_lock)
-{
- if (wakeup) {
- pag->pagb_gen++;
- wake_up_all(&pag->pagb_wait);
- }
-
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
+ return true;
}
/*
@@ -560,32 +551,33 @@ xfs_extent_busy_clear(
struct list_head *list,
bool do_discard)
{
- struct xfs_extent_busy *busyp, *n;
- struct xfs_perag *pag = NULL;
- xfs_agnumber_t agno = NULLAGNUMBER;
- bool wakeup = false;
-
- list_for_each_entry_safe(busyp, n, list, list) {
- if (busyp->agno != agno) {
- if (pag)
- xfs_extent_busy_put_pag(pag, wakeup);
- agno = busyp->agno;
- pag = xfs_perag_get(mp, agno);
- spin_lock(&pag->pagb_lock);
- wakeup = false;
- }
+ struct xfs_extent_busy *busyp, *next;
- if (do_discard && busyp->length &&
- !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
- busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
- } else {
- xfs_extent_busy_clear_one(mp, pag, busyp);
- wakeup = true;
- }
- }
+ busyp = list_first_entry_or_null(list, typeof(*busyp), list);
+ if (!busyp)
+ return;
- if (pag)
- xfs_extent_busy_put_pag(pag, wakeup);
+ do {
+ bool wakeup = false;
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, busyp->agno);
+ spin_lock(&pag->pagb_lock);
+ do {
+ next = list_next_entry(busyp, list);
+ if (xfs_extent_busy_clear_one(pag, busyp, do_discard))
+ wakeup = true;
+ busyp = next;
+ } while (!list_entry_is_head(busyp, list, list) &&
+ busyp->agno == pag->pag_agno);
+
+ if (wakeup) {
+ pag->pagb_gen++;
+ wake_up_all(&pag->pagb_wait);
+ }
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ } while (!list_entry_is_head(busyp, list, list));
}
/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 2ce302b4885f..b240ea5241dc 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -24,6 +24,7 @@
#include "xfs_pnfs.h"
#include "xfs_iomap.h"
#include "xfs_reflink.h"
+#include "xfs_file.h"
#include <linux/dax.h>
#include <linux/falloc.h>
@@ -38,33 +39,19 @@ static const struct vm_operations_struct xfs_file_vm_ops;
* Decide if the given file range is aligned to the size of the fundamental
* allocation unit for the file.
*/
-static bool
+bool
xfs_is_falloc_aligned(
struct xfs_inode *ip,
loff_t pos,
long long int len)
{
- struct xfs_mount *mp = ip->i_mount;
- uint64_t mask;
-
- if (XFS_IS_REALTIME_INODE(ip)) {
- if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
- u64 rextbytes;
- u32 mod;
-
- rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
- div_u64_rem(pos, rextbytes, &mod);
- if (mod)
- return false;
- div_u64_rem(len, rextbytes, &mod);
- return mod == 0;
- }
- mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
- } else {
- mask = mp->m_sb.sb_blocksize - 1;
- }
+ unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip);
+
+ if (!is_power_of_2(alloc_unit))
+ return isaligned_64(pos, alloc_unit) &&
+ isaligned_64(len, alloc_unit);
- return !((pos | len) & mask);
+ return !((pos | len) & (alloc_unit - 1));
}
/*
@@ -861,67 +848,6 @@ xfs_file_write_iter(
return xfs_file_buffered_write(iocb, from);
}
-static void
-xfs_wait_dax_page(
- struct inode *inode)
-{
- struct xfs_inode *ip = XFS_I(inode);
-
- xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
- schedule();
- xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
-}
-
-int
-xfs_break_dax_layouts(
- struct inode *inode,
- bool *retry)
-{
- struct page *page;
-
- xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL);
-
- page = dax_layout_busy_page(inode->i_mapping);
- if (!page)
- return 0;
-
- *retry = true;
- return ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
- 0, 0, xfs_wait_dax_page(inode));
-}
-
-int
-xfs_break_layouts(
- struct inode *inode,
- uint *iolock,
- enum layout_break_reason reason)
-{
- bool retry;
- int error;
-
- xfs_assert_ilocked(XFS_I(inode), XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL);
-
- do {
- retry = false;
- switch (reason) {
- case BREAK_UNMAP:
- error = xfs_break_dax_layouts(inode, &retry);
- if (error || retry)
- break;
- fallthrough;
- case BREAK_WRITE:
- error = xfs_break_leased_layouts(inode, iolock, &retry);
- break;
- default:
- WARN_ON_ONCE(1);
- error = -EINVAL;
- }
- } while (error == 0 && retry);
-
- return error;
-}
-
/* Does this file, inode, or mount want synchronous writes? */
static inline bool xfs_file_sync_writes(struct file *filp)
{
diff --git a/fs/xfs/xfs_file.h b/fs/xfs/xfs_file.h
new file mode 100644
index 000000000000..2ad91f755caf
--- /dev/null
+++ b/fs/xfs/xfs_file.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_FILE_H__
+#define __XFS_FILE_H__
+
+extern const struct file_operations xfs_file_operations;
+extern const struct file_operations xfs_dir_file_operations;
+
+bool xfs_is_falloc_aligned(struct xfs_inode *ip, loff_t pos,
+ long long int len);
+
+#endif /* __XFS_FILE_H__ */
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index de59eec74765..85dbb46452ca 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -533,7 +533,7 @@ xfs_getfsmap_rtdev_rtbitmap(
trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb);
trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb);
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
/*
* Set up query parameters to return free rtextents covering the range
@@ -557,7 +557,7 @@ xfs_getfsmap_rtdev_rtbitmap(
if (error)
goto err;
err:
- xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
return error;
}
#endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 83f708f62ed9..c211ea2b63c4 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -213,10 +213,8 @@ xfs_growfs_data_private(
struct xfs_perag *pag;
pag = xfs_perag_get(mp, id.agno);
- error = xfs_ag_resv_free(pag);
+ xfs_ag_resv_free(pag);
xfs_perag_put(pag);
- if (error)
- return error;
}
/*
* Reserve AG metadata blocks. ENOSPC here does not mean there
@@ -385,14 +383,14 @@ xfs_reserve_blocks(
*/
if (mp->m_resblks > request) {
lcounter = mp->m_resblks_avail - request;
- if (lcounter > 0) { /* release unused blocks */
+ if (lcounter > 0) { /* release unused blocks */
fdblks_delta = lcounter;
mp->m_resblks_avail -= lcounter;
}
mp->m_resblks = request;
if (fdblks_delta) {
spin_unlock(&mp->m_sb_lock);
- error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
+ xfs_add_fdblocks(mp, fdblks_delta);
spin_lock(&mp->m_sb_lock);
}
@@ -428,9 +426,9 @@ xfs_reserve_blocks(
*/
fdblks_delta = min(free, delta);
spin_unlock(&mp->m_sb_lock);
- error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
+ error = xfs_dec_fdblocks(mp, fdblks_delta, 0);
if (!error)
- xfs_mod_fdblocks(mp, fdblks_delta, 0);
+ xfs_add_fdblocks(mp, fdblks_delta);
spin_lock(&mp->m_sb_lock);
}
out:
@@ -556,24 +554,13 @@ xfs_fs_reserve_ag_blocks(
/*
* Free space reserved for per-AG metadata.
*/
-int
+void
xfs_fs_unreserve_ag_blocks(
struct xfs_mount *mp)
{
xfs_agnumber_t agno;
struct xfs_perag *pag;
- int error = 0;
- int err2;
- for_each_perag(mp, agno, pag) {
- err2 = xfs_ag_resv_free(pag);
- if (err2 && !error)
- error = err2;
- }
-
- if (error)
- xfs_warn(mp,
- "Error %d freeing per-AG metadata reserve pool.", error);
-
- return error;
+ for_each_perag(mp, agno, pag)
+ xfs_ag_resv_free(pag);
}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 44457b0a0593..3e2f73bcf831 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -12,6 +12,6 @@ int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request);
int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags);
int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
-int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
+void xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c
new file mode 100644
index 000000000000..c8785ed59543
--- /dev/null
+++ b/fs/xfs/xfs_handle.c
@@ -0,0 +1,952 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2022-2024 Oracle.
+ * All rights reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_ioctl.h"
+#include "xfs_parent.h"
+#include "xfs_da_btree.h"
+#include "xfs_handle.h"
+#include "xfs_health.h"
+#include "xfs_icache.h"
+#include "xfs_export.h"
+#include "xfs_xattr.h"
+#include "xfs_acl.h"
+
+#include <linux/namei.h>
+
+static inline size_t
+xfs_filehandle_fid_len(void)
+{
+ struct xfs_handle *handle = NULL;
+
+ return sizeof(struct xfs_fid) - sizeof(handle->ha_fid.fid_len);
+}
+
+static inline size_t
+xfs_filehandle_init(
+ struct xfs_mount *mp,
+ xfs_ino_t ino,
+ uint32_t gen,
+ struct xfs_handle *handle)
+{
+ memcpy(&handle->ha_fsid, mp->m_fixedfsid, sizeof(struct xfs_fsid));
+
+ handle->ha_fid.fid_len = xfs_filehandle_fid_len();
+ handle->ha_fid.fid_pad = 0;
+ handle->ha_fid.fid_gen = gen;
+ handle->ha_fid.fid_ino = ino;
+
+ return sizeof(struct xfs_handle);
+}
+
+static inline size_t
+xfs_fshandle_init(
+ struct xfs_mount *mp,
+ struct xfs_handle *handle)
+{
+ memcpy(&handle->ha_fsid, mp->m_fixedfsid, sizeof(struct xfs_fsid));
+ memset(&handle->ha_fid, 0, sizeof(handle->ha_fid));
+
+ return sizeof(struct xfs_fsid);
+}
+
+/*
+ * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
+ * a file or fs handle.
+ *
+ * XFS_IOC_PATH_TO_FSHANDLE
+ * returns fs handle for a mount point or path within that mount point
+ * XFS_IOC_FD_TO_HANDLE
+ * returns full handle for a FD opened in user space
+ * XFS_IOC_PATH_TO_HANDLE
+ * returns full handle for a path
+ */
+int
+xfs_find_handle(
+ unsigned int cmd,
+ xfs_fsop_handlereq_t *hreq)
+{
+ int hsize;
+ xfs_handle_t handle;
+ struct inode *inode;
+ struct fd f = {NULL};
+ struct path path;
+ int error;
+ struct xfs_inode *ip;
+
+ if (cmd == XFS_IOC_FD_TO_HANDLE) {
+ f = fdget(hreq->fd);
+ if (!f.file)
+ return -EBADF;
+ inode = file_inode(f.file);
+ } else {
+ error = user_path_at(AT_FDCWD, hreq->path, 0, &path);
+ if (error)
+ return error;
+ inode = d_inode(path.dentry);
+ }
+ ip = XFS_I(inode);
+
+ /*
+ * We can only generate handles for inodes residing on a XFS filesystem,
+ * and only for regular files, directories or symbolic links.
+ */
+ error = -EINVAL;
+ if (inode->i_sb->s_magic != XFS_SB_MAGIC)
+ goto out_put;
+
+ error = -EBADF;
+ if (!S_ISREG(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode) &&
+ !S_ISLNK(inode->i_mode))
+ goto out_put;
+
+
+ memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
+
+ if (cmd == XFS_IOC_PATH_TO_FSHANDLE)
+ hsize = xfs_fshandle_init(ip->i_mount, &handle);
+ else
+ hsize = xfs_filehandle_init(ip->i_mount, ip->i_ino,
+ inode->i_generation, &handle);
+
+ error = -EFAULT;
+ if (copy_to_user(hreq->ohandle, &handle, hsize) ||
+ copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32)))
+ goto out_put;
+
+ error = 0;
+
+ out_put:
+ if (cmd == XFS_IOC_FD_TO_HANDLE)
+ fdput(f);
+ else
+ path_put(&path);
+ return error;
+}
+
+/*
+ * No need to do permission checks on the various pathname components
+ * as the handle operations are privileged.
+ */
+STATIC int
+xfs_handle_acceptable(
+ void *context,
+ struct dentry *dentry)
+{
+ return 1;
+}
+
+/* Convert handle already copied to kernel space into a dentry. */
+static struct dentry *
+xfs_khandle_to_dentry(
+ struct file *file,
+ struct xfs_handle *handle)
+{
+ struct xfs_fid64 fid = {
+ .ino = handle->ha_fid.fid_ino,
+ .gen = handle->ha_fid.fid_gen,
+ };
+
+ /*
+ * Only allow handle opens under a directory.
+ */
+ if (!S_ISDIR(file_inode(file)->i_mode))
+ return ERR_PTR(-ENOTDIR);
+
+ if (handle->ha_fid.fid_len != xfs_filehandle_fid_len())
+ return ERR_PTR(-EINVAL);
+
+ return exportfs_decode_fh(file->f_path.mnt, (struct fid *)&fid, 3,
+ FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG,
+ xfs_handle_acceptable, NULL);
+}
+
+/* Convert handle already copied to kernel space into an xfs_inode. */
+static struct xfs_inode *
+xfs_khandle_to_inode(
+ struct file *file,
+ struct xfs_handle *handle)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(file));
+ struct xfs_mount *mp = ip->i_mount;
+ struct inode *inode;
+
+ if (!S_ISDIR(VFS_I(ip)->i_mode))
+ return ERR_PTR(-ENOTDIR);
+
+ if (handle->ha_fid.fid_len != xfs_filehandle_fid_len())
+ return ERR_PTR(-EINVAL);
+
+ inode = xfs_nfs_get_inode(mp->m_super, handle->ha_fid.fid_ino,
+ handle->ha_fid.fid_gen);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ return XFS_I(inode);
+}
+
+/*
+ * Convert userspace handle data into a dentry.
+ */
+struct dentry *
+xfs_handle_to_dentry(
+ struct file *parfilp,
+ void __user *uhandle,
+ u32 hlen)
+{
+ xfs_handle_t handle;
+
+ if (hlen != sizeof(xfs_handle_t))
+ return ERR_PTR(-EINVAL);
+ if (copy_from_user(&handle, uhandle, hlen))
+ return ERR_PTR(-EFAULT);
+
+ return xfs_khandle_to_dentry(parfilp, &handle);
+}
+
+STATIC struct dentry *
+xfs_handlereq_to_dentry(
+ struct file *parfilp,
+ xfs_fsop_handlereq_t *hreq)
+{
+ return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen);
+}
+
+int
+xfs_open_by_handle(
+ struct file *parfilp,
+ xfs_fsop_handlereq_t *hreq)
+{
+ const struct cred *cred = current_cred();
+ int error;
+ int fd;
+ int permflag;
+ struct file *filp;
+ struct inode *inode;
+ struct dentry *dentry;
+ fmode_t fmode;
+ struct path path;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ inode = d_inode(dentry);
+
+ /* Restrict xfs_open_by_handle to directories & regular files. */
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
+ error = -EPERM;
+ goto out_dput;
+ }
+
+#if BITS_PER_LONG != 32
+ hreq->oflags |= O_LARGEFILE;
+#endif
+
+ permflag = hreq->oflags;
+ fmode = OPEN_FMODE(permflag);
+ if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
+ (fmode & FMODE_WRITE) && IS_APPEND(inode)) {
+ error = -EPERM;
+ goto out_dput;
+ }
+
+ if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
+ error = -EPERM;
+ goto out_dput;
+ }
+
+ /* Can't write directories. */
+ if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
+ error = -EISDIR;
+ goto out_dput;
+ }
+
+ fd = get_unused_fd_flags(0);
+ if (fd < 0) {
+ error = fd;
+ goto out_dput;
+ }
+
+ path.mnt = parfilp->f_path.mnt;
+ path.dentry = dentry;
+ filp = dentry_open(&path, hreq->oflags, cred);
+ dput(dentry);
+ if (IS_ERR(filp)) {
+ put_unused_fd(fd);
+ return PTR_ERR(filp);
+ }
+
+ if (S_ISREG(inode->i_mode)) {
+ filp->f_flags |= O_NOATIME;
+ filp->f_mode |= FMODE_NOCMTIME;
+ }
+
+ fd_install(fd, filp);
+ return fd;
+
+ out_dput:
+ dput(dentry);
+ return error;
+}
+
+int
+xfs_readlink_by_handle(
+ struct file *parfilp,
+ xfs_fsop_handlereq_t *hreq)
+{
+ struct dentry *dentry;
+ __u32 olen;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ /* Restrict this handle operation to symlinks only. */
+ if (!d_is_symlink(dentry)) {
+ error = -EINVAL;
+ goto out_dput;
+ }
+
+ if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
+ error = -EFAULT;
+ goto out_dput;
+ }
+
+ error = vfs_readlink(dentry, hreq->ohandle, olen);
+
+ out_dput:
+ dput(dentry);
+ return error;
+}
+
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+static void
+xfs_ioc_attr_put_listent(
+ struct xfs_attr_list_context *context,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ void *value,
+ int valuelen)
+{
+ struct xfs_attrlist *alist = context->buffer;
+ struct xfs_attrlist_ent *aep;
+ int arraytop;
+
+ ASSERT(!context->seen_enough);
+ ASSERT(context->count >= 0);
+ ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+ ASSERT(context->firstu >= sizeof(*alist));
+ ASSERT(context->firstu <= context->bufsize);
+
+ /*
+ * Only list entries in the right namespace.
+ */
+ if (context->attr_filter != (flags & XFS_ATTR_NSP_ONDISK_MASK))
+ return;
+
+ arraytop = sizeof(*alist) +
+ context->count * sizeof(alist->al_offset[0]);
+
+ /* decrement by the actual bytes used by the attr */
+ context->firstu -= round_up(offsetof(struct xfs_attrlist_ent, a_name) +
+ namelen + 1, sizeof(uint32_t));
+ if (context->firstu < arraytop) {
+ trace_xfs_attr_list_full(context);
+ alist->al_more = 1;
+ context->seen_enough = 1;
+ return;
+ }
+
+ aep = context->buffer + context->firstu;
+ aep->a_valuelen = valuelen;
+ memcpy(aep->a_name, name, namelen);
+ aep->a_name[namelen] = 0;
+ alist->al_offset[context->count++] = context->firstu;
+ alist->al_count = context->count;
+ trace_xfs_attr_list_add(context);
+}
+
+static unsigned int
+xfs_attr_filter(
+ u32 ioc_flags)
+{
+ if (ioc_flags & XFS_IOC_ATTR_ROOT)
+ return XFS_ATTR_ROOT;
+ if (ioc_flags & XFS_IOC_ATTR_SECURE)
+ return XFS_ATTR_SECURE;
+ return 0;
+}
+
+static inline enum xfs_attr_update
+xfs_xattr_flags(
+ u32 ioc_flags,
+ void *value)
+{
+ if (!value)
+ return XFS_ATTRUPDATE_REMOVE;
+ if (ioc_flags & XFS_IOC_ATTR_CREATE)
+ return XFS_ATTRUPDATE_CREATE;
+ if (ioc_flags & XFS_IOC_ATTR_REPLACE)
+ return XFS_ATTRUPDATE_REPLACE;
+ return XFS_ATTRUPDATE_UPSERT;
+}
+
+int
+xfs_ioc_attr_list(
+ struct xfs_inode *dp,
+ void __user *ubuf,
+ size_t bufsize,
+ int flags,
+ struct xfs_attrlist_cursor __user *ucursor)
+{
+ struct xfs_attr_list_context context = { };
+ struct xfs_attrlist *alist;
+ void *buffer;
+ int error;
+
+ if (bufsize < sizeof(struct xfs_attrlist) ||
+ bufsize > XFS_XATTR_LIST_MAX)
+ return -EINVAL;
+
+ /*
+ * Reject flags, only allow namespaces.
+ */
+ if (flags & ~(XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE))
+ return -EINVAL;
+ if (flags == (XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE))
+ return -EINVAL;
+
+ /*
+ * Validate the cursor.
+ */
+ if (copy_from_user(&context.cursor, ucursor, sizeof(context.cursor)))
+ return -EFAULT;
+ if (context.cursor.pad1 || context.cursor.pad2)
+ return -EINVAL;
+ if (!context.cursor.initted &&
+ (context.cursor.hashval || context.cursor.blkno ||
+ context.cursor.offset))
+ return -EINVAL;
+
+ buffer = kvzalloc(bufsize, GFP_KERNEL);
+ if (!buffer)
+ return -ENOMEM;
+
+ /*
+ * Initialize the output buffer.
+ */
+ context.dp = dp;
+ context.resynch = 1;
+ context.attr_filter = xfs_attr_filter(flags);
+ context.buffer = buffer;
+ context.bufsize = round_down(bufsize, sizeof(uint32_t));
+ context.firstu = context.bufsize;
+ context.put_listent = xfs_ioc_attr_put_listent;
+
+ alist = context.buffer;
+ alist->al_count = 0;
+ alist->al_more = 0;
+ alist->al_offset[0] = context.bufsize;
+
+ error = xfs_attr_list(&context);
+ if (error)
+ goto out_free;
+
+ if (copy_to_user(ubuf, buffer, bufsize) ||
+ copy_to_user(ucursor, &context.cursor, sizeof(context.cursor)))
+ error = -EFAULT;
+out_free:
+ kvfree(buffer);
+ return error;
+}
+
+int
+xfs_attrlist_by_handle(
+ struct file *parfilp,
+ struct xfs_fsop_attrlist_handlereq __user *p)
+{
+ struct xfs_fsop_attrlist_handlereq al_hreq;
+ struct dentry *dentry;
+ int error = -ENOMEM;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&al_hreq, p, sizeof(al_hreq)))
+ return -EFAULT;
+
+ dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), al_hreq.buffer,
+ al_hreq.buflen, al_hreq.flags, &p->pos);
+ dput(dentry);
+ return error;
+}
+
+static int
+xfs_attrmulti_attr_get(
+ struct inode *inode,
+ unsigned char *name,
+ unsigned char __user *ubuf,
+ uint32_t *len,
+ uint32_t flags)
+{
+ struct xfs_da_args args = {
+ .dp = XFS_I(inode),
+ .attr_filter = xfs_attr_filter(flags),
+ .name = name,
+ .namelen = strlen(name),
+ .valuelen = *len,
+ };
+ int error;
+
+ if (*len > XFS_XATTR_SIZE_MAX)
+ return -EINVAL;
+
+ error = xfs_attr_get(&args);
+ if (error)
+ goto out_kfree;
+
+ *len = args.valuelen;
+ if (copy_to_user(ubuf, args.value, args.valuelen))
+ error = -EFAULT;
+
+out_kfree:
+ kvfree(args.value);
+ return error;
+}
+
+static int
+xfs_attrmulti_attr_set(
+ struct inode *inode,
+ unsigned char *name,
+ const unsigned char __user *ubuf,
+ uint32_t len,
+ uint32_t flags)
+{
+ struct xfs_da_args args = {
+ .dp = XFS_I(inode),
+ .attr_filter = xfs_attr_filter(flags),
+ .name = name,
+ .namelen = strlen(name),
+ };
+ int error;
+
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ return -EPERM;
+
+ if (ubuf) {
+ if (len > XFS_XATTR_SIZE_MAX)
+ return -EINVAL;
+ args.value = memdup_user(ubuf, len);
+ if (IS_ERR(args.value))
+ return PTR_ERR(args.value);
+ args.valuelen = len;
+ }
+
+ error = xfs_attr_change(&args, xfs_xattr_flags(flags, args.value));
+ if (!error && (flags & XFS_IOC_ATTR_ROOT))
+ xfs_forget_acl(inode, name);
+ kfree(args.value);
+ return error;
+}
+
+int
+xfs_ioc_attrmulti_one(
+ struct file *parfilp,
+ struct inode *inode,
+ uint32_t opcode,
+ void __user *uname,
+ void __user *value,
+ uint32_t *len,
+ uint32_t flags)
+{
+ unsigned char *name;
+ int error;
+
+ if ((flags & XFS_IOC_ATTR_ROOT) && (flags & XFS_IOC_ATTR_SECURE))
+ return -EINVAL;
+
+ name = strndup_user(uname, MAXNAMELEN);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ switch (opcode) {
+ case ATTR_OP_GET:
+ error = xfs_attrmulti_attr_get(inode, name, value, len, flags);
+ break;
+ case ATTR_OP_REMOVE:
+ value = NULL;
+ *len = 0;
+ fallthrough;
+ case ATTR_OP_SET:
+ error = mnt_want_write_file(parfilp);
+ if (error)
+ break;
+ error = xfs_attrmulti_attr_set(inode, name, value, *len, flags);
+ mnt_drop_write_file(parfilp);
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ kfree(name);
+ return error;
+}
+
+int
+xfs_attrmulti_by_handle(
+ struct file *parfilp,
+ void __user *arg)
+{
+ int error;
+ xfs_attr_multiop_t *ops;
+ xfs_fsop_attrmulti_handlereq_t am_hreq;
+ struct dentry *dentry;
+ unsigned int i, size;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
+ return -EFAULT;
+
+ /* overflow check */
+ if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
+ return -E2BIG;
+
+ dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ error = -E2BIG;
+ size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
+ if (!size || size > 16 * PAGE_SIZE)
+ goto out_dput;
+
+ ops = memdup_user(am_hreq.ops, size);
+ if (IS_ERR(ops)) {
+ error = PTR_ERR(ops);
+ goto out_dput;
+ }
+
+ error = 0;
+ for (i = 0; i < am_hreq.opcount; i++) {
+ ops[i].am_error = xfs_ioc_attrmulti_one(parfilp,
+ d_inode(dentry), ops[i].am_opcode,
+ ops[i].am_attrname, ops[i].am_attrvalue,
+ &ops[i].am_length, ops[i].am_flags);
+ }
+
+ if (copy_to_user(am_hreq.ops, ops, size))
+ error = -EFAULT;
+
+ kfree(ops);
+ out_dput:
+ dput(dentry);
+ return error;
+}
+
+struct xfs_getparents_ctx {
+ struct xfs_attr_list_context context;
+ struct xfs_getparents_by_handle gph;
+
+ /* File to target */
+ struct xfs_inode *ip;
+
+ /* Internal buffer where we format records */
+ void *krecords;
+
+ /* Last record filled out */
+ struct xfs_getparents_rec *lastrec;
+
+ unsigned int count;
+};
+
+static inline unsigned int
+xfs_getparents_rec_sizeof(
+ unsigned int namelen)
+{
+ return round_up(sizeof(struct xfs_getparents_rec) + namelen + 1,
+ sizeof(uint64_t));
+}
+
+static void
+xfs_getparents_put_listent(
+ struct xfs_attr_list_context *context,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ void *value,
+ int valuelen)
+{
+ struct xfs_getparents_ctx *gpx =
+ container_of(context, struct xfs_getparents_ctx, context);
+ struct xfs_inode *ip = context->dp;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_getparents *gp = &gpx->gph.gph_request;
+ struct xfs_getparents_rec *gpr = gpx->krecords + context->firstu;
+ unsigned short reclen =
+ xfs_getparents_rec_sizeof(namelen);
+ xfs_ino_t ino;
+ uint32_t gen;
+ int error;
+
+ if (!(flags & XFS_ATTR_PARENT))
+ return;
+
+ error = xfs_parent_from_attr(mp, flags, name, namelen, value, valuelen,
+ &ino, &gen);
+ if (error) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_PARENT);
+ context->seen_enough = -EFSCORRUPTED;
+ return;
+ }
+
+ /*
+ * We found a parent pointer, but we've filled up the buffer. Signal
+ * to the caller that we did /not/ reach the end of the parent pointer
+ * recordset.
+ */
+ if (context->firstu > context->bufsize - reclen) {
+ context->seen_enough = 1;
+ return;
+ }
+
+ /* Format the parent pointer directly into the caller buffer. */
+ gpr->gpr_reclen = reclen;
+ xfs_filehandle_init(mp, ino, gen, &gpr->gpr_parent);
+ memcpy(gpr->gpr_name, name, namelen);
+ gpr->gpr_name[namelen] = 0;
+
+ trace_xfs_getparents_put_listent(ip, gp, context, gpr);
+
+ context->firstu += reclen;
+ gpx->count++;
+ gpx->lastrec = gpr;
+}
+
+/* Expand the last record to fill the rest of the caller's buffer. */
+static inline void
+xfs_getparents_expand_lastrec(
+ struct xfs_getparents_ctx *gpx)
+{
+ struct xfs_getparents *gp = &gpx->gph.gph_request;
+ struct xfs_getparents_rec *gpr = gpx->lastrec;
+
+ if (!gpx->lastrec)
+ gpr = gpx->krecords;
+
+ gpr->gpr_reclen = gp->gp_bufsize - ((void *)gpr - gpx->krecords);
+
+ trace_xfs_getparents_expand_lastrec(gpx->ip, gp, &gpx->context, gpr);
+}
+
+static inline void __user *u64_to_uptr(u64 val)
+{
+ return (void __user *)(uintptr_t)val;
+}
+
+/* Retrieve the parent pointers for a given inode. */
+STATIC int
+xfs_getparents(
+ struct xfs_getparents_ctx *gpx)
+{
+ struct xfs_getparents *gp = &gpx->gph.gph_request;
+ struct xfs_inode *ip = gpx->ip;
+ struct xfs_mount *mp = ip->i_mount;
+ size_t bufsize;
+ int error;
+
+ /* Check size of buffer requested by user */
+ if (gp->gp_bufsize > XFS_XATTR_LIST_MAX)
+ return -ENOMEM;
+ if (gp->gp_bufsize < xfs_getparents_rec_sizeof(1))
+ return -EINVAL;
+
+ if (gp->gp_iflags & ~XFS_GETPARENTS_IFLAGS_ALL)
+ return -EINVAL;
+ if (gp->gp_reserved)
+ return -EINVAL;
+
+ bufsize = round_down(gp->gp_bufsize, sizeof(uint64_t));
+ gpx->krecords = kvzalloc(bufsize, GFP_KERNEL);
+ if (!gpx->krecords) {
+ bufsize = min(bufsize, PAGE_SIZE);
+ gpx->krecords = kvzalloc(bufsize, GFP_KERNEL);
+ if (!gpx->krecords)
+ return -ENOMEM;
+ }
+
+ gpx->context.dp = ip;
+ gpx->context.resynch = 1;
+ gpx->context.put_listent = xfs_getparents_put_listent;
+ gpx->context.bufsize = bufsize;
+ /* firstu is used to track the bytes filled in the buffer */
+ gpx->context.firstu = 0;
+
+ /* Copy the cursor provided by caller */
+ memcpy(&gpx->context.cursor, &gp->gp_cursor,
+ sizeof(struct xfs_attrlist_cursor));
+ gpx->count = 0;
+ gp->gp_oflags = 0;
+
+ trace_xfs_getparents_begin(ip, gp, &gpx->context.cursor);
+
+ error = xfs_attr_list(&gpx->context);
+ if (error)
+ goto out_free_buf;
+ if (gpx->context.seen_enough < 0) {
+ error = gpx->context.seen_enough;
+ goto out_free_buf;
+ }
+ xfs_getparents_expand_lastrec(gpx);
+
+ /* Update the caller with the current cursor position */
+ memcpy(&gp->gp_cursor, &gpx->context.cursor,
+ sizeof(struct xfs_attrlist_cursor));
+
+ /* Is this the root directory? */
+ if (ip->i_ino == mp->m_sb.sb_rootino)
+ gp->gp_oflags |= XFS_GETPARENTS_OFLAG_ROOT;
+
+ if (gpx->context.seen_enough == 0) {
+ /*
+ * If we did not run out of buffer space, then we reached the
+ * end of the pptr recordset, so set the DONE flag.
+ */
+ gp->gp_oflags |= XFS_GETPARENTS_OFLAG_DONE;
+ } else if (gpx->count == 0) {
+ /*
+ * If we ran out of buffer space before copying any parent
+ * pointers at all, the caller's buffer was too short. Tell
+ * userspace that, erm, the message is too long.
+ */
+ error = -EMSGSIZE;
+ goto out_free_buf;
+ }
+
+ trace_xfs_getparents_end(ip, gp, &gpx->context.cursor);
+
+ ASSERT(gpx->context.firstu <= gpx->gph.gph_request.gp_bufsize);
+
+ /* Copy the records to userspace. */
+ if (copy_to_user(u64_to_uptr(gpx->gph.gph_request.gp_buffer),
+ gpx->krecords, gpx->context.firstu))
+ error = -EFAULT;
+
+out_free_buf:
+ kvfree(gpx->krecords);
+ gpx->krecords = NULL;
+ return error;
+}
+
+/* Retrieve the parents of this file and pass them back to userspace. */
+int
+xfs_ioc_getparents(
+ struct file *file,
+ struct xfs_getparents __user *ureq)
+{
+ struct xfs_getparents_ctx gpx = {
+ .ip = XFS_I(file_inode(file)),
+ };
+ struct xfs_getparents *kreq = &gpx.gph.gph_request;
+ struct xfs_mount *mp = gpx.ip->i_mount;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (!xfs_has_parent(mp))
+ return -EOPNOTSUPP;
+ if (copy_from_user(kreq, ureq, sizeof(*kreq)))
+ return -EFAULT;
+
+ error = xfs_getparents(&gpx);
+ if (error)
+ return error;
+
+ if (copy_to_user(ureq, kreq, sizeof(*kreq)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* Retrieve the parents of this file handle and pass them back to userspace. */
+int
+xfs_ioc_getparents_by_handle(
+ struct file *file,
+ struct xfs_getparents_by_handle __user *ureq)
+{
+ struct xfs_getparents_ctx gpx = { };
+ struct xfs_inode *ip = XFS_I(file_inode(file));
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_getparents_by_handle *kreq = &gpx.gph;
+ struct xfs_handle *handle = &kreq->gph_handle;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (!xfs_has_parent(mp))
+ return -EOPNOTSUPP;
+ if (copy_from_user(kreq, ureq, sizeof(*kreq)))
+ return -EFAULT;
+
+ /*
+ * We don't use exportfs_decode_fh because it does too much work here.
+ * If the handle refers to a directory, the exportfs code will walk
+ * upwards through the directory tree to connect the dentries to the
+ * root directory dentry. For GETPARENTS we don't care about that
+ * because we're not actually going to open a file descriptor; we only
+ * want to open an inode and read its parent pointers.
+ *
+ * Note that xfs_scrub uses GETPARENTS to log that it will try to fix a
+ * corrupted file's metadata. For this usecase we would really rather
+ * userspace single-step the path reconstruction to avoid loops or
+ * other strange things if the directory tree is corrupt.
+ */
+ gpx.ip = xfs_khandle_to_inode(file, handle);
+ if (IS_ERR(gpx.ip))
+ return PTR_ERR(gpx.ip);
+
+ error = xfs_getparents(&gpx);
+ if (error)
+ goto out_rele;
+
+ if (copy_to_user(ureq, kreq, sizeof(*kreq)))
+ error = -EFAULT;
+
+out_rele:
+ xfs_irele(gpx.ip);
+ return error;
+}
diff --git a/fs/xfs/xfs_handle.h b/fs/xfs/xfs_handle.h
new file mode 100644
index 000000000000..6799a86d8565
--- /dev/null
+++ b/fs/xfs/xfs_handle.h
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2022-2024 Oracle.
+ * All rights reserved.
+ */
+#ifndef __XFS_HANDLE_H__
+#define __XFS_HANDLE_H__
+
+int xfs_attrlist_by_handle(struct file *parfilp,
+ struct xfs_fsop_attrlist_handlereq __user *p);
+int xfs_attrmulti_by_handle(struct file *parfilp, void __user *arg);
+
+int xfs_find_handle(unsigned int cmd, struct xfs_fsop_handlereq *hreq);
+int xfs_open_by_handle(struct file *parfilp, struct xfs_fsop_handlereq *hreq);
+int xfs_readlink_by_handle(struct file *parfilp,
+ struct xfs_fsop_handlereq *hreq);
+
+int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode,
+ uint32_t opcode, void __user *uname, void __user *value,
+ uint32_t *len, uint32_t flags);
+int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf,
+ size_t bufsize, int flags,
+ struct xfs_attrlist_cursor __user *ucursor);
+
+struct dentry *xfs_handle_to_dentry(struct file *parfilp, void __user *uhandle,
+ u32 hlen);
+
+int xfs_ioc_getparents(struct file *file, struct xfs_getparents __user *arg);
+int xfs_ioc_getparents_by_handle(struct file *file,
+ struct xfs_getparents_by_handle __user *arg);
+
+#endif /* __XFS_HANDLE_H__ */
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index b39f959146bc..10f116d093a2 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -470,6 +470,7 @@ static const struct ioctl_sick_map ino_map[] = {
{ XFS_SICK_INO_BMBTA_ZAPPED, XFS_BS_SICK_BMBTA },
{ XFS_SICK_INO_DIR_ZAPPED, XFS_BS_SICK_DIR },
{ XFS_SICK_INO_SYMLINK_ZAPPED, XFS_BS_SICK_SYMLINK },
+ { XFS_SICK_INO_DIRTREE, XFS_BS_SICK_DIRTREE },
{ 0, 0 },
};
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 74f1812b03cb..0953163a2d84 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -613,7 +613,6 @@ xfs_iget_cache_miss(
struct xfs_inode *ip;
int error;
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
- int iflags;
ip = xfs_inode_alloc(mp, ino);
if (!ip)
@@ -693,13 +692,12 @@ xfs_iget_cache_miss(
* memory barrier that ensures this detection works correctly at lookup
* time.
*/
- iflags = XFS_INEW;
if (flags & XFS_IGET_DONTCACHE)
d_mark_dontcache(VFS_I(ip));
ip->i_udquot = NULL;
ip->i_gdquot = NULL;
ip->i_pdquot = NULL;
- xfs_iflags_set(ip, iflags);
+ xfs_iflags_set(ip, XFS_INEW);
/* insert the new inode */
spin_lock(&pag->pag_ici_lock);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d55b42b2480d..58fb7a5062e1 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -16,6 +16,7 @@
#include "xfs_inode.h"
#include "xfs_dir2.h"
#include "xfs_attr.h"
+#include "xfs_bit.h"
#include "xfs_trans_space.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
@@ -38,13 +39,12 @@
#include "xfs_ag.h"
#include "xfs_log_priv.h"
#include "xfs_health.h"
+#include "xfs_pnfs.h"
+#include "xfs_parent.h"
+#include "xfs_xattr.h"
struct kmem_cache *xfs_inode_cache;
-STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
-STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
- struct xfs_inode *);
-
/*
* helper function to extract extent size hint from inode
*/
@@ -60,7 +60,8 @@ xfs_get_extsz_hint(
return 0;
if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
return ip->i_extsize;
- if (XFS_IS_REALTIME_INODE(ip))
+ if (XFS_IS_REALTIME_INODE(ip) &&
+ ip->i_mount->m_sb.sb_rextsize > 1)
return ip->i_mount->m_sb.sb_rextsize;
return 0;
}
@@ -420,7 +421,7 @@ xfs_lock_inumorder(
* lock more than one at a time, lockdep will report false positives saying we
* have violated locking orders.
*/
-static void
+void
xfs_lock_inodes(
struct xfs_inode **ips,
int inodes,
@@ -749,6 +750,8 @@ xfs_inode_inherit_flags2(
/*
* Initialise a newly allocated inode and return the in-core inode to the
* caller locked exclusively.
+ *
+ * Caller is responsible for unlocking the inode manually upon return
*/
int
xfs_init_new_inode(
@@ -875,7 +878,7 @@ xfs_init_new_inode(
/*
* Log the new values stuffed into the inode.
*/
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
xfs_trans_log_inode(tp, ip, flags);
/* now that we have an i_mode we can setup the inode structure */
@@ -890,24 +893,27 @@ xfs_init_new_inode(
* link count to go to zero, move the inode to AGI unlinked list so that it can
* be freed when the last active reference goes away via xfs_inactive().
*/
-static int /* error */
+int
xfs_droplink(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
- if (VFS_I(ip)->i_nlink == 0) {
- xfs_alert(ip->i_mount,
- "%s: Attempt to drop inode (%llu) with nlink zero.",
- __func__, ip->i_ino);
- return -EFSCORRUPTED;
- }
+ struct inode *inode = VFS_I(ip);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
- drop_nlink(VFS_I(ip));
+ if (inode->i_nlink == 0) {
+ xfs_info_ratelimited(tp->t_mountp,
+ "Inode 0x%llx link count dropped below zero. Pinning link count.",
+ ip->i_ino);
+ set_nlink(inode, XFS_NLINK_PINNED);
+ }
+ if (inode->i_nlink != XFS_NLINK_PINNED)
+ drop_nlink(inode);
+
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (VFS_I(ip)->i_nlink)
+ if (inode->i_nlink)
return 0;
return xfs_iunlink(tp, ip);
@@ -916,14 +922,22 @@ xfs_droplink(
/*
* Increment the link count on an inode & log the change.
*/
-static void
+void
xfs_bumplink(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
+ struct inode *inode = VFS_I(ip);
+
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
- inc_nlink(VFS_I(ip));
+ if (inode->i_nlink == XFS_NLINK_PINNED - 1)
+ xfs_info_ratelimited(tp->t_mountp,
+ "Inode 0x%llx link count exceeded maximum. Pinning link count.",
+ ip->i_ino);
+ if (inode->i_nlink != XFS_NLINK_PINNED)
+ inc_nlink(inode);
+
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -1005,7 +1019,7 @@ xfs_dir_hook_setup(
int
xfs_create(
struct mnt_idmap *idmap,
- xfs_inode_t *dp,
+ struct xfs_inode *dp,
struct xfs_name *name,
umode_t mode,
dev_t rdev,
@@ -1017,7 +1031,7 @@ xfs_create(
struct xfs_inode *ip = NULL;
struct xfs_trans *tp = NULL;
int error;
- bool unlock_dp_on_error = false;
+ bool unlock_dp_on_error = false;
prid_t prid;
struct xfs_dquot *udqp = NULL;
struct xfs_dquot *gdqp = NULL;
@@ -1025,6 +1039,7 @@ xfs_create(
struct xfs_trans_res *tres;
uint resblks;
xfs_ino_t ino;
+ struct xfs_parent_args *ppargs;
trace_xfs_create(dp, name);
@@ -1046,13 +1061,17 @@ xfs_create(
return error;
if (is_dir) {
- resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
+ resblks = xfs_mkdir_space_res(mp, name->len);
tres = &M_RES(mp)->tr_mkdir;
} else {
- resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+ resblks = xfs_create_space_res(mp, name->len);
tres = &M_RES(mp)->tr_create;
}
+ error = xfs_parent_start(mp, &ppargs);
+ if (error)
+ goto out_release_dquots;
+
/*
* Initially assume that the file does not exist and
* reserve the resources for that case. If that is not
@@ -1068,7 +1087,7 @@ xfs_create(
resblks, &tp);
}
if (error)
- goto out_release_dquots;
+ goto out_parent;
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
@@ -1092,8 +1111,7 @@ xfs_create(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
- unlock_dp_on_error = false;
+ xfs_trans_ijoin(tp, dp, 0);
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
resblks - XFS_IALLOC_SPACE_RES(mp));
@@ -1113,6 +1131,16 @@ xfs_create(
}
/*
+ * If we have parent pointers, we need to add the attribute containing
+ * the parent information now.
+ */
+ if (ppargs) {
+ error = xfs_parent_addname(tp, ppargs, dp, name, ip);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /*
* Create ip with a reference from dp, and add '.' and '..' references
* if it's a directory.
*/
@@ -1142,6 +1170,9 @@ xfs_create(
xfs_qm_dqrele(pdqp);
*ipp = ip;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ xfs_parent_finish(mp, ppargs);
return 0;
out_trans_cancel:
@@ -1153,9 +1184,12 @@ xfs_create(
* transactions and deadlocks from xfs_inactive.
*/
if (ip) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
+ out_parent:
+ xfs_parent_finish(mp, ppargs);
out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
@@ -1171,6 +1205,7 @@ xfs_create_tmpfile(
struct mnt_idmap *idmap,
struct xfs_inode *dp,
umode_t mode,
+ bool init_xattrs,
struct xfs_inode **ipp)
{
struct xfs_mount *mp = dp->i_mount;
@@ -1211,7 +1246,7 @@ xfs_create_tmpfile(
error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
if (!error)
error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
- 0, 0, prid, false, &ip);
+ 0, 0, prid, init_xattrs, &ip);
if (error)
goto out_trans_cancel;
@@ -1238,6 +1273,7 @@ xfs_create_tmpfile(
xfs_qm_dqrele(pdqp);
*ipp = ip;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return 0;
out_trans_cancel:
@@ -1249,6 +1285,7 @@ xfs_create_tmpfile(
* transactions and deadlocks from xfs_inactive.
*/
if (ip) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
@@ -1262,14 +1299,15 @@ xfs_create_tmpfile(
int
xfs_link(
- xfs_inode_t *tdp,
- xfs_inode_t *sip,
+ struct xfs_inode *tdp,
+ struct xfs_inode *sip,
struct xfs_name *target_name)
{
- xfs_mount_t *mp = tdp->i_mount;
- xfs_trans_t *tp;
+ struct xfs_mount *mp = tdp->i_mount;
+ struct xfs_trans *tp;
int error, nospace_error = 0;
int resblks;
+ struct xfs_parent_args *ppargs;
trace_xfs_link(tdp, target_name);
@@ -1288,11 +1326,25 @@ xfs_link(
if (error)
goto std_return;
- resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
+ error = xfs_parent_start(mp, &ppargs);
+ if (error)
+ goto std_return;
+
+ resblks = xfs_link_space_res(mp, target_name->len);
error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks,
&tp, &nospace_error);
if (error)
- goto std_return;
+ goto out_parent;
+
+ /*
+ * We don't allow reservationless or quotaless hardlinking when parent
+ * pointers are enabled because we can't back out if the xattrs must
+ * grow.
+ */
+ if (ppargs && nospace_error) {
+ error = nospace_error;
+ goto error_return;
+ }
/*
* If we are using project inheritance, we only allow hard link
@@ -1343,6 +1395,19 @@ xfs_link(
xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
xfs_bumplink(tp, sip);
+
+ /*
+ * If we have parent pointers, we now need to add the parent record to
+ * the attribute fork of the inode. If this is the initial parent
+ * attribute, we need to create it correctly, otherwise we can just add
+ * the parent to the inode.
+ */
+ if (ppargs) {
+ error = xfs_parent_addname(tp, ppargs, tdp, target_name, sip);
+ if (error)
+ goto error_return;
+ }
+
xfs_dir_update_hook(tdp, sip, 1, target_name);
/*
@@ -1353,10 +1418,18 @@ xfs_link(
if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp);
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(tdp, XFS_ILOCK_EXCL);
+ xfs_iunlock(sip, XFS_ILOCK_EXCL);
+ xfs_parent_finish(mp, ppargs);
+ return error;
error_return:
xfs_trans_cancel(tp);
+ xfs_iunlock(tdp, XFS_ILOCK_EXCL);
+ xfs_iunlock(sip, XFS_ILOCK_EXCL);
+ out_parent:
+ xfs_parent_finish(mp, ppargs);
std_return:
if (error == -ENOSPC && nospace_error)
error = nospace_error;
@@ -1555,6 +1628,51 @@ out_unlock:
}
/*
+ * Mark all the buffers attached to this directory stale. In theory we should
+ * never be freeing a directory with any blocks at all, but this covers the
+ * case where we've recovered a directory swap with a "temporary" directory
+ * created by online repair and now need to dump it.
+ */
+STATIC void
+xfs_inactive_dir(
+ struct xfs_inode *dp)
+{
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
+ xfs_fileoff_t off;
+
+ /*
+ * Invalidate each directory block. All directory blocks are of
+ * fsbcount length and alignment, so we only need to walk those same
+ * offsets. We hold the only reference to this inode, so we must wait
+ * for the buffer locks.
+ */
+ for_each_xfs_iext(ifp, &icur, &got) {
+ for (off = round_up(got.br_startoff, geo->fsbcount);
+ off < got.br_startoff + got.br_blockcount;
+ off += geo->fsbcount) {
+ struct xfs_buf *bp = NULL;
+ xfs_fsblock_t fsbno;
+ int error;
+
+ fsbno = (off - got.br_startoff) + got.br_startblock;
+ error = xfs_buf_incore(mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, fsbno),
+ XFS_FSB_TO_BB(mp, geo->fsbcount),
+ XBF_LIVESCAN, &bp);
+ if (error)
+ continue;
+
+ xfs_buf_stale(bp);
+ xfs_buf_relse(bp);
+ }
+ }
+}
+
+/*
* xfs_inactive_truncate
*
* Called to perform a truncate when an inode becomes unlinked.
@@ -1864,6 +1982,11 @@ xfs_inactive(
goto out;
}
+ if (S_ISDIR(VFS_I(ip)->i_mode) && ip->i_df.if_nextents > 0) {
+ xfs_inactive_dir(ip);
+ truncate = 1;
+ }
+
if (S_ISLNK(VFS_I(ip)->i_mode))
error = xfs_inactive_symlink(ip);
else if (truncate)
@@ -1937,7 +2060,7 @@ out:
* only unlinked, referenced inodes can be on the unlinked inode list. If we
* don't find the inode in cache, then let the caller handle the situation.
*/
-static struct xfs_inode *
+struct xfs_inode *
xfs_iunlink_lookup(
struct xfs_perag *pag,
xfs_agino_t agino)
@@ -2150,7 +2273,7 @@ xfs_iunlink_insert_inode(
* We place the on-disk inode on a list in the AGI. It will be pulled from this
* list when the inode is freed.
*/
-STATIC int
+int
xfs_iunlink(
struct xfs_trans *tp,
struct xfs_inode *ip)
@@ -2167,7 +2290,7 @@ xfs_iunlink(
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
/* Get the agi buffer first. It ensures lock ordering on the list. */
- error = xfs_read_agi(pag, tp, &agibp);
+ error = xfs_read_agi(pag, tp, 0, &agibp);
if (error)
goto out;
@@ -2252,7 +2375,7 @@ xfs_iunlink_remove_inode(
/*
* Pull the on-disk inode from the AGI unlinked list.
*/
-STATIC int
+int
xfs_iunlink_remove(
struct xfs_trans *tp,
struct xfs_perag *pag,
@@ -2264,7 +2387,7 @@ xfs_iunlink_remove(
trace_xfs_iunlink_remove(ip);
/* Get the agi buffer first. It ensures lock ordering on the list. */
- error = xfs_read_agi(pag, tp, &agibp);
+ error = xfs_read_agi(pag, tp, 0, &agibp);
if (error)
return error;
@@ -2598,16 +2721,17 @@ xfs_iunpin_wait(
*/
int
xfs_remove(
- xfs_inode_t *dp,
+ struct xfs_inode *dp,
struct xfs_name *name,
- xfs_inode_t *ip)
+ struct xfs_inode *ip)
{
- xfs_mount_t *mp = dp->i_mount;
- xfs_trans_t *tp = NULL;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_trans *tp = NULL;
int is_dir = S_ISDIR(VFS_I(ip)->i_mode);
int dontcare;
int error = 0;
uint resblks;
+ struct xfs_parent_args *ppargs;
trace_xfs_remove(dp, name);
@@ -2624,6 +2748,10 @@ xfs_remove(
if (error)
goto std_return;
+ error = xfs_parent_start(mp, &ppargs);
+ if (error)
+ goto std_return;
+
/*
* We try to get the real space reservation first, allowing for
* directory btree deletion(s) implying possible bmap insert(s). If we
@@ -2635,12 +2763,12 @@ xfs_remove(
* the directory code can handle a reservationless update and we don't
* want to prevent a user from trying to free space by deleting things.
*/
- resblks = XFS_REMOVE_SPACE_RES(mp);
+ resblks = xfs_remove_space_res(mp, name->len);
error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks,
&tp, &dontcare);
if (error) {
ASSERT(error != -ENOSPC);
- goto std_return;
+ goto out_parent;
}
/*
@@ -2700,6 +2828,13 @@ xfs_remove(
goto out_trans_cancel;
}
+ /* Remove parent pointer. */
+ if (ppargs) {
+ error = xfs_parent_removename(tp, ppargs, dp, name, ip);
+ if (error)
+ goto out_trans_cancel;
+ }
+
/*
* Drop the link from dp to ip, and if ip was a directory, remove the
* '.' and '..' references since we freed the directory.
@@ -2716,19 +2851,42 @@ xfs_remove(
error = xfs_trans_commit(tp);
if (error)
- goto std_return;
+ goto out_unlock;
if (is_dir && xfs_inode_is_filestream(ip))
xfs_filestream_deassociate(ip);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ xfs_parent_finish(mp, ppargs);
return 0;
out_trans_cancel:
xfs_trans_cancel(tp);
+ out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ out_parent:
+ xfs_parent_finish(mp, ppargs);
std_return:
return error;
}
+static inline void
+xfs_iunlock_rename(
+ struct xfs_inode **i_tab,
+ int num_inodes)
+{
+ int i;
+
+ for (i = num_inodes - 1; i >= 0; i--) {
+ /* Skip duplicate inodes if src and target dps are the same */
+ if (!i_tab[i] || (i > 0 && i_tab[i] == i_tab[i - 1]))
+ continue;
+ xfs_iunlock(i_tab[i], XFS_ILOCK_EXCL);
+ }
+}
+
/*
* Enter all inodes for a rename transaction into a sorted array.
*/
@@ -2743,7 +2901,7 @@ xfs_sort_for_rename(
struct xfs_inode **i_tab,/* out: sorted array of inodes */
int *num_inodes) /* in/out: inodes in array */
{
- int i, j;
+ int i;
ASSERT(*num_inodes == __XFS_SORT_INODES);
memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
@@ -2765,17 +2923,26 @@ xfs_sort_for_rename(
i_tab[i++] = wip;
*num_inodes = i;
+ xfs_sort_inodes(i_tab, *num_inodes);
+}
+
+void
+xfs_sort_inodes(
+ struct xfs_inode **i_tab,
+ unsigned int num_inodes)
+{
+ int i, j;
+
+ ASSERT(num_inodes <= __XFS_SORT_INODES);
+
/*
* Sort the elements via bubble sort. (Remember, there are at
* most 5 elements to sort, so this is adequate.)
*/
- for (i = 0; i < *num_inodes; i++) {
- for (j = 1; j < *num_inodes; j++) {
- if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
- struct xfs_inode *temp = i_tab[j];
- i_tab[j] = i_tab[j-1];
- i_tab[j-1] = temp;
- }
+ for (i = 0; i < num_inodes; i++) {
+ for (j = 1; j < num_inodes; j++) {
+ if (i_tab[j]->i_ino < i_tab[j-1]->i_ino)
+ swap(i_tab[j], i_tab[j - 1]);
}
}
}
@@ -2805,15 +2972,17 @@ xfs_cross_rename(
struct xfs_inode *dp1,
struct xfs_name *name1,
struct xfs_inode *ip1,
+ struct xfs_parent_args *ip1_ppargs,
struct xfs_inode *dp2,
struct xfs_name *name2,
struct xfs_inode *ip2,
+ struct xfs_parent_args *ip2_ppargs,
int spaceres)
{
- int error = 0;
- int ip1_flags = 0;
- int ip2_flags = 0;
- int dp2_flags = 0;
+ int error = 0;
+ int ip1_flags = 0;
+ int ip2_flags = 0;
+ int dp2_flags = 0;
/* Swap inode number for dirent in first parent */
error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres);
@@ -2882,6 +3051,21 @@ xfs_cross_rename(
}
}
+ /* Schedule parent pointer replacements */
+ if (ip1_ppargs) {
+ error = xfs_parent_replacename(tp, ip1_ppargs, dp1, name1, dp2,
+ name2, ip1);
+ if (error)
+ goto out_trans_abort;
+ }
+
+ if (ip2_ppargs) {
+ error = xfs_parent_replacename(tp, ip2_ppargs, dp2, name2, dp1,
+ name1, ip2);
+ if (error)
+ goto out_trans_abort;
+ }
+
if (ip1_flags) {
xfs_trans_ichgtime(tp, ip1, ip1_flags);
xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
@@ -2937,7 +3121,7 @@ xfs_rename_alloc_whiteout(
int error;
error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE,
- &tmpfile);
+ xfs_has_parent(dp->i_mount), &tmpfile);
if (error)
return error;
@@ -2981,6 +3165,9 @@ xfs_rename(
struct xfs_trans *tp;
struct xfs_inode *wip = NULL; /* whiteout inode */
struct xfs_inode *inodes[__XFS_SORT_INODES];
+ struct xfs_parent_args *src_ppargs = NULL;
+ struct xfs_parent_args *tgt_ppargs = NULL;
+ struct xfs_parent_args *wip_ppargs = NULL;
int i;
int num_inodes = __XFS_SORT_INODES;
bool new_parent = (src_dp != target_dp);
@@ -3012,9 +3199,26 @@ xfs_rename(
xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
inodes, &num_inodes);
+ error = xfs_parent_start(mp, &src_ppargs);
+ if (error)
+ goto out_release_wip;
+
+ if (wip) {
+ error = xfs_parent_start(mp, &wip_ppargs);
+ if (error)
+ goto out_src_ppargs;
+ }
+
+ if (target_ip) {
+ error = xfs_parent_start(mp, &tgt_ppargs);
+ if (error)
+ goto out_wip_ppargs;
+ }
+
retry:
nospace_error = 0;
- spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
+ spaceres = xfs_rename_space_res(mp, src_name->len, target_ip != NULL,
+ target_name->len, wip != NULL);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
if (error == -ENOSPC) {
nospace_error = error;
@@ -3023,14 +3227,26 @@ retry:
&tp);
}
if (error)
- goto out_release_wip;
+ goto out_tgt_ppargs;
+
+ /*
+ * We don't allow reservationless renaming when parent pointers are
+ * enabled because we can't back out if the xattrs must grow.
+ */
+ if (src_ppargs && nospace_error) {
+ error = nospace_error;
+ xfs_trans_cancel(tp);
+ goto out_tgt_ppargs;
+ }
/*
* Attach the dquots to the inodes
*/
error = xfs_qm_vop_rename_dqattach(inodes);
- if (error)
- goto out_trans_cancel;
+ if (error) {
+ xfs_trans_cancel(tp);
+ goto out_tgt_ppargs;
+ }
/*
* Lock all the participating inodes. Depending upon whether
@@ -3041,18 +3257,16 @@ retry:
xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
/*
- * Join all the inodes to the transaction. From this point on,
- * we can rely on either trans_commit or trans_cancel to unlock
- * them.
+ * Join all the inodes to the transaction.
*/
- xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, src_dp, 0);
if (new_parent)
- xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, target_dp, 0);
+ xfs_trans_ijoin(tp, src_ip, 0);
if (target_ip)
- xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, target_ip, 0);
if (wip)
- xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, wip, 0);
/*
* If we are using project inheritance, we only allow renames
@@ -3066,10 +3280,13 @@ retry:
}
/* RENAME_EXCHANGE is unique from here on. */
- if (flags & RENAME_EXCHANGE)
- return xfs_cross_rename(tp, src_dp, src_name, src_ip,
- target_dp, target_name, target_ip,
- spaceres);
+ if (flags & RENAME_EXCHANGE) {
+ error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
+ src_ppargs, target_dp, target_name, target_ip,
+ tgt_ppargs, spaceres);
+ nospace_error = 0;
+ goto out_unlock;
+ }
/*
* Try to reserve quota to handle an expansion of the target directory.
@@ -3083,6 +3300,7 @@ retry:
if (error == -EDQUOT || error == -ENOSPC) {
if (!retried) {
xfs_trans_cancel(tp);
+ xfs_iunlock_rename(inodes, num_inodes);
xfs_blockgc_free_quota(target_dp, 0);
retried = true;
goto retry;
@@ -3097,6 +3315,15 @@ retry:
}
/*
+ * We don't allow quotaless renaming when parent pointers are enabled
+ * because we can't back out if the xattrs must grow.
+ */
+ if (src_ppargs && nospace_error) {
+ error = nospace_error;
+ goto out_trans_cancel;
+ }
+
+ /*
* Check for expected errors before we dirty the transaction
* so we can return an error without a transaction abort.
*/
@@ -3142,7 +3369,7 @@ retry:
pag = xfs_perag_get(mp,
XFS_INO_TO_AGNO(mp, inodes[i]->i_ino));
- error = xfs_read_agi(pag, tp, &bp);
+ error = xfs_read_agi(pag, tp, 0, &bp);
xfs_perag_put(pag);
if (error)
goto out_trans_cancel;
@@ -3288,6 +3515,28 @@ retry:
if (error)
goto out_trans_cancel;
+ /* Schedule parent pointer updates. */
+ if (wip_ppargs) {
+ error = xfs_parent_addname(tp, wip_ppargs, src_dp, src_name,
+ wip);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ if (src_ppargs) {
+ error = xfs_parent_replacename(tp, src_ppargs, src_dp,
+ src_name, target_dp, target_name, src_ip);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ if (tgt_ppargs) {
+ error = xfs_parent_removename(tp, tgt_ppargs, target_dp,
+ target_name, target_ip);
+ if (error)
+ goto out_trans_cancel;
+ }
+
xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
if (new_parent)
@@ -3309,12 +3558,19 @@ retry:
xfs_dir_update_hook(src_dp, wip, 1, src_name);
error = xfs_finish_rename(tp);
- if (wip)
- xfs_irele(wip);
- return error;
+ nospace_error = 0;
+ goto out_unlock;
out_trans_cancel:
xfs_trans_cancel(tp);
+out_unlock:
+ xfs_iunlock_rename(inodes, num_inodes);
+out_tgt_ppargs:
+ xfs_parent_finish(mp, tgt_ppargs);
+out_wip_ppargs:
+ xfs_parent_finish(mp, wip_ppargs);
+out_src_ppargs:
+ xfs_parent_finish(mp, src_ppargs);
out_release_wip:
if (wip)
xfs_irele(wip);
@@ -3814,7 +4070,7 @@ xfs_inode_reload_unlinked_bucket(
/* Grab the first inode in the list */
pag = xfs_perag_get(mp, agno);
- error = xfs_ialloc_read_agi(pag, tp, &agibp);
+ error = xfs_ialloc_read_agi(pag, tp, 0, &agibp);
xfs_perag_put(pag);
if (error)
return error;
@@ -3946,3 +4202,77 @@ xfs_inode_count_blocks(
xfs_bmap_count_leaves(ifp, rblocks);
*dblocks = ip->i_nblocks - *rblocks;
}
+
+static void
+xfs_wait_dax_page(
+ struct inode *inode)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+
+ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+ schedule();
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+}
+
+int
+xfs_break_dax_layouts(
+ struct inode *inode,
+ bool *retry)
+{
+ struct page *page;
+
+ xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL);
+
+ page = dax_layout_busy_page(inode->i_mapping);
+ if (!page)
+ return 0;
+
+ *retry = true;
+ return ___wait_var_event(&page->_refcount,
+ atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
+ 0, 0, xfs_wait_dax_page(inode));
+}
+
+int
+xfs_break_layouts(
+ struct inode *inode,
+ uint *iolock,
+ enum layout_break_reason reason)
+{
+ bool retry;
+ int error;
+
+ xfs_assert_ilocked(XFS_I(inode), XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL);
+
+ do {
+ retry = false;
+ switch (reason) {
+ case BREAK_UNMAP:
+ error = xfs_break_dax_layouts(inode, &retry);
+ if (error || retry)
+ break;
+ fallthrough;
+ case BREAK_WRITE:
+ error = xfs_break_leased_layouts(inode, iolock, &retry);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ error = -EINVAL;
+ }
+ } while (error == 0 && retry);
+
+ return error;
+}
+
+/* Returns the size of fundamental allocation unit for a file, in bytes. */
+unsigned int
+xfs_inode_alloc_unitsize(
+ struct xfs_inode *ip)
+{
+ unsigned int blocks = 1;
+
+ if (XFS_IS_REALTIME_INODE(ip))
+ blocks = ip->i_mount->m_sb.sb_rextsize;
+
+ return XFS_FSB_TO_B(ip->i_mount, blocks);
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ab46ffb3ac19..292b90b5f2ac 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -207,13 +207,13 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
* i_flags helper functions
*/
static inline void
-__xfs_iflags_set(xfs_inode_t *ip, unsigned short flags)
+__xfs_iflags_set(xfs_inode_t *ip, unsigned long flags)
{
ip->i_flags |= flags;
}
static inline void
-xfs_iflags_set(xfs_inode_t *ip, unsigned short flags)
+xfs_iflags_set(xfs_inode_t *ip, unsigned long flags)
{
spin_lock(&ip->i_flags_lock);
__xfs_iflags_set(ip, flags);
@@ -221,7 +221,7 @@ xfs_iflags_set(xfs_inode_t *ip, unsigned short flags)
}
static inline void
-xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags)
+xfs_iflags_clear(xfs_inode_t *ip, unsigned long flags)
{
spin_lock(&ip->i_flags_lock);
ip->i_flags &= ~flags;
@@ -229,13 +229,13 @@ xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags)
}
static inline int
-__xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
+__xfs_iflags_test(xfs_inode_t *ip, unsigned long flags)
{
return (ip->i_flags & flags);
}
static inline int
-xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
+xfs_iflags_test(xfs_inode_t *ip, unsigned long flags)
{
int ret;
spin_lock(&ip->i_flags_lock);
@@ -245,7 +245,7 @@ xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
}
static inline int
-xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
+xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned long flags)
{
int ret;
@@ -258,7 +258,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
}
static inline int
-xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
+xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned long flags)
{
int ret;
@@ -312,6 +312,15 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
}
/*
+ * Decide if this file is a realtime file whose data allocation unit is larger
+ * than a single filesystem block.
+ */
+static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip)
+{
+ return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1;
+}
+
+/*
* Return the buftarg used for data allocations on a given inode.
*/
#define xfs_inode_buftarg(ip) \
@@ -513,7 +522,7 @@ int xfs_create(struct mnt_idmap *idmap,
umode_t mode, dev_t rdev, bool need_xattr,
struct xfs_inode **ipp);
int xfs_create_tmpfile(struct mnt_idmap *idmap,
- struct xfs_inode *dp, umode_t mode,
+ struct xfs_inode *dp, umode_t mode, bool init_xattrs,
struct xfs_inode **ipp);
int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode *ip);
@@ -565,16 +574,10 @@ xfs_itruncate_extents(
return xfs_itruncate_extents_flags(tpp, ip, whichfork, new_size, 0);
}
-/* from xfs_file.c */
int xfs_break_dax_layouts(struct inode *inode, bool *retry);
int xfs_break_layouts(struct inode *inode, uint *iolock,
enum layout_break_reason reason);
-/* from xfs_iops.c */
-extern void xfs_setup_inode(struct xfs_inode *ip);
-extern void xfs_setup_iops(struct xfs_inode *ip);
-extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
-
static inline void xfs_update_stable_writes(struct xfs_inode *ip)
{
if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev))
@@ -613,11 +616,20 @@ extern struct kmem_cache *xfs_inode_cache;
bool xfs_inode_needs_inactive(struct xfs_inode *ip);
+int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip);
+int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
+ struct xfs_inode *ip);
+struct xfs_inode *xfs_iunlink_lookup(struct xfs_perag *pag, xfs_agino_t agino);
+
void xfs_end_io(struct work_struct *work);
int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2);
+int xfs_droplink(struct xfs_trans *tp, struct xfs_inode *ip);
+void xfs_bumplink(struct xfs_trans *tp, struct xfs_inode *ip);
+void xfs_lock_inodes(struct xfs_inode **ips, int inodes, uint lock_mode);
+void xfs_sort_inodes(struct xfs_inode **i_tab, unsigned int num_inodes);
static inline bool
xfs_inode_unlinked_incomplete(
@@ -631,6 +643,7 @@ int xfs_inode_reload_unlinked(struct xfs_inode *ip);
bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork);
void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_filblks_t *dblocks, xfs_filblks_t *rblocks);
+unsigned int xfs_inode_alloc_unitsize(struct xfs_inode *ip);
struct xfs_dir_update_params {
const struct xfs_inode *dp;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d0e2cec6210d..f0117188f302 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -23,11 +23,9 @@
#include "xfs_fsops.h"
#include "xfs_discard.h"
#include "xfs_quota.h"
-#include "xfs_export.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_trans.h"
-#include "xfs_acl.h"
#include "xfs_btree.h"
#include <linux/fsmap.h>
#include "xfs_fsmap.h"
@@ -39,596 +37,13 @@
#include "xfs_ioctl.h"
#include "xfs_xattr.h"
#include "xfs_rtbitmap.h"
+#include "xfs_file.h"
+#include "xfs_exchrange.h"
+#include "xfs_handle.h"
#include <linux/mount.h>
-#include <linux/namei.h>
#include <linux/fileattr.h>
-/*
- * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
- * a file or fs handle.
- *
- * XFS_IOC_PATH_TO_FSHANDLE
- * returns fs handle for a mount point or path within that mount point
- * XFS_IOC_FD_TO_HANDLE
- * returns full handle for a FD opened in user space
- * XFS_IOC_PATH_TO_HANDLE
- * returns full handle for a path
- */
-int
-xfs_find_handle(
- unsigned int cmd,
- xfs_fsop_handlereq_t *hreq)
-{
- int hsize;
- xfs_handle_t handle;
- struct inode *inode;
- struct fd f = {NULL};
- struct path path;
- int error;
- struct xfs_inode *ip;
-
- if (cmd == XFS_IOC_FD_TO_HANDLE) {
- f = fdget(hreq->fd);
- if (!f.file)
- return -EBADF;
- inode = file_inode(f.file);
- } else {
- error = user_path_at(AT_FDCWD, hreq->path, 0, &path);
- if (error)
- return error;
- inode = d_inode(path.dentry);
- }
- ip = XFS_I(inode);
-
- /*
- * We can only generate handles for inodes residing on a XFS filesystem,
- * and only for regular files, directories or symbolic links.
- */
- error = -EINVAL;
- if (inode->i_sb->s_magic != XFS_SB_MAGIC)
- goto out_put;
-
- error = -EBADF;
- if (!S_ISREG(inode->i_mode) &&
- !S_ISDIR(inode->i_mode) &&
- !S_ISLNK(inode->i_mode))
- goto out_put;
-
-
- memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
-
- if (cmd == XFS_IOC_PATH_TO_FSHANDLE) {
- /*
- * This handle only contains an fsid, zero the rest.
- */
- memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
- hsize = sizeof(xfs_fsid_t);
- } else {
- handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
- sizeof(handle.ha_fid.fid_len);
- handle.ha_fid.fid_pad = 0;
- handle.ha_fid.fid_gen = inode->i_generation;
- handle.ha_fid.fid_ino = ip->i_ino;
- hsize = sizeof(xfs_handle_t);
- }
-
- error = -EFAULT;
- if (copy_to_user(hreq->ohandle, &handle, hsize) ||
- copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32)))
- goto out_put;
-
- error = 0;
-
- out_put:
- if (cmd == XFS_IOC_FD_TO_HANDLE)
- fdput(f);
- else
- path_put(&path);
- return error;
-}
-
-/*
- * No need to do permission checks on the various pathname components
- * as the handle operations are privileged.
- */
-STATIC int
-xfs_handle_acceptable(
- void *context,
- struct dentry *dentry)
-{
- return 1;
-}
-
-/*
- * Convert userspace handle data into a dentry.
- */
-struct dentry *
-xfs_handle_to_dentry(
- struct file *parfilp,
- void __user *uhandle,
- u32 hlen)
-{
- xfs_handle_t handle;
- struct xfs_fid64 fid;
-
- /*
- * Only allow handle opens under a directory.
- */
- if (!S_ISDIR(file_inode(parfilp)->i_mode))
- return ERR_PTR(-ENOTDIR);
-
- if (hlen != sizeof(xfs_handle_t))
- return ERR_PTR(-EINVAL);
- if (copy_from_user(&handle, uhandle, hlen))
- return ERR_PTR(-EFAULT);
- if (handle.ha_fid.fid_len !=
- sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len))
- return ERR_PTR(-EINVAL);
-
- memset(&fid, 0, sizeof(struct fid));
- fid.ino = handle.ha_fid.fid_ino;
- fid.gen = handle.ha_fid.fid_gen;
-
- return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3,
- FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG,
- xfs_handle_acceptable, NULL);
-}
-
-STATIC struct dentry *
-xfs_handlereq_to_dentry(
- struct file *parfilp,
- xfs_fsop_handlereq_t *hreq)
-{
- return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen);
-}
-
-int
-xfs_open_by_handle(
- struct file *parfilp,
- xfs_fsop_handlereq_t *hreq)
-{
- const struct cred *cred = current_cred();
- int error;
- int fd;
- int permflag;
- struct file *filp;
- struct inode *inode;
- struct dentry *dentry;
- fmode_t fmode;
- struct path path;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- dentry = xfs_handlereq_to_dentry(parfilp, hreq);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- inode = d_inode(dentry);
-
- /* Restrict xfs_open_by_handle to directories & regular files. */
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
- error = -EPERM;
- goto out_dput;
- }
-
-#if BITS_PER_LONG != 32
- hreq->oflags |= O_LARGEFILE;
-#endif
-
- permflag = hreq->oflags;
- fmode = OPEN_FMODE(permflag);
- if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
- (fmode & FMODE_WRITE) && IS_APPEND(inode)) {
- error = -EPERM;
- goto out_dput;
- }
-
- if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
- error = -EPERM;
- goto out_dput;
- }
-
- /* Can't write directories. */
- if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
- error = -EISDIR;
- goto out_dput;
- }
-
- fd = get_unused_fd_flags(0);
- if (fd < 0) {
- error = fd;
- goto out_dput;
- }
-
- path.mnt = parfilp->f_path.mnt;
- path.dentry = dentry;
- filp = dentry_open(&path, hreq->oflags, cred);
- dput(dentry);
- if (IS_ERR(filp)) {
- put_unused_fd(fd);
- return PTR_ERR(filp);
- }
-
- if (S_ISREG(inode->i_mode)) {
- filp->f_flags |= O_NOATIME;
- filp->f_mode |= FMODE_NOCMTIME;
- }
-
- fd_install(fd, filp);
- return fd;
-
- out_dput:
- dput(dentry);
- return error;
-}
-
-int
-xfs_readlink_by_handle(
- struct file *parfilp,
- xfs_fsop_handlereq_t *hreq)
-{
- struct dentry *dentry;
- __u32 olen;
- int error;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- dentry = xfs_handlereq_to_dentry(parfilp, hreq);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
-
- /* Restrict this handle operation to symlinks only. */
- if (!d_is_symlink(dentry)) {
- error = -EINVAL;
- goto out_dput;
- }
-
- if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
- error = -EFAULT;
- goto out_dput;
- }
-
- error = vfs_readlink(dentry, hreq->ohandle, olen);
-
- out_dput:
- dput(dentry);
- return error;
-}
-
-/*
- * Format an attribute and copy it out to the user's buffer.
- * Take care to check values and protect against them changing later,
- * we may be reading them directly out of a user buffer.
- */
-static void
-xfs_ioc_attr_put_listent(
- struct xfs_attr_list_context *context,
- int flags,
- unsigned char *name,
- int namelen,
- int valuelen)
-{
- struct xfs_attrlist *alist = context->buffer;
- struct xfs_attrlist_ent *aep;
- int arraytop;
-
- ASSERT(!context->seen_enough);
- ASSERT(context->count >= 0);
- ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
- ASSERT(context->firstu >= sizeof(*alist));
- ASSERT(context->firstu <= context->bufsize);
-
- /*
- * Only list entries in the right namespace.
- */
- if (context->attr_filter != (flags & XFS_ATTR_NSP_ONDISK_MASK))
- return;
-
- arraytop = sizeof(*alist) +
- context->count * sizeof(alist->al_offset[0]);
-
- /* decrement by the actual bytes used by the attr */
- context->firstu -= round_up(offsetof(struct xfs_attrlist_ent, a_name) +
- namelen + 1, sizeof(uint32_t));
- if (context->firstu < arraytop) {
- trace_xfs_attr_list_full(context);
- alist->al_more = 1;
- context->seen_enough = 1;
- return;
- }
-
- aep = context->buffer + context->firstu;
- aep->a_valuelen = valuelen;
- memcpy(aep->a_name, name, namelen);
- aep->a_name[namelen] = 0;
- alist->al_offset[context->count++] = context->firstu;
- alist->al_count = context->count;
- trace_xfs_attr_list_add(context);
-}
-
-static unsigned int
-xfs_attr_filter(
- u32 ioc_flags)
-{
- if (ioc_flags & XFS_IOC_ATTR_ROOT)
- return XFS_ATTR_ROOT;
- if (ioc_flags & XFS_IOC_ATTR_SECURE)
- return XFS_ATTR_SECURE;
- return 0;
-}
-
-static unsigned int
-xfs_attr_flags(
- u32 ioc_flags)
-{
- if (ioc_flags & XFS_IOC_ATTR_CREATE)
- return XATTR_CREATE;
- if (ioc_flags & XFS_IOC_ATTR_REPLACE)
- return XATTR_REPLACE;
- return 0;
-}
-
-int
-xfs_ioc_attr_list(
- struct xfs_inode *dp,
- void __user *ubuf,
- size_t bufsize,
- int flags,
- struct xfs_attrlist_cursor __user *ucursor)
-{
- struct xfs_attr_list_context context = { };
- struct xfs_attrlist *alist;
- void *buffer;
- int error;
-
- if (bufsize < sizeof(struct xfs_attrlist) ||
- bufsize > XFS_XATTR_LIST_MAX)
- return -EINVAL;
-
- /*
- * Reject flags, only allow namespaces.
- */
- if (flags & ~(XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE))
- return -EINVAL;
- if (flags == (XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE))
- return -EINVAL;
-
- /*
- * Validate the cursor.
- */
- if (copy_from_user(&context.cursor, ucursor, sizeof(context.cursor)))
- return -EFAULT;
- if (context.cursor.pad1 || context.cursor.pad2)
- return -EINVAL;
- if (!context.cursor.initted &&
- (context.cursor.hashval || context.cursor.blkno ||
- context.cursor.offset))
- return -EINVAL;
-
- buffer = kvzalloc(bufsize, GFP_KERNEL);
- if (!buffer)
- return -ENOMEM;
-
- /*
- * Initialize the output buffer.
- */
- context.dp = dp;
- context.resynch = 1;
- context.attr_filter = xfs_attr_filter(flags);
- context.buffer = buffer;
- context.bufsize = round_down(bufsize, sizeof(uint32_t));
- context.firstu = context.bufsize;
- context.put_listent = xfs_ioc_attr_put_listent;
-
- alist = context.buffer;
- alist->al_count = 0;
- alist->al_more = 0;
- alist->al_offset[0] = context.bufsize;
-
- error = xfs_attr_list(&context);
- if (error)
- goto out_free;
-
- if (copy_to_user(ubuf, buffer, bufsize) ||
- copy_to_user(ucursor, &context.cursor, sizeof(context.cursor)))
- error = -EFAULT;
-out_free:
- kvfree(buffer);
- return error;
-}
-
-STATIC int
-xfs_attrlist_by_handle(
- struct file *parfilp,
- struct xfs_fsop_attrlist_handlereq __user *p)
-{
- struct xfs_fsop_attrlist_handlereq al_hreq;
- struct dentry *dentry;
- int error = -ENOMEM;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (copy_from_user(&al_hreq, p, sizeof(al_hreq)))
- return -EFAULT;
-
- dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
-
- error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), al_hreq.buffer,
- al_hreq.buflen, al_hreq.flags, &p->pos);
- dput(dentry);
- return error;
-}
-
-static int
-xfs_attrmulti_attr_get(
- struct inode *inode,
- unsigned char *name,
- unsigned char __user *ubuf,
- uint32_t *len,
- uint32_t flags)
-{
- struct xfs_da_args args = {
- .dp = XFS_I(inode),
- .attr_filter = xfs_attr_filter(flags),
- .attr_flags = xfs_attr_flags(flags),
- .name = name,
- .namelen = strlen(name),
- .valuelen = *len,
- };
- int error;
-
- if (*len > XFS_XATTR_SIZE_MAX)
- return -EINVAL;
-
- error = xfs_attr_get(&args);
- if (error)
- goto out_kfree;
-
- *len = args.valuelen;
- if (copy_to_user(ubuf, args.value, args.valuelen))
- error = -EFAULT;
-
-out_kfree:
- kvfree(args.value);
- return error;
-}
-
-static int
-xfs_attrmulti_attr_set(
- struct inode *inode,
- unsigned char *name,
- const unsigned char __user *ubuf,
- uint32_t len,
- uint32_t flags)
-{
- struct xfs_da_args args = {
- .dp = XFS_I(inode),
- .attr_filter = xfs_attr_filter(flags),
- .attr_flags = xfs_attr_flags(flags),
- .name = name,
- .namelen = strlen(name),
- };
- int error;
-
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- return -EPERM;
-
- if (ubuf) {
- if (len > XFS_XATTR_SIZE_MAX)
- return -EINVAL;
- args.value = memdup_user(ubuf, len);
- if (IS_ERR(args.value))
- return PTR_ERR(args.value);
- args.valuelen = len;
- }
-
- error = xfs_attr_change(&args);
- if (!error && (flags & XFS_IOC_ATTR_ROOT))
- xfs_forget_acl(inode, name);
- kfree(args.value);
- return error;
-}
-
-int
-xfs_ioc_attrmulti_one(
- struct file *parfilp,
- struct inode *inode,
- uint32_t opcode,
- void __user *uname,
- void __user *value,
- uint32_t *len,
- uint32_t flags)
-{
- unsigned char *name;
- int error;
-
- if ((flags & XFS_IOC_ATTR_ROOT) && (flags & XFS_IOC_ATTR_SECURE))
- return -EINVAL;
-
- name = strndup_user(uname, MAXNAMELEN);
- if (IS_ERR(name))
- return PTR_ERR(name);
-
- switch (opcode) {
- case ATTR_OP_GET:
- error = xfs_attrmulti_attr_get(inode, name, value, len, flags);
- break;
- case ATTR_OP_REMOVE:
- value = NULL;
- *len = 0;
- fallthrough;
- case ATTR_OP_SET:
- error = mnt_want_write_file(parfilp);
- if (error)
- break;
- error = xfs_attrmulti_attr_set(inode, name, value, *len, flags);
- mnt_drop_write_file(parfilp);
- break;
- default:
- error = -EINVAL;
- break;
- }
-
- kfree(name);
- return error;
-}
-
-STATIC int
-xfs_attrmulti_by_handle(
- struct file *parfilp,
- void __user *arg)
-{
- int error;
- xfs_attr_multiop_t *ops;
- xfs_fsop_attrmulti_handlereq_t am_hreq;
- struct dentry *dentry;
- unsigned int i, size;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
- return -EFAULT;
-
- /* overflow check */
- if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
- return -E2BIG;
-
- dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
-
- error = -E2BIG;
- size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
- if (!size || size > 16 * PAGE_SIZE)
- goto out_dput;
-
- ops = memdup_user(am_hreq.ops, size);
- if (IS_ERR(ops)) {
- error = PTR_ERR(ops);
- goto out_dput;
- }
-
- error = 0;
- for (i = 0; i < am_hreq.opcount; i++) {
- ops[i].am_error = xfs_ioc_attrmulti_one(parfilp,
- d_inode(dentry), ops[i].am_opcode,
- ops[i].am_attrname, ops[i].am_attrvalue,
- &ops[i].am_length, ops[i].am_flags);
- }
-
- if (copy_to_user(am_hreq.ops, ops, size))
- error = -EFAULT;
-
- kfree(ops);
- out_dput:
- dput(dentry);
- return error;
-}
-
/* Return 0 on success or positive error */
int
xfs_fsbulkstat_one_fmt(
@@ -1640,30 +1055,6 @@ out_free:
return error;
}
-STATIC int
-xfs_ioc_scrub_metadata(
- struct file *file,
- void __user *arg)
-{
- struct xfs_scrub_metadata scrub;
- int error;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (copy_from_user(&scrub, arg, sizeof(scrub)))
- return -EFAULT;
-
- error = xfs_scrub_metadata(file, &scrub);
- if (error)
- return error;
-
- if (copy_to_user(arg, &scrub, sizeof(scrub)))
- return -EFAULT;
-
- return 0;
-}
-
int
xfs_ioc_swapext(
xfs_swapext_t *sxp)
@@ -2010,7 +1401,10 @@ xfs_file_ioctl(
case XFS_IOC_FSGETXATTRA:
return xfs_ioc_fsgetxattra(ip, arg);
-
+ case XFS_IOC_GETPARENTS:
+ return xfs_ioc_getparents(filp, arg);
+ case XFS_IOC_GETPARENTS_BY_HANDLE:
+ return xfs_ioc_getparents_by_handle(filp, arg);
case XFS_IOC_GETBMAP:
case XFS_IOC_GETBMAPA:
case XFS_IOC_GETBMAPX:
@@ -2019,6 +1413,8 @@ xfs_file_ioctl(
case FS_IOC_GETFSMAP:
return xfs_ioc_getfsmap(ip, arg);
+ case XFS_IOC_SCRUBV_METADATA:
+ return xfs_ioc_scrubv_metadata(filp, arg);
case XFS_IOC_SCRUB_METADATA:
return xfs_ioc_scrub_metadata(filp, arg);
@@ -2169,6 +1565,9 @@ xfs_file_ioctl(
return error;
}
+ case XFS_IOC_EXCHANGE_RANGE:
+ return xfs_ioc_exchange_range(filp, arg);
+
default:
return -ENOTTY;
}
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 38be600b5e1e..12124946f347 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -15,34 +15,6 @@ xfs_ioc_swapext(
xfs_swapext_t *sxp);
extern int
-xfs_find_handle(
- unsigned int cmd,
- xfs_fsop_handlereq_t *hreq);
-
-extern int
-xfs_open_by_handle(
- struct file *parfilp,
- xfs_fsop_handlereq_t *hreq);
-
-extern int
-xfs_readlink_by_handle(
- struct file *parfilp,
- xfs_fsop_handlereq_t *hreq);
-
-int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode,
- uint32_t opcode, void __user *uname, void __user *value,
- uint32_t *len, uint32_t flags);
-int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf,
- size_t bufsize, int flags,
- struct xfs_attrlist_cursor __user *ucursor);
-
-extern struct dentry *
-xfs_handle_to_dentry(
- struct file *parfilp,
- void __user *uhandle,
- u32 hlen);
-
-extern int
xfs_fileattr_get(
struct dentry *dentry,
struct fileattr *fa);
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index ee35eea1ecce..b64785dc4354 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -24,6 +24,7 @@
#include "xfs_ioctl32.h"
#include "xfs_trace.h"
#include "xfs_sb.h"
+#include "xfs_handle.h"
#define _NATIVE_IOC(cmd, type) \
_IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 4087af7f3c9f..378342673925 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -28,6 +28,7 @@
#include "xfs_dquot.h"
#include "xfs_reflink.h"
#include "xfs_health.h"
+#include "xfs_rtbitmap.h"
#define XFS_ALLOC_ALIGN(mp, off) \
(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@@ -298,9 +299,7 @@ xfs_iomap_write_direct(
if (error)
return error;
- error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip, nr_exts);
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, nr_exts);
if (error)
goto out_trans_cancel;
@@ -321,14 +320,6 @@ xfs_iomap_write_direct(
if (error)
goto out_unlock;
- /*
- * Copy any maps to caller's array and return any error.
- */
- if (nimaps == 0) {
- error = -ENOSPC;
- goto out_unlock;
- }
-
if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) {
xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
error = xfs_alert_fsblock_zero(ip, imap);
@@ -404,6 +395,29 @@ xfs_quota_calc_throttle(
}
}
+static int64_t
+xfs_iomap_freesp(
+ struct percpu_counter *counter,
+ uint64_t low_space[XFS_LOWSP_MAX],
+ int *shift)
+{
+ int64_t freesp;
+
+ freesp = percpu_counter_read_positive(counter);
+ if (freesp < low_space[XFS_LOWSP_5_PCNT]) {
+ *shift = 2;
+ if (freesp < low_space[XFS_LOWSP_4_PCNT])
+ (*shift)++;
+ if (freesp < low_space[XFS_LOWSP_3_PCNT])
+ (*shift)++;
+ if (freesp < low_space[XFS_LOWSP_2_PCNT])
+ (*shift)++;
+ if (freesp < low_space[XFS_LOWSP_1_PCNT])
+ (*shift)++;
+ }
+ return freesp;
+}
+
/*
* If we don't have a user specified preallocation size, dynamically increase
* the preallocation size as the size of the file grows. Cap the maximum size
@@ -486,18 +500,13 @@ xfs_iomap_prealloc_size(
alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN),
alloc_blocks);
- freesp = percpu_counter_read_positive(&mp->m_fdblocks);
- if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
- shift = 2;
- if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
- shift++;
- if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
- shift++;
- if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
- shift++;
- if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
- shift++;
- }
+ if (unlikely(XFS_IS_REALTIME_INODE(ip)))
+ freesp = xfs_rtx_to_rtb(mp,
+ xfs_iomap_freesp(&mp->m_frextents,
+ mp->m_low_rtexts, &shift));
+ else
+ freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space,
+ &shift);
/*
* Check each quota to cap the prealloc size, provide a shift value to
@@ -606,11 +615,8 @@ xfs_iomap_write_unwritten(
if (error)
return error;
- error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
XFS_IEXT_WRITE_UNWRITTEN_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip,
- XFS_IEXT_WRITE_UNWRITTEN_CNT);
if (error)
goto error_on_bmapi_transaction;
@@ -982,8 +988,6 @@ xfs_buffered_write_iomap_begin(
return xfs_direct_write_iomap_begin(inode, offset, count,
flags, iomap, srcmap);
- ASSERT(!XFS_IS_REALTIME_INODE(ip));
-
error = xfs_qm_dqattach(ip);
if (error)
return error;
@@ -1023,6 +1027,24 @@ xfs_buffered_write_iomap_begin(
}
/*
+ * For zeroing, trim a delalloc extent that extends beyond the EOF
+ * block. If it starts beyond the EOF block, convert it to an
+ * unwritten extent.
+ */
+ if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb &&
+ isnullstartblock(imap.br_startblock)) {
+ xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
+
+ if (offset_fsb >= eof_fsb)
+ goto convert_delay;
+ if (end_fsb > eof_fsb) {
+ end_fsb = eof_fsb;
+ xfs_trim_extent(&imap, offset_fsb,
+ end_fsb - offset_fsb);
+ }
+ }
+
+ /*
* Search the COW fork extent list even if we did not find a data fork
* extent. This serves two purposes: first this implements the
* speculative preallocation using cowextsize, so that we also unshare
@@ -1158,15 +1180,26 @@ retry:
* them out if the write happens to fail.
*/
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq);
found_imap:
seq = xfs_iomap_inode_sequence(ip, 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
+convert_delay:
+ xfs_iunlock(ip, lockmode);
+ truncate_pagecache(inode, offset);
+ error = xfs_bmapi_convert_delalloc(ip, XFS_DATA_FORK, offset,
+ iomap, NULL);
+ if (error)
+ return error;
+
+ trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &imap);
+ return 0;
+
found_cow:
seq = xfs_iomap_inode_sequence(ip, 0);
if (imap.br_startoff <= offset_fsb) {
@@ -1174,17 +1207,17 @@ found_cow:
if (error)
goto out_unlock;
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
IOMAP_F_SHARED, seq);
}
xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq);
out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return error;
}
@@ -1194,8 +1227,8 @@ xfs_buffered_write_delalloc_punch(
loff_t offset,
loff_t length)
{
- return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset,
- offset + length);
+ xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, offset + length);
+ return 0;
}
static int
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 66f8c47642e8..ff222827e550 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -25,6 +25,7 @@
#include "xfs_error.h"
#include "xfs_ioctl.h"
#include "xfs_xattr.h"
+#include "xfs_file.h"
#include <linux/posix_acl.h>
#include <linux/security.h>
@@ -62,7 +63,7 @@ xfs_initxattrs(
.value = xattr->value,
.valuelen = xattr->value_len,
};
- error = xfs_attr_change(&args);
+ error = xfs_attr_change(&args, XFS_ATTRUPDATE_UPSERT);
if (error < 0)
break;
}
@@ -156,6 +157,8 @@ xfs_create_need_xattr(
if (dir->i_sb->s_security)
return true;
#endif
+ if (xfs_has_parent(XFS_I(dir)->i_mount))
+ return true;
return false;
}
@@ -200,7 +203,18 @@ xfs_generic_create(
xfs_create_need_xattr(dir, default_acl, acl),
&ip);
} else {
- error = xfs_create_tmpfile(idmap, XFS_I(dir), mode, &ip);
+ bool init_xattrs = false;
+
+ /*
+ * If this temporary file will be linkable, set up the file
+ * with an attr fork to receive a parent pointer.
+ */
+ if (!(tmpfile->f_flags & O_EXCL) &&
+ xfs_has_parent(XFS_I(dir)->i_mount))
+ init_xattrs = true;
+
+ error = xfs_create_tmpfile(idmap, XFS_I(dir), mode,
+ init_xattrs, &ip);
}
if (unlikely(error))
goto out_free_acl;
@@ -364,6 +378,9 @@ xfs_vn_link(
if (unlikely(error))
return error;
+ if (IS_PRIVATE(inode))
+ return -EPERM;
+
error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
if (unlikely(error))
return error;
@@ -521,7 +538,7 @@ xfs_stat_blksize(
* always return the realtime extent size.
*/
if (XFS_IS_REALTIME_INODE(ip))
- return XFS_FSB_TO_B(mp, xfs_get_extsz_hint(ip));
+ return XFS_FSB_TO_B(mp, xfs_get_extsz_hint(ip) ? : 1);
/*
* Allow large block sizes to be reported to userspace programs if the
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 7f84a0843b24..3c1a2605ffd2 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -8,9 +8,6 @@
struct xfs_inode;
-extern const struct file_operations xfs_file_operations;
-extern const struct file_operations xfs_dir_file_operations;
-
extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
int xfs_vn_setattr_size(struct mnt_idmap *idmap,
@@ -19,4 +16,8 @@ int xfs_vn_setattr_size(struct mnt_idmap *idmap,
int xfs_inode_init_security(struct inode *inode, struct inode *dir,
const struct qstr *qstr);
+extern void xfs_setup_inode(struct xfs_inode *ip);
+extern void xfs_setup_iops(struct xfs_inode *ip);
+extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
+
#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 95fc31b9f87d..c0757ab99495 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -97,6 +97,14 @@ xfs_bulkstat_one_int(
vfsuid = i_uid_into_vfsuid(idmap, inode);
vfsgid = i_gid_into_vfsgid(idmap, inode);
+ /* If this is a private inode, don't leak its details to userspace. */
+ if (IS_PRIVATE(inode)) {
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ xfs_irele(ip);
+ error = -EINVAL;
+ goto out_advance;
+ }
+
/* xfs_iget returns the following without needing
* further change.
*/
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 01b55f03a102..730c8d48da28 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -268,7 +268,7 @@ xfs_iwalk_ag_start(
/* Set up a fresh cursor and empty the inobt cache. */
iwag->nr_recs = 0;
- error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
+ error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp);
if (error)
return error;
*curpp = xfs_inobt_init_cursor(pag, tp, *agi_bpp);
@@ -386,7 +386,7 @@ xfs_iwalk_run_callbacks(
}
/* ...and recreate the cursor just past where we left off. */
- error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, agi_bpp);
+ error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, 0, agi_bpp);
if (error)
return error;
*curpp = xfs_inobt_init_cursor(iwag->pag, iwag->tp, *agi_bpp);
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 8f07c9f6157f..ac355328121a 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -198,6 +198,11 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
return x;
}
+static inline bool isaligned_64(uint64_t x, uint32_t y)
+{
+ return do_div(x, y) == 0;
+}
+
/* If @b is a power of 2, return log2(b). Else return -1. */
static inline int8_t log2_if_power2(unsigned long b)
{
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 5004f23d344e..416c15494983 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1448,7 +1448,7 @@ xfs_log_work_queue(
* Clear the log incompat flags if we have the opportunity.
*
* This only happens if we're about to log the second dummy transaction as part
- * of covering the log and we can get the log incompat feature usage lock.
+ * of covering the log.
*/
static inline void
xlog_clear_incompat(
@@ -1463,11 +1463,7 @@ xlog_clear_incompat(
if (log->l_covered_state != XLOG_STATE_COVER_DONE2)
return;
- if (!down_write_trylock(&log->l_incompat_users))
- return;
-
xfs_clear_incompat_log_features(mp);
- up_write(&log->l_incompat_users);
}
/*
@@ -1585,8 +1581,6 @@ xlog_alloc_log(
}
log->l_sectBBsize = 1 << log2_size;
- init_rwsem(&log->l_incompat_users);
-
xlog_get_iclog_buffer_size(mp, log);
spin_lock_init(&log->l_icloglock);
@@ -3871,23 +3865,3 @@ xfs_log_check_lsn(
return valid;
}
-
-/*
- * Notify the log that we're about to start using a feature that is protected
- * by a log incompat feature flag. This will prevent log covering from
- * clearing those flags.
- */
-void
-xlog_use_incompat_feat(
- struct xlog *log)
-{
- down_read(&log->l_incompat_users);
-}
-
-/* Notify the log that we've finished using log incompat features. */
-void
-xlog_drop_incompat_feat(
- struct xlog *log)
-{
- up_read(&log->l_incompat_users);
-}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 2728886c2963..d69acf881153 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -159,8 +159,6 @@ bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes);
bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags);
-void xlog_use_incompat_feat(struct xlog *log);
-void xlog_drop_incompat_feat(struct xlog *log);
int xfs_attr_use_log_assist(struct xfs_mount *mp);
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 73f5b7f628f4..f51cbc6405c1 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -1378,7 +1378,7 @@ out_abort_free_ticket:
*/
static void
xlog_cil_push_background(
- struct xlog *log) __releases(cil->xc_ctx_lock)
+ struct xlog *log)
{
struct xfs_cil *cil = log->l_cilp;
int space_used = atomic_read(&cil->xc_ctx->space_used);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index e30c06ec20e3..40e22ec0fbe6 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -450,9 +450,6 @@ struct xlog {
xfs_lsn_t l_recovery_lsn;
uint32_t l_iclog_roundoff;/* padding roundoff */
-
- /* Users of log incompat features should take a read lock. */
- struct rw_semaphore l_incompat_users;
};
/*
@@ -623,7 +620,8 @@ xlog_wait(
remove_wait_queue(wq, &wait);
}
-int xlog_wait_on_iclog(struct xlog_in_core *iclog);
+int xlog_wait_on_iclog(struct xlog_in_core *iclog)
+ __releases(iclog->ic_log->l_icloglock);
/*
* The LSN is valid so long as it is behind the current LSN. If it isn't, this
@@ -683,7 +681,7 @@ xlog_valid_lsn(
* flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc()
* will do direct reclaim and compaction in the slow path, both of which are
* horrendously expensive. We just want kmalloc to fail fast and fall back to
- * vmalloc if it can't get somethign straight away from the free lists or
+ * vmalloc if it can't get something straight away from the free lists or
* buddy allocator. Hence we have to open code kvmalloc outselves here.
*
* This assumes that the caller uses memalloc_nofs_save task context here, so
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 13f1d2e91540..4fe627991e86 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1767,6 +1767,37 @@ xlog_recover_iget(
return 0;
}
+/*
+ * Get an inode so that we can recover a log operation.
+ *
+ * Log intent items that target inodes effectively contain a file handle.
+ * Check that the generation number matches the intent item like we do for
+ * other file handles. Log intent items defined after this validation weakness
+ * was identified must use this function.
+ */
+int
+xlog_recover_iget_handle(
+ struct xfs_mount *mp,
+ xfs_ino_t ino,
+ uint32_t gen,
+ struct xfs_inode **ipp)
+{
+ struct xfs_inode *ip;
+ int error;
+
+ error = xlog_recover_iget(mp, ino, &ip);
+ if (error)
+ return error;
+
+ if (VFS_I(ip)->i_generation != gen) {
+ xfs_irele(ip);
+ return -EFSCORRUPTED;
+ }
+
+ *ipp = ip;
+ return 0;
+}
+
/******************************************************************************
*
* Log recover routines
@@ -1789,6 +1820,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
&xlog_bud_item_ops,
&xlog_attri_item_ops,
&xlog_attrd_item_ops,
+ &xlog_xmi_item_ops,
+ &xlog_xmd_item_ops,
};
static const struct xlog_recover_item_ops *
@@ -2656,7 +2689,7 @@ xlog_recover_clear_agi_bucket(
if (error)
goto out_error;
- error = xfs_read_agi(pag, tp, &agibp);
+ error = xfs_read_agi(pag, tp, 0, &agibp);
if (error)
goto out_abort;
@@ -2772,7 +2805,7 @@ xlog_recover_iunlink_ag(
int bucket;
int error;
- error = xfs_read_agi(pag, NULL, &agibp);
+ error = xfs_read_agi(pag, NULL, 0, &agibp);
if (error) {
/*
* AGI is b0rked. Don't process it.
@@ -2966,7 +2999,7 @@ xlog_do_recovery_pass(
int error = 0, h_size, h_len;
int error2 = 0;
int bblks, split_bblks;
- int hblks, split_hblks, wrapped_hblks;
+ int hblks = 1, split_hblks, wrapped_hblks;
int i;
struct hlist_head rhash[XLOG_RHASH_SIZE];
LIST_HEAD (buffer_list);
@@ -2977,6 +3010,10 @@ xlog_do_recovery_pass(
for (i = 0; i < XLOG_RHASH_SIZE; i++)
INIT_HLIST_HEAD(&rhash[i]);
+ hbp = xlog_alloc_buffer(log, hblks);
+ if (!hbp)
+ return -ENOMEM;
+
/*
* Read the header of the tail block and get the iclog buffer size from
* h_size. Use this to tell how many sectors make up the log header.
@@ -2987,10 +3024,6 @@ xlog_do_recovery_pass(
* iclog header and extract the header size from it. Get a
* new hbp that is the correct size.
*/
- hbp = xlog_alloc_buffer(log, 1);
- if (!hbp)
- return -ENOMEM;
-
error = xlog_bread(log, tail_blk, 1, hbp, &offset);
if (error)
goto bread_err1;
@@ -3022,20 +3055,27 @@ xlog_do_recovery_pass(
if (error)
goto bread_err1;
- hblks = xlog_logrec_hblks(log, rhead);
- if (hblks != 1) {
- kvfree(hbp);
- hbp = xlog_alloc_buffer(log, hblks);
+ /*
+ * This open codes xlog_logrec_hblks so that we can reuse the
+ * fixed up h_size value calculated above. Without that we'd
+ * still allocate the buffer based on the incorrect on-disk
+ * size.
+ */
+ if (h_size > XLOG_HEADER_CYCLE_SIZE &&
+ (rhead->h_version & cpu_to_be32(XLOG_VERSION_2))) {
+ hblks = DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
+ if (hblks > 1) {
+ kvfree(hbp);
+ hbp = xlog_alloc_buffer(log, hblks);
+ if (!hbp)
+ return -ENOMEM;
+ }
}
} else {
ASSERT(log->l_sectBBsize == 1);
- hblks = 1;
- hbp = xlog_alloc_buffer(log, 1);
h_size = XLOG_BIG_RECORD_BSIZE;
}
- if (!hbp)
- return -ENOMEM;
dbp = xlog_alloc_buffer(log, BTOBB(h_size));
if (!dbp) {
kvfree(hbp);
@@ -3496,21 +3536,6 @@ xlog_recover_finish(
*/
xfs_log_force(log->l_mp, XFS_LOG_SYNC);
- /*
- * Now that we've recovered the log and all the intents, we can clear
- * the log incompat feature bits in the superblock because there's no
- * longer anything to protect. We rely on the AIL push to write out the
- * updated superblock after everything else.
- */
- if (xfs_clear_incompat_log_features(log->l_mp)) {
- error = xfs_sync_sb(log->l_mp, false);
- if (error < 0) {
- xfs_alert(log->l_mp,
- "Failed to clear log incompat features on recovery");
- goto out_error;
- }
- }
-
xlog_recover_process_iunlinks(log);
/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index df370eb5dc15..09eef1721ef4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -34,6 +34,7 @@
#include "xfs_health.h"
#include "xfs_trace.h"
#include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
#include "scrub/stats.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -230,6 +231,13 @@ reread:
mp->m_features |= xfs_sb_version_to_features(sbp);
xfs_reinit_percpu_counters(mp);
+ /*
+ * If logged xattrs are enabled after log recovery finishes, then set
+ * the opstate so that log recovery will work properly.
+ */
+ if (xfs_sb_version_haslogxattrs(&mp->m_sb))
+ xfs_set_using_logged_xattrs(mp);
+
/* no need to be quiet anymore, so reset the buf ops */
bp->b_ops = &xfs_sb_buf_ops;
@@ -828,6 +836,15 @@ xfs_mountfs(
goto out_inodegc_shrinker;
}
+ /*
+ * If logged xattrs are still enabled after log recovery finishes, then
+ * they'll be available until unmount. Otherwise, turn them off.
+ */
+ if (xfs_sb_version_haslogxattrs(&mp->m_sb))
+ xfs_set_using_logged_xattrs(mp);
+ else
+ xfs_clear_using_logged_xattrs(mp);
+
/* Enable background inode inactivation workers. */
xfs_inodegc_start(mp);
xfs_blockgc_start(mp);
@@ -1095,6 +1112,11 @@ xfs_unmountfs(
"Freespace may not be correct on next mount.");
xfs_unmount_check(mp);
+ /*
+ * Indicate that it's ok to clear log incompat bits before cleaning
+ * the log and writing the unmount record.
+ */
+ xfs_set_done_with_log_incompat(mp);
xfs_log_unmount(mp);
xfs_da_unmount(mp);
xfs_uuid_unmount(mp);
@@ -1131,16 +1153,44 @@ xfs_fs_writable(
return true;
}
-/* Adjust m_fdblocks or m_frextents. */
+void
+xfs_add_freecounter(
+ struct xfs_mount *mp,
+ struct percpu_counter *counter,
+ uint64_t delta)
+{
+ bool has_resv_pool = (counter == &mp->m_fdblocks);
+ uint64_t res_used;
+
+ /*
+ * If the reserve pool is depleted, put blocks back into it first.
+ * Most of the time the pool is full.
+ */
+ if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) {
+ percpu_counter_add(counter, delta);
+ return;
+ }
+
+ spin_lock(&mp->m_sb_lock);
+ res_used = mp->m_resblks - mp->m_resblks_avail;
+ if (res_used > delta) {
+ mp->m_resblks_avail += delta;
+ } else {
+ delta -= res_used;
+ mp->m_resblks_avail = mp->m_resblks;
+ percpu_counter_add(counter, delta);
+ }
+ spin_unlock(&mp->m_sb_lock);
+}
+
int
-xfs_mod_freecounter(
+xfs_dec_freecounter(
struct xfs_mount *mp,
struct percpu_counter *counter,
- int64_t delta,
+ uint64_t delta,
bool rsvd)
{
int64_t lcounter;
- long long res_used;
uint64_t set_aside = 0;
s32 batch;
bool has_resv_pool;
@@ -1150,31 +1200,6 @@ xfs_mod_freecounter(
if (rsvd)
ASSERT(has_resv_pool);
- if (delta > 0) {
- /*
- * If the reserve pool is depleted, put blocks back into it
- * first. Most of the time the pool is full.
- */
- if (likely(!has_resv_pool ||
- mp->m_resblks == mp->m_resblks_avail)) {
- percpu_counter_add(counter, delta);
- return 0;
- }
-
- spin_lock(&mp->m_sb_lock);
- res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
-
- if (res_used > delta) {
- mp->m_resblks_avail += delta;
- } else {
- delta -= res_used;
- mp->m_resblks_avail = mp->m_resblks;
- percpu_counter_add(counter, delta);
- }
- spin_unlock(&mp->m_sb_lock);
- return 0;
- }
-
/*
* Taking blocks away, need to be more accurate the closer we
* are to zero.
@@ -1202,7 +1227,7 @@ xfs_mod_freecounter(
*/
if (has_resv_pool)
set_aside = xfs_fdblocks_unavailable(mp);
- percpu_counter_add_batch(counter, delta, batch);
+ percpu_counter_add_batch(counter, -((int64_t)delta), batch);
if (__percpu_counter_compare(counter, set_aside,
XFS_FDBLOCKS_BATCH) >= 0) {
/* we had space! */
@@ -1214,11 +1239,11 @@ xfs_mod_freecounter(
* that took us to ENOSPC.
*/
spin_lock(&mp->m_sb_lock);
- percpu_counter_add(counter, -delta);
+ percpu_counter_add(counter, delta);
if (!has_resv_pool || !rsvd)
goto fdblocks_enospc;
- lcounter = (long long)mp->m_resblks_avail + delta;
+ lcounter = (long long)mp->m_resblks_avail - delta;
if (lcounter >= 0) {
mp->m_resblks_avail = lcounter;
spin_unlock(&mp->m_sb_lock);
@@ -1364,7 +1389,8 @@ xfs_clear_incompat_log_features(
if (!xfs_has_crc(mp) ||
!xfs_sb_has_incompat_log_feature(&mp->m_sb,
XFS_SB_FEAT_INCOMPAT_LOG_ALL) ||
- xfs_is_shutdown(mp))
+ xfs_is_shutdown(mp) ||
+ !xfs_is_done_with_log_incompat(mp))
return false;
/*
@@ -1399,9 +1425,20 @@ xfs_clear_incompat_log_features(
#define XFS_DELALLOC_BATCH (4096)
void
xfs_mod_delalloc(
- struct xfs_mount *mp,
- int64_t delta)
+ struct xfs_inode *ip,
+ int64_t data_delta,
+ int64_t ind_delta)
{
- percpu_counter_add_batch(&mp->m_delalloc_blks, delta,
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ percpu_counter_add_batch(&mp->m_delalloc_rtextents,
+ xfs_rtb_to_rtx(mp, data_delta),
+ XFS_DELALLOC_BATCH);
+ if (!ind_delta)
+ return;
+ data_delta = 0;
+ }
+ percpu_counter_add_batch(&mp->m_delalloc_blks, data_delta + ind_delta,
XFS_DELALLOC_BATCH);
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e880aa48de68..d0567dfbc036 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -195,6 +195,12 @@ typedef struct xfs_mount {
* extents or anything related to the rt device.
*/
struct percpu_counter m_delalloc_blks;
+
+ /*
+ * RT version of the above.
+ */
+ struct percpu_counter m_delalloc_rtextents;
+
/*
* Global count of allocation btree blocks in use across all AGs. Only
* used when perag reservation is enabled. Helps prevent block
@@ -292,6 +298,7 @@ typedef struct xfs_mount {
#define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */
#define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */
#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
+#define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */
/* Mount features */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
@@ -331,19 +338,10 @@ static inline void xfs_add_ ## name (struct xfs_mount *mp) \
__XFS_ADD_FEAT(attr, ATTR)
__XFS_HAS_FEAT(nlink, NLINK)
__XFS_ADD_FEAT(quota, QUOTA)
-__XFS_HAS_FEAT(align, ALIGN)
__XFS_HAS_FEAT(dalign, DALIGN)
-__XFS_HAS_FEAT(logv2, LOGV2)
__XFS_HAS_FEAT(sector, SECTOR)
-__XFS_HAS_FEAT(extflg, EXTFLG)
__XFS_HAS_FEAT(asciici, ASCIICI)
-__XFS_HAS_FEAT(lazysbcount, LAZYSBCOUNT)
-__XFS_ADD_FEAT(attr2, ATTR2)
__XFS_HAS_FEAT(parent, PARENT)
-__XFS_ADD_FEAT(projid32, PROJID32)
-__XFS_HAS_FEAT(crc, CRC)
-__XFS_HAS_FEAT(v3inodes, V3INODES)
-__XFS_HAS_FEAT(pquotino, PQUOTINO)
__XFS_HAS_FEAT(ftype, FTYPE)
__XFS_HAS_FEAT(finobt, FINOBT)
__XFS_HAS_FEAT(rmapbt, RMAPBT)
@@ -355,6 +353,38 @@ __XFS_HAS_FEAT(inobtcounts, INOBTCNT)
__XFS_HAS_FEAT(bigtime, BIGTIME)
__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
__XFS_HAS_FEAT(large_extent_counts, NREXT64)
+__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
+
+/*
+ * Some features are always on for v5 file systems, allow the compiler to
+ * eliminiate dead code when building without v4 support.
+ */
+#define __XFS_HAS_V4_FEAT(name, NAME) \
+static inline bool xfs_has_ ## name (struct xfs_mount *mp) \
+{ \
+ return !IS_ENABLED(CONFIG_XFS_SUPPORT_V4) || \
+ (mp->m_features & XFS_FEAT_ ## NAME); \
+}
+
+#define __XFS_ADD_V4_FEAT(name, NAME) \
+ __XFS_HAS_V4_FEAT(name, NAME); \
+static inline void xfs_add_ ## name (struct xfs_mount *mp) \
+{ \
+ if (IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) { \
+ mp->m_features |= XFS_FEAT_ ## NAME; \
+ xfs_sb_version_add ## name(&mp->m_sb); \
+ } \
+}
+
+__XFS_HAS_V4_FEAT(align, ALIGN)
+__XFS_HAS_V4_FEAT(logv2, LOGV2)
+__XFS_HAS_V4_FEAT(extflg, EXTFLG)
+__XFS_HAS_V4_FEAT(lazysbcount, LAZYSBCOUNT)
+__XFS_ADD_V4_FEAT(attr2, ATTR2)
+__XFS_ADD_V4_FEAT(projid32, PROJID32)
+__XFS_HAS_V4_FEAT(v3inodes, V3INODES)
+__XFS_HAS_V4_FEAT(crc, CRC)
+__XFS_HAS_V4_FEAT(pquotino, PQUOTINO)
/*
* Mount features
@@ -412,6 +442,10 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
#define XFS_OPSTATE_WARNED_LARP 9
/* Mount time quotacheck is running */
#define XFS_OPSTATE_QUOTACHECK_RUNNING 10
+/* Do we want to clear log incompat flags? */
+#define XFS_OPSTATE_UNSET_LOG_INCOMPAT 11
+/* Filesystem can use logged extended attributes */
+#define XFS_OPSTATE_USE_LARP 12
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -439,6 +473,8 @@ __XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING)
#else
# define xfs_is_quotacheck_running(mp) (false)
#endif
+__XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
+__XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
static inline bool
xfs_should_warn(struct xfs_mount *mp, long nr)
@@ -457,7 +493,9 @@ xfs_should_warn(struct xfs_mount *mp, long nr)
{ (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \
{ (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \
{ (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \
- { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }
+ { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }, \
+ { (1UL << XFS_OPSTATE_UNSET_LOG_INCOMPAT), "unset_log_incompat" }, \
+ { (1UL << XFS_OPSTATE_USE_LARP), "logged_xattrs" }
/*
* Max and min values for mount-option defined I/O
@@ -534,19 +572,30 @@ xfs_fdblocks_unavailable(
return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
}
-int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
- int64_t delta, bool rsvd);
+int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+ uint64_t delta, bool rsvd);
+void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+ uint64_t delta);
+
+static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta,
+ bool reserved)
+{
+ return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved);
+}
+
+static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta)
+{
+ xfs_add_freecounter(mp, &mp->m_fdblocks, delta);
+}
-static inline int
-xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved)
+static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta)
{
- return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved);
+ return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false);
}
-static inline int
-xfs_mod_frextents(struct xfs_mount *mp, int64_t delta)
+static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta)
{
- return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false);
+ xfs_add_freecounter(mp, &mp->m_frextents, delta);
}
extern int xfs_readsb(xfs_mount_t *, int);
@@ -566,6 +615,7 @@ struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp,
void xfs_force_summary_recalc(struct xfs_mount *mp);
int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature);
bool xfs_clear_incompat_log_features(struct xfs_mount *mp);
-void xfs_mod_delalloc(struct xfs_mount *mp, int64_t delta);
+void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta,
+ int64_t ind_delta);
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 0f4cf4170c35..47120b745c47 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -836,8 +836,10 @@ xfs_qm_qino_alloc(
ASSERT(xfs_is_shutdown(mp));
xfs_alert(mp, "%s failed (error %d)!", __func__, error);
}
- if (need_alloc)
+ if (need_alloc) {
+ xfs_iunlock(*ipp, XFS_ILOCK_EXCL);
xfs_finish_inode_setup(*ipp);
+ }
return error;
}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index f5993012bf98..6e09dfcd13e2 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -136,7 +136,7 @@ enum {
XFS_QM_TRANS_PRJ,
XFS_QM_TRANS_DQTYPES
};
-#define XFS_QM_TRANS_MAXDQS 2
+#define XFS_QM_TRANS_MAXDQS 5
struct xfs_dquot_acct {
struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS];
};
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 85a4ae1a17f6..23d71a55bbc0 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -123,12 +123,6 @@ extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *);
extern void xfs_qm_mount_quotas(struct xfs_mount *);
extern void xfs_qm_unmount(struct xfs_mount *);
extern void xfs_qm_unmount_quotas(struct xfs_mount *);
-
-static inline int
-xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
-{
- return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false);
-}
bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type);
# ifdef CONFIG_XFS_LIVE_HOOKS
@@ -188,12 +182,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
}
static inline int
-xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
-{
- return 0;
-}
-
-static inline int
xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp,
struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, int64_t dblocks)
{
@@ -222,9 +210,16 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp,
#endif /* CONFIG_XFS_QUOTA */
static inline int
-xfs_quota_unreserve_blkres(struct xfs_inode *ip, int64_t blocks)
+xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
+{
+ return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false);
+}
+
+static inline void
+xfs_quota_unreserve_blkres(struct xfs_inode *ip, uint64_t blocks)
{
- return xfs_quota_reserve_blkres(ip, -blocks);
+ /* don't return an error as unreserving quotas can't fail */
+ xfs_quota_reserve_blkres(ip, -(int64_t)blocks);
}
extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 7da0e8f961d3..063a2e00d169 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -430,13 +430,6 @@ xfs_reflink_fill_cow_hole(
if (error)
return error;
- /*
- * Allocation succeeded but the requested range was not even partially
- * satisfied? Bail out!
- */
- if (nimaps == 0)
- return -ENOSPC;
-
convert:
return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
@@ -499,13 +492,6 @@ xfs_reflink_fill_delalloc(
error = xfs_trans_commit(tp);
if (error)
return error;
-
- /*
- * Allocation succeeded but the requested range was not even
- * partially satisfied? Bail out!
- */
- if (nimaps == 0)
- return -ENOSPC;
} while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
@@ -606,10 +592,8 @@ xfs_reflink_cancel_cow_blocks(
trace_xfs_reflink_cancel_cow(ip, &del);
if (isnullstartblock(del.br_startblock)) {
- error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
- &icur, &got, &del);
- if (error)
- break;
+ xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got,
+ &del);
} else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
@@ -632,10 +616,7 @@ xfs_reflink_cancel_cow_blocks(
xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
/* Remove the quota reservation */
- error = xfs_quota_unreserve_blkres(ip,
- del.br_blockcount);
- if (error)
- break;
+ xfs_quota_unreserve_blkres(ip, del.br_blockcount);
} else {
/* Didn't do anything, push cursor back. */
xfs_iext_prev(ifp, &icur);
@@ -731,12 +712,6 @@ xfs_reflink_end_cow_extent(
int nmaps;
int error;
- /* No COW extents? That's easy! */
- if (ifp->if_bytes == 0) {
- *offset_fsb = end_fsb;
- return 0;
- }
-
resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
XFS_TRANS_RESERVE, &tp);
@@ -751,14 +726,6 @@ xfs_reflink_end_cow_extent(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
- XFS_IEXT_REFLINK_END_COW_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip,
- XFS_IEXT_REFLINK_END_COW_CNT);
- if (error)
- goto out_cancel;
-
/*
* In case of racing, overlapping AIO writes no COW extents might be
* left by the time I/O completes for the loser of the race. In that
@@ -787,6 +754,11 @@ xfs_reflink_end_cow_extent(
del = got;
xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb);
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
+ XFS_IEXT_REFLINK_END_COW_CNT);
+ if (error)
+ goto out_cancel;
+
/* Grab the corresponding mapping in the data fork. */
nmaps = 1;
error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
@@ -1283,9 +1255,7 @@ xfs_reflink_remap_extent(
if (dmap_written)
++iext_delta;
- error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip, iext_delta);
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, iext_delta);
if (error)
goto out_cancel;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index e66f9bd5de5c..5a7ddfed1bb8 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -695,11 +695,8 @@ xfs_growfs_rt_alloc(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
- if (error == -EFBIG)
- error = xfs_iext_count_upgrade(tp, ip,
- XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto out_trans_cancel;
@@ -709,8 +706,6 @@ xfs_growfs_rt_alloc(
nmap = 1;
error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
XFS_BMAPI_METADATA, 0, &map, &nmap);
- if (!error && nmap < 1)
- error = -ENOSPC;
if (error)
goto out_trans_cancel;
/*
@@ -957,10 +952,10 @@ xfs_growfs_rt(
nargs.tp = tp;
/*
- * Lock out other callers by grabbing the bitmap inode lock.
+ * Lock out other callers by grabbing the bitmap and summary
+ * inode locks and joining them to the transaction.
*/
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
- xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_rtbitmap_lock(tp, mp);
/*
* Update the bitmap inode's size ondisk and incore. We need
* to update the incore size so that inode inactivation won't
@@ -971,11 +966,6 @@ xfs_growfs_rt(
i_size_write(VFS_I(mp->m_rbmip), mp->m_rbmip->i_disk_size);
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
/*
- * Get the summary inode into the transaction.
- */
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
- xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
- /*
* Update the summary inode's size. We need to update the
* incore size so that inode inactivation won't punch what it
* thinks are "posteof" blocks.
@@ -1142,10 +1132,10 @@ xfs_rtalloc_reinit_frextents(
uint64_t val = 0;
int error;
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent,
&val);
- xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
if (error)
return error;
@@ -1346,6 +1336,8 @@ xfs_bmap_rtalloc(
int error;
align = xfs_get_extsz_hint(ap->ip);
+ if (!align)
+ align = 1;
retry:
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
align, 1, ap->eof, 0,
@@ -1382,10 +1374,7 @@ retry:
* Lock out modifications to both the RT bitmap and summary inodes
*/
if (!rtlocked) {
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
- xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
- xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+ xfs_rtbitmap_lock(ap->tp, mp);
rtlocked = true;
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bce020374c5e..27e9f749c4c7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -43,6 +43,8 @@
#include "xfs_iunlink_item.h"
#include "xfs_dahash_test.h"
#include "xfs_rtbitmap.h"
+#include "xfs_exchmaps_item.h"
+#include "xfs_parent.h"
#include "scrub/stats.h"
#include "scrub/rcbag_btree.h"
@@ -1051,12 +1053,18 @@ xfs_init_percpu_counters(
if (error)
goto free_fdblocks;
- error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
+ error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL);
if (error)
goto free_delalloc;
+ error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
+ if (error)
+ goto free_delalloc_rt;
+
return 0;
+free_delalloc_rt:
+ percpu_counter_destroy(&mp->m_delalloc_rtextents);
free_delalloc:
percpu_counter_destroy(&mp->m_delalloc_blks);
free_fdblocks:
@@ -1086,6 +1094,9 @@ xfs_destroy_percpu_counters(
percpu_counter_destroy(&mp->m_ifree);
percpu_counter_destroy(&mp->m_fdblocks);
ASSERT(xfs_is_shutdown(mp) ||
+ percpu_counter_sum(&mp->m_delalloc_rtextents) == 0);
+ percpu_counter_destroy(&mp->m_delalloc_rtextents);
+ ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_blks) == 0);
percpu_counter_destroy(&mp->m_delalloc_blks);
percpu_counter_destroy(&mp->m_frextents);
@@ -1579,17 +1590,21 @@ xfs_fs_fill_super(
if (error)
goto out_free_sb;
- /* V4 support is undergoing deprecation. */
- if (!xfs_has_crc(mp)) {
-#ifdef CONFIG_XFS_SUPPORT_V4
+ /*
+ * V4 support is undergoing deprecation.
+ *
+ * Note: this has to use an open coded m_features check as xfs_has_crc
+ * always returns false for !CONFIG_XFS_SUPPORT_V4.
+ */
+ if (!(mp->m_features & XFS_FEAT_CRC)) {
+ if (!IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) {
+ xfs_warn(mp,
+ "Deprecated V4 format (crc=0) not supported by kernel.");
+ error = -EINVAL;
+ goto out_free_sb;
+ }
xfs_warn_once(mp,
"Deprecated V4 format (crc=0) will not be supported after September 2030.");
-#else
- xfs_warn(mp,
- "Deprecated V4 format (crc=0) not supported by kernel.");
- error = -EINVAL;
- goto out_free_sb;
-#endif
}
/* ASCII case insensitivity is undergoing deprecation. */
@@ -1727,6 +1742,14 @@ xfs_fs_fill_super(
goto out_filestream_unmount;
}
+ if (xfs_has_exchange_range(mp))
+ xfs_warn(mp,
+ "EXPERIMENTAL exchange-range feature enabled. Use at your own risk!");
+
+ if (xfs_has_parent(mp))
+ xfs_warn(mp,
+ "EXPERIMENTAL parent pointer feature enabled. Use at your own risk!");
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
@@ -1873,11 +1896,7 @@ xfs_remount_ro(
xfs_inodegc_stop(mp);
/* Free the per-AG metadata reservation pool. */
- error = xfs_fs_unreserve_ag_blocks(mp);
- if (error) {
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- return error;
- }
+ xfs_fs_unreserve_ag_blocks(mp);
/*
* Before we sync the metadata, we need to free up the reserve block
@@ -2185,8 +2204,32 @@ xfs_init_caches(void)
if (!xfs_iunlink_cache)
goto out_destroy_attri_cache;
+ xfs_xmd_cache = kmem_cache_create("xfs_xmd_item",
+ sizeof(struct xfs_xmd_log_item),
+ 0, 0, NULL);
+ if (!xfs_xmd_cache)
+ goto out_destroy_iul_cache;
+
+ xfs_xmi_cache = kmem_cache_create("xfs_xmi_item",
+ sizeof(struct xfs_xmi_log_item),
+ 0, 0, NULL);
+ if (!xfs_xmi_cache)
+ goto out_destroy_xmd_cache;
+
+ xfs_parent_args_cache = kmem_cache_create("xfs_parent_args",
+ sizeof(struct xfs_parent_args),
+ 0, 0, NULL);
+ if (!xfs_parent_args_cache)
+ goto out_destroy_xmi_cache;
+
return 0;
+ out_destroy_xmi_cache:
+ kmem_cache_destroy(xfs_xmi_cache);
+ out_destroy_xmd_cache:
+ kmem_cache_destroy(xfs_xmd_cache);
+ out_destroy_iul_cache:
+ kmem_cache_destroy(xfs_iunlink_cache);
out_destroy_attri_cache:
kmem_cache_destroy(xfs_attri_cache);
out_destroy_attrd_cache:
@@ -2243,6 +2286,9 @@ xfs_destroy_caches(void)
* destroy caches.
*/
rcu_barrier();
+ kmem_cache_destroy(xfs_parent_args_cache);
+ kmem_cache_destroy(xfs_xmd_cache);
+ kmem_cache_destroy(xfs_xmi_cache);
kmem_cache_destroy(xfs_iunlink_cache);
kmem_cache_destroy(xfs_attri_cache);
kmem_cache_destroy(xfs_attrd_cache);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 3e376d24c7c1..17aee806ec2e 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -25,6 +25,8 @@
#include "xfs_error.h"
#include "xfs_health.h"
#include "xfs_symlink_remote.h"
+#include "xfs_parent.h"
+#include "xfs_defer.h"
int
xfs_readlink(
@@ -100,6 +102,7 @@ xfs_symlink(
struct xfs_dquot *pdqp = NULL;
uint resblks;
xfs_ino_t ino;
+ struct xfs_parent_args *ppargs;
*ipp = NULL;
@@ -130,18 +133,24 @@ xfs_symlink(
/*
* The symlink will fit into the inode data fork?
- * There can't be any attributes so we get the whole variable part.
+ * If there are no parent pointers, then there wont't be any attributes.
+ * So we get the whole variable part, and do not need to reserve extra
+ * blocks. Otherwise, we need to reserve the blocks.
*/
- if (pathlen <= XFS_LITINO(mp))
+ if (pathlen <= XFS_LITINO(mp) && !xfs_has_parent(mp))
fs_blocks = 0;
else
fs_blocks = xfs_symlink_blocks(mp, pathlen);
- resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
+ resblks = xfs_symlink_space_res(mp, link_name->len, fs_blocks);
+
+ error = xfs_parent_start(mp, &ppargs);
+ if (error)
+ goto out_release_dquots;
error = xfs_trans_alloc_icreate(mp, &M_RES(mp)->tr_symlink, udqp, gdqp,
pdqp, resblks, &tp);
if (error)
- goto out_release_dquots;
+ goto out_parent;
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
@@ -161,7 +170,7 @@ xfs_symlink(
if (!error)
error = xfs_init_new_inode(idmap, tp, dp, ino,
S_IFLNK | (mode & ~S_IFMT), 1, 0, prid,
- false, &ip);
+ xfs_has_parent(mp), &ip);
if (error)
goto out_trans_cancel;
@@ -172,8 +181,7 @@ xfs_symlink(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
- unlock_dp_on_error = false;
+ xfs_trans_ijoin(tp, dp, 0);
/*
* Also attach the dquot(s) to it, if applicable.
@@ -181,8 +189,8 @@ xfs_symlink(
xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
resblks -= XFS_IALLOC_SPACE_RES(mp);
- error = xfs_symlink_write_target(tp, ip, target_path, pathlen,
- fs_blocks, resblks);
+ error = xfs_symlink_write_target(tp, ip, ip->i_ino, target_path,
+ pathlen, fs_blocks, resblks);
if (error)
goto out_trans_cancel;
resblks -= fs_blocks;
@@ -196,6 +204,14 @@ xfs_symlink(
goto out_trans_cancel;
xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+ /* Add parent pointer for the new symlink. */
+ if (ppargs) {
+ error = xfs_parent_addname(tp, ppargs, dp, link_name, ip);
+ if (error)
+ goto out_trans_cancel;
+ }
+
xfs_dir_update_hook(dp, ip, 1, link_name);
/*
@@ -215,6 +231,9 @@ xfs_symlink(
xfs_qm_dqrele(pdqp);
*ipp = ip;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ xfs_parent_finish(mp, ppargs);
return 0;
out_trans_cancel:
@@ -226,9 +245,12 @@ out_release_inode:
* transactions and deadlocks from xfs_inactive.
*/
if (ip) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
+out_parent:
+ xfs_parent_finish(mp, ppargs);
out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
@@ -250,19 +272,12 @@ out_release_dquots:
*/
STATIC int
xfs_inactive_symlink_rmt(
- struct xfs_inode *ip)
+ struct xfs_inode *ip)
{
- struct xfs_buf *bp;
- int done;
- int error;
- int i;
- xfs_mount_t *mp;
- xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS];
- int nmaps;
- int size;
- xfs_trans_t *tp;
-
- mp = ip->i_mount;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
ASSERT(!xfs_need_iread_extents(&ip->i_df));
/*
* We're freeing a symlink that has some
@@ -286,44 +301,14 @@ xfs_inactive_symlink_rmt(
* locked for the second transaction. In the error paths we need it
* held so the cancel won't rele it, see below.
*/
- size = (int)ip->i_disk_size;
ip->i_disk_size = 0;
VFS_I(ip)->i_mode = (VFS_I(ip)->i_mode & ~S_IFMT) | S_IFREG;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- /*
- * Find the block(s) so we can inval and unmap them.
- */
- done = 0;
- nmaps = ARRAY_SIZE(mval);
- error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size),
- mval, &nmaps, 0);
- if (error)
- goto error_trans_cancel;
- /*
- * Invalidate the block(s). No validation is done.
- */
- for (i = 0; i < nmaps; i++) {
- error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
- XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
- XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0,
- &bp);
- if (error)
- goto error_trans_cancel;
- xfs_trans_binval(tp, bp);
- }
- /*
- * Unmap the dead block(s) to the dfops.
- */
- error = xfs_bunmapi(tp, ip, 0, size, 0, nmaps, &done);
+
+ error = xfs_symlink_remote_truncate(tp, ip);
if (error)
goto error_trans_cancel;
- ASSERT(done);
- /*
- * Commit the transaction. This first logs the EFI and the inode, then
- * rolls and commits the transaction that frees the extents.
- */
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_trans_commit(tp);
if (error) {
ASSERT(xfs_is_shutdown(mp));
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 1a963382e5e9..9c7fbaae2717 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -39,6 +39,9 @@
#include "xfs_buf_mem.h"
#include "xfs_btree_mem.h"
#include "xfs_bmap.h"
+#include "xfs_exchmaps.h"
+#include "xfs_exchrange.h"
+#include "xfs_parent.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index aea97fc074f8..25ff6fe1eb6c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -31,6 +31,8 @@
* pos: file offset, in bytes
* bytecount: number of bytes
*
+ * dablk: directory or xattr block offset, in filesystem blocks
+ *
* disize: ondisk file size, in bytes
* isize: incore file size, in bytes
*
@@ -82,11 +84,18 @@ struct xfs_perag;
struct xfbtree;
struct xfs_btree_ops;
struct xfs_bmap_intent;
+struct xfs_exchmaps_intent;
+struct xfs_exchmaps_req;
+struct xfs_exchrange;
+struct xfs_getparents;
+struct xfs_parent_irec;
+struct xfs_attrlist_cursor_kern;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
{ XFS_ATTR_SECURE, "SECURE" }, \
- { XFS_ATTR_INCOMPLETE, "INCOMPLETE" }
+ { XFS_ATTR_INCOMPLETE, "INCOMPLETE" }, \
+ { XFS_ATTR_PARENT, "PARENT" }
DECLARE_EVENT_CLASS(xfs_attr_list_class,
TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -159,7 +168,7 @@ TRACE_EVENT(xlog_intent_recovery_failed,
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
- __assign_str(name, ops->name);
+ __assign_str(name);
__entry->error = error;
),
TP_printk("dev %d:%d optype %s error %d",
@@ -1654,7 +1663,6 @@ DEFINE_EVENT(xfs_extent_busy_class, name, \
xfs_agblock_t agbno, xfs_extlen_t len), \
TP_ARGS(mp, agno, agbno, len))
DEFINE_BUSY_EVENT(xfs_extent_busy);
-DEFINE_BUSY_EVENT(xfs_extent_busy_enomem);
DEFINE_BUSY_EVENT(xfs_extent_busy_force);
DEFINE_BUSY_EVENT(xfs_extent_busy_reuse);
DEFINE_BUSY_EVENT(xfs_extent_busy_clear);
@@ -1905,7 +1913,7 @@ TRACE_EVENT(xfs_alloc_cur_check,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->bno = bno;
__entry->len = len;
__entry->diff = diff;
@@ -1928,6 +1936,7 @@ DECLARE_EVENT_CLASS(xfs_da_class,
__field(xfs_dahash_t, hashval)
__field(xfs_ino_t, inumber)
__field(uint32_t, op_flags)
+ __field(xfs_ino_t, owner)
),
TP_fast_assign(
__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
@@ -1938,9 +1947,10 @@ DECLARE_EVENT_CLASS(xfs_da_class,
__entry->hashval = args->hashval;
__entry->inumber = args->inumber;
__entry->op_flags = args->op_flags;
+ __entry->owner = args->owner;
),
TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x "
- "inumber 0x%llx op_flags %s",
+ "inumber 0x%llx op_flags %s owner 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->namelen,
@@ -1948,7 +1958,8 @@ DECLARE_EVENT_CLASS(xfs_da_class,
__entry->namelen,
__entry->hashval,
__entry->inumber,
- __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
+ __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
+ __entry->owner)
)
#define DEFINE_DIR2_EVENT(name) \
@@ -1992,7 +2003,6 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__field(int, valuelen)
__field(xfs_dahash_t, hashval)
__field(unsigned int, attr_filter)
- __field(unsigned int, attr_flags)
__field(uint32_t, op_flags)
),
TP_fast_assign(
@@ -2004,11 +2014,10 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__entry->valuelen = args->valuelen;
__entry->hashval = args->hashval;
__entry->attr_filter = args->attr_filter;
- __entry->attr_flags = args->attr_flags;
__entry->op_flags = args->op_flags;
),
TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
- "hashval 0x%x filter %s flags %s op_flags %s",
+ "hashval 0x%x filter %s op_flags %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->namelen,
@@ -2018,9 +2027,6 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__entry->hashval,
__print_flags(__entry->attr_filter, "|",
XFS_ATTR_FILTER_FLAGS),
- __print_flags(__entry->attr_flags, "|",
- { XATTR_CREATE, "CREATE" },
- { XATTR_REPLACE, "REPLACE" }),
__print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
)
@@ -2467,7 +2473,7 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
__entry->ptr = cur->bc_levels[level].ptr;
@@ -2517,7 +2523,7 @@ TRACE_EVENT(xfs_btree_alloc_block,
__entry->ino = 0;
break;
}
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->error = error;
if (!error && stat) {
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
@@ -2561,7 +2567,7 @@ TRACE_EVENT(xfs_btree_free_block,
__entry->ino = cur->bc_ino.ip->i_ino;
else
__entry->ino = 0;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->agbno = xfs_daddr_to_agbno(cur->bc_mp,
xfs_buf_daddr(bp));
),
@@ -2637,7 +2643,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
),
TP_fast_assign(
__entry->dev = mp ? mp->m_super->s_dev : 0;
- __assign_str(name, dfp->dfp_ops->name);
+ __assign_str(name);
__entry->intent = dfp->dfp_intent;
__entry->flags = dfp->dfp_flags;
__entry->committed = dfp->dfp_done != NULL;
@@ -2726,7 +2732,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
),
TP_fast_assign(
__entry->dev = mp ? mp->m_super->s_dev : 0;
- __assign_str(name, dfp->dfp_ops->name);
+ __assign_str(name);
__entry->intent = dfp->dfp_intent;
__entry->item = item;
__entry->committed = dfp->dfp_done != NULL;
@@ -3062,7 +3068,6 @@ DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent);
DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical);
DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed);
-DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error);
DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
/* refcount tracepoint classes */
@@ -4239,7 +4244,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->agno = cur->bc_ag.pag->pag_agno;
__entry->agbno = cur->bc_ag.afake->af_root;
__entry->levels = cur->bc_ag.afake->af_levels;
@@ -4268,7 +4273,7 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->agno = XFS_INO_TO_AGNO(cur->bc_mp,
cur->bc_ino.ip->i_ino);
__entry->agino = XFS_INO_TO_AGINO(cur->bc_mp,
@@ -4307,7 +4312,7 @@ TRACE_EVENT(xfs_btree_bload_level_geometry,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
__entry->nr_this_level = nr_this_level;
@@ -4345,7 +4350,7 @@ TRACE_EVENT(xfs_btree_bload_block,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __assign_str(name, cur->bc_ops->name);
+ __assign_str(name);
__entry->level = level;
__entry->block_idx = block_idx;
__entry->nr_blocks = nr_blocks;
@@ -4568,7 +4573,7 @@ TRACE_EVENT(xfs_force_shutdown,
__entry->dev = mp->m_super->s_dev;
__entry->ptag = ptag;
__entry->flags = flags;
- __assign_str(fname, fname);
+ __assign_str(fname);
__entry->line_num = line_num;
),
TP_printk("dev %d:%d tag %s flags %s file %s line_num %d",
@@ -4750,7 +4755,7 @@ DECLARE_EVENT_CLASS(xfbtree_freesp_class,
),
TP_fast_assign(
__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
- __assign_str(btname, cur->bc_ops->name);
+ __assign_str(btname);
__entry->nlevels = cur->bc_nlevels;
__entry->fileoff = fileoff;
),
@@ -4770,6 +4775,419 @@ DEFINE_XFBTREE_FREESP_EVENT(xfbtree_alloc_block);
DEFINE_XFBTREE_FREESP_EVENT(xfbtree_free_block);
#endif /* CONFIG_XFS_BTREE_IN_MEM */
+/* exchmaps tracepoints */
+#define XFS_EXCHMAPS_STRINGS \
+ { XFS_EXCHMAPS_ATTR_FORK, "ATTRFORK" }, \
+ { XFS_EXCHMAPS_SET_SIZES, "SETSIZES" }, \
+ { XFS_EXCHMAPS_INO1_WRITTEN, "INO1_WRITTEN" }, \
+ { XFS_EXCHMAPS_CLEAR_INO1_REFLINK, "CLEAR_INO1_REFLINK" }, \
+ { XFS_EXCHMAPS_CLEAR_INO2_REFLINK, "CLEAR_INO2_REFLINK" }, \
+ { __XFS_EXCHMAPS_INO2_SHORTFORM, "INO2_SF" }
+
+DEFINE_INODE_IREC_EVENT(xfs_exchmaps_mapping1_skip);
+DEFINE_INODE_IREC_EVENT(xfs_exchmaps_mapping1);
+DEFINE_INODE_IREC_EVENT(xfs_exchmaps_mapping2);
+DEFINE_ITRUNC_EVENT(xfs_exchmaps_update_inode_size);
+
+#define XFS_EXCHRANGE_INODES \
+ { 1, "file1" }, \
+ { 2, "file2" }
+
+DECLARE_EVENT_CLASS(xfs_exchrange_inode_class,
+ TP_PROTO(struct xfs_inode *ip, int whichfile),
+ TP_ARGS(ip, whichfile),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, whichfile)
+ __field(xfs_ino_t, ino)
+ __field(int, format)
+ __field(xfs_extnum_t, nex)
+ __field(int, broot_size)
+ __field(int, fork_off)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->whichfile = whichfile;
+ __entry->ino = ip->i_ino;
+ __entry->format = ip->i_df.if_format;
+ __entry->nex = ip->i_df.if_nextents;
+ __entry->fork_off = xfs_inode_fork_boff(ip);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx whichfile %s format %s num_extents %llu forkoff 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->whichfile, XFS_EXCHRANGE_INODES),
+ __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
+ __entry->nex,
+ __entry->fork_off)
+)
+
+#define DEFINE_EXCHRANGE_INODE_EVENT(name) \
+DEFINE_EVENT(xfs_exchrange_inode_class, name, \
+ TP_PROTO(struct xfs_inode *ip, int whichfile), \
+ TP_ARGS(ip, whichfile))
+
+DEFINE_EXCHRANGE_INODE_EVENT(xfs_exchrange_before);
+DEFINE_EXCHRANGE_INODE_EVENT(xfs_exchrange_after);
+DEFINE_INODE_ERROR_EVENT(xfs_exchrange_error);
+
+#define XFS_EXCHANGE_RANGE_FLAGS_STRS \
+ { XFS_EXCHANGE_RANGE_TO_EOF, "TO_EOF" }, \
+ { XFS_EXCHANGE_RANGE_DSYNC , "DSYNC" }, \
+ { XFS_EXCHANGE_RANGE_DRY_RUN, "DRY_RUN" }, \
+ { XFS_EXCHANGE_RANGE_FILE1_WRITTEN, "F1_WRITTEN" }, \
+ { __XFS_EXCHANGE_RANGE_UPD_CMTIME1, "CMTIME1" }, \
+ { __XFS_EXCHANGE_RANGE_UPD_CMTIME2, "CMTIME2" }
+
+/* file exchange-range tracepoint class */
+DECLARE_EVENT_CLASS(xfs_exchrange_class,
+ TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip1,
+ struct xfs_inode *ip2),
+ TP_ARGS(fxr, ip1, ip2),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ip1_ino)
+ __field(loff_t, ip1_isize)
+ __field(loff_t, ip1_disize)
+ __field(xfs_ino_t, ip2_ino)
+ __field(loff_t, ip2_isize)
+ __field(loff_t, ip2_disize)
+
+ __field(loff_t, file1_offset)
+ __field(loff_t, file2_offset)
+ __field(unsigned long long, length)
+ __field(unsigned long long, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip1)->i_sb->s_dev;
+ __entry->ip1_ino = ip1->i_ino;
+ __entry->ip1_isize = VFS_I(ip1)->i_size;
+ __entry->ip1_disize = ip1->i_disk_size;
+ __entry->ip2_ino = ip2->i_ino;
+ __entry->ip2_isize = VFS_I(ip2)->i_size;
+ __entry->ip2_disize = ip2->i_disk_size;
+
+ __entry->file1_offset = fxr->file1_offset;
+ __entry->file2_offset = fxr->file2_offset;
+ __entry->length = fxr->length;
+ __entry->flags = fxr->flags;
+ ),
+ TP_printk("dev %d:%d flags %s bytecount 0x%llx "
+ "ino1 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx -> "
+ "ino2 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_flags_u64(__entry->flags, "|", XFS_EXCHANGE_RANGE_FLAGS_STRS),
+ __entry->length,
+ __entry->ip1_ino,
+ __entry->ip1_isize,
+ __entry->ip1_disize,
+ __entry->file1_offset,
+ __entry->ip2_ino,
+ __entry->ip2_isize,
+ __entry->ip2_disize,
+ __entry->file2_offset)
+)
+
+#define DEFINE_EXCHRANGE_EVENT(name) \
+DEFINE_EVENT(xfs_exchrange_class, name, \
+ TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip1, \
+ struct xfs_inode *ip2), \
+ TP_ARGS(fxr, ip1, ip2))
+DEFINE_EXCHRANGE_EVENT(xfs_exchrange_prep);
+DEFINE_EXCHRANGE_EVENT(xfs_exchrange_flush);
+DEFINE_EXCHRANGE_EVENT(xfs_exchrange_mappings);
+
+TRACE_EVENT(xfs_exchmaps_overhead,
+ TP_PROTO(struct xfs_mount *mp, unsigned long long bmbt_blocks,
+ unsigned long long rmapbt_blocks),
+ TP_ARGS(mp, bmbt_blocks, rmapbt_blocks),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long long, bmbt_blocks)
+ __field(unsigned long long, rmapbt_blocks)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->bmbt_blocks = bmbt_blocks;
+ __entry->rmapbt_blocks = rmapbt_blocks;
+ ),
+ TP_printk("dev %d:%d bmbt_blocks 0x%llx rmapbt_blocks 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->bmbt_blocks,
+ __entry->rmapbt_blocks)
+);
+
+DECLARE_EVENT_CLASS(xfs_exchmaps_estimate_class,
+ TP_PROTO(const struct xfs_exchmaps_req *req),
+ TP_ARGS(req),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino1)
+ __field(xfs_ino_t, ino2)
+ __field(xfs_fileoff_t, startoff1)
+ __field(xfs_fileoff_t, startoff2)
+ __field(xfs_filblks_t, blockcount)
+ __field(uint64_t, flags)
+ __field(xfs_filblks_t, ip1_bcount)
+ __field(xfs_filblks_t, ip2_bcount)
+ __field(xfs_filblks_t, ip1_rtbcount)
+ __field(xfs_filblks_t, ip2_rtbcount)
+ __field(unsigned long long, resblks)
+ __field(unsigned long long, nr_exchanges)
+ ),
+ TP_fast_assign(
+ __entry->dev = req->ip1->i_mount->m_super->s_dev;
+ __entry->ino1 = req->ip1->i_ino;
+ __entry->ino2 = req->ip2->i_ino;
+ __entry->startoff1 = req->startoff1;
+ __entry->startoff2 = req->startoff2;
+ __entry->blockcount = req->blockcount;
+ __entry->flags = req->flags;
+ __entry->ip1_bcount = req->ip1_bcount;
+ __entry->ip2_bcount = req->ip2_bcount;
+ __entry->ip1_rtbcount = req->ip1_rtbcount;
+ __entry->ip2_rtbcount = req->ip2_rtbcount;
+ __entry->resblks = req->resblks;
+ __entry->nr_exchanges = req->nr_exchanges;
+ ),
+ TP_printk("dev %d:%d ino1 0x%llx fileoff1 0x%llx ino2 0x%llx fileoff2 0x%llx fsbcount 0x%llx flags (%s) bcount1 0x%llx rtbcount1 0x%llx bcount2 0x%llx rtbcount2 0x%llx resblks 0x%llx nr_exchanges %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino1, __entry->startoff1,
+ __entry->ino2, __entry->startoff2,
+ __entry->blockcount,
+ __print_flags_u64(__entry->flags, "|", XFS_EXCHMAPS_STRINGS),
+ __entry->ip1_bcount,
+ __entry->ip1_rtbcount,
+ __entry->ip2_bcount,
+ __entry->ip2_rtbcount,
+ __entry->resblks,
+ __entry->nr_exchanges)
+);
+
+#define DEFINE_EXCHMAPS_ESTIMATE_EVENT(name) \
+DEFINE_EVENT(xfs_exchmaps_estimate_class, name, \
+ TP_PROTO(const struct xfs_exchmaps_req *req), \
+ TP_ARGS(req))
+DEFINE_EXCHMAPS_ESTIMATE_EVENT(xfs_exchmaps_initial_estimate);
+DEFINE_EXCHMAPS_ESTIMATE_EVENT(xfs_exchmaps_final_estimate);
+
+DECLARE_EVENT_CLASS(xfs_exchmaps_intent_class,
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_exchmaps_intent *xmi),
+ TP_ARGS(mp, xmi),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino1)
+ __field(xfs_ino_t, ino2)
+ __field(uint64_t, flags)
+ __field(xfs_fileoff_t, startoff1)
+ __field(xfs_fileoff_t, startoff2)
+ __field(xfs_filblks_t, blockcount)
+ __field(xfs_fsize_t, isize1)
+ __field(xfs_fsize_t, isize2)
+ __field(xfs_fsize_t, new_isize1)
+ __field(xfs_fsize_t, new_isize2)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino1 = xmi->xmi_ip1->i_ino;
+ __entry->ino2 = xmi->xmi_ip2->i_ino;
+ __entry->flags = xmi->xmi_flags;
+ __entry->startoff1 = xmi->xmi_startoff1;
+ __entry->startoff2 = xmi->xmi_startoff2;
+ __entry->blockcount = xmi->xmi_blockcount;
+ __entry->isize1 = xmi->xmi_ip1->i_disk_size;
+ __entry->isize2 = xmi->xmi_ip2->i_disk_size;
+ __entry->new_isize1 = xmi->xmi_isize1;
+ __entry->new_isize2 = xmi->xmi_isize2;
+ ),
+ TP_printk("dev %d:%d ino1 0x%llx fileoff1 0x%llx ino2 0x%llx fileoff2 0x%llx fsbcount 0x%llx flags (%s) isize1 0x%llx newisize1 0x%llx isize2 0x%llx newisize2 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino1, __entry->startoff1,
+ __entry->ino2, __entry->startoff2,
+ __entry->blockcount,
+ __print_flags_u64(__entry->flags, "|", XFS_EXCHMAPS_STRINGS),
+ __entry->isize1, __entry->new_isize1,
+ __entry->isize2, __entry->new_isize2)
+);
+
+#define DEFINE_EXCHMAPS_INTENT_EVENT(name) \
+DEFINE_EVENT(xfs_exchmaps_intent_class, name, \
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_exchmaps_intent *xmi), \
+ TP_ARGS(mp, xmi))
+DEFINE_EXCHMAPS_INTENT_EVENT(xfs_exchmaps_defer);
+DEFINE_EXCHMAPS_INTENT_EVENT(xfs_exchmaps_recover);
+
+TRACE_EVENT(xfs_exchmaps_delta_nextents_step,
+ TP_PROTO(struct xfs_mount *mp,
+ const struct xfs_bmbt_irec *left,
+ const struct xfs_bmbt_irec *curr,
+ const struct xfs_bmbt_irec *new,
+ const struct xfs_bmbt_irec *right,
+ int delta, unsigned int state),
+ TP_ARGS(mp, left, curr, new, right, delta, state),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_fileoff_t, loff)
+ __field(xfs_fsblock_t, lstart)
+ __field(xfs_filblks_t, lcount)
+ __field(xfs_fileoff_t, coff)
+ __field(xfs_fsblock_t, cstart)
+ __field(xfs_filblks_t, ccount)
+ __field(xfs_fileoff_t, noff)
+ __field(xfs_fsblock_t, nstart)
+ __field(xfs_filblks_t, ncount)
+ __field(xfs_fileoff_t, roff)
+ __field(xfs_fsblock_t, rstart)
+ __field(xfs_filblks_t, rcount)
+ __field(int, delta)
+ __field(unsigned int, state)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->loff = left->br_startoff;
+ __entry->lstart = left->br_startblock;
+ __entry->lcount = left->br_blockcount;
+ __entry->coff = curr->br_startoff;
+ __entry->cstart = curr->br_startblock;
+ __entry->ccount = curr->br_blockcount;
+ __entry->noff = new->br_startoff;
+ __entry->nstart = new->br_startblock;
+ __entry->ncount = new->br_blockcount;
+ __entry->roff = right->br_startoff;
+ __entry->rstart = right->br_startblock;
+ __entry->rcount = right->br_blockcount;
+ __entry->delta = delta;
+ __entry->state = state;
+ ),
+ TP_printk("dev %d:%d left 0x%llx:0x%llx:0x%llx; curr 0x%llx:0x%llx:0x%llx <- new 0x%llx:0x%llx:0x%llx; right 0x%llx:0x%llx:0x%llx delta %d state 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->loff, __entry->lstart, __entry->lcount,
+ __entry->coff, __entry->cstart, __entry->ccount,
+ __entry->noff, __entry->nstart, __entry->ncount,
+ __entry->roff, __entry->rstart, __entry->rcount,
+ __entry->delta, __entry->state)
+);
+
+TRACE_EVENT(xfs_exchmaps_delta_nextents,
+ TP_PROTO(const struct xfs_exchmaps_req *req, int64_t d_nexts1,
+ int64_t d_nexts2),
+ TP_ARGS(req, d_nexts1, d_nexts2),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino1)
+ __field(xfs_ino_t, ino2)
+ __field(xfs_extnum_t, nexts1)
+ __field(xfs_extnum_t, nexts2)
+ __field(int64_t, d_nexts1)
+ __field(int64_t, d_nexts2)
+ ),
+ TP_fast_assign(
+ int whichfork = xfs_exchmaps_reqfork(req);
+
+ __entry->dev = req->ip1->i_mount->m_super->s_dev;
+ __entry->ino1 = req->ip1->i_ino;
+ __entry->ino2 = req->ip2->i_ino;
+ __entry->nexts1 = xfs_ifork_ptr(req->ip1, whichfork)->if_nextents;
+ __entry->nexts2 = xfs_ifork_ptr(req->ip2, whichfork)->if_nextents;
+ __entry->d_nexts1 = d_nexts1;
+ __entry->d_nexts2 = d_nexts2;
+ ),
+ TP_printk("dev %d:%d ino1 0x%llx nexts %llu ino2 0x%llx nexts %llu delta1 %lld delta2 %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino1, __entry->nexts1,
+ __entry->ino2, __entry->nexts2,
+ __entry->d_nexts1, __entry->d_nexts2)
+);
+
+DECLARE_EVENT_CLASS(xfs_getparents_rec_class,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi,
+ const struct xfs_attr_list_context *context,
+ const struct xfs_getparents_rec *pptr),
+ TP_ARGS(ip, ppi, context, pptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, firstu)
+ __field(unsigned short, reclen)
+ __field(unsigned int, bufsize)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __string(name, pptr->gpr_name)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->firstu = context->firstu;
+ __entry->reclen = pptr->gpr_reclen;
+ __entry->bufsize = ppi->gp_bufsize;
+ __entry->parent_ino = pptr->gpr_parent.ha_fid.fid_ino;
+ __entry->parent_gen = pptr->gpr_parent.ha_fid.fid_gen;
+ __assign_str(name);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx firstu %u reclen %u bufsize %u parent_ino 0x%llx parent_gen 0x%x name '%s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->firstu,
+ __entry->reclen,
+ __entry->bufsize,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __get_str(name))
+)
+#define DEFINE_XFS_GETPARENTS_REC_EVENT(name) \
+DEFINE_EVENT(xfs_getparents_rec_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi, \
+ const struct xfs_attr_list_context *context, \
+ const struct xfs_getparents_rec *pptr), \
+ TP_ARGS(ip, ppi, context, pptr))
+DEFINE_XFS_GETPARENTS_REC_EVENT(xfs_getparents_put_listent);
+DEFINE_XFS_GETPARENTS_REC_EVENT(xfs_getparents_expand_lastrec);
+
+DECLARE_EVENT_CLASS(xfs_getparents_class,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi,
+ const struct xfs_attrlist_cursor_kern *cur),
+ TP_ARGS(ip, ppi, cur),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned short, iflags)
+ __field(unsigned short, oflags)
+ __field(unsigned int, bufsize)
+ __field(unsigned int, hashval)
+ __field(unsigned int, blkno)
+ __field(unsigned int, offset)
+ __field(int, initted)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->iflags = ppi->gp_iflags;
+ __entry->oflags = ppi->gp_oflags;
+ __entry->bufsize = ppi->gp_bufsize;
+ __entry->hashval = cur->hashval;
+ __entry->blkno = cur->blkno;
+ __entry->offset = cur->offset;
+ __entry->initted = cur->initted;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx iflags 0x%x oflags 0x%x bufsize %u cur_init? %d hashval 0x%x blkno %u offset %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->iflags,
+ __entry->oflags,
+ __entry->bufsize,
+ __entry->initted,
+ __entry->hashval,
+ __entry->blkno,
+ __entry->offset)
+)
+#define DEFINE_XFS_GETPARENTS_EVENT(name) \
+DEFINE_EVENT(xfs_getparents_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi, \
+ const struct xfs_attrlist_cursor_kern *cur), \
+ TP_ARGS(ip, ppi, cur))
+DEFINE_XFS_GETPARENTS_EVENT(xfs_getparents_begin);
+DEFINE_XFS_GETPARENTS_EVENT(xfs_getparents_end);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 7350640059cc..828da4ac4316 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -163,7 +163,7 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (blocks > 0) {
- error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
+ error = xfs_dec_fdblocks(mp, blocks, rsvd);
if (error != 0)
return -ENOSPC;
tp->t_blk_res += blocks;
@@ -210,7 +210,7 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (rtextents > 0) {
- error = xfs_mod_frextents(mp, -((int64_t)rtextents));
+ error = xfs_dec_frextents(mp, rtextents);
if (error) {
error = -ENOSPC;
goto undo_log;
@@ -234,7 +234,7 @@ undo_log:
undo_blocks:
if (blocks > 0) {
- xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
+ xfs_add_fdblocks(mp, blocks);
tp->t_blk_res = 0;
}
return error;
@@ -593,38 +593,44 @@ xfs_trans_unreserve_and_mod_sb(
struct xfs_trans *tp)
{
struct xfs_mount *mp = tp->t_mountp;
- bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
- int64_t blkdelta = 0;
- int64_t rtxdelta = 0;
+ int64_t blkdelta = tp->t_blk_res;
+ int64_t rtxdelta = tp->t_rtx_res;
int64_t idelta = 0;
int64_t ifreedelta = 0;
- int error;
- /* calculate deltas */
- if (tp->t_blk_res > 0)
- blkdelta = tp->t_blk_res;
- if ((tp->t_fdblocks_delta != 0) &&
- (xfs_has_lazysbcount(mp) ||
- (tp->t_flags & XFS_TRANS_SB_DIRTY)))
+ /*
+ * Calculate the deltas.
+ *
+ * t_fdblocks_delta and t_frextents_delta can be positive or negative:
+ *
+ * - positive values indicate blocks freed in the transaction.
+ * - negative values indicate blocks allocated in the transaction
+ *
+ * Negative values can only happen if the transaction has a block
+ * reservation that covers the allocated block. The end result is
+ * that the calculated delta values must always be positive and we
+ * can only put back previous allocated or reserved blocks here.
+ */
+ ASSERT(tp->t_blk_res || tp->t_fdblocks_delta >= 0);
+ if (xfs_has_lazysbcount(mp) || (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
blkdelta += tp->t_fdblocks_delta;
+ ASSERT(blkdelta >= 0);
+ }
- if (tp->t_rtx_res > 0)
- rtxdelta = tp->t_rtx_res;
- if ((tp->t_frextents_delta != 0) &&
- (tp->t_flags & XFS_TRANS_SB_DIRTY))
+ ASSERT(tp->t_rtx_res || tp->t_frextents_delta >= 0);
+ if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
rtxdelta += tp->t_frextents_delta;
+ ASSERT(rtxdelta >= 0);
+ }
- if (xfs_has_lazysbcount(mp) ||
- (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
+ if (xfs_has_lazysbcount(mp) || (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
idelta = tp->t_icount_delta;
ifreedelta = tp->t_ifree_delta;
}
/* apply the per-cpu counters */
- if (blkdelta) {
- error = xfs_mod_fdblocks(mp, blkdelta, rsvd);
- ASSERT(!error);
- }
+ if (blkdelta)
+ xfs_add_fdblocks(mp, blkdelta);
if (idelta)
percpu_counter_add_batch(&mp->m_icount, idelta,
@@ -633,10 +639,8 @@ xfs_trans_unreserve_and_mod_sb(
if (ifreedelta)
percpu_counter_add(&mp->m_ifree, ifreedelta);
- if (rtxdelta) {
- error = xfs_mod_frextents(mp, rtxdelta);
- ASSERT(!error);
- }
+ if (rtxdelta)
+ xfs_add_frextents(mp, rtxdelta);
if (!(tp->t_flags & XFS_TRANS_SB_DIRTY))
return;
@@ -672,7 +676,6 @@ xfs_trans_unreserve_and_mod_sb(
*/
ASSERT(mp->m_sb.sb_imax_pct >= 0);
ASSERT(mp->m_sb.sb_rextslog >= 0);
- return;
}
/* Add the given log item to the transaction's list of log items. */
@@ -1291,9 +1294,9 @@ xfs_trans_reserve_more_inode(
return 0;
/* Quota failed, give back the new reservation. */
- xfs_mod_fdblocks(mp, dblocks, tp->t_flags & XFS_TRANS_RESERVE);
+ xfs_add_fdblocks(mp, dblocks);
tp->t_blk_res -= dblocks;
- xfs_mod_frextents(mp, rtx);
+ xfs_add_frextents(mp, rtx);
tp->t_rtx_res -= rtx;
return error;
}
@@ -1430,6 +1433,8 @@ out_cancel:
* The caller must ensure that the on-disk dquots attached to this inode have
* already been allocated and initialized. The ILOCKs will be dropped when the
* transaction is committed or cancelled.
+ *
+ * Caller is responsible for unlocking the inodes manually upon return
*/
int
xfs_trans_alloc_dir(
@@ -1460,8 +1465,8 @@ retry:
xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, 0);
+ xfs_trans_ijoin(tp, ip, 0);
error = xfs_qm_dqattach_locked(dp, false);
if (error) {
@@ -1484,6 +1489,9 @@ retry:
if (error == -EDQUOT || error == -ENOSPC) {
if (!retried) {
xfs_trans_cancel(tp);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ if (dp != ip)
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_blockgc_free_quota(dp, 0);
retried = true;
goto retry;
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 577b535a595c..b368e13424c4 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -379,24 +379,29 @@ xfs_trans_mod_dquot(
/*
* Given an array of dqtrx structures, lock all the dquots associated and join
- * them to the transaction, provided they have been modified. We know that the
- * highest number of dquots of one type - usr, grp and prj - involved in a
- * transaction is 3 so we don't need to make this very generic.
+ * them to the transaction, provided they have been modified.
*/
STATIC void
xfs_trans_dqlockedjoin(
struct xfs_trans *tp,
struct xfs_dqtrx *q)
{
+ unsigned int i;
ASSERT(q[0].qt_dquot != NULL);
if (q[1].qt_dquot == NULL) {
xfs_dqlock(q[0].qt_dquot);
xfs_trans_dqjoin(tp, q[0].qt_dquot);
- } else {
- ASSERT(XFS_QM_TRANS_MAXDQS == 2);
+ } else if (q[2].qt_dquot == NULL) {
xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot);
xfs_trans_dqjoin(tp, q[0].qt_dquot);
xfs_trans_dqjoin(tp, q[1].qt_dquot);
+ } else {
+ xfs_dqlockn(q);
+ for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+ if (q[i].qt_dquot == NULL)
+ break;
+ xfs_trans_dqjoin(tp, q[i].qt_dquot);
+ }
}
}
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 364104e1b38a..ab3d22f662f2 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -17,15 +17,13 @@
#include "xfs_acl.h"
#include "xfs_log.h"
#include "xfs_xattr.h"
+#include "xfs_quota.h"
#include <linux/posix_acl_xattr.h>
/*
* Get permission to use log-assisted atomic exchange of file extents.
- *
- * Callers must not be running any transactions or hold any inode locks, and
- * they must release the permission by calling xlog_drop_incompat_feat
- * when they're done.
+ * Callers must not be running any transactions or hold any ILOCKs.
*/
static inline int
xfs_attr_grab_log_assist(
@@ -33,17 +31,8 @@ xfs_attr_grab_log_assist(
{
int error = 0;
- /*
- * Protect ourselves from an idle log clearing the logged xattrs log
- * incompat feature bit.
- */
- xlog_use_incompat_feat(mp->m_log);
-
- /*
- * If log-assisted xattrs are already enabled, the caller can use the
- * log assisted swap functions with the log-incompat reference we got.
- */
- if (xfs_sb_version_haslogxattrs(&mp->m_sb))
+ /* xattr update log intent items are already enabled */
+ if (xfs_is_using_logged_xattrs(mp))
return 0;
/*
@@ -52,31 +41,20 @@ xfs_attr_grab_log_assist(
* a V5 filesystem for the superblock field, but we'll require rmap
* or reflink to avoid having to deal with really old kernels.
*/
- if (!xfs_has_reflink(mp) && !xfs_has_rmapbt(mp)) {
- error = -EOPNOTSUPP;
- goto drop_incompat;
- }
+ if (!xfs_has_reflink(mp) && !xfs_has_rmapbt(mp))
+ return -EOPNOTSUPP;
/* Enable log-assisted xattrs. */
error = xfs_add_incompat_log_feature(mp,
XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
if (error)
- goto drop_incompat;
+ return error;
+ xfs_set_using_logged_xattrs(mp);
xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP,
"EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!");
return 0;
-drop_incompat:
- xlog_drop_incompat_feat(mp->m_log);
- return error;
-}
-
-static inline void
-xfs_attr_rele_log_assist(
- struct xfs_mount *mp)
-{
- xlog_drop_incompat_feat(mp->m_log);
}
static inline bool
@@ -93,17 +71,31 @@ xfs_attr_want_log_assist(
/*
* Set or remove an xattr, having grabbed the appropriate logging resources
- * prior to calling libxfs.
+ * prior to calling libxfs. Callers of this function are only required to
+ * initialize the inode, attr_filter, name, namelen, value, and valuelen fields
+ * of @args.
*/
int
xfs_attr_change(
- struct xfs_da_args *args)
+ struct xfs_da_args *args,
+ enum xfs_attr_update op)
{
struct xfs_mount *mp = args->dp->i_mount;
- bool use_logging = false;
int error;
- ASSERT(!(args->op_flags & XFS_DA_OP_LOGGED));
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ error = xfs_qm_dqattach(args->dp);
+ if (error)
+ return error;
+
+ /*
+ * We have no control over the attribute names that userspace passes us
+ * to remove, so we have to allow the name lookup prior to attribute
+ * removal to fail as well.
+ */
+ args->op_flags = XFS_DA_OP_OKNOENT;
if (xfs_attr_want_log_assist(mp)) {
error = xfs_attr_grab_log_assist(mp);
@@ -111,14 +103,14 @@ xfs_attr_change(
return error;
args->op_flags |= XFS_DA_OP_LOGGED;
- use_logging = true;
}
- error = xfs_attr_set(args);
+ args->owner = args->dp->i_ino;
+ args->geo = mp->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ xfs_attr_sethash(args);
- if (use_logging)
- xfs_attr_rele_log_assist(mp);
- return error;
+ return xfs_attr_set(args, op, args->attr_filter & XFS_ATTR_ROOT);
}
@@ -145,6 +137,20 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
return args.valuelen;
}
+static inline enum xfs_attr_update
+xfs_xattr_flags_to_op(
+ int flags,
+ const void *value)
+{
+ if (!value)
+ return XFS_ATTRUPDATE_REMOVE;
+ if (flags & XATTR_CREATE)
+ return XFS_ATTRUPDATE_CREATE;
+ if (flags & XATTR_REPLACE)
+ return XFS_ATTRUPDATE_REPLACE;
+ return XFS_ATTRUPDATE_UPSERT;
+}
+
static int
xfs_xattr_set(const struct xattr_handler *handler,
struct mnt_idmap *idmap, struct dentry *unused,
@@ -154,7 +160,6 @@ xfs_xattr_set(const struct xattr_handler *handler,
struct xfs_da_args args = {
.dp = XFS_I(inode),
.attr_filter = handler->flags,
- .attr_flags = flags,
.name = name,
.namelen = strlen(name),
.value = (void *)value,
@@ -162,7 +167,7 @@ xfs_xattr_set(const struct xattr_handler *handler,
};
int error;
- error = xfs_attr_change(&args);
+ error = xfs_attr_change(&args, xfs_xattr_flags_to_op(flags, value));
if (!error && (handler->flags & XFS_ATTR_ROOT))
xfs_forget_acl(inode, name);
return error;
@@ -237,6 +242,7 @@ xfs_xattr_put_listent(
int flags,
unsigned char *name,
int namelen,
+ void *value,
int valuelen)
{
char *prefix;
@@ -244,6 +250,10 @@ xfs_xattr_put_listent(
ASSERT(context->count >= 0);
+ /* Don't expose private xattr namespaces. */
+ if (flags & XFS_ATTR_PRIVATE_NSP_MASK)
+ return;
+
if (flags & XFS_ATTR_ROOT) {
#ifdef CONFIG_XFS_POSIX_ACL
if (namelen == SGI_ACL_FILE_SIZE &&
diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h
index cec766cad26c..c3eb858fb59e 100644
--- a/fs/xfs/xfs_xattr.h
+++ b/fs/xfs/xfs_xattr.h
@@ -6,7 +6,8 @@
#ifndef __XFS_XATTR_H__
#define __XFS_XATTR_H__
-int xfs_attr_change(struct xfs_da_args *args);
+enum xfs_attr_update;
+int xfs_attr_change(struct xfs_da_args *args, enum xfs_attr_update op);
extern const struct xattr_handler * const xfs_xattr_handlers[];