diff options
Diffstat (limited to 'fs')
50 files changed, 2047 insertions, 664 deletions
diff --git a/fs/Makefile b/fs/Makefile index 09e051fefc5b..f79cf4043e60 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o +obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FILE_LOCKING) += locks.o @@ -308,15 +308,9 @@ static void aio_free_ring(struct kioctx *ctx) } } -static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) -{ - vma->vm_flags |= VM_DONTEXPAND; - vma->vm_ops = &generic_file_vm_ops; - return 0; -} - -static int aio_ring_remap(struct file *file, struct vm_area_struct *vma) +static int aio_ring_mremap(struct vm_area_struct *vma) { + struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; int i, res = -EINVAL; @@ -342,9 +336,24 @@ static int aio_ring_remap(struct file *file, struct vm_area_struct *vma) return res; } +static const struct vm_operations_struct aio_ring_vm_ops = { + .mremap = aio_ring_mremap, +#if IS_ENABLED(CONFIG_MMU) + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = filemap_page_mkwrite, +#endif +}; + +static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_flags |= VM_DONTEXPAND; + vma->vm_ops = &aio_ring_vm_ops; + return 0; +} + static const struct file_operations aio_ring_fops = { .mmap = aio_ring_mmap, - .mremap = aio_ring_remap, }; #if IS_ENABLED(CONFIG_MIGRATION) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index d1c833c321b9..7b6bfcbf801c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -479,7 +479,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) - seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); + seq_show_option(m, "snapdirname", fsopt->snapdir_name); return 0; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 0a9fb6b53126..6a1119e87fbb 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -394,17 +394,17 @@ cifs_show_options(struct seq_file *s, struct dentry *root) struct sockaddr *srcaddr; srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; - seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string); + seq_show_option(s, "vers", tcon->ses->server->vals->version_string); cifs_show_security(s, tcon->ses); cifs_show_cache_flavor(s, cifs_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) seq_puts(s, ",multiuser"); else if (tcon->ses->user_name) - seq_printf(s, ",username=%s", tcon->ses->user_name); + seq_show_option(s, "username", tcon->ses->user_name); if (tcon->ses->domainName) - seq_printf(s, ",domain=%s", tcon->ses->domainName); + seq_show_option(s, "domain", tcon->ses->domainName); if (srcaddr->sa_family != AF_UNSPEC) { struct sockaddr_in *saddr4; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ee3878262a49..a63c7b0a10cf 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1776,10 +1776,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq, } if (sbi->s_qf_names[USRQUOTA]) - seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); + seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); if (sbi->s_qf_names[GRPQUOTA]) - seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); + seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); #endif } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 2982445947e1..894fb01a91da 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1334,11 +1334,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) if (is_ancestor(root, sdp->sd_master_dir)) seq_puts(s, ",meta"); if (args->ar_lockproto[0]) - seq_printf(s, ",lockproto=%s", args->ar_lockproto); + seq_show_option(s, "lockproto", args->ar_lockproto); if (args->ar_locktable[0]) - seq_printf(s, ",locktable=%s", args->ar_locktable); + seq_show_option(s, "locktable", args->ar_locktable); if (args->ar_hostdata[0]) - seq_printf(s, ",hostdata=%s", args->ar_hostdata); + seq_show_option(s, "hostdata", args->ar_hostdata); if (args->ar_spectator) seq_puts(s, ",spectator"); if (args->ar_localflocks) diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 55c03b9e9070..4574fdd3d421 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -136,9 +136,9 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root) struct hfs_sb_info *sbi = HFS_SB(root->d_sb); if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f)) - seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator); + seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4); if (sbi->s_type != cpu_to_be32(0x3f3f3f3f)) - seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type); + seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4); seq_printf(seq, ",uid=%u,gid=%u", from_kuid_munged(&init_user_ns, sbi->s_uid), from_kgid_munged(&init_user_ns, sbi->s_gid)); diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index c90b72ee676d..bb806e58c977 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -218,9 +218,9 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root) struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb); if (sbi->creator != HFSPLUS_DEF_CR_TYPE) - seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); + seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4); if (sbi->type != HFSPLUS_DEF_CR_TYPE) - seq_printf(seq, ",type=%.4s", (char *)&sbi->type); + seq_show_option_n(seq, "type", (char *)&sbi->type, 4); seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, from_kuid_munged(&init_user_ns, sbi->uid), from_kgid_munged(&init_user_ns, sbi->gid)); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 059597b23f67..2ac99db3750e 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -260,7 +260,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root) size_t offset = strlen(root_ino) + 1; if (strlen(root_path) > offset) - seq_printf(seq, ",%s", root_path + offset); + seq_show_option(seq, root_path + offset, NULL); if (append) seq_puts(seq, ",append"); diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 44523f4a6084..6faaf710e563 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -154,6 +154,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) struct dnotify_struct *dn; struct dnotify_struct **prev; struct inode *inode; + bool free = false; inode = file_inode(filp); if (!S_ISDIR(inode->i_mode)) @@ -182,11 +183,15 @@ void dnotify_flush(struct file *filp, fl_owner_t id) /* nothing else could have found us thanks to the dnotify_groups mark_mutex */ - if (dn_mark->dn == NULL) - fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); + if (dn_mark->dn == NULL) { + fsnotify_detach_mark(fsn_mark); + free = true; + } mutex_unlock(&dnotify_group->mark_mutex); + if (free) + fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); } @@ -362,9 +367,10 @@ out: spin_unlock(&fsn_mark->lock); if (destroy) - fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); - + fsnotify_detach_mark(fsn_mark); mutex_unlock(&dnotify_group->mark_mutex); + if (destroy) + fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); out_err: if (new_fsn_mark) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index cf275500a665..8e8e6bcd1d43 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -529,8 +529,10 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); if (destroy_mark) - fsnotify_destroy_mark_locked(fsn_mark, group); + fsnotify_detach_mark(fsn_mark); mutex_unlock(&group->mark_mutex); + if (destroy_mark) + fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); if (removed & real_mount(mnt)->mnt_fsnotify_mask) @@ -557,8 +559,10 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); if (destroy_mark) - fsnotify_destroy_mark_locked(fsn_mark, group); + fsnotify_detach_mark(fsn_mark); mutex_unlock(&group->mark_mutex); + if (destroy_mark) + fsnotify_free_mark(fsn_mark); /* matches the fsnotify_find_inode_mark() */ fsnotify_put_mark(fsn_mark); diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 58b7cdb63da9..6b6f0d472ae8 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -76,7 +76,8 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) struct inotify_inode_mark *inode_mark; struct inode *inode; - if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE))) + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) || + !(mark->flags & FSNOTIFY_MARK_FLAG_INODE)) return; inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index dd3fb0b17be7..db39de2dd4cb 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -26,7 +26,6 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" -#include "../mount.h" /* * Clear all of the marks on an inode when it is being evicted from core @@ -205,6 +204,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, mnt = NULL; /* + * Optimization: srcu_read_lock() has a memory barrier which can + * be expensive. It protects walking the *_fsnotify_marks lists. + * However, if we do not walk the lists, we do not have to do + * SRCU because we have no references to any objects and do not + * need SRCU to keep them "alive". + */ + if (hlist_empty(&to_tell->i_fsnotify_marks) && + (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks))) + return 0; + /* * if this is a modify event we may need to clear the ignored masks * otherwise return if neither the inode nor the vfsmount care about * this type of event. diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 13a00be516d2..b44c68a857e7 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -6,6 +6,8 @@ #include <linux/srcu.h> #include <linux/types.h> +#include "../mount.h" + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); @@ -38,15 +40,22 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); /* inode specific destruction of a mark */ extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); -/* Destroy all marks in the given list */ -extern void fsnotify_destroy_marks(struct list_head *to_free); /* Find mark belonging to given group in the list of marks */ extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, struct fsnotify_group *group); -/* run the list of all marks associated with inode and flag them to be freed */ -extern void fsnotify_clear_marks_by_inode(struct inode *inode); -/* run the list of all marks associated with vfsmount and flag them to be freed */ -extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt); +/* Destroy all marks in the given list protected by 'lock' */ +extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock); +/* run the list of all marks associated with inode and destroy them */ +static inline void fsnotify_clear_marks_by_inode(struct inode *inode) +{ + fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock); +} +/* run the list of all marks associated with vfsmount and destroy them */ +static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) +{ + fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks, + &mnt->mnt_root->d_lock); +} /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 3daf513ee99e..474a3ce1b5e1 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -65,26 +65,6 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) } /* - * Given an inode, destroy all of the marks associated with that inode. - */ -void fsnotify_clear_marks_by_inode(struct inode *inode) -{ - struct fsnotify_mark *mark; - struct hlist_node *n; - LIST_HEAD(free_list); - - spin_lock(&inode->i_lock); - hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) { - list_add(&mark->free_list, &free_list); - hlist_del_init_rcu(&mark->obj_list); - fsnotify_get_mark(mark); - } - spin_unlock(&inode->i_lock); - - fsnotify_destroy_marks(&free_list); -} - -/* * Given a group clear all of the inode marks associated with that group. */ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 39ddcaf0918f..fc0df4442f7b 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -122,26 +122,27 @@ u32 fsnotify_recalc_mask(struct hlist_head *head) } /* - * Any time a mark is getting freed we end up here. - * The caller had better be holding a reference to this mark so we don't actually - * do the final put under the mark->lock + * Remove mark from inode / vfsmount list, group list, drop inode reference + * if we got one. + * + * Must be called with group->mark_mutex held. */ -void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, - struct fsnotify_group *group) +void fsnotify_detach_mark(struct fsnotify_mark *mark) { struct inode *inode = NULL; + struct fsnotify_group *group = mark->group; BUG_ON(!mutex_is_locked(&group->mark_mutex)); spin_lock(&mark->lock); /* something else already called this function on this mark */ - if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { spin_unlock(&mark->lock); return; } - mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; + mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { inode = mark->inode; @@ -150,6 +151,12 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, fsnotify_destroy_vfsmount_mark(mark); else BUG(); + /* + * Note that we didn't update flags telling whether inode cares about + * what's happening with children. We update these flags from + * __fsnotify_parent() lazily when next event happens on one of our + * children. + */ list_del_init(&mark->g_list); @@ -157,18 +164,32 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) iput(inode); - /* release lock temporarily */ - mutex_unlock(&group->mark_mutex); + + atomic_dec(&group->num_marks); +} + +/* + * Free fsnotify mark. The freeing is actually happening from a kthread which + * first waits for srcu period end. Caller must have a reference to the mark + * or be protected by fsnotify_mark_srcu. + */ +void fsnotify_free_mark(struct fsnotify_mark *mark) +{ + struct fsnotify_group *group = mark->group; + + spin_lock(&mark->lock); + /* something else already called this function on this mark */ + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { + spin_unlock(&mark->lock); + return; + } + mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; + spin_unlock(&mark->lock); spin_lock(&destroy_lock); list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); wake_up(&destroy_waitq); - /* - * We don't necessarily have a ref on mark from caller so the above destroy - * may have actually freed it, unless this group provides a 'freeing_mark' - * function which must be holding a reference. - */ /* * Some groups like to know that marks are being freed. This is a @@ -177,50 +198,45 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, */ if (group->ops->freeing_mark) group->ops->freeing_mark(mark, group); - - /* - * __fsnotify_update_child_dentry_flags(inode); - * - * I really want to call that, but we can't, we have no idea if the inode - * still exists the second we drop the mark->lock. - * - * The next time an event arrive to this inode from one of it's children - * __fsnotify_parent will see that the inode doesn't care about it's - * children and will update all of these flags then. So really this - * is just a lazy update (and could be a perf win...) - */ - - atomic_dec(&group->num_marks); - - mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); } void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group) { mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); - fsnotify_destroy_mark_locked(mark, group); + fsnotify_detach_mark(mark); mutex_unlock(&group->mark_mutex); + fsnotify_free_mark(mark); } -/* - * Destroy all marks in the given list. The marks must be already detached from - * the original inode / vfsmount. - */ -void fsnotify_destroy_marks(struct list_head *to_free) +void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) { - struct fsnotify_mark *mark, *lmark; - struct fsnotify_group *group; - - list_for_each_entry_safe(mark, lmark, to_free, free_list) { - spin_lock(&mark->lock); - fsnotify_get_group(mark->group); - group = mark->group; - spin_unlock(&mark->lock); + struct fsnotify_mark *mark; - fsnotify_destroy_mark(mark, group); + while (1) { + /* + * We have to be careful since we can race with e.g. + * fsnotify_clear_marks_by_group() and once we drop 'lock', + * mark can get removed from the obj_list and destroyed. But + * we are holding mark reference so mark cannot be freed and + * calling fsnotify_destroy_mark() more than once is fine. + */ + spin_lock(lock); + if (hlist_empty(head)) { + spin_unlock(lock); + break; + } + mark = hlist_entry(head->first, struct fsnotify_mark, obj_list); + /* + * We don't update i_fsnotify_mask / mnt_fsnotify_mask here + * since inode / mount is going away anyway. So just remove + * mark from the list. + */ + hlist_del_init_rcu(&mark->obj_list); + fsnotify_get_mark(mark); + spin_unlock(lock); + fsnotify_destroy_mark(mark, mark->group); fsnotify_put_mark(mark); - fsnotify_put_group(group); } } @@ -332,7 +348,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, * inode->i_lock */ spin_lock(&mark->lock); - mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE; + mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; fsnotify_get_group(group); mark->group = group; @@ -438,8 +454,9 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, } mark = list_first_entry(&to_free, struct fsnotify_mark, g_list); fsnotify_get_mark(mark); - fsnotify_destroy_mark_locked(mark, group); + fsnotify_detach_mark(mark); mutex_unlock(&group->mark_mutex); + fsnotify_free_mark(mark); fsnotify_put_mark(mark); } } diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 326b148e623c..a8fcab68faef 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -28,25 +28,6 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" -#include "../mount.h" - -void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) -{ - struct fsnotify_mark *mark; - struct hlist_node *n; - struct mount *m = real_mount(mnt); - LIST_HEAD(free_list); - - spin_lock(&mnt->mnt_root->d_lock); - hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) { - list_add(&mark->free_list, &free_list); - hlist_del_init_rcu(&mark->obj_list); - fsnotify_get_mark(mark); - } - spin_unlock(&mnt->mnt_root->d_lock); - - fsnotify_destroy_marks(&free_list); -} void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) { diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index c1128bcbeb5e..d1a853585b53 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2204,17 +2204,12 @@ get_ctx_vol_failed: return true; #ifdef NTFS_RW iput_usnjrnl_err_out: - if (vol->usnjrnl_j_ino) - iput(vol->usnjrnl_j_ino); - if (vol->usnjrnl_max_ino) - iput(vol->usnjrnl_max_ino); - if (vol->usnjrnl_ino) - iput(vol->usnjrnl_ino); + iput(vol->usnjrnl_j_ino); + iput(vol->usnjrnl_max_ino); + iput(vol->usnjrnl_ino); iput_quota_err_out: - if (vol->quota_q_ino) - iput(vol->quota_q_ino); - if (vol->quota_ino) - iput(vol->quota_ino); + iput(vol->quota_q_ino); + iput(vol->quota_ino); iput(vol->extend_ino); #endif /* NTFS_RW */ iput_sec_err_out: @@ -2223,8 +2218,7 @@ iput_root_err_out: iput(vol->root_ino); iput_logfile_err_out: #ifdef NTFS_RW - if (vol->logfile_ino) - iput(vol->logfile_ino); + iput(vol->logfile_ino); iput_vol_err_out: #endif /* NTFS_RW */ iput(vol->vol_ino); @@ -2254,8 +2248,7 @@ iput_mftbmp_err_out: iput(vol->mftbmp_ino); iput_mirr_err_out: #ifdef NTFS_RW - if (vol->mftmirr_ino) - iput(vol->mftmirr_ino); + iput(vol->mftmirr_ino); #endif /* NTFS_RW */ return false; } diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index c58a1bcfda0f..0cdf497c91ef 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -284,7 +284,19 @@ int ocfs2_set_acl(handle_t *handle, int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); + struct buffer_head *bh = NULL; + int status = 0; + + status = ocfs2_inode_lock(inode, &bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); + ocfs2_inode_unlock(inode, 1); + brelse(bh); + return status; } struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) @@ -292,19 +304,21 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) struct ocfs2_super *osb; struct buffer_head *di_bh = NULL; struct posix_acl *acl; - int ret = -EAGAIN; + int ret; osb = OCFS2_SB(inode->i_sb); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return NULL; - - ret = ocfs2_read_inode_block(inode, &di_bh); - if (ret < 0) + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret < 0) { + if (ret != -ENOENT) + mlog_errno(ret); return ERR_PTR(ret); + } acl = ocfs2_get_acl_nolock(inode, type, di_bh); + ocfs2_inode_unlock(inode, 0); brelse(di_bh); - return acl; } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 5997c00a1515..86181d6526dc 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -908,32 +908,30 @@ static int ocfs2_validate_extent_block(struct super_block *sb, */ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - ocfs2_error(sb, - "Extent block #%llu has bad signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - eb->h_signature); - return -EINVAL; + rc = ocfs2_error(sb, + "Extent block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + eb->h_signature); + goto bail; } if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Extent block #%llu has an invalid h_blkno " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(eb->h_blkno)); - return -EINVAL; + rc = ocfs2_error(sb, + "Extent block #%llu has an invalid h_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(eb->h_blkno)); + goto bail; } if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Extent block #%llu has an invalid " - "h_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(eb->h_fs_generation)); - return -EINVAL; + rc = ocfs2_error(sb, + "Extent block #%llu has an invalid h_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(eb->h_fs_generation)); + goto bail; } - - return 0; +bail: + return rc; } int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, @@ -1446,8 +1444,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, while(le16_to_cpu(el->l_tree_depth) > 1) { if (le16_to_cpu(el->l_next_free_rec) == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has empty " - "extent list (next_free_rec == 0)", + "Owner %llu has empty extent list (next_free_rec == 0)\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); status = -EIO; goto bail; @@ -1456,9 +1453,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (!blkno) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has extent " - "list where extent # %d has no physical " - "block start", + "Owner %llu has extent list where extent # %d has no physical block start\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); status = -EIO; goto bail; @@ -1788,8 +1783,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, while (el->l_tree_depth) { if (le16_to_cpu(el->l_next_free_rec) == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has empty extent list at " - "depth %u\n", + "Owner %llu has empty extent list at depth %u\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth)); ret = -EROFS; @@ -1814,8 +1808,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (blkno == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has bad blkno in extent list " - "at depth %u (index %d)\n", + "Owner %llu has bad blkno in extent list at depth %u (index %d)\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth), i); ret = -EROFS; @@ -1836,8 +1829,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has bad count in extent list " - "at block %llu (next free=%u, count=%u)\n", + "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)bh->b_blocknr, le16_to_cpu(el->l_next_free_rec), @@ -2116,8 +2108,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle, if (left_el->l_next_free_rec != left_el->l_count) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Inode %llu has non-full interior leaf node %llu" - "(next free = %u)", + "Inode %llu has non-full interior leaf node %llu (next free = %u)\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)left_leaf_bh->b_blocknr, le16_to_cpu(left_el->l_next_free_rec)); @@ -2256,8 +2247,7 @@ int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, * If we got here, we never found a valid node where * the tree indicated one should be. */ - ocfs2_error(sb, - "Invalid extent tree at extent block %llu\n", + ocfs2_error(sb, "Invalid extent tree at extent block %llu\n", (unsigned long long)blkno); ret = -EROFS; goto out; @@ -2872,8 +2862,7 @@ int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, * If we got here, we never found a valid node where * the tree indicated one should be. */ - ocfs2_error(sb, - "Invalid extent tree at extent block %llu\n", + ocfs2_error(sb, "Invalid extent tree at extent block %llu\n", (unsigned long long)blkno); ret = -EROFS; goto out; @@ -3131,6 +3120,30 @@ out: return ret; } +static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + handle_t *handle; + int ret; + int credits = path->p_tree_depth * 2 + 1; + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return ret; + } + + ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc); + if (ret) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); + return ret; +} + /* * Left rotation of btree records. * @@ -3200,7 +3213,7 @@ rightmost_no_delete: if (le16_to_cpu(el->l_next_free_rec) == 0) { ret = -EIO; ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has empty extent block at %llu", + "Owner %llu has empty extent block at %llu\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)le64_to_cpu(eb->h_blkno)); goto out; @@ -3930,7 +3943,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, next_free = le16_to_cpu(el->l_next_free_rec); if (next_free == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has a bad extent list", + "Owner %llu has a bad extent list\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); ret = -EIO; return; @@ -4355,10 +4368,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, bh = path_leaf_bh(left_path); eb = (struct ocfs2_extent_block *)bh->b_data; ocfs2_error(sb, - "Extent block #%llu has an " - "invalid l_next_free_rec of " - "%d. It should have " - "matched the l_count of %d", + "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n", (unsigned long long)le64_to_cpu(eb->h_blkno), le16_to_cpu(new_el->l_next_free_rec), le16_to_cpu(new_el->l_count)); @@ -4413,8 +4423,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, bh = path_leaf_bh(right_path); eb = (struct ocfs2_extent_block *)bh->b_data; ocfs2_error(sb, - "Extent block #%llu has an " - "invalid l_next_free_rec of %d", + "Extent block #%llu has an invalid l_next_free_rec of %d\n", (unsigned long long)le64_to_cpu(eb->h_blkno), le16_to_cpu(new_el->l_next_free_rec)); status = -EINVAL; @@ -4970,10 +4979,9 @@ leftright: split_index = ocfs2_search_extent_list(el, cpos); if (split_index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has an extent at cpos %u " - "which can no longer be found.\n", - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), - cpos); + "Owner %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); ret = -EROFS; goto out; } @@ -5158,10 +5166,9 @@ int ocfs2_change_extent_flag(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ocfs2_error(sb, - "Owner %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long) - ocfs2_metadata_cache_owner(et->et_ci), cpos); + "Owner %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); ret = -EROFS; goto out; } @@ -5228,9 +5235,7 @@ int ocfs2_mark_extent_written(struct inode *inode, cpos, len, phys); if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " - "that are being written to, but the feature bit " - "is not set in the super block.", + ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); ret = -EROFS; goto out; @@ -5514,8 +5519,7 @@ int ocfs2_remove_extent(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has an extent at cpos %u which can no " - "longer be found.\n", + "Owner %llu has an extent at cpos %u which can no longer be found\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; @@ -5580,7 +5584,7 @@ int ocfs2_remove_extent(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu: split at cpos %u lost record.", + "Owner %llu: split at cpos %u lost record\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; @@ -5596,8 +5600,7 @@ int ocfs2_remove_extent(handle_t *handle, ocfs2_rec_clusters(el, rec); if (rec_range != trunc_range) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu: error after split at cpos %u" - "trunc len %u, existing record is (%u,%u)", + "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos, len, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); @@ -6175,7 +6178,7 @@ bail: iput(tl_inode); brelse(tl_bh); - if (status < 0 && (*tl_copy)) { + if (status < 0) { kfree(*tl_copy); *tl_copy = NULL; mlog_errno(status); @@ -7108,15 +7111,23 @@ start: * to check it up here before changing the tree. */ if (root_el->l_tree_depth && rec->e_int_clusters == 0) { - ocfs2_error(inode->i_sb, "Inode %lu has an empty " + mlog(ML_ERROR, "Inode %lu has an empty " "extent record, depth %u\n", inode->i_ino, le16_to_cpu(root_el->l_tree_depth)); - status = -EROFS; - goto bail; + status = ocfs2_remove_rightmost_empty_extent(osb, + &et, path, &dealloc); + if (status) { + mlog_errno(status); + goto bail; + } + + ocfs2_reinit_path(path, 1); + goto start; + } else { + trunc_cpos = le32_to_cpu(rec->e_cpos); + trunc_len = 0; + blkno = 0; } - trunc_cpos = le32_to_cpu(rec->e_cpos); - trunc_len = 0; - blkno = 0; } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) { /* * Truncate entire record. @@ -7204,8 +7215,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || !ocfs2_supports_inline_data(osb)) { ocfs2_error(inode->i_sb, - "Inline data flags for inode %llu don't agree! " - "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", + "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, le16_to_cpu(di->i_dyn_features), OCFS2_I(inode)->ip_dyn_features, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 0f5fd9db8194..64b11d90eca6 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { - ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag", + ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); return -EROFS; } @@ -237,7 +237,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, if (size > PAGE_CACHE_SIZE || size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { ocfs2_error(inode->i_sb, - "Inode %llu has with inline data has bad size: %Lu", + "Inode %llu has with inline data has bad size: %Lu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)size); return -EROFS; @@ -533,10 +533,14 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); + down_read(&OCFS2_I(inode)->ip_alloc_sem); + /* This figures out the size of the next contiguous block, and * our logical offset */ ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &contig_blocks, &ext_flags); + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (ret) { mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", (unsigned long long)iblock); @@ -557,6 +561,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, alloc_locked = 1; + down_write(&OCFS2_I(inode)->ip_alloc_sem); + /* fill hole, allocate blocks can't be larger than the size * of the hole */ clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); @@ -569,6 +575,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, ret = ocfs2_extend_allocation(inode, cpos, clusters_to_alloc, 0); if (ret < 0) { + up_write(&OCFS2_I(inode)->ip_alloc_sem); mlog_errno(ret); goto bail; } @@ -576,11 +583,13 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &contig_blocks, &ext_flags); if (ret < 0) { + up_write(&OCFS2_I(inode)->ip_alloc_sem); mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", (unsigned long long)iblock); ret = -EIO; goto bail; } + up_write(&OCFS2_I(inode)->ip_alloc_sem); } /* @@ -627,10 +636,13 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); } - ocfs2_iocb_clear_rw_locked(iocb); + /* Let rw unlock to be done later to protect append direct io write */ + if (offset + bytes <= i_size_read(inode)) { + ocfs2_iocb_clear_rw_locked(iocb); - level = ocfs2_iocb_rw_locked_level(iocb); - ocfs2_rw_unlock(inode, level); + level = ocfs2_iocb_rw_locked_level(iocb); + ocfs2_rw_unlock(inode, level); + } } static int ocfs2_releasepage(struct page *page, gfp_t wait) @@ -832,12 +844,17 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, /* zeroing out the previously allocated cluster tail * that but not zeroed */ - if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { + down_read(&OCFS2_I(inode)->ip_alloc_sem); ret = ocfs2_direct_IO_zero_extend(osb, inode, offset, zero_len_tail, cluster_align_tail); - else + up_read(&OCFS2_I(inode)->ip_alloc_sem); + } else { + down_write(&OCFS2_I(inode)->ip_alloc_sem); ret = ocfs2_direct_IO_extend_no_holes(osb, inode, offset); + up_write(&OCFS2_I(inode)->ip_alloc_sem); + } if (ret < 0) { mlog_errno(ret); ocfs2_inode_unlock(inode, 1); @@ -857,7 +874,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, offset, ocfs2_direct_IO_get_blocks, ocfs2_dio_end_io, NULL, 0); - if (unlikely(written < 0)) { + /* overwrite aio may return -EIOCBQUEUED, and it is not an error */ + if ((written < 0) && (written != -EIOCBQUEUED)) { loff_t i_size = i_size_read(inode); if (offset + count > i_size) { @@ -876,12 +894,14 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, ocfs2_inode_unlock(inode, 1); brelse(di_bh); + di_bh = NULL; goto clean_orphan; } } ocfs2_inode_unlock(inode, 1); brelse(di_bh); + di_bh = NULL; ret = jbd2_journal_force_commit(journal); if (ret < 0) @@ -936,10 +956,12 @@ clean_orphan: if (tmp_ret < 0) { ret = tmp_ret; mlog_errno(ret); + brelse(di_bh); goto out; } ocfs2_inode_unlock(inode, 1); + brelse(di_bh); tmp_ret = jbd2_journal_force_commit(journal); if (tmp_ret < 0) { @@ -2185,10 +2207,7 @@ try_again: if (ret) goto out_commit; } - /* - * We don't want this to fail in ocfs2_write_end(), so do it - * here. - */ + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { @@ -2345,7 +2364,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - int i; + int i, ret; unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); struct inode *inode = mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -2354,6 +2373,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping, handle_t *handle = wc->w_handle; struct page *tmppage; + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + copied = ret; + mlog_errno(ret); + goto out; + } + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { ocfs2_write_end_inline(inode, pos, len, &copied, di, wc); goto out_write_size; @@ -2409,6 +2436,7 @@ out_write_size: ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, wc->w_di_bh); +out: /* unlock pages before dealloc since it needs acquiring j_trans_barrier * lock, or it will cause a deadlock since journal commit threads holds * this lock and will ask for the page lock when flushing the data. diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 1edcb141f639..fe50ded1b4ce 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -316,6 +316,12 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, bh = bhs[i]; if (!(flags & OCFS2_BH_READAHEAD)) { + if (status) { + /* Clear the rest of the buffers on error */ + put_bh(bh); + bhs[i] = NULL; + continue; + } /* We know this can't have changed as we hold the * owner sem. Avoid doing any work on the bh if the * journal has it. */ diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 140de3c93d2e..fa15debcc02b 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -36,7 +36,7 @@ #include <linux/debugfs.h> #include <linux/slab.h> #include <linux/bitmap.h> - +#include <linux/ktime.h> #include "heartbeat.h" #include "tcp.h" #include "nodemanager.h" @@ -1060,37 +1060,6 @@ bail: return ret; } -/* Subtract b from a, storing the result in a. a *must* have a larger - * value than b. */ -static void o2hb_tv_subtract(struct timeval *a, - struct timeval *b) -{ - /* just return 0 when a is after b */ - if (a->tv_sec < b->tv_sec || - (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) { - a->tv_sec = 0; - a->tv_usec = 0; - return; - } - - a->tv_sec -= b->tv_sec; - a->tv_usec -= b->tv_usec; - while ( a->tv_usec < 0 ) { - a->tv_sec--; - a->tv_usec += 1000000; - } -} - -static unsigned int o2hb_elapsed_msecs(struct timeval *start, - struct timeval *end) -{ - struct timeval res = *end; - - o2hb_tv_subtract(&res, start); - - return res.tv_sec * 1000 + res.tv_usec / 1000; -} - /* * we ride the region ref that the region dir holds. before the region * dir is removed and drops it ref it will wait to tear down this @@ -1101,7 +1070,7 @@ static int o2hb_thread(void *data) int i, ret; struct o2hb_region *reg = data; struct o2hb_bio_wait_ctxt write_wc; - struct timeval before_hb, after_hb; + ktime_t before_hb, after_hb; unsigned int elapsed_msec; mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); @@ -1118,18 +1087,18 @@ static int o2hb_thread(void *data) * hr_timeout_ms between disk writes. On busy systems * this should result in a heartbeat which is less * likely to time itself out. */ - do_gettimeofday(&before_hb); + before_hb = ktime_get_real(); ret = o2hb_do_disk_heartbeat(reg); - do_gettimeofday(&after_hb); - elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); + after_hb = ktime_get_real(); + + elapsed_msec = (unsigned int) + ktime_ms_delta(after_hb, before_hb); mlog(ML_HEARTBEAT, - "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n", - before_hb.tv_sec, (unsigned long) before_hb.tv_usec, - after_hb.tv_sec, (unsigned long) after_hb.tv_usec, - elapsed_msec, ret); + "start = %lld, end = %lld, msec = %u, ret = %d\n", + before_hb.tv64, after_hb.tv64, elapsed_msec, ret); if (!kthread_should_stop() && elapsed_msec < reg->hr_timeout_ms) { @@ -1619,17 +1588,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg) struct o2hb_disk_slot *slot; reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL); - if (reg->hr_tmp_block == NULL) { - mlog_errno(-ENOMEM); + if (reg->hr_tmp_block == NULL) return -ENOMEM; - } reg->hr_slots = kcalloc(reg->hr_blocks, sizeof(struct o2hb_disk_slot), GFP_KERNEL); - if (reg->hr_slots == NULL) { - mlog_errno(-ENOMEM); + if (reg->hr_slots == NULL) return -ENOMEM; - } for(i = 0; i < reg->hr_blocks; i++) { slot = ®->hr_slots[i]; @@ -1645,17 +1610,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg) reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *), GFP_KERNEL); - if (!reg->hr_slot_data) { - mlog_errno(-ENOMEM); + if (!reg->hr_slot_data) return -ENOMEM; - } for(i = 0; i < reg->hr_num_pages; i++) { page = alloc_page(GFP_KERNEL); - if (!page) { - mlog_errno(-ENOMEM); + if (!page) return -ENOMEM; - } reg->hr_slot_data[i] = page; @@ -1687,10 +1648,8 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg) struct o2hb_disk_heartbeat_block *hb_block; ret = o2hb_read_slots(reg, reg->hr_blocks); - if (ret) { - mlog_errno(ret); + if (ret) goto out; - } /* We only want to get an idea of the values initially in each * slot, so we do no verification - o2hb_check_slot will diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 02878a83f0b4..ffecf89c8c1c 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -480,33 +480,26 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh) trailer = ocfs2_trailer_from_bh(bh, dir->i_sb); if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Invalid dirblock #%llu: " - "signature = %.*s\n", - (unsigned long long)bh->b_blocknr, 7, - trailer->db_signature); + rc = ocfs2_error(dir->i_sb, + "Invalid dirblock #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + trailer->db_signature); goto out; } if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Directory block #%llu has an invalid " - "db_blkno of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(trailer->db_blkno)); + rc = ocfs2_error(dir->i_sb, + "Directory block #%llu has an invalid db_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); goto out; } if (le64_to_cpu(trailer->db_parent_dinode) != OCFS2_I(dir)->ip_blkno) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Directory block #%llu on dinode " - "#%llu has an invalid parent_dinode " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)OCFS2_I(dir)->ip_blkno, - (unsigned long long)le64_to_cpu(trailer->db_blkno)); + rc = ocfs2_error(dir->i_sb, + "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); goto out; } out: @@ -604,14 +597,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb, } if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) { - ocfs2_error(sb, - "Dir Index Root # %llu has bad signature %.*s", - (unsigned long long)le64_to_cpu(dx_root->dr_blkno), - 7, dx_root->dr_signature); - return -EINVAL; + ret = ocfs2_error(sb, + "Dir Index Root # %llu has bad signature %.*s\n", + (unsigned long long)le64_to_cpu(dx_root->dr_blkno), + 7, dx_root->dr_signature); } - return 0; + return ret; } static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di, @@ -648,12 +640,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb, } if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) { - ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s", - 7, dx_leaf->dl_signature); - return -EROFS; + ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n", + 7, dx_leaf->dl_signature); } - return 0; + return ret; } static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno, @@ -812,11 +803,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "btree tree block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in btree tree block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -832,11 +822,11 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, } if (!found) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in btree", inode->i_ino, - le32_to_cpu(rec->e_cpos), - ocfs2_rec_clusters(el, rec)); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0) in btree\n", + inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); goto out; } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 7df88a6dd626..6918f30d02cd 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1465,39 +1465,46 @@ static int dlm_request_join(struct dlm_ctxt *dlm, if (status == -ENOPROTOOPT) { status = 0; *response = JOIN_OK_NO_MAP; - } else if (packet.code == JOIN_DISALLOW || - packet.code == JOIN_OK_NO_MAP) { - *response = packet.code; - } else if (packet.code == JOIN_PROTOCOL_MISMATCH) { - mlog(ML_NOTICE, - "This node requested DLM locking protocol %u.%u and " - "filesystem locking protocol %u.%u. At least one of " - "the protocol versions on node %d is not compatible, " - "disconnecting\n", - dlm->dlm_locking_proto.pv_major, - dlm->dlm_locking_proto.pv_minor, - dlm->fs_locking_proto.pv_major, - dlm->fs_locking_proto.pv_minor, - node); - status = -EPROTO; - *response = packet.code; - } else if (packet.code == JOIN_OK) { - *response = packet.code; - /* Use the same locking protocol as the remote node */ - dlm->dlm_locking_proto.pv_minor = packet.dlm_minor; - dlm->fs_locking_proto.pv_minor = packet.fs_minor; - mlog(0, - "Node %d responds JOIN_OK with DLM locking protocol " - "%u.%u and fs locking protocol %u.%u\n", - node, - dlm->dlm_locking_proto.pv_major, - dlm->dlm_locking_proto.pv_minor, - dlm->fs_locking_proto.pv_major, - dlm->fs_locking_proto.pv_minor); } else { - status = -EINVAL; - mlog(ML_ERROR, "invalid response %d from node %u\n", - packet.code, node); + *response = packet.code; + switch (packet.code) { + case JOIN_DISALLOW: + case JOIN_OK_NO_MAP: + break; + case JOIN_PROTOCOL_MISMATCH: + mlog(ML_NOTICE, + "This node requested DLM locking protocol %u.%u and " + "filesystem locking protocol %u.%u. At least one of " + "the protocol versions on node %d is not compatible, " + "disconnecting\n", + dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor, + dlm->fs_locking_proto.pv_major, + dlm->fs_locking_proto.pv_minor, + node); + status = -EPROTO; + break; + case JOIN_OK: + /* Use the same locking protocol as the remote node */ + dlm->dlm_locking_proto.pv_minor = packet.dlm_minor; + dlm->fs_locking_proto.pv_minor = packet.fs_minor; + mlog(0, + "Node %d responds JOIN_OK with DLM locking protocol " + "%u.%u and fs locking protocol %u.%u\n", + node, + dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor, + dlm->fs_locking_proto.pv_major, + dlm->fs_locking_proto.pv_minor); + break; + default: + status = -EINVAL; + mlog(ML_ERROR, "invalid response %d from node %u\n", + packet.code, node); + /* Reset response to JOIN_DISALLOW */ + *response = JOIN_DISALLOW; + break; + } } mlog(0, "status %d, node %d response is %d\n", status, node, @@ -1725,12 +1732,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); + o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, + dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); + status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down); if (status) goto bail; - o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, - dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up); if (status) goto bail; @@ -1845,8 +1853,6 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) sizeof(struct dlm_exit_domain), dlm_begin_exit_domain_handler, dlm, NULL, &dlm->dlm_domain_handlers); - if (status) - goto bail; bail: if (status) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index fdf4b41d0609..46b8b2bbc95a 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -498,16 +498,6 @@ static void dlm_lockres_release(struct kref *kref) mlog(0, "destroying lockres %.*s\n", res->lockname.len, res->lockname.name); - spin_lock(&dlm->track_lock); - if (!list_empty(&res->tracking)) - list_del_init(&res->tracking); - else { - mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", - res->lockname.len, res->lockname.name); - dlm_print_one_lock_resource(res); - } - spin_unlock(&dlm->track_lock); - atomic_dec(&dlm->res_cur_count); if (!hlist_unhashed(&res->hash_node) || @@ -795,8 +785,18 @@ lookup: dlm_lockres_grab_inflight_ref(dlm, tmpres); spin_unlock(&tmpres->spinlock); - if (res) + if (res) { + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else + mlog(ML_ERROR, "Resource %.*s not " + "on the Tracking list\n", + res->lockname.len, + res->lockname.name); + spin_unlock(&dlm->track_lock); dlm_lockres_put(res); + } res = tmpres; goto leave; } diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index ce12e0b1a31f..d0e436dc6437 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1776,7 +1776,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, struct dlm_migratable_lockres *mres) { struct dlm_migratable_lock *ml; - struct list_head *queue, *iter; + struct list_head *queue; struct list_head *tmpq = NULL; struct dlm_lock *newlock = NULL; struct dlm_lockstatus *lksb = NULL; @@ -1821,9 +1821,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { tmpq = dlm_list_idx_to_ptr(res, j); - list_for_each(iter, tmpq) { - lock = list_entry(iter, - struct dlm_lock, list); + list_for_each_entry(lock, tmpq, list) { if (lock->ml.cookie == ml->cookie) break; lock = NULL; diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 69aac6f088ad..2e5e6d5fffe8 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -211,6 +211,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, __dlm_unhash_lockres(dlm, res); + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else { + mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", + res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + } + spin_unlock(&dlm->track_lock); + /* lockres is not in the hash now. drop the flag and wake up * any processes waiting in dlm_get_lock_resource. */ if (!master) { diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 23157e40dd74..1c91103c1333 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3035,8 +3035,6 @@ local: ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb); osb->cconn = conn; - - status = 0; bail: if (status < 0) { ocfs2_dlm_shutdown_debug(osb); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 767370b656ca..e4719e0a3f99 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -305,8 +305,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; @@ -441,8 +441,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; @@ -475,8 +475,9 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); if (!rec->e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0)", inode->i_ino, + ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0)\n", + inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); ret = -EROFS; @@ -564,8 +565,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "xattr leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in xattr leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; @@ -582,8 +583,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); if (!rec->e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in xattr", inode->i_ino, + ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", + inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); ret = -EROFS; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 7210583b472f..0e5b4515f92e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1130,6 +1130,7 @@ out: int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) { int status = 0, size_change; + int inode_locked = 0; struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; struct ocfs2_super *osb = OCFS2_SB(sb); @@ -1178,6 +1179,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) mlog_errno(status); goto bail_unlock_rw; } + inode_locked = 1; if (size_change) { status = inode_newsize_ok(inode, attr->ia_size); @@ -1258,7 +1260,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: - ocfs2_inode_unlock(inode, 1); + if (status) { + ocfs2_inode_unlock(inode, 1); + inode_locked = 0; + } bail_unlock_rw: if (size_change) ocfs2_rw_unlock(inode, 1); @@ -1274,6 +1279,8 @@ bail: if (status < 0) mlog_errno(status); } + if (inode_locked) + ocfs2_inode_unlock(inode, 1); return status; } @@ -2262,8 +2269,6 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, ssize_t written = 0; ssize_t ret; size_t count = iov_iter_count(from), orig_count; - loff_t old_size; - u32 old_clusters; struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -2271,6 +2276,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, OCFS2_MOUNT_COHERENCY_BUFFERED); int unaligned_dio = 0; int dropped_dio = 0; + int append_write = ((iocb->ki_pos + count) >= + i_size_read(inode) ? 1 : 0); trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2290,8 +2297,9 @@ relock: /* * Concurrent O_DIRECT writes are allowed with * mount_option "coherency=buffered". + * For append write, we must take rw EX. */ - rw_level = (!direct_io || full_coherency); + rw_level = (!direct_io || full_coherency || append_write); ret = ocfs2_rw_lock(inode, rw_level); if (ret < 0) { @@ -2364,13 +2372,6 @@ relock: ocfs2_iocb_set_unaligned_aio(iocb); } - /* - * To later detect whether a journal commit for sync writes is - * necessary, we sample i_size, and cluster count here. - */ - old_size = i_size_read(inode); - old_clusters = OCFS2_I(inode)->ip_clusters; - /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_rw_locked(iocb, rw_level); @@ -2378,6 +2379,20 @@ relock: /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); + /* + * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io + * function pointer which is called when o_direct io completes so that + * it can unlock our rw lock. + * Unfortunately there are error cases which call end_io and others + * that don't. so we don't have to unlock the rw_lock if either an + * async dio is going to do it in the future or an end_io after an + * error has already done it. + */ + if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { + rw_level = -1; + unaligned_dio = 0; + } + if (unlikely(written <= 0)) goto no_sync; @@ -2402,21 +2417,7 @@ relock: } no_sync: - /* - * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io - * function pointer which is called when o_direct io completes so that - * it can unlock our rw lock. - * Unfortunately there are error cases which call end_io and others - * that don't. so we don't have to unlock the rw_lock if either an - * async dio is going to do it in the future or an end_io after an - * error has already done it. - */ - if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { - rw_level = -1; - unaligned_dio = 0; - } - - if (unaligned_dio) { + if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) { ocfs2_iocb_clear_unaligned_aio(iocb); mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); } diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index b254416dc8d9..8f87e05ee25d 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -971,6 +971,7 @@ static void ocfs2_delete_inode(struct inode *inode) int wipe, status; sigset_t oldset; struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; trace_ocfs2_delete_inode(inode->i_ino, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -1025,6 +1026,14 @@ static void ocfs2_delete_inode(struct inode *inode) goto bail_unlock_nfs_sync; } + di = (struct ocfs2_dinode *)di_bh->b_data; + /* Skip inode deletion and wait for dio orphan entry recovered + * first */ + if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { + ocfs2_cleanup_delete_inode(inode, 0); + goto bail_unlock_inode; + } + /* Query the cluster. This will be the final decision made * before we go ahead and wipe the inode. */ status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); @@ -1191,17 +1200,19 @@ void ocfs2_evict_inode(struct inode *inode) int ocfs2_drop_inode(struct inode *inode) { struct ocfs2_inode_info *oi = OCFS2_I(inode); - int res; trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); - if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) - res = 1; - else - res = generic_drop_inode(inode); + assert_spin_locked(&inode->i_lock); + inode->i_state |= I_WILL_FREE; + spin_unlock(&inode->i_lock); + write_inode_now(inode, 1); + spin_lock(&inode->i_lock); + WARN_ON(inode->i_state & I_NEW); + inode->i_state &= ~I_WILL_FREE; - return res; + return 1; } /* @@ -1350,32 +1361,32 @@ int ocfs2_validate_inode_block(struct super_block *sb, rc = -EINVAL; if (!OCFS2_IS_VALID_DINODE(di)) { - ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", - (unsigned long long)bh->b_blocknr, 7, - di->i_signature); + rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + di->i_signature); goto bail; } if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { - ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(di->i_blkno)); + rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); goto bail; } if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { - ocfs2_error(sb, - "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", - (unsigned long long)bh->b_blocknr); + rc = ocfs2_error(sb, + "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", + (unsigned long long)bh->b_blocknr); goto bail; } if (le32_to_cpu(di->i_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Invalid dinode #%llu: fs_generation is %u\n", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(di->i_fs_generation)); + rc = ocfs2_error(sb, + "Invalid dinode #%llu: fs_generation is %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); goto bail; } diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 5e86b247c821..ca3431ee7f24 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -81,8 +81,6 @@ struct ocfs2_inode_info tid_t i_sync_tid; tid_t i_datasync_tid; - wait_queue_head_t append_dio_wq; - struct dquot *i_dquot[MAXQUOTAS]; }; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 7c099f7032fd..ff82b28462a6 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -374,7 +374,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) mlog_errno(PTR_ERR(handle)); if (is_journal_aborted(journal)) { - ocfs2_abort(osb->sb, "Detected aborted journal"); + ocfs2_abort(osb->sb, "Detected aborted journal\n"); handle = ERR_PTR(-EROFS); } } else { @@ -668,7 +668,23 @@ static int __ocfs2_journal_access(handle_t *handle, mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); mlog(ML_ERROR, "b_blocknr=%llu\n", (unsigned long long)bh->b_blocknr); - BUG(); + + lock_buffer(bh); + /* + * A previous attempt to write this buffer head failed. + * Nothing we can do but to retry the write and hope for + * the best. + */ + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) { + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + + if (!buffer_uptodate(bh)) { + unlock_buffer(bh); + return -EIO; + } + unlock_buffer(bh); } /* Set the current transaction information on the ci so @@ -2170,6 +2186,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi->ip_next_orphan; oi->ip_next_orphan = NULL; + mutex_lock(&inode->i_mutex); ret = ocfs2_rw_lock(inode, 1); if (ret < 0) { mlog_errno(ret); @@ -2193,7 +2210,9 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, * ocfs2_delete_inode. */ oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; spin_unlock(&oi->ip_lock); - } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && + } + + if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { ret = ocfs2_truncate_file(inode, di_bh, i_size_read(inode)); @@ -2206,17 +2225,16 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); if (ret) mlog_errno(ret); - - wake_up(&OCFS2_I(inode)->append_dio_wq); } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ unlock_inode: ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + di_bh = NULL; unlock_rw: ocfs2_rw_unlock(inode, 1); next: + mutex_unlock(&inode->i_mutex); iput(inode); - brelse(di_bh); - di_bh = NULL; inode = iter; } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 857bbbcd39f3..0a4457fb0711 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -665,8 +665,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, #ifdef CONFIG_OCFS2_DEBUG_FS if (le32_to_cpu(alloc->id1.bitmap1.i_used) != ocfs2_local_alloc_count_bits(alloc)) { - ocfs2_error(osb->sb, "local alloc inode %llu says it has " - "%u used bits, but a count shows %u", + ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n", (unsigned long long)le64_to_cpu(alloc->i_blkno), le32_to_cpu(alloc->id1.bitmap1.i_used), ocfs2_local_alloc_count_bits(alloc)); diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 56a768d06aa6..124471d26a73 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -99,11 +99,9 @@ static int __ocfs2_move_extent(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { - ocfs2_error(inode->i_sb, - "Inode %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long)ino, cpos); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ino, cpos); goto out; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 948681e37cfd..b7dfac226b1e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1035,11 +1035,6 @@ leave: if (handle) ocfs2_commit_trans(osb, handle); - if (child_locked) - ocfs2_inode_unlock(inode, 1); - - ocfs2_inode_unlock(dir, 1); - if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); @@ -1047,6 +1042,11 @@ leave: iput(orphan_dir); } + if (child_locked) + ocfs2_inode_unlock(inode, 1); + + ocfs2_inode_unlock(dir, 1); + brelse(fe_bh); brelse(parent_node_bh); @@ -1309,6 +1309,11 @@ static int ocfs2_rename(struct inode *old_dir, } parents_locked = 1; + if (!new_dir->i_nlink) { + status = -EACCES; + goto bail; + } + /* make sure both dirs have bhs * get an extra ref on old_dir_bh if old==new */ if (!new_dir_bh) { @@ -1569,12 +1574,25 @@ static int ocfs2_rename(struct inode *old_dir, status = ocfs2_find_entry(old_dentry->d_name.name, old_dentry->d_name.len, old_dir, &old_entry_lookup); - if (status) + if (status) { + if (!is_journal_aborted(osb->journal->j_journal)) { + ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s " + "is not deleted.", + new_dentry->d_name.len, new_dentry->d_name.name, + old_dentry->d_name.len, old_dentry->d_name.name); + } goto bail; + } status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup); if (status < 0) { mlog_errno(status); + if (!is_journal_aborted(osb->journal->j_journal)) { + ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s " + "is not deleted.", + new_dentry->d_name.len, new_dentry->d_name.name, + old_dentry->d_name.len, old_dentry->d_name.name); + } goto bail; } @@ -1633,21 +1651,9 @@ static int ocfs2_rename(struct inode *old_dir, ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); status = 0; bail: - if (rename_lock) - ocfs2_rename_unlock(osb); - if (handle) ocfs2_commit_trans(osb, handle); - if (parents_locked) - ocfs2_double_unlock(old_dir, new_dir); - - if (old_child_locked) - ocfs2_inode_unlock(old_inode, 1); - - if (new_child_locked) - ocfs2_inode_unlock(new_inode, 1); - if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); @@ -1655,6 +1661,18 @@ bail: iput(orphan_dir); } + if (new_child_locked) + ocfs2_inode_unlock(new_inode, 1); + + if (old_child_locked) + ocfs2_inode_unlock(old_inode, 1); + + if (parents_locked) + ocfs2_double_unlock(old_dir, new_dir); + + if (rename_lock) + ocfs2_rename_unlock(osb); + if (new_inode) sync_mapping_buffers(old_inode->i_mapping); @@ -2601,27 +2619,6 @@ leave: return status; } -static int ocfs2_dio_orphan_recovered(struct inode *inode) -{ - int ret; - struct buffer_head *di_bh = NULL; - struct ocfs2_dinode *di = NULL; - - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { - mlog_errno(ret); - return 0; - } - - di = (struct ocfs2_dinode *) di_bh->b_data; - ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)); - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - - return ret; -} - -#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000 int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, struct inode *inode) { @@ -2633,7 +2630,6 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, handle_t *handle = NULL; struct ocfs2_dinode *di = NULL; -restart: status = ocfs2_inode_lock(inode, &di_bh, 1); if (status < 0) { mlog_errno(status); @@ -2643,15 +2639,21 @@ restart: di = (struct ocfs2_dinode *) di_bh->b_data; /* * Another append dio crashed? - * If so, wait for recovery first. + * If so, manually recover it first. */ if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq, - ocfs2_dio_orphan_recovered(inode), - msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL)); - goto restart; + status = ocfs2_truncate_file(inode, di_bh, i_size_read(inode)); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail_unlock_inode; + } + + status = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_inode; + } } status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 690ddc60189b..7a0126267847 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -286,6 +286,8 @@ enum ocfs2_mount_options OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ + OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ + OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ }; #define OCFS2_OSB_SOFT_RO 0x0001 diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index bb07004df72a..8a54fd8a4fa5 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -138,8 +138,7 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block, if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { ocfs2_error(inode->i_sb, - "Quota file %llu is probably corrupted! Requested " - "to read block %Lu but file has size only %Lu\n", + "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)v_block, (unsigned long long)i_size_read(inode)); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7dc818b87cd8..e5d57cd32505 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -102,32 +102,30 @@ static int ocfs2_validate_refcount_block(struct super_block *sb, if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { - ocfs2_error(sb, - "Refcount block #%llu has bad signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - rb->rf_signature); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + rb->rf_signature); + goto out; } if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Refcount block #%llu has an invalid rf_blkno " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(rb->rf_blkno)); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(rb->rf_blkno)); + goto out; } if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Refcount block #%llu has an invalid " - "rf_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(rb->rf_fs_generation)); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(rb->rf_fs_generation)); + goto out; } - - return 0; +out: + return rc; } static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, @@ -1102,12 +1100,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(sb, - "refcount tree %llu has non zero tree " - "depth in leaf btree tree block %llu\n", - (unsigned long long)ocfs2_metadata_cache_owner(ci), - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(sb, + "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -2359,10 +2355,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode, cpos, len, phys); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); goto out; } @@ -2545,10 +2539,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); goto out; } @@ -2672,11 +2664,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -3106,11 +3097,9 @@ static int ocfs2_clear_ext_refcount(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { - ocfs2_error(sb, - "Inode %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long)ino, cpos); - ret = -EROFS; + ret = ocfs2_error(sb, + "Inode %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ino, cpos); goto out; } @@ -3376,10 +3365,8 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - return -EROFS; + return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); } ocfs2_init_dealloc_ctxt(&context->dealloc); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 4479029630bb..d83d2602cf2b 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -149,10 +149,8 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) brelse(ac->ac_bh); ac->ac_bh = NULL; ac->ac_resv = NULL; - if (ac->ac_find_loc_priv) { - kfree(ac->ac_find_loc_priv); - ac->ac_find_loc_priv = NULL; - } + kfree(ac->ac_find_loc_priv); + ac->ac_find_loc_priv = NULL; } void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) @@ -167,12 +165,12 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) } #define do_error(fmt, ...) \ - do{ \ - if (resize) \ - mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ - else \ - ocfs2_error(sb, fmt, ##__VA_ARGS__); \ - } while (0) +do { \ + if (resize) \ + mlog(ML_ERROR, fmt, ##__VA_ARGS__); \ + else \ + return ocfs2_error(sb, fmt, ##__VA_ARGS__); \ +} while (0) static int ocfs2_validate_gd_self(struct super_block *sb, struct buffer_head *bh, @@ -181,44 +179,35 @@ static int ocfs2_validate_gd_self(struct super_block *sb, struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { - do_error("Group descriptor #%llu has bad signature %.*s", + do_error("Group descriptor #%llu has bad signature %.*s\n", (unsigned long long)bh->b_blocknr, 7, gd->bg_signature); - return -EINVAL; } if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { - do_error("Group descriptor #%llu has an invalid bg_blkno " - "of %llu", + do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(gd->bg_blkno)); - return -EINVAL; } if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { - do_error("Group descriptor #%llu has an invalid " - "fs_generation of #%u", + do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n", (unsigned long long)bh->b_blocknr, le32_to_cpu(gd->bg_generation)); - return -EINVAL; } if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { - do_error("Group descriptor #%llu has bit count %u but " - "claims that %u are free", + do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits), le16_to_cpu(gd->bg_free_bits_count)); - return -EINVAL; } if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { - do_error("Group descriptor #%llu has bit count %u but " - "max bitmap bits of %u", + do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits), 8 * le16_to_cpu(gd->bg_size)); - return -EINVAL; } return 0; @@ -233,20 +222,17 @@ static int ocfs2_validate_gd_parent(struct super_block *sb, struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; if (di->i_blkno != gd->bg_parent_dinode) { - do_error("Group descriptor #%llu has bad parent " - "pointer (%llu, expected %llu)", + do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), (unsigned long long)le64_to_cpu(di->i_blkno)); - return -EINVAL; } max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); if (le16_to_cpu(gd->bg_bits) > max_bits) { - do_error("Group descriptor #%llu has bit count of %u", + do_error("Group descriptor #%llu has bit count of %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits)); - return -EINVAL; } /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ @@ -254,10 +240,9 @@ static int ocfs2_validate_gd_parent(struct super_block *sb, le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || ((le16_to_cpu(gd->bg_chain) == le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { - do_error("Group descriptor #%llu has bad chain %u", + do_error("Group descriptor #%llu has bad chain %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_chain)); - return -EINVAL; } return 0; @@ -384,11 +369,10 @@ static int ocfs2_block_group_fill(handle_t *handle, struct super_block * sb = alloc_inode->i_sb; if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { - ocfs2_error(alloc_inode->i_sb, "group block (%llu) != " - "b_blocknr (%llu)", - (unsigned long long)group_blkno, - (unsigned long long) bg_bh->b_blocknr); - status = -EIO; + status = ocfs2_error(alloc_inode->i_sb, + "group block (%llu) != b_blocknr (%llu)\n", + (unsigned long long)group_blkno, + (unsigned long long) bg_bh->b_blocknr); goto bail; } @@ -834,9 +818,9 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { - ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", - (unsigned long long)le64_to_cpu(fe->i_blkno)); - status = -EIO; + status = ocfs2_error(alloc_inode->i_sb, + "Invalid chain allocator %llu\n", + (unsigned long long)le64_to_cpu(fe->i_blkno)); goto bail; } @@ -1370,12 +1354,11 @@ int ocfs2_block_group_set_bits(handle_t *handle, le16_add_cpu(&bg->bg_free_bits_count, -num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; + return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), + num_bits); } while(num_bits--) ocfs2_set_bit(bit_off++, bitmap); @@ -1905,13 +1888,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, if (le32_to_cpu(fe->id1.bitmap1.i_used) >= le32_to_cpu(fe->id1.bitmap1.i_total)) { - ocfs2_error(ac->ac_inode->i_sb, - "Chain allocator dinode %llu has %u used " - "bits but only %u total.", - (unsigned long long)le64_to_cpu(fe->i_blkno), - le32_to_cpu(fe->id1.bitmap1.i_used), - le32_to_cpu(fe->id1.bitmap1.i_total)); - status = -EIO; + status = ocfs2_error(ac->ac_inode->i_sb, + "Chain allocator dinode %llu has %u used bits but only %u total\n", + (unsigned long long)le64_to_cpu(fe->i_blkno), + le32_to_cpu(fe->id1.bitmap1.i_used), + le32_to_cpu(fe->id1.bitmap1.i_total)); goto bail; } @@ -2429,12 +2410,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, } le16_add_cpu(&bg->bg_free_bits_count, num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; + return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), + num_bits); } if (undo_fn) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 403c5660b306..2de4c8a9340c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -192,6 +192,7 @@ enum { Opt_resv_level, Opt_dir_resv_level, Opt_journal_async_commit, + Opt_err_cont, Opt_err, }; @@ -224,6 +225,7 @@ static const match_table_t tokens = { {Opt_resv_level, "resv_level=%u"}, {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_journal_async_commit, "journal_async_commit"}, + {Opt_err_cont, "errors=continue"}, {Opt_err, NULL} }; @@ -1330,10 +1332,19 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->mount_opt |= OCFS2_MOUNT_NOINTR; break; case Opt_err_panic: + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT; + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS; mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; break; case Opt_err_ro: + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT; mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; + mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS; + break; + case Opt_err_cont: + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS; + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; + mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT; break; case Opt_data_ordered: mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; @@ -1530,6 +1541,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (opts & OCFS2_MOUNT_ERRORS_PANIC) seq_printf(s, ",errors=panic"); + else if (opts & OCFS2_MOUNT_ERRORS_CONT) + seq_printf(s, ",errors=continue"); else seq_printf(s, ",errors=remount-ro"); @@ -1550,8 +1563,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",localflocks,"); if (osb->osb_cluster_stack[0]) - seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, - osb->osb_cluster_stack); + seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack, + OCFS2_STACK_LABEL_LEN); if (opts & OCFS2_MOUNT_USRQUOTA) seq_printf(s, ",usrquota"); if (opts & OCFS2_MOUNT_GRPQUOTA) @@ -1746,8 +1759,6 @@ static void ocfs2_inode_init_once(void *data) ocfs2_lock_res_init_once(&oi->ip_inode_lockres); ocfs2_lock_res_init_once(&oi->ip_open_lockres); - init_waitqueue_head(&oi->append_dio_wq); - ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), &ocfs2_inode_caching_ops); @@ -2541,31 +2552,43 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) memset(osb, 0, sizeof(struct ocfs2_super)); } -/* Put OCFS2 into a readonly state, or (if the user specifies it), - * panic(). We do not support continue-on-error operation. */ -static void ocfs2_handle_error(struct super_block *sb) +/* Depending on the mount option passed, perform one of the following: + * Put OCFS2 into a readonly state (default) + * Return EIO so that only the process errs + * Fix the error as if fsck.ocfs2 -y + * panic + */ +static int ocfs2_handle_error(struct super_block *sb) { struct ocfs2_super *osb = OCFS2_SB(sb); - - if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) - panic("OCFS2: (device %s): panic forced after error\n", - sb->s_id); + int rv = 0; ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); + pr_crit("On-disk corruption discovered. " + "Please run fsck.ocfs2 once the filesystem is unmounted.\n"); - if (sb->s_flags & MS_RDONLY && - (ocfs2_is_soft_readonly(osb) || - ocfs2_is_hard_readonly(osb))) - return; - - printk(KERN_CRIT "File system is now read-only due to the potential " - "of on-disk corruption. Please run fsck.ocfs2 once the file " - "system is unmounted.\n"); - sb->s_flags |= MS_RDONLY; - ocfs2_set_ro_flag(osb, 0); + if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) { + panic("OCFS2: (device %s): panic forced after error\n", + sb->s_id); + } else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) { + pr_crit("OCFS2: Returning error to the calling process.\n"); + rv = -EIO; + } else { /* default option */ + rv = -EROFS; + if (sb->s_flags & MS_RDONLY && + (ocfs2_is_soft_readonly(osb) || + ocfs2_is_hard_readonly(osb))) + return rv; + + pr_crit("OCFS2: File system is now read-only.\n"); + sb->s_flags |= MS_RDONLY; + ocfs2_set_ro_flag(osb, 0); + } + + return rv; } -void __ocfs2_error(struct super_block *sb, const char *function, +int __ocfs2_error(struct super_block *sb, const char *function, const char *fmt, ...) { struct va_format vaf; @@ -2577,12 +2600,12 @@ void __ocfs2_error(struct super_block *sb, const char *function, /* Not using mlog here because we want to show the actual * function the error came from. */ - printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n", + printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV", sb->s_id, function, &vaf); va_end(args); - ocfs2_handle_error(sb); + return ocfs2_handle_error(sb); } /* Handle critical errors. This is intentionally more drastic than @@ -2599,7 +2622,7 @@ void __ocfs2_abort(struct super_block *sb, const char *function, vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n", + printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV", sb->s_id, function, &vaf); va_end(args); diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index 74ff74cf78fe..b477d0b1c7b6 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -32,16 +32,18 @@ int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, int node_num); __printf(3, 4) -void __ocfs2_error(struct super_block *sb, const char *function, +int __ocfs2_error(struct super_block *sb, const char *function, const char *fmt, ...); -#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) +#define ocfs2_error(sb, fmt, ...) \ + __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__) __printf(3, 4) void __ocfs2_abort(struct super_block *sb, const char *function, const char *fmt, ...); -#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) +#define ocfs2_abort(sb, fmt, ...) \ + __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__) /* * Void signal blockers, because in-kernel sigprocmask() only fails diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 889f3796a0d7..ebfdea78659b 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -499,30 +499,24 @@ static int ocfs2_validate_xattr_block(struct super_block *sb, */ if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { - ocfs2_error(sb, - "Extended attribute block #%llu has bad " - "signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - xb->xb_signature); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + xb->xb_signature); } if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Extended attribute block #%llu has an " - "invalid xb_blkno of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(xb->xb_blkno)); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has an invalid xb_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(xb->xb_blkno)); } if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Extended attribute block #%llu has an invalid " - "xb_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(xb->xb_fs_generation)); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has an invalid xb_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(xb->xb_fs_generation)); } return 0; @@ -3694,11 +3688,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "xattr tree block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in xattr tree block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -3713,11 +3706,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode, } if (!e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in xattr", inode->i_ino, - le32_to_cpu(rec->e_cpos), - ocfs2_rec_clusters(el, rec)); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", + inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); goto out; } @@ -7334,6 +7326,9 @@ static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list, const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; + if (!capable(CAP_SYS_ADMIN)) + return 0; + if (list && total_len <= list_size) { memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); memcpy(list + prefix_len, name, name_len); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 7466ff339c66..79073d68b475 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -588,10 +588,10 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) struct super_block *sb = dentry->d_sb; struct ovl_fs *ufs = sb->s_fs_info; - seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir); + seq_show_option(m, "lowerdir", ufs->config.lowerdir); if (ufs->config.upperdir) { - seq_printf(m, ",upperdir=%s", ufs->config.upperdir); - seq_printf(m, ",workdir=%s", ufs->config.workdir); + seq_show_option(m, "upperdir", ufs->config.upperdir); + seq_show_option(m, "workdir", ufs->config.workdir); } return 0; } diff --git a/fs/proc/array.c b/fs/proc/array.c index ce065cf3104f..f60f0121e331 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -308,7 +308,8 @@ static void render_cap_t(struct seq_file *m, const char *header, static inline void task_cap(struct seq_file *m, struct task_struct *p) { const struct cred *cred; - kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset; + kernel_cap_t cap_inheritable, cap_permitted, cap_effective, + cap_bset, cap_ambient; rcu_read_lock(); cred = __task_cred(p); @@ -316,12 +317,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) cap_permitted = cred->cap_permitted; cap_effective = cred->cap_effective; cap_bset = cred->cap_bset; + cap_ambient = cred->cap_ambient; rcu_read_unlock(); render_cap_t(m, "CapInh:\t", &cap_inheritable); render_cap_t(m, "CapPrm:\t", &cap_permitted); render_cap_t(m, "CapEff:\t", &cap_effective); render_cap_t(m, "CapBnd:\t", &cap_bset); + render_cap_t(m, "CapAmb:\t", &cap_ambient); } static inline void task_seccomp(struct seq_file *m, struct task_struct *p) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ca1e091881d4..3b4d8255e806 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -597,6 +597,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_HUGEPAGE)] = "hg", [ilog2(VM_NOHUGEPAGE)] = "nh", [ilog2(VM_MERGEABLE)] = "mg", + [ilog2(VM_UFFD_MISSING)]= "um", + [ilog2(VM_UFFD_WP)] = "uw", }; size_t i; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 0e4cf728126f..4a62fe8cc3bf 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -714,18 +714,20 @@ static int reiserfs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",acl"); if (REISERFS_SB(s)->s_jdev) - seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev); + seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev); if (journal->j_max_commit_age != journal->j_default_max_commit_age) seq_printf(seq, ",commit=%d", journal->j_max_commit_age); #ifdef CONFIG_QUOTA if (REISERFS_SB(s)->s_qf_names[USRQUOTA]) - seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]); + seq_show_option(seq, "usrjquota", + REISERFS_SB(s)->s_qf_names[USRQUOTA]); else if (opts & (1 << REISERFS_USRQUOTA)) seq_puts(seq, ",usrquota"); if (REISERFS_SB(s)->s_qf_names[GRPQUOTA]) - seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]); + seq_show_option(seq, "grpjquota", + REISERFS_SB(s)->s_qf_names[GRPQUOTA]); else if (opts & (1 << REISERFS_GRPQUOTA)) seq_puts(seq, ",grpquota"); if (REISERFS_SB(s)->s_jquota_fmt) { diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c new file mode 100644 index 000000000000..634e676072cb --- /dev/null +++ b/fs/userfaultfd.c @@ -0,0 +1,1330 @@ +/* + * fs/userfaultfd.c + * + * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> + * Copyright (C) 2008-2009 Red Hat, Inc. + * Copyright (C) 2015 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Some part derived from fs/eventfd.c (anon inode setup) and + * mm/ksm.c (mm hashing). + */ + +#include <linux/hashtable.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/poll.h> +#include <linux/slab.h> +#include <linux/seq_file.h> +#include <linux/file.h> +#include <linux/bug.h> +#include <linux/anon_inodes.h> +#include <linux/syscalls.h> +#include <linux/userfaultfd_k.h> +#include <linux/mempolicy.h> +#include <linux/ioctl.h> +#include <linux/security.h> + +static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; + +enum userfaultfd_state { + UFFD_STATE_WAIT_API, + UFFD_STATE_RUNNING, +}; + +/* + * Start with fault_pending_wqh and fault_wqh so they're more likely + * to be in the same cacheline. + */ +struct userfaultfd_ctx { + /* waitqueue head for the pending (i.e. not read) userfaults */ + wait_queue_head_t fault_pending_wqh; + /* waitqueue head for the userfaults */ + wait_queue_head_t fault_wqh; + /* waitqueue head for the pseudo fd to wakeup poll/read */ + wait_queue_head_t fd_wqh; + /* a refile sequence protected by fault_pending_wqh lock */ + struct seqcount refile_seq; + /* pseudo fd refcounting */ + atomic_t refcount; + /* userfaultfd syscall flags */ + unsigned int flags; + /* state machine */ + enum userfaultfd_state state; + /* released */ + bool released; + /* mm with one ore more vmas attached to this userfaultfd_ctx */ + struct mm_struct *mm; +}; + +struct userfaultfd_wait_queue { + struct uffd_msg msg; + wait_queue_t wq; + struct userfaultfd_ctx *ctx; +}; + +struct userfaultfd_wake_range { + unsigned long start; + unsigned long len; +}; + +static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, + int wake_flags, void *key) +{ + struct userfaultfd_wake_range *range = key; + int ret; + struct userfaultfd_wait_queue *uwq; + unsigned long start, len; + + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); + ret = 0; + /* len == 0 means wake all */ + start = range->start; + len = range->len; + if (len && (start > uwq->msg.arg.pagefault.address || + start + len <= uwq->msg.arg.pagefault.address)) + goto out; + ret = wake_up_state(wq->private, mode); + if (ret) + /* + * Wake only once, autoremove behavior. + * + * After the effect of list_del_init is visible to the + * other CPUs, the waitqueue may disappear from under + * us, see the !list_empty_careful() in + * handle_userfault(). try_to_wake_up() has an + * implicit smp_mb__before_spinlock, and the + * wq->private is read before calling the extern + * function "wake_up_state" (which in turns calls + * try_to_wake_up). While the spin_lock;spin_unlock; + * wouldn't be enough, the smp_mb__before_spinlock is + * enough to avoid an explicit smp_mb() here. + */ + list_del_init(&wq->task_list); +out: + return ret; +} + +/** + * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd + * context. + * @ctx: [in] Pointer to the userfaultfd context. + * + * Returns: In case of success, returns not zero. + */ +static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) +{ + if (!atomic_inc_not_zero(&ctx->refcount)) + BUG(); +} + +/** + * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd + * context. + * @ctx: [in] Pointer to userfaultfd context. + * + * The userfaultfd context reference must have been previously acquired either + * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget(). + */ +static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) { + VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock)); + VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); + VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); + VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); + VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); + VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); + mmput(ctx->mm); + kmem_cache_free(userfaultfd_ctx_cachep, ctx); + } +} + +static inline void msg_init(struct uffd_msg *msg) +{ + BUILD_BUG_ON(sizeof(struct uffd_msg) != 32); + /* + * Must use memset to zero out the paddings or kernel data is + * leaked to userland. + */ + memset(msg, 0, sizeof(struct uffd_msg)); +} + +static inline struct uffd_msg userfault_msg(unsigned long address, + unsigned int flags, + unsigned long reason) +{ + struct uffd_msg msg; + msg_init(&msg); + msg.event = UFFD_EVENT_PAGEFAULT; + msg.arg.pagefault.address = address; + if (flags & FAULT_FLAG_WRITE) + /* + * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the + * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE + * was not set in a UFFD_EVENT_PAGEFAULT, it means it + * was a read fault, otherwise if set it means it's + * a write fault. + */ + msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; + if (reason & VM_UFFD_WP) + /* + * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the + * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was + * not set in a UFFD_EVENT_PAGEFAULT, it means it was + * a missing fault, otherwise if set it means it's a + * write protect fault. + */ + msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; + return msg; +} + +/* + * Verify the pagetables are still not ok after having reigstered into + * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any + * userfault that has already been resolved, if userfaultfd_read and + * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different + * threads. + */ +static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, + unsigned long address, + unsigned long flags, + unsigned long reason) +{ + struct mm_struct *mm = ctx->mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + bool ret = true; + + VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + pmd = pmd_offset(pud, address); + /* + * READ_ONCE must function as a barrier with narrower scope + * and it must be equivalent to: + * _pmd = *pmd; barrier(); + * + * This is to deal with the instability (as in + * pmd_trans_unstable) of the pmd. + */ + _pmd = READ_ONCE(*pmd); + if (!pmd_present(_pmd)) + goto out; + + ret = false; + if (pmd_trans_huge(_pmd)) + goto out; + + /* + * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it + * and use the standard pte_offset_map() instead of parsing _pmd. + */ + pte = pte_offset_map(pmd, address); + /* + * Lockless access: we're in a wait_event so it's ok if it + * changes under us. + */ + if (pte_none(*pte)) + ret = true; + pte_unmap(pte); + +out: + return ret; +} + +/* + * The locking rules involved in returning VM_FAULT_RETRY depending on + * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and + * FAULT_FLAG_KILLABLE are not straightforward. The "Caution" + * recommendation in __lock_page_or_retry is not an understatement. + * + * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released + * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is + * not set. + * + * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not + * set, VM_FAULT_RETRY can still be returned if and only if there are + * fatal_signal_pending()s, and the mmap_sem must be released before + * returning it. + */ +int handle_userfault(struct vm_area_struct *vma, unsigned long address, + unsigned int flags, unsigned long reason) +{ + struct mm_struct *mm = vma->vm_mm; + struct userfaultfd_ctx *ctx; + struct userfaultfd_wait_queue uwq; + int ret; + bool must_wait, return_to_userland; + + BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + + ret = VM_FAULT_SIGBUS; + ctx = vma->vm_userfaultfd_ctx.ctx; + if (!ctx) + goto out; + + BUG_ON(ctx->mm != mm); + + VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); + VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); + + /* + * If it's already released don't get it. This avoids to loop + * in __get_user_pages if userfaultfd_release waits on the + * caller of handle_userfault to release the mmap_sem. + */ + if (unlikely(ACCESS_ONCE(ctx->released))) + goto out; + + /* + * Check that we can return VM_FAULT_RETRY. + * + * NOTE: it should become possible to return VM_FAULT_RETRY + * even if FAULT_FLAG_TRIED is set without leading to gup() + * -EBUSY failures, if the userfaultfd is to be extended for + * VM_UFFD_WP tracking and we intend to arm the userfault + * without first stopping userland access to the memory. For + * VM_UFFD_MISSING userfaults this is enough for now. + */ + if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) { + /* + * Validate the invariant that nowait must allow retry + * to be sure not to return SIGBUS erroneously on + * nowait invocations. + */ + BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT); +#ifdef CONFIG_DEBUG_VM + if (printk_ratelimit()) { + printk(KERN_WARNING + "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags); + dump_stack(); + } +#endif + goto out; + } + + /* + * Handle nowait, not much to do other than tell it to retry + * and wait. + */ + ret = VM_FAULT_RETRY; + if (flags & FAULT_FLAG_RETRY_NOWAIT) + goto out; + + /* take the reference before dropping the mmap_sem */ + userfaultfd_ctx_get(ctx); + + init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); + uwq.wq.private = current; + uwq.msg = userfault_msg(address, flags, reason); + uwq.ctx = ctx; + + return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == + (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); + + spin_lock(&ctx->fault_pending_wqh.lock); + /* + * After the __add_wait_queue the uwq is visible to userland + * through poll/read(). + */ + __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq); + /* + * The smp_mb() after __set_current_state prevents the reads + * following the spin_unlock to happen before the list_add in + * __add_wait_queue. + */ + set_current_state(return_to_userland ? TASK_INTERRUPTIBLE : + TASK_KILLABLE); + spin_unlock(&ctx->fault_pending_wqh.lock); + + must_wait = userfaultfd_must_wait(ctx, address, flags, reason); + up_read(&mm->mmap_sem); + + if (likely(must_wait && !ACCESS_ONCE(ctx->released) && + (return_to_userland ? !signal_pending(current) : + !fatal_signal_pending(current)))) { + wake_up_poll(&ctx->fd_wqh, POLLIN); + schedule(); + ret |= VM_FAULT_MAJOR; + } + + __set_current_state(TASK_RUNNING); + + if (return_to_userland) { + if (signal_pending(current) && + !fatal_signal_pending(current)) { + /* + * If we got a SIGSTOP or SIGCONT and this is + * a normal userland page fault, just let + * userland return so the signal will be + * handled and gdb debugging works. The page + * fault code immediately after we return from + * this function is going to release the + * mmap_sem and it's not depending on it + * (unlike gup would if we were not to return + * VM_FAULT_RETRY). + * + * If a fatal signal is pending we still take + * the streamlined VM_FAULT_RETRY failure path + * and there's no need to retake the mmap_sem + * in such case. + */ + down_read(&mm->mmap_sem); + ret = 0; + } + } + + /* + * Here we race with the list_del; list_add in + * userfaultfd_ctx_read(), however because we don't ever run + * list_del_init() to refile across the two lists, the prev + * and next pointers will never point to self. list_add also + * would never let any of the two pointers to point to + * self. So list_empty_careful won't risk to see both pointers + * pointing to self at any time during the list refile. The + * only case where list_del_init() is called is the full + * removal in the wake function and there we don't re-list_add + * and it's fine not to block on the spinlock. The uwq on this + * kernel stack can be released after the list_del_init. + */ + if (!list_empty_careful(&uwq.wq.task_list)) { + spin_lock(&ctx->fault_pending_wqh.lock); + /* + * No need of list_del_init(), the uwq on the stack + * will be freed shortly anyway. + */ + list_del(&uwq.wq.task_list); + spin_unlock(&ctx->fault_pending_wqh.lock); + } + + /* + * ctx may go away after this if the userfault pseudo fd is + * already released. + */ + userfaultfd_ctx_put(ctx); + +out: + return ret; +} + +static int userfaultfd_release(struct inode *inode, struct file *file) +{ + struct userfaultfd_ctx *ctx = file->private_data; + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma, *prev; + /* len == 0 means wake all */ + struct userfaultfd_wake_range range = { .len = 0, }; + unsigned long new_flags; + + ACCESS_ONCE(ctx->released) = true; + + /* + * Flush page faults out of all CPUs. NOTE: all page faults + * must be retried without returning VM_FAULT_SIGBUS if + * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx + * changes while handle_userfault released the mmap_sem. So + * it's critical that released is set to true (above), before + * taking the mmap_sem for writing. + */ + down_write(&mm->mmap_sem); + prev = NULL; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + cond_resched(); + BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ + !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + if (vma->vm_userfaultfd_ctx.ctx != ctx) { + prev = vma; + continue; + } + new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); + prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, + new_flags, vma->anon_vma, + vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + NULL_VM_UFFD_CTX); + if (prev) + vma = prev; + else + prev = vma; + vma->vm_flags = new_flags; + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + } + up_write(&mm->mmap_sem); + + /* + * After no new page faults can wait on this fault_*wqh, flush + * the last page faults that may have been already waiting on + * the fault_*wqh. + */ + spin_lock(&ctx->fault_pending_wqh.lock); + __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range); + __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range); + spin_unlock(&ctx->fault_pending_wqh.lock); + + wake_up_poll(&ctx->fd_wqh, POLLHUP); + userfaultfd_ctx_put(ctx); + return 0; +} + +/* fault_pending_wqh.lock must be hold by the caller */ +static inline struct userfaultfd_wait_queue *find_userfault( + struct userfaultfd_ctx *ctx) +{ + wait_queue_t *wq; + struct userfaultfd_wait_queue *uwq; + + VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock)); + + uwq = NULL; + if (!waitqueue_active(&ctx->fault_pending_wqh)) + goto out; + /* walk in reverse to provide FIFO behavior to read userfaults */ + wq = list_last_entry(&ctx->fault_pending_wqh.task_list, + typeof(*wq), task_list); + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); +out: + return uwq; +} + +static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) +{ + struct userfaultfd_ctx *ctx = file->private_data; + unsigned int ret; + + poll_wait(file, &ctx->fd_wqh, wait); + + switch (ctx->state) { + case UFFD_STATE_WAIT_API: + return POLLERR; + case UFFD_STATE_RUNNING: + /* + * poll() never guarantees that read won't block. + * userfaults can be waken before they're read(). + */ + if (unlikely(!(file->f_flags & O_NONBLOCK))) + return POLLERR; + /* + * lockless access to see if there are pending faults + * __pollwait last action is the add_wait_queue but + * the spin_unlock would allow the waitqueue_active to + * pass above the actual list_add inside + * add_wait_queue critical section. So use a full + * memory barrier to serialize the list_add write of + * add_wait_queue() with the waitqueue_active read + * below. + */ + ret = 0; + smp_mb(); + if (waitqueue_active(&ctx->fault_pending_wqh)) + ret = POLLIN; + return ret; + default: + BUG(); + } +} + +static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, + struct uffd_msg *msg) +{ + ssize_t ret; + DECLARE_WAITQUEUE(wait, current); + struct userfaultfd_wait_queue *uwq; + + /* always take the fd_wqh lock before the fault_pending_wqh lock */ + spin_lock(&ctx->fd_wqh.lock); + __add_wait_queue(&ctx->fd_wqh, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock(&ctx->fault_pending_wqh.lock); + uwq = find_userfault(ctx); + if (uwq) { + /* + * Use a seqcount to repeat the lockless check + * in wake_userfault() to avoid missing + * wakeups because during the refile both + * waitqueue could become empty if this is the + * only userfault. + */ + write_seqcount_begin(&ctx->refile_seq); + + /* + * The fault_pending_wqh.lock prevents the uwq + * to disappear from under us. + * + * Refile this userfault from + * fault_pending_wqh to fault_wqh, it's not + * pending anymore after we read it. + * + * Use list_del() by hand (as + * userfaultfd_wake_function also uses + * list_del_init() by hand) to be sure nobody + * changes __remove_wait_queue() to use + * list_del_init() in turn breaking the + * !list_empty_careful() check in + * handle_userfault(). The uwq->wq.task_list + * must never be empty at any time during the + * refile, or the waitqueue could disappear + * from under us. The "wait_queue_head_t" + * parameter of __remove_wait_queue() is unused + * anyway. + */ + list_del(&uwq->wq.task_list); + __add_wait_queue(&ctx->fault_wqh, &uwq->wq); + + write_seqcount_end(&ctx->refile_seq); + + /* careful to always initialize msg if ret == 0 */ + *msg = uwq->msg; + spin_unlock(&ctx->fault_pending_wqh.lock); + ret = 0; + break; + } + spin_unlock(&ctx->fault_pending_wqh.lock); + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + if (no_wait) { + ret = -EAGAIN; + break; + } + spin_unlock(&ctx->fd_wqh.lock); + schedule(); + spin_lock(&ctx->fd_wqh.lock); + } + __remove_wait_queue(&ctx->fd_wqh, &wait); + __set_current_state(TASK_RUNNING); + spin_unlock(&ctx->fd_wqh.lock); + + return ret; +} + +static ssize_t userfaultfd_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct userfaultfd_ctx *ctx = file->private_data; + ssize_t _ret, ret = 0; + struct uffd_msg msg; + int no_wait = file->f_flags & O_NONBLOCK; + + if (ctx->state == UFFD_STATE_WAIT_API) + return -EINVAL; + + for (;;) { + if (count < sizeof(msg)) + return ret ? ret : -EINVAL; + _ret = userfaultfd_ctx_read(ctx, no_wait, &msg); + if (_ret < 0) + return ret ? ret : _ret; + if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg))) + return ret ? ret : -EFAULT; + ret += sizeof(msg); + buf += sizeof(msg); + count -= sizeof(msg); + /* + * Allow to read more than one fault at time but only + * block if waiting for the very first one. + */ + no_wait = O_NONBLOCK; + } +} + +static void __wake_userfault(struct userfaultfd_ctx *ctx, + struct userfaultfd_wake_range *range) +{ + unsigned long start, end; + + start = range->start; + end = range->start + range->len; + + spin_lock(&ctx->fault_pending_wqh.lock); + /* wake all in the range and autoremove */ + if (waitqueue_active(&ctx->fault_pending_wqh)) + __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, + range); + if (waitqueue_active(&ctx->fault_wqh)) + __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range); + spin_unlock(&ctx->fault_pending_wqh.lock); +} + +static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, + struct userfaultfd_wake_range *range) +{ + unsigned seq; + bool need_wakeup; + + /* + * To be sure waitqueue_active() is not reordered by the CPU + * before the pagetable update, use an explicit SMP memory + * barrier here. PT lock release or up_read(mmap_sem) still + * have release semantics that can allow the + * waitqueue_active() to be reordered before the pte update. + */ + smp_mb(); + + /* + * Use waitqueue_active because it's very frequent to + * change the address space atomically even if there are no + * userfaults yet. So we take the spinlock only when we're + * sure we've userfaults to wake. + */ + do { + seq = read_seqcount_begin(&ctx->refile_seq); + need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) || + waitqueue_active(&ctx->fault_wqh); + cond_resched(); + } while (read_seqcount_retry(&ctx->refile_seq, seq)); + if (need_wakeup) + __wake_userfault(ctx, range); +} + +static __always_inline int validate_range(struct mm_struct *mm, + __u64 start, __u64 len) +{ + __u64 task_size = mm->task_size; + + if (start & ~PAGE_MASK) + return -EINVAL; + if (len & ~PAGE_MASK) + return -EINVAL; + if (!len) + return -EINVAL; + if (start < mmap_min_addr) + return -EINVAL; + if (start >= task_size) + return -EINVAL; + if (len > task_size - start) + return -EINVAL; + return 0; +} + +static int userfaultfd_register(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma, *prev, *cur; + int ret; + struct uffdio_register uffdio_register; + struct uffdio_register __user *user_uffdio_register; + unsigned long vm_flags, new_flags; + bool found; + unsigned long start, end, vma_end; + + user_uffdio_register = (struct uffdio_register __user *) arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_register, user_uffdio_register, + sizeof(uffdio_register)-sizeof(__u64))) + goto out; + + ret = -EINVAL; + if (!uffdio_register.mode) + goto out; + if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING| + UFFDIO_REGISTER_MODE_WP)) + goto out; + vm_flags = 0; + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) + vm_flags |= VM_UFFD_MISSING; + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { + vm_flags |= VM_UFFD_WP; + /* + * FIXME: remove the below error constraint by + * implementing the wprotect tracking mode. + */ + ret = -EINVAL; + goto out; + } + + ret = validate_range(mm, uffdio_register.range.start, + uffdio_register.range.len); + if (ret) + goto out; + + start = uffdio_register.range.start; + end = start + uffdio_register.range.len; + + down_write(&mm->mmap_sem); + vma = find_vma_prev(mm, start, &prev); + + ret = -ENOMEM; + if (!vma) + goto out_unlock; + + /* check that there's at least one vma in the range */ + ret = -EINVAL; + if (vma->vm_start >= end) + goto out_unlock; + + /* + * Search for not compatible vmas. + * + * FIXME: this shall be relaxed later so that it doesn't fail + * on tmpfs backed vmas (in addition to the current allowance + * on anonymous vmas). + */ + found = false; + for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + cond_resched(); + + BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ + !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + + /* check not compatible vmas */ + ret = -EINVAL; + if (cur->vm_ops) + goto out_unlock; + + /* + * Check that this vma isn't already owned by a + * different userfaultfd. We can't allow more than one + * userfaultfd to own a single vma simultaneously or we + * wouldn't know which one to deliver the userfaults to. + */ + ret = -EBUSY; + if (cur->vm_userfaultfd_ctx.ctx && + cur->vm_userfaultfd_ctx.ctx != ctx) + goto out_unlock; + + found = true; + } + BUG_ON(!found); + + if (vma->vm_start < start) + prev = vma; + + ret = 0; + do { + cond_resched(); + + BUG_ON(vma->vm_ops); + BUG_ON(vma->vm_userfaultfd_ctx.ctx && + vma->vm_userfaultfd_ctx.ctx != ctx); + + /* + * Nothing to do: this vma is already registered into this + * userfaultfd and with the right tracking mode too. + */ + if (vma->vm_userfaultfd_ctx.ctx == ctx && + (vma->vm_flags & vm_flags) == vm_flags) + goto skip; + + if (vma->vm_start > start) + start = vma->vm_start; + vma_end = min(end, vma->vm_end); + + new_flags = (vma->vm_flags & ~vm_flags) | vm_flags; + prev = vma_merge(mm, prev, start, vma_end, new_flags, + vma->anon_vma, vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + ((struct vm_userfaultfd_ctx){ ctx })); + if (prev) { + vma = prev; + goto next; + } + if (vma->vm_start < start) { + ret = split_vma(mm, vma, start, 1); + if (ret) + break; + } + if (vma->vm_end > end) { + ret = split_vma(mm, vma, end, 0); + if (ret) + break; + } + next: + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + vma->vm_flags = new_flags; + vma->vm_userfaultfd_ctx.ctx = ctx; + + skip: + prev = vma; + start = vma->vm_end; + vma = vma->vm_next; + } while (vma && vma->vm_start < end); +out_unlock: + up_write(&mm->mmap_sem); + if (!ret) { + /* + * Now that we scanned all vmas we can already tell + * userland which ioctls methods are guaranteed to + * succeed on this range. + */ + if (put_user(UFFD_API_RANGE_IOCTLS, + &user_uffdio_register->ioctls)) + ret = -EFAULT; + } +out: + return ret; +} + +static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma, *prev, *cur; + int ret; + struct uffdio_range uffdio_unregister; + unsigned long new_flags; + bool found; + unsigned long start, end, vma_end; + const void __user *buf = (void __user *)arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) + goto out; + + ret = validate_range(mm, uffdio_unregister.start, + uffdio_unregister.len); + if (ret) + goto out; + + start = uffdio_unregister.start; + end = start + uffdio_unregister.len; + + down_write(&mm->mmap_sem); + vma = find_vma_prev(mm, start, &prev); + + ret = -ENOMEM; + if (!vma) + goto out_unlock; + + /* check that there's at least one vma in the range */ + ret = -EINVAL; + if (vma->vm_start >= end) + goto out_unlock; + + /* + * Search for not compatible vmas. + * + * FIXME: this shall be relaxed later so that it doesn't fail + * on tmpfs backed vmas (in addition to the current allowance + * on anonymous vmas). + */ + found = false; + ret = -EINVAL; + for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + cond_resched(); + + BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ + !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + + /* + * Check not compatible vmas, not strictly required + * here as not compatible vmas cannot have an + * userfaultfd_ctx registered on them, but this + * provides for more strict behavior to notice + * unregistration errors. + */ + if (cur->vm_ops) + goto out_unlock; + + found = true; + } + BUG_ON(!found); + + if (vma->vm_start < start) + prev = vma; + + ret = 0; + do { + cond_resched(); + + BUG_ON(vma->vm_ops); + + /* + * Nothing to do: this vma is already registered into this + * userfaultfd and with the right tracking mode too. + */ + if (!vma->vm_userfaultfd_ctx.ctx) + goto skip; + + if (vma->vm_start > start) + start = vma->vm_start; + vma_end = min(end, vma->vm_end); + + new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); + prev = vma_merge(mm, prev, start, vma_end, new_flags, + vma->anon_vma, vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + NULL_VM_UFFD_CTX); + if (prev) { + vma = prev; + goto next; + } + if (vma->vm_start < start) { + ret = split_vma(mm, vma, start, 1); + if (ret) + break; + } + if (vma->vm_end > end) { + ret = split_vma(mm, vma, end, 0); + if (ret) + break; + } + next: + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + vma->vm_flags = new_flags; + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + + skip: + prev = vma; + start = vma->vm_end; + vma = vma->vm_next; + } while (vma && vma->vm_start < end); +out_unlock: + up_write(&mm->mmap_sem); +out: + return ret; +} + +/* + * userfaultfd_wake may be used in combination with the + * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches. + */ +static int userfaultfd_wake(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + int ret; + struct uffdio_range uffdio_wake; + struct userfaultfd_wake_range range; + const void __user *buf = (void __user *)arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) + goto out; + + ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); + if (ret) + goto out; + + range.start = uffdio_wake.start; + range.len = uffdio_wake.len; + + /* + * len == 0 means wake all and we don't want to wake all here, + * so check it again to be sure. + */ + VM_BUG_ON(!range.len); + + wake_userfault(ctx, &range); + ret = 0; + +out: + return ret; +} + +static int userfaultfd_copy(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + __s64 ret; + struct uffdio_copy uffdio_copy; + struct uffdio_copy __user *user_uffdio_copy; + struct userfaultfd_wake_range range; + + user_uffdio_copy = (struct uffdio_copy __user *) arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_copy, user_uffdio_copy, + /* don't copy "copy" last field */ + sizeof(uffdio_copy)-sizeof(__s64))) + goto out; + + ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); + if (ret) + goto out; + /* + * double check for wraparound just in case. copy_from_user() + * will later check uffdio_copy.src + uffdio_copy.len to fit + * in the userland range. + */ + ret = -EINVAL; + if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) + goto out; + if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE) + goto out; + + ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, + uffdio_copy.len); + if (unlikely(put_user(ret, &user_uffdio_copy->copy))) + return -EFAULT; + if (ret < 0) + goto out; + BUG_ON(!ret); + /* len == 0 would wake all */ + range.len = ret; + if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) { + range.start = uffdio_copy.dst; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_copy.len ? 0 : -EAGAIN; +out: + return ret; +} + +static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + __s64 ret; + struct uffdio_zeropage uffdio_zeropage; + struct uffdio_zeropage __user *user_uffdio_zeropage; + struct userfaultfd_wake_range range; + + user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, + /* don't copy "zeropage" last field */ + sizeof(uffdio_zeropage)-sizeof(__s64))) + goto out; + + ret = validate_range(ctx->mm, uffdio_zeropage.range.start, + uffdio_zeropage.range.len); + if (ret) + goto out; + ret = -EINVAL; + if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) + goto out; + + ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, + uffdio_zeropage.range.len); + if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) + return -EFAULT; + if (ret < 0) + goto out; + /* len == 0 would wake all */ + BUG_ON(!ret); + range.len = ret; + if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) { + range.start = uffdio_zeropage.range.start; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN; +out: + return ret; +} + +/* + * userland asks for a certain API version and we return which bits + * and ioctl commands are implemented in this kernel for such API + * version or -EINVAL if unknown. + */ +static int userfaultfd_api(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + struct uffdio_api uffdio_api; + void __user *buf = (void __user *)arg; + int ret; + + ret = -EINVAL; + if (ctx->state != UFFD_STATE_WAIT_API) + goto out; + ret = -EFAULT; + if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) + goto out; + if (uffdio_api.api != UFFD_API || uffdio_api.features) { + memset(&uffdio_api, 0, sizeof(uffdio_api)); + if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) + goto out; + ret = -EINVAL; + goto out; + } + uffdio_api.features = UFFD_API_FEATURES; + uffdio_api.ioctls = UFFD_API_IOCTLS; + ret = -EFAULT; + if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) + goto out; + ctx->state = UFFD_STATE_RUNNING; + ret = 0; +out: + return ret; +} + +static long userfaultfd_ioctl(struct file *file, unsigned cmd, + unsigned long arg) +{ + int ret = -EINVAL; + struct userfaultfd_ctx *ctx = file->private_data; + + if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API) + return -EINVAL; + + switch(cmd) { + case UFFDIO_API: + ret = userfaultfd_api(ctx, arg); + break; + case UFFDIO_REGISTER: + ret = userfaultfd_register(ctx, arg); + break; + case UFFDIO_UNREGISTER: + ret = userfaultfd_unregister(ctx, arg); + break; + case UFFDIO_WAKE: + ret = userfaultfd_wake(ctx, arg); + break; + case UFFDIO_COPY: + ret = userfaultfd_copy(ctx, arg); + break; + case UFFDIO_ZEROPAGE: + ret = userfaultfd_zeropage(ctx, arg); + break; + } + return ret; +} + +#ifdef CONFIG_PROC_FS +static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct userfaultfd_ctx *ctx = f->private_data; + wait_queue_t *wq; + struct userfaultfd_wait_queue *uwq; + unsigned long pending = 0, total = 0; + + spin_lock(&ctx->fault_pending_wqh.lock); + list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) { + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); + pending++; + total++; + } + list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) { + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); + total++; + } + spin_unlock(&ctx->fault_pending_wqh.lock); + + /* + * If more protocols will be added, there will be all shown + * separated by a space. Like this: + * protocols: aa:... bb:... + */ + seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n", + pending, total, UFFD_API, UFFD_API_FEATURES, + UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS); +} +#endif + +static const struct file_operations userfaultfd_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = userfaultfd_show_fdinfo, +#endif + .release = userfaultfd_release, + .poll = userfaultfd_poll, + .read = userfaultfd_read, + .unlocked_ioctl = userfaultfd_ioctl, + .compat_ioctl = userfaultfd_ioctl, + .llseek = noop_llseek, +}; + +static void init_once_userfaultfd_ctx(void *mem) +{ + struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem; + + init_waitqueue_head(&ctx->fault_pending_wqh); + init_waitqueue_head(&ctx->fault_wqh); + init_waitqueue_head(&ctx->fd_wqh); + seqcount_init(&ctx->refile_seq); +} + +/** + * userfaultfd_file_create - Creates an userfaultfd file pointer. + * @flags: Flags for the userfaultfd file. + * + * This function creates an userfaultfd file pointer, w/out installing + * it into the fd table. This is useful when the userfaultfd file is + * used during the initialization of data structures that require + * extra setup after the userfaultfd creation. So the userfaultfd + * creation is split into the file pointer creation phase, and the + * file descriptor installation phase. In this way races with + * userspace closing the newly installed file descriptor can be + * avoided. Returns an userfaultfd file pointer, or a proper error + * pointer. + */ +static struct file *userfaultfd_file_create(int flags) +{ + struct file *file; + struct userfaultfd_ctx *ctx; + + BUG_ON(!current->mm); + + /* Check the UFFD_* constants for consistency. */ + BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); + + file = ERR_PTR(-EINVAL); + if (flags & ~UFFD_SHARED_FCNTL_FLAGS) + goto out; + + file = ERR_PTR(-ENOMEM); + ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); + if (!ctx) + goto out; + + atomic_set(&ctx->refcount, 1); + ctx->flags = flags; + ctx->state = UFFD_STATE_WAIT_API; + ctx->released = false; + ctx->mm = current->mm; + /* prevent the mm struct to be freed */ + atomic_inc(&ctx->mm->mm_users); + + file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx, + O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); + if (IS_ERR(file)) + kmem_cache_free(userfaultfd_ctx_cachep, ctx); +out: + return file; +} + +SYSCALL_DEFINE1(userfaultfd, int, flags) +{ + int fd, error; + struct file *file; + + error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS); + if (error < 0) + return error; + fd = error; + + file = userfaultfd_file_create(flags); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_put_unused_fd; + } + fd_install(fd, file); + + return fd; + +err_put_unused_fd: + put_unused_fd(fd); + + return error; +} + +static int __init userfaultfd_init(void) +{ + userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", + sizeof(struct userfaultfd_ctx), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + init_once_userfaultfd_ctx); + return 0; +} +__initcall(userfaultfd_init); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 1fb16562c159..bbd9b1f10ffb 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -511,9 +511,9 @@ xfs_showargs( seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); if (mp->m_logname) - seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname); + seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname); if (mp->m_rtname) - seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname); + seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname); if (mp->m_dalign > 0) seq_printf(m, "," MNTOPT_SUNIT "=%d", |