diff options
Diffstat (limited to 'fs')
468 files changed, 16900 insertions, 13433 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig index 6489e1fc1afd..11045d8e356a 100644 --- a/fs/9p/Kconfig +++ b/fs/9p/Kconfig @@ -25,9 +25,6 @@ config 9P_FS_POSIX_ACL POSIX Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. - To learn more about Access Control Lists, visit the POSIX ACLs for - Linux website <http://acl.bestbits.at/>. - If you don't know what Access Control Lists are, say N endif diff --git a/fs/Kconfig b/fs/Kconfig index 7aee6d699fd6..bc821a86d965 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -58,6 +58,13 @@ config FS_DAX_PMD depends on ZONE_DEVICE depends on TRANSPARENT_HUGEPAGE +# Selected by DAX drivers that do not expect filesystem DAX to support +# get_user_pages() of DAX mappings. I.e. "limited" indicates no support +# for fork() of processes with MAP_SHARED mappings or support for +# direct-I/O to a DAX mapping. +config FS_DAX_LIMITED + bool + endif # BLOCK # Posix ACL utility routines @@ -167,17 +174,13 @@ config TMPFS_POSIX_ACL files for sound to work properly. In short, if you're not sure, say Y. - To learn more about Access Control Lists, visit the POSIX ACLs for - Linux website <http://acl.bestbits.at/>. - config TMPFS_XATTR bool "Tmpfs extended attributes" depends on TMPFS default n help Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). + the kernel or by users (see the attr(5) manual page for details). Currently this enables support for the trusted.* and security.* namespaces. @@ -298,7 +301,6 @@ config NFS_COMMON source "net/sunrpc/Kconfig" source "fs/ceph/Kconfig" source "fs/cifs/Kconfig" -source "fs/ncpfs/Kconfig" source "fs/coda/Kconfig" source "fs/afs/Kconfig" source "fs/9p/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index ef772f1eaff8..add789ea270a 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -92,7 +92,6 @@ obj-$(CONFIG_LOCKD) += lockd/ obj-$(CONFIG_NLS) += nls/ obj-$(CONFIG_SYSV_FS) += sysv/ obj-$(CONFIG_CIFS) += cifs/ -obj-$(CONFIG_NCP_FS) += ncpfs/ obj-$(CONFIG_HPFS_FS) += hpfs/ obj-$(CONFIG_NTFS_FS) += ntfs/ obj-$(CONFIG_UFS_FS) += ufs/ diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 0f0e6925e97d..14a6c1b90c9f 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -10,6 +10,7 @@ */ #include <linux/math64.h> +#include <linux/iversion.h> #include "affs.h" /* @@ -60,7 +61,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh) affs_brelse(dir_bh); dir->i_mtime = dir->i_ctime = current_time(dir); - dir->i_version++; + inode_inc_iversion(dir); mark_inode_dirty(dir); return 0; @@ -114,7 +115,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh) affs_brelse(bh); dir->i_mtime = dir->i_ctime = current_time(dir); - dir->i_version++; + inode_inc_iversion(dir); mark_inode_dirty(dir); return retval; diff --git a/fs/affs/dir.c b/fs/affs/dir.c index a105e77df2c1..d180b46453cf 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -14,6 +14,7 @@ * */ +#include <linux/iversion.h> #include "affs.h" static int affs_readdir(struct file *, struct dir_context *); @@ -80,7 +81,7 @@ affs_readdir(struct file *file, struct dir_context *ctx) * we can jump directly to where we left off. */ ino = (u32)(long)file->private_data; - if (ino && file->f_version == inode->i_version) { + if (ino && inode_cmp_iversion(inode, file->f_version) == 0) { pr_debug("readdir() left off=%d\n", ino); goto inside; } @@ -130,7 +131,7 @@ inside: } while (ino); } done: - file->f_version = inode->i_version; + file->f_version = inode_query_iversion(inode); file->private_data = (void *)(long)ino; affs_brelse(fh_bh); diff --git a/fs/affs/super.c b/fs/affs/super.c index 1117e36134cc..e602619aed9d 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -21,6 +21,7 @@ #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/seq_file.h> +#include <linux/iversion.h> #include "affs.h" static int affs_statfs(struct dentry *dentry, struct kstatfs *buf); @@ -102,7 +103,7 @@ static struct inode *affs_alloc_inode(struct super_block *sb) if (!i) return NULL; - i->vfs_inode.i_version = 1; + inode_set_iversion(&i->vfs_inode, 1); i->i_lc = NULL; i->i_ext_bh = NULL; i->i_pa_cnt = 0; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index ff8d5bf4354f..23c7f395d718 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -895,20 +895,38 @@ error: * However, if we didn't have a callback promise outstanding, or it was * outstanding on a different server, then it won't break it either... */ -static int afs_dir_remove_link(struct dentry *dentry, struct key *key) +static int afs_dir_remove_link(struct dentry *dentry, struct key *key, + unsigned long d_version_before, + unsigned long d_version_after) { + bool dir_valid; int ret = 0; + /* There were no intervening changes on the server if the version + * number we got back was incremented by exactly 1. + */ + dir_valid = (d_version_after == d_version_before + 1); + if (d_really_is_positive(dentry)) { struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) - kdebug("AFS_VNODE_DELETED"); - clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); - - ret = afs_validate(vnode, key); - if (ret == -ESTALE) + if (dir_valid) { + drop_nlink(&vnode->vfs_inode); + if (vnode->vfs_inode.i_nlink == 0) { + set_bit(AFS_VNODE_DELETED, &vnode->flags); + clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + } ret = 0; + } else { + clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + kdebug("AFS_VNODE_DELETED"); + + ret = afs_validate(vnode, key); + if (ret == -ESTALE) + ret = 0; + } _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret); } @@ -923,6 +941,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) struct afs_fs_cursor fc; struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode; struct key *key; + unsigned long d_version = (unsigned long)dentry->d_fsdata; int ret; _enter("{%x:%u},{%pd}", @@ -955,7 +974,9 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) afs_vnode_commit_status(&fc, dvnode, fc.cb_break); ret = afs_end_vnode_operation(&fc); if (ret == 0) - ret = afs_dir_remove_link(dentry, key); + ret = afs_dir_remove_link( + dentry, key, d_version, + (unsigned long)dvnode->status.data_version); } error_key: diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index b90ef39ae914..88ec38c2d83c 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/sched.h> #include <linux/circ_buf.h> +#include <linux/iversion.h> #include "internal.h" #include "afs_fs.h" @@ -124,7 +125,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, vnode->vfs_inode.i_ctime.tv_sec = status->mtime_client; vnode->vfs_inode.i_mtime = vnode->vfs_inode.i_ctime; vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime; - vnode->vfs_inode.i_version = data_version; + inode_set_iversion_raw(&vnode->vfs_inode, data_version); } expected_version = status->data_version; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 3415eb7484f6..c7f17c44c7ce 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -21,6 +21,7 @@ #include <linux/sched.h> #include <linux/mount.h> #include <linux/namei.h> +#include <linux/iversion.h> #include "internal.h" static const struct inode_operations afs_symlink_inode_operations = { @@ -89,7 +90,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) inode->i_atime = inode->i_mtime = inode->i_ctime; inode->i_blocks = 0; inode->i_generation = vnode->fid.unique; - inode->i_version = vnode->status.data_version; + inode_set_iversion_raw(inode, vnode->status.data_version); inode->i_mapping->a_ops = &afs_fs_aops; read_sequnlock_excl(&vnode->cb_lock); @@ -218,7 +219,7 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name, inode->i_ctime.tv_nsec = 0; inode->i_atime = inode->i_mtime = inode->i_ctime; inode->i_blocks = 0; - inode->i_version = 0; + inode_set_iversion_raw(inode, 0); inode->i_generation = 0; set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); @@ -377,6 +378,10 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) } read_sequnlock_excl(&vnode->cb_lock); + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + clear_nlink(&vnode->vfs_inode); + if (valid) goto valid; diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index ea1460b9b71a..e1126659f043 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -885,7 +885,7 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count, { struct afs_net *net = call->net; enum afs_call_state state; - u32 remote_abort; + u32 remote_abort = 0; int ret; _enter("{%s,%zu},,%zu,%d", diff --git a/fs/afs/write.c b/fs/afs/write.c index cb5f8a3df577..9370e2feb999 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -198,7 +198,7 @@ int afs_write_end(struct file *file, struct address_space *mapping, ret = afs_fill_page(vnode, key, pos + copied, len - copied, page); if (ret < 0) - return ret; + goto out; } SetPageUptodate(page); } @@ -206,10 +206,12 @@ int afs_write_end(struct file *file, struct address_space *mapping, set_page_dirty(page); if (PageDirty(page)) _debug("dirtied"); + ret = copied; + +out: unlock_page(page); put_page(page); - - return copied; + return ret; } /* diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 961a12dc6dc8..a0c57c37fa21 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -442,8 +442,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, memcpy(&wq->name, &qstr, sizeof(struct qstr)); wq->dev = autofs4_get_dev(sbi); wq->ino = autofs4_get_ino(sbi); - wq->uid = current_cred()->uid; - wq->gid = current_cred()->gid; + wq->uid = current_uid(); + wq->gid = current_gid(); wq->pid = pid; wq->tgid = tgid; wq->status = -EINTR; /* Status return if interrupted */ diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index ee236231cafa..af2832aaeec5 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -444,11 +444,15 @@ unacquire_none: static int __init befs_init_inodecache(void) { - befs_inode_cachep = kmem_cache_create("befs_inode_cache", - sizeof (struct befs_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), - init_once); + befs_inode_cachep = kmem_cache_create_usercopy("befs_inode_cache", + sizeof(struct befs_inode_info), 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT), + offsetof(struct befs_inode_info, + i_data.symlink), + sizeof_field(struct befs_inode_info, + i_data.symlink), + init_once); if (befs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 83732fef510d..bdb201230bae 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1599,6 +1599,8 @@ static int fill_files_note(struct memelfnote *note) /* *Estimated* file count and total data size needed */ count = current->mm->map_count; + if (count > UINT_MAX / 64) + return -EINVAL; size = count * 64; names_ofs = (2 + 3 * count) * sizeof(data[0]); diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 2e558227931a..273351ee4c46 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -38,9 +38,6 @@ config BTRFS_FS_POSIX_ACL POSIX Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. - To learn more about Access Control Lists, visit the POSIX ACLs for - Linux website <http://acl.bestbits.at/>. - If you don't know what Access Control Lists are, say N config BTRFS_FS_CHECK_INTEGRITY diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 6fe881d5cb38..0c4373628eb4 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -19,4 +19,4 @@ btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ - tests/free-space-tree-tests.o + tests/free-space-tree-tests.o tests/extent-map-tests.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 7d0dc100a09a..e4054e533f6d 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -216,7 +216,8 @@ static int prelim_ref_compare(struct prelim_ref *ref1, return 0; } -void update_share_count(struct share_check *sc, int oldcount, int newcount) +static void update_share_count(struct share_check *sc, int oldcount, + int newcount) { if ((!sc) || (oldcount == 0 && newcount < 1)) return; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5982c8a71f02..07d049c0c20f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -33,7 +33,6 @@ #include <linux/bit_spinlock.h> #include <linux/slab.h> #include <linux/sched/mm.h> -#include <linux/sort.h> #include <linux/log2.h> #include "ctree.h" #include "disk-io.h" @@ -45,6 +44,21 @@ #include "extent_io.h" #include "extent_map.h" +static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; + +const char* btrfs_compress_type2str(enum btrfs_compression_type type) +{ + switch (type) { + case BTRFS_COMPRESS_ZLIB: + case BTRFS_COMPRESS_LZO: + case BTRFS_COMPRESS_ZSTD: + case BTRFS_COMPRESS_NONE: + return btrfs_compress_types[type]; + } + + return NULL; +} + static int btrfs_decompress_bio(struct compressed_bio *cb); static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, @@ -348,8 +362,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, page->mapping = NULL; if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - bio_get(bio); - /* * inc the count before we submit the bio so * we know the end IO handler won't happen before @@ -372,8 +384,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_endio(bio); } - bio_put(bio); - bio = btrfs_bio_alloc(bdev, first_byte); bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; @@ -389,7 +399,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, first_byte += PAGE_SIZE; cond_resched(); } - bio_get(bio); ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ @@ -405,13 +414,12 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_endio(bio); } - bio_put(bio); return 0; } static u64 bio_end_offset(struct bio *bio) { - struct bio_vec *last = &bio->bi_io_vec[bio->bi_vcnt - 1]; + struct bio_vec *last = bio_last_bvec_all(bio); return page_offset(last->bv_page) + last->bv_len + last->bv_offset; } @@ -563,7 +571,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* we need the actual starting offset of this extent in the file */ read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, - page_offset(bio->bi_io_vec->bv_page), + page_offset(bio_first_page_all(bio)), PAGE_SIZE); read_unlock(&em_tree->lock); if (!em) @@ -638,8 +646,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->mapping = NULL; if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - bio_get(comp_bio); - ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ @@ -666,8 +672,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_endio(comp_bio); } - bio_put(comp_bio); - comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); bio_set_op_attrs(comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; @@ -677,7 +681,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } cur_disk_byte += PAGE_SIZE; } - bio_get(comp_bio); ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ @@ -693,7 +696,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_endio(comp_bio); } - bio_put(comp_bio); return 0; fail2: @@ -752,6 +754,8 @@ struct heuristic_ws { u32 sample_size; /* Buckets store counters for each byte value */ struct bucket_item *bucket; + /* Sorting buffer */ + struct bucket_item *bucket_b; struct list_head list; }; @@ -763,6 +767,7 @@ static void free_heuristic_ws(struct list_head *ws) kvfree(workspace->sample); kfree(workspace->bucket); + kfree(workspace->bucket_b); kfree(workspace); } @@ -782,6 +787,10 @@ static struct list_head *alloc_heuristic_ws(void) if (!ws->bucket) goto fail; + ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL); + if (!ws->bucket_b) + goto fail; + INIT_LIST_HEAD(&ws->list); return &ws->list; fail: @@ -1278,13 +1287,103 @@ static u32 shannon_entropy(struct heuristic_ws *ws) return entropy_sum * 100 / entropy_max; } -/* Compare buckets by size, ascending */ -static int bucket_comp_rev(const void *lv, const void *rv) +#define RADIX_BASE 4U +#define COUNTERS_SIZE (1U << RADIX_BASE) + +static u8 get4bits(u64 num, int shift) { + u8 low4bits; + + num >>= shift; + /* Reverse order */ + low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE); + return low4bits; +} + +/* + * Use 4 bits as radix base + * Use 16 u32 counters for calculating new possition in buf array + * + * @array - array that will be sorted + * @array_buf - buffer array to store sorting results + * must be equal in size to @array + * @num - array size + */ +static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf, + int num) { - const struct bucket_item *l = (const struct bucket_item *)lv; - const struct bucket_item *r = (const struct bucket_item *)rv; + u64 max_num; + u64 buf_num; + u32 counters[COUNTERS_SIZE]; + u32 new_addr; + u32 addr; + int bitlen; + int shift; + int i; - return r->count - l->count; + /* + * Try avoid useless loop iterations for small numbers stored in big + * counters. Example: 48 33 4 ... in 64bit array + */ + max_num = array[0].count; + for (i = 1; i < num; i++) { + buf_num = array[i].count; + if (buf_num > max_num) + max_num = buf_num; + } + + buf_num = ilog2(max_num); + bitlen = ALIGN(buf_num, RADIX_BASE * 2); + + shift = 0; + while (shift < bitlen) { + memset(counters, 0, sizeof(counters)); + + for (i = 0; i < num; i++) { + buf_num = array[i].count; + addr = get4bits(buf_num, shift); + counters[addr]++; + } + + for (i = 1; i < COUNTERS_SIZE; i++) + counters[i] += counters[i - 1]; + + for (i = num - 1; i >= 0; i--) { + buf_num = array[i].count; + addr = get4bits(buf_num, shift); + counters[addr]--; + new_addr = counters[addr]; + array_buf[new_addr] = array[i]; + } + + shift += RADIX_BASE; + + /* + * Normal radix expects to move data from a temporary array, to + * the main one. But that requires some CPU time. Avoid that + * by doing another sort iteration to original array instead of + * memcpy() + */ + memset(counters, 0, sizeof(counters)); + + for (i = 0; i < num; i ++) { + buf_num = array_buf[i].count; + addr = get4bits(buf_num, shift); + counters[addr]++; + } + + for (i = 1; i < COUNTERS_SIZE; i++) + counters[i] += counters[i - 1]; + + for (i = num - 1; i >= 0; i--) { + buf_num = array_buf[i].count; + addr = get4bits(buf_num, shift); + counters[addr]--; + new_addr = counters[addr]; + array[new_addr] = array_buf[i]; + } + + shift += RADIX_BASE; + } } /* @@ -1314,7 +1413,7 @@ static int byte_core_set_size(struct heuristic_ws *ws) struct bucket_item *bucket = ws->bucket; /* Sort in reverse order */ - sort(bucket, BUCKET_SIZE, sizeof(*bucket), &bucket_comp_rev, NULL); + radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE); for (i = 0; i < BYTE_CORE_SET_LOW; i++) coreset_sum += bucket[i].count; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 0868cc554f14..677fa4aa0bd7 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -75,7 +75,7 @@ struct compressed_bio { u32 sums; }; -void btrfs_init_compress(void); +void __init btrfs_init_compress(void); void btrfs_exit_compress(void); int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, @@ -137,6 +137,8 @@ extern const struct btrfs_compress_op btrfs_zlib_compress; extern const struct btrfs_compress_op btrfs_lzo_compress; extern const struct btrfs_compress_op btrfs_zstd_compress; +const char* btrfs_compress_type2str(enum btrfs_compression_type type); + int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 1e74cf826532..b88a79e69ddf 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1807,8 +1807,8 @@ static noinline int generic_bin_search(struct extent_buffer *eb, * simple bin_search frontend that does the right thing for * leaves vs nodes */ -static int bin_search(struct extent_buffer *eb, const struct btrfs_key *key, - int level, int *slot) +int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, + int level, int *slot) { if (level == 0) return generic_bin_search(eb, @@ -1824,12 +1824,6 @@ static int bin_search(struct extent_buffer *eb, const struct btrfs_key *key, slot); } -int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, - int level, int *slot) -{ - return bin_search(eb, key, level, slot); -} - static void root_add_used(struct btrfs_root *root, u32 size) { spin_lock(&root->accounting_lock); @@ -2614,7 +2608,7 @@ static int key_search(struct extent_buffer *b, const struct btrfs_key *key, int level, int *prev_cmp, int *slot) { if (*prev_cmp != 0) { - *prev_cmp = bin_search(b, key, level, slot); + *prev_cmp = btrfs_bin_search(b, key, level, slot); return *prev_cmp; } @@ -2660,17 +2654,29 @@ int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, } /* - * look for key in the tree. path is filled in with nodes along the way - * if key is found, we return zero and you can find the item in the leaf - * level of the path (level 0) + * btrfs_search_slot - look for a key in a tree and perform necessary + * modifications to preserve tree invariants. + * + * @trans: Handle of transaction, used when modifying the tree + * @p: Holds all btree nodes along the search path + * @root: The root node of the tree + * @key: The key we are looking for + * @ins_len: Indicates purpose of search, for inserts it is 1, for + * deletions it's -1. 0 for plain searches + * @cow: boolean should CoW operations be performed. Must always be 1 + * when modifying the tree. + * + * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. + * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) * - * If the key isn't found, the path points to the slot where it should - * be inserted, and 1 is returned. If there are other errors during the - * search a negative error number is returned. + * If @key is found, 0 is returned and you can find the item in the leaf level + * of the path (level 0) * - * if ins_len > 0, nodes and leaves will be split as we walk down the - * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if - * possible) + * If @key isn't found, 1 is returned and the leaf level of the path (level 0) + * points to the slot where it should be inserted + * + * If an error is encountered while searching the tree a negative error number + * is returned */ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, @@ -2774,6 +2780,8 @@ again: * contention with the cow code */ if (cow) { + bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); + /* * if we don't really need to cow this block * then we don't want to set the path blocking, @@ -2798,9 +2806,13 @@ again: } btrfs_set_path_blocking(p); - err = btrfs_cow_block(trans, root, b, - p->nodes[level + 1], - p->slots[level + 1], &b); + if (last_level) + err = btrfs_cow_block(trans, root, b, NULL, 0, + &b); + else + err = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], &b); if (err) { ret = err; goto done; @@ -5175,7 +5187,7 @@ again: while (1) { nritems = btrfs_header_nritems(cur); level = btrfs_header_level(cur); - sret = bin_search(cur, min_key, level, &slot); + sret = btrfs_bin_search(cur, min_key, level, &slot); /* at the lowest level, we're done, setup the path and exit */ if (level == path->lowest_level) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 13c260b525a1..1a462ab85c49 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -679,7 +679,6 @@ enum btrfs_orphan_cleanup_state { /* used by the raid56 code to lock stripes for read/modify/write */ struct btrfs_stripe_hash { struct list_head hash_list; - wait_queue_head_t wait; spinlock_t lock; }; @@ -3060,15 +3059,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod); -int verify_dir_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, int slot, - struct btrfs_dir_item *dir_item); struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const char *name, int name_len); -bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot, - unsigned long start, u16 name_len); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, @@ -3197,7 +3191,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); -int btrfs_init_cachep(void); +int __init btrfs_init_cachep(void); void btrfs_destroy_cachep(void); long btrfs_ioctl_trans_end(struct file *file); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, @@ -3248,7 +3242,7 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, struct file *dst_file, u64 dst_loff); /* file.c */ -int btrfs_auto_defrag_init(void); +int __init btrfs_auto_defrag_init(void); void btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); @@ -3283,7 +3277,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); /* sysfs.c */ -int btrfs_init_sysfs(void); +int __init btrfs_init_sysfs(void); void btrfs_exit_sysfs(void); int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 5d73f79ded8b..0530f6f2e4ba 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -18,6 +18,7 @@ */ #include <linux/slab.h> +#include <linux/iversion.h> #include "delayed-inode.h" #include "disk-io.h" #include "transaction.h" @@ -87,6 +88,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( spin_lock(&root->inode_lock); node = radix_tree_lookup(&root->delayed_nodes_tree, ino); + if (node) { if (btrfs_inode->delayed_node) { refcount_inc(&node->refs); /* can be accessed */ @@ -94,9 +96,30 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( spin_unlock(&root->inode_lock); return node; } - btrfs_inode->delayed_node = node; - /* can be accessed and cached in the inode */ - refcount_add(2, &node->refs); + + /* + * It's possible that we're racing into the middle of removing + * this node from the radix tree. In this case, the refcount + * was zero and it should never go back to one. Just return + * NULL like it was never in the radix at all; our release + * function is in the process of removing it. + * + * Some implementations of refcount_inc refuse to bump the + * refcount once it has hit zero. If we don't do this dance + * here, refcount_inc() may decide to just WARN_ONCE() instead + * of actually bumping the refcount. + * + * If this node is properly in the radix, we want to bump the + * refcount twice, once for the inode and once for this get + * operation. + */ + if (refcount_inc_not_zero(&node->refs)) { + refcount_inc(&node->refs); + btrfs_inode->delayed_node = node; + } else { + node = NULL; + } + spin_unlock(&root->inode_lock); return node; } @@ -254,17 +277,18 @@ static void __btrfs_release_delayed_node( mutex_unlock(&delayed_node->mutex); if (refcount_dec_and_test(&delayed_node->refs)) { - bool free = false; struct btrfs_root *root = delayed_node->root; + spin_lock(&root->inode_lock); - if (refcount_read(&delayed_node->refs) == 0) { - radix_tree_delete(&root->delayed_nodes_tree, - delayed_node->inode_id); - free = true; - } + /* + * Once our refcount goes to zero, nobody is allowed to bump it + * back up. We can delete it now. + */ + ASSERT(refcount_read(&delayed_node->refs) == 0); + radix_tree_delete(&root->delayed_nodes_tree, + delayed_node->inode_id); spin_unlock(&root->inode_lock); - if (free) - kmem_cache_free(delayed_node_cache, delayed_node); + kmem_cache_free(delayed_node_cache, delayed_node); } } @@ -1279,40 +1303,42 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) if (!path) goto out; -again: - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2) - goto free_path; + do { + if (atomic_read(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND / 2) + break; - delayed_node = btrfs_first_prepared_delayed_node(delayed_root); - if (!delayed_node) - goto free_path; + delayed_node = btrfs_first_prepared_delayed_node(delayed_root); + if (!delayed_node) + break; - path->leave_spinning = 1; - root = delayed_node->root; + path->leave_spinning = 1; + root = delayed_node->root; - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - goto release_path; + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_release_path(path); + btrfs_release_prepared_delayed_node(delayed_node); + total_done++; + continue; + } - block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->delayed_block_rsv; + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; - __btrfs_commit_inode_delayed_items(trans, path, delayed_node); + __btrfs_commit_inode_delayed_items(trans, path, delayed_node); - trans->block_rsv = block_rsv; - btrfs_end_transaction(trans); - btrfs_btree_balance_dirty_nodelay(root->fs_info); + trans->block_rsv = block_rsv; + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty_nodelay(root->fs_info); -release_path: - btrfs_release_path(path); - total_done++; + btrfs_release_path(path); + btrfs_release_prepared_delayed_node(delayed_node); + total_done++; - btrfs_release_prepared_delayed_node(delayed_node); - if ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) || - total_done < async_work->nr) - goto again; + } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) + || total_done < async_work->nr); -free_path: btrfs_free_path(path); out: wake_up(&delayed_root->wait); @@ -1325,10 +1351,6 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, { struct btrfs_async_delayed_work *async_work; - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND || - btrfs_workqueue_normal_congested(fs_info->delayed_workers)) - return 0; - async_work = kmalloc(sizeof(*async_work), GFP_NOFS); if (!async_work) return -ENOMEM; @@ -1364,7 +1386,8 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) { struct btrfs_delayed_root *delayed_root = fs_info->delayed_root; - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) + if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) || + btrfs_workqueue_normal_congested(fs_info->delayed_workers)) return; if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { @@ -1610,28 +1633,18 @@ void btrfs_readdir_put_delayed_items(struct inode *inode, int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index) { - struct btrfs_delayed_item *curr, *next; - int ret; - - if (list_empty(del_list)) - return 0; + struct btrfs_delayed_item *curr; + int ret = 0; - list_for_each_entry_safe(curr, next, del_list, readdir_list) { + list_for_each_entry(curr, del_list, readdir_list) { if (curr->key.offset > index) break; - - list_del(&curr->readdir_list); - ret = (curr->key.offset == index); - - if (refcount_dec_and_test(&curr->refs)) - kfree(curr); - - if (ret) - return 1; - else - continue; + if (curr->key.offset == index) { + ret = 1; + break; + } } - return 0; + return ret; } /* @@ -1700,7 +1713,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); btrfs_set_stack_inode_generation(inode_item, BTRFS_I(inode)->generation); - btrfs_set_stack_inode_sequence(inode_item, inode->i_version); + btrfs_set_stack_inode_sequence(inode_item, + inode_peek_iversion(inode)); btrfs_set_stack_inode_transid(inode_item, trans->transid); btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); @@ -1754,7 +1768,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item); - inode->i_version = btrfs_stack_inode_sequence(inode_item); + inode_set_iversion_queried(inode, + btrfs_stack_inode_sequence(inode_item)); inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 83be8f9fd906..a1a40cf382e3 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -937,7 +937,7 @@ void btrfs_delayed_ref_exit(void) kmem_cache_destroy(btrfs_delayed_extent_op_cachep); } -int btrfs_delayed_ref_init(void) +int __init btrfs_delayed_ref_init(void) { btrfs_delayed_ref_head_cachep = kmem_cache_create( "btrfs_delayed_ref_head", diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index a43af432f859..c4f625e5a691 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -203,7 +203,7 @@ extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; extern struct kmem_cache *btrfs_delayed_data_ref_cachep; extern struct kmem_cache *btrfs_delayed_extent_op_cachep; -int btrfs_delayed_ref_init(void); +int __init btrfs_delayed_ref_init(void); void btrfs_delayed_ref_exit(void); static inline struct btrfs_delayed_extent_op * diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 7c655f9a7a50..7efbc4d1128b 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -172,7 +172,8 @@ no_valid_dev_replace_entry_found: dev_replace->tgtdev->commit_bytes_used = dev_replace->srcdev->commit_bytes_used; } - dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; + set_bit(BTRFS_DEV_STATE_REPLACE_TGT, + &dev_replace->tgtdev->dev_state); btrfs_init_dev_replace_tgtdev_for_resume(fs_info, dev_replace->tgtdev); } @@ -304,6 +305,14 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) dev_replace->cursor_left_last_write_of_item; } +static char* btrfs_dev_name(struct btrfs_device *device) +{ + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) + return "<missing disk>"; + else + return rcu_str_deref(device->name); +} + int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, int read_src) @@ -363,8 +372,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, btrfs_info_in_rcu(fs_info, "dev_replace from %s (devid %llu) to %s started", - src_device->missing ? "<missing disk>" : - rcu_str_deref(src_device->name), + btrfs_dev_name(src_device), src_device->devid, rcu_str_deref(tgt_device->name)); @@ -538,8 +546,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, } else { btrfs_err_in_rcu(fs_info, "btrfs_scrub_dev(%s, %llu, %s) failed %d", - src_device->missing ? "<missing disk>" : - rcu_str_deref(src_device->name), + btrfs_dev_name(src_device), src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); btrfs_dev_replace_unlock(dev_replace, 1); @@ -557,11 +564,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_info_in_rcu(fs_info, "dev_replace from %s (devid %llu) to %s finished", - src_device->missing ? "<missing disk>" : - rcu_str_deref(src_device->name), + btrfs_dev_name(src_device), src_device->devid, rcu_str_deref(tgt_device->name)); - tgt_device->is_tgtdev_for_dev_replace = 0; + clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state); tgt_device->devid = src_device->devid; src_device->devid = BTRFS_DEV_REPLACE_DEVID; memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); @@ -814,12 +820,10 @@ static int btrfs_dev_replace_kthread(void *data) progress = btrfs_dev_replace_progress(fs_info); progress = div_u64(progress, 10); btrfs_info_in_rcu(fs_info, - "continuing dev_replace from %s (devid %llu) to %s @%u%%", - dev_replace->srcdev->missing ? "<missing disk>" - : rcu_str_deref(dev_replace->srcdev->name), + "continuing dev_replace from %s (devid %llu) to target %s @%u%%", + btrfs_dev_name(dev_replace->srcdev), dev_replace->srcdev->devid, - dev_replace->tgtdev ? rcu_str_deref(dev_replace->tgtdev->name) - : "<missing target disk>", + btrfs_dev_name(dev_replace->tgtdev), (unsigned int)progress); btrfs_dev_replace_continue_on_mount(fs_info); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 41cb9196eaa8..cbe421605cd5 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -403,8 +403,6 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, btrfs_dir_data_len(leaf, dir_item); name_ptr = (unsigned long)(dir_item + 1); - if (verify_dir_item(fs_info, leaf, path->slots[0], dir_item)) - return NULL; if (btrfs_dir_name_len(leaf, dir_item) == name_len && memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) return dir_item; @@ -450,109 +448,3 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, } return ret; } - -int verify_dir_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, - int slot, - struct btrfs_dir_item *dir_item) -{ - u16 namelen = BTRFS_NAME_LEN; - int ret; - u8 type = btrfs_dir_type(leaf, dir_item); - - if (type >= BTRFS_FT_MAX) { - btrfs_crit(fs_info, "invalid dir item type: %d", (int)type); - return 1; - } - - if (type == BTRFS_FT_XATTR) - namelen = XATTR_NAME_MAX; - - if (btrfs_dir_name_len(leaf, dir_item) > namelen) { - btrfs_crit(fs_info, "invalid dir item name len: %u", - (unsigned)btrfs_dir_name_len(leaf, dir_item)); - return 1; - } - - namelen = btrfs_dir_name_len(leaf, dir_item); - ret = btrfs_is_name_len_valid(leaf, slot, - (unsigned long)(dir_item + 1), namelen); - if (!ret) - return 1; - - /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ - if ((btrfs_dir_data_len(leaf, dir_item) + - btrfs_dir_name_len(leaf, dir_item)) > - BTRFS_MAX_XATTR_SIZE(fs_info)) { - btrfs_crit(fs_info, "invalid dir item name + data len: %u + %u", - (unsigned)btrfs_dir_name_len(leaf, dir_item), - (unsigned)btrfs_dir_data_len(leaf, dir_item)); - return 1; - } - - return 0; -} - -bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot, - unsigned long start, u16 name_len) -{ - struct btrfs_fs_info *fs_info = leaf->fs_info; - struct btrfs_key key; - u32 read_start; - u32 read_end; - u32 item_start; - u32 item_end; - u32 size; - bool ret = true; - - ASSERT(start > BTRFS_LEAF_DATA_OFFSET); - - read_start = start - BTRFS_LEAF_DATA_OFFSET; - read_end = read_start + name_len; - item_start = btrfs_item_offset_nr(leaf, slot); - item_end = btrfs_item_end_nr(leaf, slot); - - btrfs_item_key_to_cpu(leaf, &key, slot); - - switch (key.type) { - case BTRFS_DIR_ITEM_KEY: - case BTRFS_XATTR_ITEM_KEY: - case BTRFS_DIR_INDEX_KEY: - size = sizeof(struct btrfs_dir_item); - break; - case BTRFS_INODE_REF_KEY: - size = sizeof(struct btrfs_inode_ref); - break; - case BTRFS_INODE_EXTREF_KEY: - size = sizeof(struct btrfs_inode_extref); - break; - case BTRFS_ROOT_REF_KEY: - case BTRFS_ROOT_BACKREF_KEY: - size = sizeof(struct btrfs_root_ref); - break; - default: - ret = false; - goto out; - } - - if (read_start < item_start) { - ret = false; - goto out; - } - if (read_end > item_end) { - ret = false; - goto out; - } - - /* there shall be item(s) before name */ - if (read_start - item_start < size) { - ret = false; - goto out; - } - -out: - if (!ret) - btrfs_crit(fs_info, "invalid dir item name len: %u", - (unsigned int)name_len); - return ret; -} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a8ecccfc36de..21f34ad0d411 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -30,6 +30,7 @@ #include <linux/ratelimit.h> #include <linux/uuid.h> #include <linux/semaphore.h> +#include <linux/error-injection.h> #include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" @@ -61,7 +62,8 @@ BTRFS_HEADER_FLAG_RELOC |\ BTRFS_SUPER_FLAG_ERROR |\ BTRFS_SUPER_FLAG_SEEDING |\ - BTRFS_SUPER_FLAG_METADUMP) + BTRFS_SUPER_FLAG_METADUMP |\ + BTRFS_SUPER_FLAG_METADUMP_V2) static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); @@ -220,7 +222,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, * extents on the btree inode are pretty simple, there's one extent * that covers the entire device */ -static struct extent_map *btree_get_extent(struct btrfs_inode *inode, +struct extent_map *btree_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create) { @@ -285,7 +287,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, int verify) { u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); - char *result = NULL; + char result[BTRFS_CSUM_SIZE]; unsigned long len; unsigned long cur_len; unsigned long offset = BTRFS_CSUM_SIZE; @@ -294,7 +296,6 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, unsigned long map_len; int err; u32 crc = ~(u32)0; - unsigned long inline_result; len = buf->len - offset; while (len > 0) { @@ -308,13 +309,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, len -= cur_len; offset += cur_len; } - if (csum_size > sizeof(inline_result)) { - result = kzalloc(csum_size, GFP_NOFS); - if (!result) - return -ENOMEM; - } else { - result = (char *)&inline_result; - } + memset(result, 0, BTRFS_CSUM_SIZE); btrfs_csum_final(crc, result); @@ -329,15 +324,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, "%s checksum verify failed on %llu wanted %X found %X level %d", fs_info->sb->s_id, buf->start, val, found, btrfs_header_level(buf)); - if (result != (char *)&inline_result) - kfree(result); return -EUCLEAN; } } else { write_extent_buffer(buf, result, 0, csum_size); } - if (result != (char *)&inline_result) - kfree(result); + return 0; } @@ -391,7 +383,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, clear_extent_buffer_uptodate(eb); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state, GFP_NOFS); + &cached_state); if (need_lock) btrfs_tree_read_unlock_blocking(eb); return ret; @@ -455,7 +447,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; while (1) { ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE, - btree_get_extent, mirror_num); + mirror_num); if (!ret) { if (!verify_parent_transid(io_tree, eb, parent_transid, 0)) @@ -1012,7 +1004,7 @@ void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) if (IS_ERR(buf)) return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, WAIT_NONE, btree_get_extent, 0); + buf, WAIT_NONE, 0); free_extent_buffer(buf); } @@ -1031,7 +1023,7 @@ int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK, - btree_get_extent, mirror_num); + mirror_num); if (ret) { free_extent_buffer(buf); return ret; @@ -1243,7 +1235,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root; struct btrfs_key key; int ret = 0; - uuid_le uuid; + uuid_le uuid = NULL_UUID_LE; root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!root) @@ -1284,7 +1276,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_root_used(&root->root_item, leaf->len); btrfs_set_root_last_snapshot(&root->root_item, 0); btrfs_set_root_dirid(&root->root_item, 0); - uuid_le_gen(&uuid); + if (is_fstree(objectid)) + uuid_le_gen(&uuid); memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE); root->root_item.drop_level = 0; @@ -2875,7 +2868,7 @@ retry_root_backup: goto fail_sysfs; } - if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info)) { + if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, "writeable mount is not allowed due to too many missing devices"); goto fail_sysfs; @@ -3123,6 +3116,7 @@ recovery_tree_root: goto fail_block_groups; goto retry_root_backup; } +ALLOW_ERROR_INJECTION(open_ctree, ERRNO); static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) { @@ -3357,7 +3351,7 @@ static void write_dev_flush(struct btrfs_device *device) bio->bi_private = &device->flush_wait; btrfsic_submit_bio(bio); - device->flush_bio_sent = 1; + set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); } /* @@ -3367,10 +3361,10 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device) { struct bio *bio = device->flush_bio; - if (!device->flush_bio_sent) + if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) return BLK_STS_OK; - device->flush_bio_sent = 0; + clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); wait_for_completion_io(&device->flush_wait); return bio->bi_status; @@ -3378,7 +3372,7 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device) static int check_barrier_error(struct btrfs_fs_info *fs_info) { - if (!btrfs_check_rw_degradable(fs_info)) + if (!btrfs_check_rw_degradable(fs_info, NULL)) return -EIO; return 0; } @@ -3394,14 +3388,16 @@ static int barrier_all_devices(struct btrfs_fs_info *info) int errors_wait = 0; blk_status_t ret; + lockdep_assert_held(&info->fs_devices->device_list_mutex); /* send down all the barriers */ head = &info->fs_devices->devices; - list_for_each_entry_rcu(dev, head, dev_list) { - if (dev->missing) + list_for_each_entry(dev, head, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) continue; if (!dev->bdev) continue; - if (!dev->in_fs_metadata || !dev->writeable) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; write_dev_flush(dev); @@ -3409,14 +3405,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info) } /* wait for all the barriers */ - list_for_each_entry_rcu(dev, head, dev_list) { - if (dev->missing) + list_for_each_entry(dev, head, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) continue; if (!dev->bdev) { errors_wait++; continue; } - if (!dev->in_fs_metadata || !dev->writeable) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; ret = wait_dev_flush(dev); @@ -3508,12 +3505,13 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) } } - list_for_each_entry_rcu(dev, head, dev_list) { + list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) { total_errors++; continue; } - if (!dev->in_fs_metadata || !dev->writeable) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; btrfs_set_stack_device_generation(dev_item, 0); @@ -3549,10 +3547,11 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) } total_errors = 0; - list_for_each_entry_rcu(dev, head, dev_list) { + list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) continue; - if (!dev->in_fs_metadata || !dev->writeable) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; ret = wait_dev_supers(dev, max_mirrors); @@ -3910,9 +3909,11 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "no valid FS found"); ret = -EINVAL; } - if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) - btrfs_warn(fs_info, "unrecognized super flag: %llu", + if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) { + btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu", btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); + ret = -EINVAL; + } if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "tree_root level too big: %d >= %d", btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 7f7c35d6347a..301151a50ac1 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -149,6 +149,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); +struct extent_map *btree_get_extent(struct btrfs_inode *inode, + struct page *page, size_t pg_offset, u64 start, u64 len, + int create); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int __init btrfs_end_io_wq_init(void); void btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 3aeb5770f896..ddaccad469f8 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -283,11 +283,6 @@ static int btrfs_get_name(struct dentry *parent, char *name, name_len = btrfs_inode_ref_name_len(leaf, iref); } - ret = btrfs_is_name_len_valid(leaf, path->slots[0], name_ptr, name_len); - if (!ret) { - btrfs_free_path(path); - return -EIO; - } read_extent_buffer(leaf, name, name_ptr, name_len); btrfs_free_path(path); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2f4328511ac8..05751a677da4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2145,7 +2145,10 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, for (i = 0; i < bbio->num_stripes; i++, stripe++) { u64 bytes; - if (!stripe->dev->can_discard) + struct request_queue *req_q; + + req_q = bdev_get_queue(stripe->dev->bdev); + if (!blk_queue_discard(req_q)) continue; ret = btrfs_issue_discard(stripe->dev->bdev, @@ -2894,7 +2897,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *global_rsv; u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; - u64 num_dirty_bgs = trans->transaction->num_dirty_bgs; + unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs; u64 num_bytes, num_dirty_bgs_bytes; int ret = 0; @@ -4945,12 +4948,12 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, bytes = 0; else bytes -= delayed_rsv->size; + spin_unlock(&delayed_rsv->lock); + if (percpu_counter_compare(&space_info->total_bytes_pinned, bytes) < 0) { - spin_unlock(&delayed_rsv->lock); return -ENOSPC; } - spin_unlock(&delayed_rsv->lock); commit: trans = btrfs_join_transaction(fs_info->extent_root); @@ -5738,8 +5741,8 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, * or return if we already have enough space. This will also handle the resreve * tracepoint for the reserved amount. */ -int btrfs_inode_rsv_refill(struct btrfs_inode *inode, - enum btrfs_reserve_flush_enum flush) +static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, + enum btrfs_reserve_flush_enum flush) { struct btrfs_root *root = inode->root; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; @@ -5770,7 +5773,7 @@ int btrfs_inode_rsv_refill(struct btrfs_inode *inode, * This is the same as btrfs_block_rsv_release, except that it handles the * tracepoint for the reservation. */ -void btrfs_inode_rsv_release(struct btrfs_inode *inode) +static void btrfs_inode_rsv_release(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; @@ -9690,7 +9693,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) * space to fit our block group in. */ if (device->total_bytes > device->bytes_used + min_free && - !device->is_tgtdev_for_dev_replace) { + !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = find_free_dev_extent(trans, device, min_free, &dev_offset, NULL); if (!ret) @@ -10875,7 +10878,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, *trimmed = 0; /* Not writeable = nothing to do. */ - if (!device->writeable) + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return 0; /* No free space = nothing to do. */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 012d63870b99..dfeb74a0be77 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -21,6 +21,7 @@ #include "locking.h" #include "rcu-string.h" #include "backref.h" +#include "disk-io.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -109,8 +110,6 @@ struct tree_entry { struct extent_page_data { struct bio *bio; struct extent_io_tree *tree; - get_extent_t *get_extent; - /* tells writepage not to lock the state bits for this range * it still does the unlocking */ @@ -139,7 +138,8 @@ static void add_extent_changeset(struct extent_state *state, unsigned bits, BUG_ON(ret < 0); } -static noinline void flush_write_bio(void *data); +static void flush_write_bio(struct extent_page_data *epd); + static inline struct btrfs_fs_info * tree_fs_info(struct extent_io_tree *tree) { @@ -581,7 +581,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) * * This takes the tree lock, and returns 0 on success and < 0 on error. */ -static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, +int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached_state, gfp_t mask, struct extent_changeset *changeset) @@ -1295,10 +1295,10 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, - struct extent_state **cached, gfp_t mask) + struct extent_state **cached) { return __clear_extent_bit(tree, start, end, bits, wake, delete, - cached, mask, NULL); + cached, GFP_NOFS, NULL); } int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -1348,7 +1348,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); + EXTENT_LOCKED, 1, 0, NULL); return 0; } return 1; @@ -1648,7 +1648,7 @@ again: EXTENT_DELALLOC, 1, cached_state); if (!ret) { unlock_extent_cached(tree, delalloc_start, delalloc_end, - &cached_state, GFP_NOFS); + &cached_state); __unlock_for_delalloc(inode, locked_page, delalloc_start, delalloc_end); cond_resched(); @@ -1744,7 +1744,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, unsigned long page_ops) { clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, - NULL, GFP_NOFS); + NULL); __process_pages_contig(inode->i_mapping, locked_page, start >> PAGE_SHIFT, end >> PAGE_SHIFT, @@ -2027,7 +2027,8 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, bio->bi_iter.bi_sector = sector; dev = bbio->stripes[bbio->mirror_num - 1].dev; btrfs_put_bbio(bbio); - if (!dev || !dev->bdev || !dev->writeable) { + if (!dev || !dev->bdev || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; @@ -2257,7 +2258,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, return 0; } -bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, +bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, struct io_failure_record *failrec, int failed_mirror) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2281,7 +2282,7 @@ bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, * a) deliver good data to the caller * b) correct the bad sectors on disk */ - if (failed_bio->bi_vcnt > 1) { + if (failed_bio_pages > 1) { /* * to fulfill b), we need to know the exact failing sectors, as * we don't want to rewrite any more than the failed ones. thus, @@ -2374,6 +2375,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, int read_mode = 0; blk_status_t status; int ret; + unsigned failed_bio_pages = bio_pages_all(failed_bio); BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -2381,13 +2383,13 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, if (ret) return ret; - if (!btrfs_check_repairable(inode, failed_bio, failrec, + if (!btrfs_check_repairable(inode, failed_bio_pages, failrec, failed_mirror)) { free_io_failure(failure_tree, tree, failrec); return -EIO; } - if (failed_bio->bi_vcnt > 1) + if (failed_bio_pages > 1) read_mode |= REQ_FAILFAST_DEV; phy_offset >>= inode->i_sb->s_blocksize_bits; @@ -2492,7 +2494,7 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, if (uptodate && tree->track_uptodate) set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); - unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); + unlock_extent_cached_atomic(tree, start, end, &cached); } /* @@ -2724,7 +2726,7 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags) { blk_status_t ret = 0; - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec = bio_last_bvec_all(bio); struct page *page = bvec->bv_page; struct extent_io_tree *tree = bio->bi_private; u64 start; @@ -2732,7 +2734,6 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, start = page_offset(page) + bvec->bv_offset; bio->bi_private = NULL; - bio_get(bio); if (tree->ops) ret = tree->ops->submit_bio_hook(tree->private_data, bio, @@ -2740,7 +2741,6 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, else btrfsic_submit_bio(bio); - bio_put(bio); return blk_status_to_errno(ret); } @@ -2942,8 +2942,7 @@ static int __do_readpage(struct extent_io_tree *tree, set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, - cur + iosize - 1, - &cached, GFP_NOFS); + cur + iosize - 1, &cached); break; } em = __get_extent_map(inode, page, pg_offset, cur, @@ -3036,8 +3035,7 @@ static int __do_readpage(struct extent_io_tree *tree, set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, - cur + iosize - 1, - &cached, GFP_NOFS); + cur + iosize - 1, &cached); cur = cur + iosize; pg_offset += iosize; continue; @@ -3092,9 +3090,8 @@ out: static inline void __do_contiguous_readpages(struct extent_io_tree *tree, struct page *pages[], int nr_pages, u64 start, u64 end, - get_extent_t *get_extent, struct extent_map **em_cached, - struct bio **bio, int mirror_num, + struct bio **bio, unsigned long *bio_flags, u64 *prev_em_start) { @@ -3115,18 +3112,17 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, } for (index = 0; index < nr_pages; index++) { - __do_readpage(tree, pages[index], get_extent, em_cached, bio, - mirror_num, bio_flags, 0, prev_em_start); + __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, + bio, 0, bio_flags, 0, prev_em_start); put_page(pages[index]); } } static void __extent_readpages(struct extent_io_tree *tree, struct page *pages[], - int nr_pages, get_extent_t *get_extent, + int nr_pages, struct extent_map **em_cached, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, + struct bio **bio, unsigned long *bio_flags, u64 *prev_em_start) { u64 start = 0; @@ -3146,8 +3142,8 @@ static void __extent_readpages(struct extent_io_tree *tree, } else { __do_contiguous_readpages(tree, &pages[first_index], index - first_index, start, - end, get_extent, em_cached, - bio, mirror_num, bio_flags, + end, em_cached, + bio, bio_flags, prev_em_start); start = page_start; end = start + PAGE_SIZE - 1; @@ -3158,9 +3154,8 @@ static void __extent_readpages(struct extent_io_tree *tree, if (end) __do_contiguous_readpages(tree, &pages[first_index], index - first_index, start, - end, get_extent, em_cached, bio, - mirror_num, bio_flags, - prev_em_start); + end, em_cached, bio, + bio_flags, prev_em_start); } static int __extent_read_full_page(struct extent_io_tree *tree, @@ -3375,7 +3370,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page_end, NULL, 1); break; } - em = epd->get_extent(BTRFS_I(inode), page, pg_offset, cur, + em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur, end - cur + 1, 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); @@ -3458,10 +3453,9 @@ done: * and the end_io handler clears the writeback ranges */ static int __extent_writepage(struct page *page, struct writeback_control *wbc, - void *data) + struct extent_page_data *epd) { struct inode *inode = page->mapping->host; - struct extent_page_data *epd = data; u64 start = page_offset(page); u64 page_end = start + PAGE_SIZE - 1; int ret; @@ -3895,8 +3889,7 @@ retry: * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @writepage: function called for each page - * @data: data passed to writepage function + * @data: data passed to __extent_writepage function * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, @@ -3908,8 +3901,7 @@ retry: */ static int extent_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, - writepage_t writepage, void *data, - void (*flush_fn)(void *)) + struct extent_page_data *epd) { struct inode *inode = mapping->host; int ret = 0; @@ -3973,7 +3965,7 @@ retry: * mapping */ if (!trylock_page(page)) { - flush_fn(data); + flush_write_bio(epd); lock_page(page); } @@ -3984,7 +3976,7 @@ retry: if (wbc->sync_mode != WB_SYNC_NONE) { if (PageWriteback(page)) - flush_fn(data); + flush_write_bio(epd); wait_on_page_writeback(page); } @@ -3994,7 +3986,7 @@ retry: continue; } - ret = (*writepage)(page, wbc, data); + ret = __extent_writepage(page, wbc, epd); if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { unlock_page(page); @@ -4042,7 +4034,7 @@ retry: return ret; } -static void flush_epd_write_bio(struct extent_page_data *epd) +static void flush_write_bio(struct extent_page_data *epd) { if (epd->bio) { int ret; @@ -4053,37 +4045,28 @@ static void flush_epd_write_bio(struct extent_page_data *epd) } } -static noinline void flush_write_bio(void *data) -{ - struct extent_page_data *epd = data; - flush_epd_write_bio(epd); -} - -int extent_write_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, - struct writeback_control *wbc) +int extent_write_full_page(struct page *page, struct writeback_control *wbc) { int ret; struct extent_page_data epd = { .bio = NULL, - .tree = tree, - .get_extent = get_extent, + .tree = &BTRFS_I(page->mapping->host)->io_tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; ret = __extent_writepage(page, wbc, &epd); - flush_epd_write_bio(&epd); + flush_write_bio(&epd); return ret; } -int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, - u64 start, u64 end, get_extent_t *get_extent, +int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode) { int ret = 0; struct address_space *mapping = inode->i_mapping; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *page; unsigned long nr_pages = (end - start + PAGE_SIZE) >> PAGE_SHIFT; @@ -4091,7 +4074,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, struct extent_page_data epd = { .bio = NULL, .tree = tree, - .get_extent = get_extent, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, }; @@ -4117,34 +4099,30 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, start += PAGE_SIZE; } - flush_epd_write_bio(&epd); + flush_write_bio(&epd); return ret; } int extent_writepages(struct extent_io_tree *tree, struct address_space *mapping, - get_extent_t *get_extent, struct writeback_control *wbc) { int ret = 0; struct extent_page_data epd = { .bio = NULL, .tree = tree, - .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; - ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd, - flush_write_bio); - flush_epd_write_bio(&epd); + ret = extent_write_cache_pages(mapping, wbc, &epd); + flush_write_bio(&epd); return ret; } int extent_readpages(struct extent_io_tree *tree, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages, - get_extent_t get_extent) + struct list_head *pages, unsigned nr_pages) { struct bio *bio = NULL; unsigned page_idx; @@ -4170,13 +4148,13 @@ int extent_readpages(struct extent_io_tree *tree, pagepool[nr++] = page; if (nr < ARRAY_SIZE(pagepool)) continue; - __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, &prev_em_start); + __extent_readpages(tree, pagepool, nr, &em_cached, &bio, + &bio_flags, &prev_em_start); nr = 0; } if (nr) - __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, &prev_em_start); + __extent_readpages(tree, pagepool, nr, &em_cached, &bio, + &bio_flags, &prev_em_start); if (em_cached) free_extent_map(em_cached); @@ -4209,7 +4187,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, - 1, 1, &cached_state, GFP_NOFS); + 1, 1, &cached_state); return 0; } @@ -4234,9 +4212,9 @@ static int try_release_extent_state(struct extent_map_tree *map, * at this point we can safely clear everything except the * locked bit and the nodatasum bit */ - ret = clear_extent_bit(tree, start, end, + ret = __clear_extent_bit(tree, start, end, ~(EXTENT_LOCKED | EXTENT_NODATASUM), - 0, 0, NULL, mask); + 0, 0, NULL, mask, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. @@ -4302,9 +4280,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, * This maps until we find something past 'last' */ static struct extent_map *get_extent_skip_holes(struct inode *inode, - u64 offset, - u64 last, - get_extent_t *get_extent) + u64 offset, u64 last) { u64 sectorsize = btrfs_inode_sectorsize(inode); struct extent_map *em; @@ -4318,15 +4294,14 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, if (len == 0) break; len = ALIGN(len, sectorsize); - em = get_extent(BTRFS_I(inode), NULL, 0, offset, len, 0); + em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset, + len, 0); if (IS_ERR_OR_NULL(em)) return em; /* if this isn't a hole return it */ - if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && - em->block_start != EXTENT_MAP_HOLE) { + if (em->block_start != EXTENT_MAP_HOLE) return em; - } /* this is a hole, advance to the next extent */ offset = extent_map_end(em); @@ -4451,7 +4426,7 @@ static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info, } int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len, get_extent_t *get_extent) + __u64 start, __u64 len) { int ret = 0; u64 off = start; @@ -4533,8 +4508,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); - em = get_extent_skip_holes(inode, start, last_for_get_extent, - get_extent); + em = get_extent_skip_holes(inode, start, last_for_get_extent); if (!em) goto out; if (IS_ERR(em)) { @@ -4622,8 +4596,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } /* now scan forward to see if this is really the last extent. */ - em = get_extent_skip_holes(inode, off, last_for_get_extent, - get_extent); + em = get_extent_skip_holes(inode, off, last_for_get_extent); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -4647,7 +4620,7 @@ out_free: out: btrfs_free_path(path); unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, - &cached_state, GFP_NOFS); + &cached_state); return ret; } @@ -5263,8 +5236,7 @@ int extent_buffer_uptodate(struct extent_buffer *eb) } int read_extent_buffer_pages(struct extent_io_tree *tree, - struct extent_buffer *eb, int wait, - get_extent_t *get_extent, int mirror_num) + struct extent_buffer *eb, int wait, int mirror_num) { unsigned long i; struct page *page; @@ -5324,7 +5296,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ClearPageError(page); err = __extent_read_full_page(tree, page, - get_extent, &bio, + btree_get_extent, &bio, mirror_num, &bio_flags, REQ_META); if (err) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 93dcae0c3183..a7a850abd600 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -300,19 +300,29 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, struct extent_changeset *changeset); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, - struct extent_state **cached, gfp_t mask); + struct extent_state **cached); +int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int wake, int delete, + struct extent_state **cached, gfp_t mask, + struct extent_changeset *changeset); static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) { - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, - GFP_NOFS); + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL); } static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached, gfp_t mask) + u64 end, struct extent_state **cached) +{ + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + GFP_NOFS, NULL); +} + +static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree, + u64 start, u64 end, struct extent_state **cached) { - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - mask); + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + GFP_ATOMIC, NULL); } static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, @@ -323,8 +333,7 @@ static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, if (bits & EXTENT_LOCKED) wake = 1; - return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, - GFP_NOFS); + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL); } int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -340,10 +349,10 @@ static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, } static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) + u64 end, struct extent_state **cached_state) { - return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, - cached_state, mask); + return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, + cached_state, GFP_NOFS, NULL); } static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, @@ -358,7 +367,7 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); + EXTENT_DO_ACCOUNTING, 0, 0, NULL); } int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -401,24 +410,19 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, struct extent_state **cached_state); int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset); -int extent_write_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, - struct writeback_control *wbc); -int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, - u64 start, u64 end, get_extent_t *get_extent, +int extent_write_full_page(struct page *page, struct writeback_control *wbc); +int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); int extent_writepages(struct extent_io_tree *tree, struct address_space *mapping, - get_extent_t *get_extent, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); int extent_readpages(struct extent_io_tree *tree, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages, - get_extent_t get_extent); + struct list_head *pages, unsigned nr_pages); int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len, get_extent_t *get_extent); + __u64 start, __u64 len); void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, @@ -437,7 +441,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_PAGE_LOCK 2 int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, int wait, - get_extent_t *get_extent, int mirror_num); + int mirror_num); void wait_on_extent_buffer_writeback(struct extent_buffer *eb); static inline unsigned long num_extent_pages(u64 start, u64 len) @@ -540,7 +544,7 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, struct io_failure_record **failrec_ret); -bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, +bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, struct io_failure_record *failrec, int fail_mirror); struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 2e348fb0b280..d3bd02105d1c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -454,3 +454,135 @@ void replace_extent_mapping(struct extent_map_tree *tree, setup_extent_mapping(tree, new, modified); } + +static struct extent_map *next_extent_map(struct extent_map *em) +{ + struct rb_node *next; + + next = rb_next(&em->rb_node); + if (!next) + return NULL; + return container_of(next, struct extent_map, rb_node); +} + +static struct extent_map *prev_extent_map(struct extent_map *em) +{ + struct rb_node *prev; + + prev = rb_prev(&em->rb_node); + if (!prev) + return NULL; + return container_of(prev, struct extent_map, rb_node); +} + +/* helper for btfs_get_extent. Given an existing extent in the tree, + * the existing extent is the nearest extent to map_start, + * and an extent that you want to insert, deal with overlap and insert + * the best fitted new extent into the tree. + */ +static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, + struct extent_map *existing, + struct extent_map *em, + u64 map_start) +{ + struct extent_map *prev; + struct extent_map *next; + u64 start; + u64 end; + u64 start_diff; + + BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); + + if (existing->start > map_start) { + next = existing; + prev = prev_extent_map(next); + } else { + prev = existing; + next = next_extent_map(prev); + } + + start = prev ? extent_map_end(prev) : em->start; + start = max_t(u64, start, em->start); + end = next ? next->start : extent_map_end(em); + end = min_t(u64, end, extent_map_end(em)); + start_diff = start - em->start; + em->start = start; + em->len = end - start; + if (em->block_start < EXTENT_MAP_LAST_BYTE && + !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + em->block_start += start_diff; + em->block_len = em->len; + } + return add_extent_mapping(em_tree, em, 0); +} + +/** + * btrfs_add_extent_mapping - add extent mapping into em_tree + * @em_tree - the extent tree into which we want to insert the extent mapping + * @em_in - extent we are inserting + * @start - start of the logical range btrfs_get_extent() is requesting + * @len - length of the logical range btrfs_get_extent() is requesting + * + * Note that @em_in's range may be different from [start, start+len), + * but they must be overlapped. + * + * Insert @em_in into @em_tree. In case there is an overlapping range, handle + * the -EEXIST by either: + * a) Returning the existing extent in @em_in if @start is within the + * existing em. + * b) Merge the existing extent with @em_in passed in. + * + * Return 0 on success, otherwise -EEXIST. + * + */ +int btrfs_add_extent_mapping(struct extent_map_tree *em_tree, + struct extent_map **em_in, u64 start, u64 len) +{ + int ret; + struct extent_map *em = *em_in; + + ret = add_extent_mapping(em_tree, em, 0); + /* it is possible that someone inserted the extent into the tree + * while we had the lock dropped. It is also possible that + * an overlapping map exists in the tree + */ + if (ret == -EEXIST) { + struct extent_map *existing; + + ret = 0; + + existing = search_extent_mapping(em_tree, start, len); + /* + * existing will always be non-NULL, since there must be + * extent causing the -EEXIST. + */ + if (start >= existing->start && + start < extent_map_end(existing)) { + free_extent_map(em); + *em_in = existing; + ret = 0; + } else { + u64 orig_start = em->start; + u64 orig_len = em->len; + + /* + * The existing extent map is the one nearest to + * the [start, start + len) range which overlaps + */ + ret = merge_extent_mapping(em_tree, existing, + em, start); + if (ret) { + free_extent_map(em); + *em_in = NULL; + WARN_ONCE(ret, +"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n", + ret, existing->start, existing->len, + orig_start, orig_len); + } + free_extent_map(existing); + } + } + + ASSERT(ret == 0 || ret == -EEXIST); + return ret; +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 64365bbc9b16..b29f77bc0732 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -13,7 +13,6 @@ /* bits for the flags field */ #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ #define EXTENT_FLAG_COMPRESSED 1 -#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ #define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ @@ -92,4 +91,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); +int btrfs_add_extent_mapping(struct extent_map_tree *em_tree, + struct extent_map **em_in, u64 start, u64 len); #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index eb1bac7c8553..41ab9073d1d4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -31,6 +31,7 @@ #include <linux/slab.h> #include <linux/btrfs.h> #include <linux/uio.h> +#include <linux/iversion.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -1504,7 +1505,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, ordered->file_offset + ordered->len > start_pos && ordered->file_offset <= last_pos) { unlock_extent_cached(&inode->io_tree, start_pos, - last_pos, cached_state, GFP_NOFS); + last_pos, cached_state); for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); put_page(pages[i]); @@ -1519,7 +1520,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, clear_extent_bit(&inode->io_tree, start_pos, last_pos, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, cached_state, GFP_NOFS); + 0, 0, cached_state); *lockstart = start_pos; *lockend = last_pos; ret = 1; @@ -1755,11 +1756,10 @@ again: if (copied > 0) ret = btrfs_dirty_pages(inode, pages, dirty_pages, - pos, copied, NULL); + pos, copied, &cached_state); if (extents_locked) unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state, - GFP_NOFS); + lockstart, lockend, &cached_state); btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); if (ret) { btrfs_drop_pages(pages, num_pages); @@ -2019,10 +2019,19 @@ int btrfs_release_file(struct inode *inode, struct file *filp) static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) { int ret; + struct blk_plug plug; + /* + * This is only called in fsync, which would do synchronous writes, so + * a plug can merge adjacent IOs as much as possible. Esp. in case of + * multiple disks using raid profile, a large IO can be split to + * several segments of stripe length (currently 64K). + */ + blk_start_plug(&plug); atomic_inc(&BTRFS_I(inode)->sync_writers); ret = btrfs_fdatawrite_range(inode, start, end); atomic_dec(&BTRFS_I(inode)->sync_writers); + blk_finish_plug(&plug); return ret; } @@ -2450,6 +2459,46 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) return ret; } +static int btrfs_punch_hole_lock_range(struct inode *inode, + const u64 lockstart, + const u64 lockend, + struct extent_state **cached_state) +{ + while (1) { + struct btrfs_ordered_extent *ordered; + int ret; + + truncate_pagecache_range(inode, lockstart, lockend); + + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); + ordered = btrfs_lookup_first_ordered_extent(inode, lockend); + + /* + * We need to make sure we have no ordered extents in this range + * and nobody raced in and read a page in this range, if we did + * we need to try again. + */ + if ((!ordered || + (ordered->file_offset + ordered->len <= lockstart || + ordered->file_offset > lockend)) && + !btrfs_page_exists_in_range(inode, lockstart, lockend)) { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, cached_state); + ret = btrfs_wait_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (ret) + return ret; + } + return 0; +} + static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2566,38 +2615,11 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - while (1) { - struct btrfs_ordered_extent *ordered; - - truncate_pagecache_range(inode, lockstart, lockend); - - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, lockend); - - /* - * We need to make sure we have no ordered extents in this range - * and nobody raced in and read a page in this range, if we did - * we need to try again. - */ - if ((!ordered || - (ordered->file_offset + ordered->len <= lockstart || - ordered->file_offset > lockend)) && - !btrfs_page_exists_in_range(inode, lockstart, lockend)) { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - if (ordered) - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state, GFP_NOFS); - ret = btrfs_wait_ordered_range(inode, lockstart, - lockend - lockstart + 1); - if (ret) { - inode_unlock(inode); - return ret; - } + ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, + &cached_state); + if (ret) { + inode_unlock(inode); + goto out_only_mutex; } path = btrfs_alloc_path(); @@ -2742,7 +2764,7 @@ out_free: btrfs_free_block_rsv(fs_info, rsv); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state, GFP_NOFS); + &cached_state); out_only_mutex: if (!updated_inode && truncated_block && !ret && !err) { /* @@ -2806,6 +2828,234 @@ insert: return 0; } +static int btrfs_fallocate_update_isize(struct inode *inode, + const u64 end, + const int mode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + int ret2; + + if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) + return 0; + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + inode->i_ctime = current_time(inode); + i_size_write(inode, end); + btrfs_ordered_update_i_size(inode, end, NULL); + ret = btrfs_update_inode(trans, root, inode); + ret2 = btrfs_end_transaction(trans); + + return ret ? ret : ret2; +} + +enum { + RANGE_BOUNDARY_WRITTEN_EXTENT = 0, + RANGE_BOUNDARY_PREALLOC_EXTENT = 1, + RANGE_BOUNDARY_HOLE = 2, +}; + +static int btrfs_zero_range_check_range_boundary(struct inode *inode, + u64 offset) +{ + const u64 sectorsize = btrfs_inode_sectorsize(inode); + struct extent_map *em; + int ret; + + offset = round_down(offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + if (em->block_start == EXTENT_MAP_HOLE) + ret = RANGE_BOUNDARY_HOLE; + else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + ret = RANGE_BOUNDARY_PREALLOC_EXTENT; + else + ret = RANGE_BOUNDARY_WRITTEN_EXTENT; + + free_extent_map(em); + return ret; +} + +static int btrfs_zero_range(struct inode *inode, + loff_t offset, + loff_t len, + const int mode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct extent_map *em; + struct extent_changeset *data_reserved = NULL; + int ret; + u64 alloc_hint = 0; + const u64 sectorsize = btrfs_inode_sectorsize(inode); + u64 alloc_start = round_down(offset, sectorsize); + u64 alloc_end = round_up(offset + len, sectorsize); + u64 bytes_to_reserve = 0; + bool space_reserved = false; + + inode_dio_wait(inode); + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, + alloc_start, alloc_end - alloc_start, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + /* + * Avoid hole punching and extent allocation for some cases. More cases + * could be considered, but these are unlikely common and we keep things + * as simple as possible for now. Also, intentionally, if the target + * range contains one or more prealloc extents together with regular + * extents and holes, we drop all the existing extents and allocate a + * new prealloc extent, so that we get a larger contiguous disk extent. + */ + if (em->start <= alloc_start && + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + const u64 em_end = em->start + em->len; + + if (em_end >= offset + len) { + /* + * The whole range is already a prealloc extent, + * do nothing except updating the inode's i_size if + * needed. + */ + free_extent_map(em); + ret = btrfs_fallocate_update_isize(inode, offset + len, + mode); + goto out; + } + /* + * Part of the range is already a prealloc extent, so operate + * only on the remaining part of the range. + */ + alloc_start = em_end; + ASSERT(IS_ALIGNED(alloc_start, sectorsize)); + len = offset + len - alloc_start; + offset = alloc_start; + alloc_hint = em->block_start + em->len; + } + free_extent_map(em); + + if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == + BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, + alloc_start, sectorsize, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + free_extent_map(em); + ret = btrfs_fallocate_update_isize(inode, offset + len, + mode); + goto out; + } + if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) { + free_extent_map(em); + ret = btrfs_truncate_block(inode, offset, len, 0); + if (!ret) + ret = btrfs_fallocate_update_isize(inode, + offset + len, + mode); + return ret; + } + free_extent_map(em); + alloc_start = round_down(offset, sectorsize); + alloc_end = alloc_start + sectorsize; + goto reserve_space; + } + + alloc_start = round_up(offset, sectorsize); + alloc_end = round_down(offset + len, sectorsize); + + /* + * For unaligned ranges, check the pages at the boundaries, they might + * map to an extent, in which case we need to partially zero them, or + * they might map to a hole, in which case we need our allocation range + * to cover them. + */ + if (!IS_ALIGNED(offset, sectorsize)) { + ret = btrfs_zero_range_check_range_boundary(inode, offset); + if (ret < 0) + goto out; + if (ret == RANGE_BOUNDARY_HOLE) { + alloc_start = round_down(offset, sectorsize); + ret = 0; + } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { + ret = btrfs_truncate_block(inode, offset, 0, 0); + if (ret) + goto out; + } else { + ret = 0; + } + } + + if (!IS_ALIGNED(offset + len, sectorsize)) { + ret = btrfs_zero_range_check_range_boundary(inode, + offset + len); + if (ret < 0) + goto out; + if (ret == RANGE_BOUNDARY_HOLE) { + alloc_end = round_up(offset + len, sectorsize); + ret = 0; + } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { + ret = btrfs_truncate_block(inode, offset + len, 0, 1); + if (ret) + goto out; + } else { + ret = 0; + } + } + +reserve_space: + if (alloc_start < alloc_end) { + struct extent_state *cached_state = NULL; + const u64 lockstart = alloc_start; + const u64 lockend = alloc_end - 1; + + bytes_to_reserve = alloc_end - alloc_start; + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + bytes_to_reserve); + if (ret < 0) + goto out; + space_reserved = true; + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, + alloc_start, bytes_to_reserve); + if (ret) + goto out; + ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, + &cached_state); + if (ret) + goto out; + ret = btrfs_prealloc_file_range(inode, mode, alloc_start, + alloc_end - alloc_start, + i_blocksize(inode), + offset + len, &alloc_hint); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); + /* btrfs_prealloc_file_range releases reserved space on error */ + if (ret) { + space_reserved = false; + goto out; + } + } + ret = btrfs_fallocate_update_isize(inode, offset + len, mode); + out: + if (ret && space_reserved) + btrfs_free_reserved_data_space(inode, data_reserved, + alloc_start, bytes_to_reserve); + extent_changeset_free(data_reserved); + + return ret; +} + static long btrfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -2831,7 +3081,8 @@ static long btrfs_fallocate(struct file *file, int mode, cur_offset = alloc_start; /* Make sure we aren't being give some crap mode */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) @@ -2842,10 +3093,12 @@ static long btrfs_fallocate(struct file *file, int mode, * * For qgroup space, it will be checked later. */ - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), - alloc_end - alloc_start); - if (ret < 0) - return ret; + if (!(mode & FALLOC_FL_ZERO_RANGE)) { + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + alloc_end - alloc_start); + if (ret < 0) + return ret; + } inode_lock(inode); @@ -2887,6 +3140,12 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret) goto out; + if (mode & FALLOC_FL_ZERO_RANGE) { + ret = btrfs_zero_range(inode, offset, len, mode); + inode_unlock(inode); + return ret; + } + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; @@ -2896,15 +3155,15 @@ static long btrfs_fallocate(struct file *file, int mode, */ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, - alloc_end - 1); + ordered = btrfs_lookup_first_ordered_extent(inode, locked_end); + if (ordered && ordered->file_offset + ordered->len > alloc_start && ordered->file_offset < alloc_end) { btrfs_put_ordered_extent(ordered); unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state, GFP_KERNEL); + &cached_state); /* * we can't wait on the range with the transaction * running or with the extent lock held @@ -2922,7 +3181,7 @@ static long btrfs_fallocate(struct file *file, int mode, /* First, check if we exceed the qgroup limit */ INIT_LIST_HEAD(&reserve_list); - while (1) { + while (cur_offset < alloc_end) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, alloc_end - cur_offset, 0); if (IS_ERR(em)) { @@ -2958,8 +3217,6 @@ static long btrfs_fallocate(struct file *file, int mode, } free_extent_map(em); cur_offset = last_byte; - if (cur_offset >= alloc_end) - break; } /* @@ -2982,37 +3239,18 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret < 0) goto out_unlock; - if (actual_end > inode->i_size && - !(mode & FALLOC_FL_KEEP_SIZE)) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* - * We didn't need to allocate any more space, but we - * still extended the size of the file so we need to - * update i_size and the inode item. - */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - } else { - inode->i_ctime = current_time(inode); - i_size_write(inode, actual_end); - btrfs_ordered_update_i_size(inode, actual_end, NULL); - ret = btrfs_update_inode(trans, root, inode); - if (ret) - btrfs_end_transaction(trans); - else - ret = btrfs_end_transaction(trans); - } - } + /* + * We didn't need to allocate any more space, but we still extended the + * size of the file so we need to update i_size and the inode item. + */ + ret = btrfs_fallocate_update_isize(inode, actual_end, mode); out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state, GFP_KERNEL); + &cached_state); out: inode_unlock(inode); /* Let go of our reservation. */ - if (ret != 0) + if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) btrfs_free_reserved_data_space(inode, data_reserved, alloc_start, alloc_end - cur_offset); extent_changeset_free(data_reserved); @@ -3081,7 +3319,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) *offset = min_t(loff_t, start, inode->i_size); } unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state, GFP_NOFS); + &cached_state); return ret; } @@ -3145,7 +3383,7 @@ void btrfs_auto_defrag_exit(void) kmem_cache_destroy(btrfs_inode_defrag_cachep); } -int btrfs_auto_defrag_init(void) +int __init btrfs_auto_defrag_init(void) { btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", sizeof(struct inode_defrag), 0, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4426d1c73e50..a9f22ac50d6a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -22,6 +22,7 @@ #include <linux/slab.h> #include <linux/math64.h> #include <linux/ratelimit.h> +#include <linux/error-injection.h> #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" @@ -332,6 +333,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, return 0; } +ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); static void io_ctl_free(struct btrfs_io_ctl *io_ctl) { @@ -993,8 +995,7 @@ update_cache_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, - GFP_NOFS); + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL); goto fail; } leaf = path->nodes[0]; @@ -1008,7 +1009,7 @@ update_cache_item(struct btrfs_trans_handle *trans, clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, - NULL, GFP_NOFS); + NULL); btrfs_release_path(path); goto fail; } @@ -1105,8 +1106,7 @@ static int flush_dirty_cache(struct inode *inode) ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, - GFP_NOFS); + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL); return ret; } @@ -1127,8 +1127,7 @@ cleanup_write_cache_enospc(struct inode *inode, { io_ctl_drop_pages(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, cached_state, - GFP_NOFS); + i_size_read(inode) - 1, cached_state); } static int __btrfs_wait_cache_io(struct btrfs_root *root, @@ -1322,7 +1321,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, io_ctl_drop_pages(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state, GFP_NOFS); + i_size_read(inode) - 1, &cached_state); /* * at this point the pages are under IO and we're happy, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e1a7f3cb5be9..53ca025655fc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -43,6 +43,7 @@ #include <linux/posix_acl_xattr.h> #include <linux/uio.h> #include <linux/magic.h> +#include <linux/iversion.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -536,9 +537,14 @@ again: * * If the compression fails for any reason, we set the pages * dirty again later on. + * + * Note that the remaining part is redirtied, the start pointer + * has moved, the end is the original one. */ - extent_range_clear_dirty_for_io(inode, start, end); - redirty = 1; + if (!redirty) { + extent_range_clear_dirty_for_io(inode, start, end); + redirty = 1; + } /* Compression level is applied here and only here */ ret = btrfs_compress_pages( @@ -765,11 +771,10 @@ retry: * all those pages down to the drive. */ if (!page_started && !ret) - extent_write_locked_range(io_tree, - inode, async_extent->start, + extent_write_locked_range(inode, + async_extent->start, async_extent->start + async_extent->ram_size - 1, - btrfs_get_extent, WB_SYNC_ALL); else if (ret) unlock_page(async_cow->locked_page); @@ -1203,7 +1208,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, u64 cur_end; clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, - 1, 0, NULL, GFP_NOFS); + 1, 0, NULL); while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); BUG_ON(!async_cow); /* -ENOMEM */ @@ -1951,7 +1956,21 @@ static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio, /* * extent_io.c submission hook. This does the right thing for csum calculation - * on write, or reading the csums from the tree before a read + * on write, or reading the csums from the tree before a read. + * + * Rules about async/sync submit, + * a) read: sync submit + * + * b) write without checksum: sync submit + * + * c) write with checksum: + * c-1) if bio is issued by fsync: sync submit + * (sync_writers != 0) + * + * c-2) if root is reloc root: sync submit + * (only in case of buffered IO) + * + * c-3) otherwise: async submit */ static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, @@ -2023,10 +2042,10 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sum; list_for_each_entry(sum, list, list) { - trans->adding_csums = 1; + trans->adding_csums = true; btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); - trans->adding_csums = 0; + trans->adding_csums = false; } return 0; } @@ -2082,7 +2101,7 @@ again: PAGE_SIZE); if (ordered) { unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, - page_end, &cached_state, GFP_NOFS); + page_end, &cached_state); unlock_page(page); btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); @@ -2098,14 +2117,21 @@ again: goto out; } - btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state, - 0); + ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, + &cached_state, 0); + if (ret) { + mapping_set_error(page->mapping, ret); + end_extent_writepage(page, ret, page_start, page_end); + ClearPageChecked(page); + goto out; + } + ClearPageChecked(page); set_page_dirty(page); btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, - &cached_state, GFP_NOFS); + &cached_state); out_page: unlock_page(page); put_page(page); @@ -2697,7 +2723,7 @@ out_free_path: btrfs_end_transaction(trans); out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, - &cached, GFP_NOFS); + &cached); iput(inode); return ret; } @@ -2986,7 +3012,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) clear_extent_bit(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, - EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); + EXTENT_DEFRAG, 0, 0, &cached_state); } if (nolock) @@ -3056,7 +3082,7 @@ out: ordered_extent->len - 1, clear_bits, (clear_bits & EXTENT_LOCKED) ? 1 : 0, - 0, &cached_state, GFP_NOFS); + 0, &cached_state); } if (trans) @@ -3070,7 +3096,7 @@ out: else start = ordered_extent->file_offset; end = ordered_extent->file_offset + ordered_extent->len - 1; - clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS); + clear_extent_uptodate(io_tree, start, end, NULL); /* Drop the cache for the part of the extent we didn't write. */ btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); @@ -3777,7 +3803,8 @@ static int btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); - inode->i_version = btrfs_inode_sequence(leaf, inode_item); + inode_set_iversion_queried(inode, + btrfs_inode_sequence(leaf, inode_item)); inode->i_generation = BTRFS_I(inode)->generation; inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); @@ -3945,7 +3972,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, &token); btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, &token); - btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); + btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode), + &token); btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); @@ -4744,8 +4772,8 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, u64 block_start; u64 block_end; - if ((offset & (blocksize - 1)) == 0 && - (!len || ((len & (blocksize - 1)) == 0))) + if (IS_ALIGNED(offset, blocksize) && + (!len || IS_ALIGNED(len, blocksize))) goto out; block_start = round_down(from, blocksize); @@ -4787,7 +4815,7 @@ again: ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { unlock_extent_cached(io_tree, block_start, block_end, - &cached_state, GFP_NOFS); + &cached_state); unlock_page(page); put_page(page); btrfs_start_ordered_extent(inode, ordered, 1); @@ -4798,13 +4826,13 @@ again: clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state, GFP_NOFS); + 0, 0, &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state, 0); if (ret) { unlock_extent_cached(io_tree, block_start, block_end, - &cached_state, GFP_NOFS); + &cached_state); goto out_unlock; } @@ -4823,8 +4851,7 @@ again: } ClearPageChecked(page); set_page_dirty(page); - unlock_extent_cached(io_tree, block_start, block_end, &cached_state, - GFP_NOFS); + unlock_extent_cached(io_tree, block_start, block_end, &cached_state); out_unlock: if (ret) @@ -4925,7 +4952,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) if (!ordered) break; unlock_extent_cached(io_tree, hole_start, block_end - 1, - &cached_state, GFP_NOFS); + &cached_state); btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); } @@ -4990,8 +5017,7 @@ next: break; } free_extent_map(em); - unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, - GFP_NOFS); + unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state); return err; } @@ -5234,8 +5260,7 @@ static void evict_inode_truncate_pages(struct inode *inode) clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 1, - &cached_state, GFP_NOFS); + EXTENT_DEFRAG, 1, 1, &cached_state); cond_resched(); spin_lock(&io_tree->lock); @@ -5894,7 +5919,6 @@ static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_file_private *private = file->private_data; struct btrfs_dir_item *di; @@ -5962,9 +5986,6 @@ again: if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) goto next; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); - if (verify_dir_item(fs_info, leaf, slot, di)) - goto next; - name_len = btrfs_dir_name_len(leaf, di); if ((total_len + sizeof(struct dir_entry) + name_len) >= PAGE_SIZE) { @@ -6104,19 +6125,20 @@ static int btrfs_update_time(struct inode *inode, struct timespec *now, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; + bool dirty = flags & ~S_VERSION; if (btrfs_root_readonly(root)) return -EROFS; if (flags & S_VERSION) - inode_inc_iversion(inode); + dirty |= inode_maybe_inc_iversion(inode, dirty); if (flags & S_CTIME) inode->i_ctime = *now; if (flags & S_MTIME) inode->i_mtime = *now; if (flags & S_ATIME) inode->i_atime = *now; - return btrfs_dirty_inode(inode); + return dirty ? btrfs_dirty_inode(inode) : 0; } /* @@ -6297,7 +6319,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, } /* * index_cnt is ignored for everything but a dir, - * btrfs_get_inode_index_count has an explanation for the magic + * btrfs_set_inode_index_count has an explanation for the magic * number */ BTRFS_I(inode)->index_cnt = 2; @@ -6560,7 +6582,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, out_unlock: btrfs_end_transaction(trans); - btrfs_balance_delayed_items(fs_info); btrfs_btree_balance_dirty(fs_info); if (drop_inode) { inode_dec_link_count(inode); @@ -6641,7 +6662,6 @@ out_unlock: inode_dec_link_count(inode); iput(inode); } - btrfs_balance_delayed_items(fs_info); btrfs_btree_balance_dirty(fs_info); return err; @@ -6716,7 +6736,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); } - btrfs_balance_delayed_items(fs_info); fail: if (trans) btrfs_end_transaction(trans); @@ -6794,7 +6813,6 @@ out_fail: inode_dec_link_count(inode); iput(inode); } - btrfs_balance_delayed_items(fs_info); btrfs_btree_balance_dirty(fs_info); return err; @@ -6803,68 +6821,6 @@ out_fail_inode: goto out_fail; } -/* Find next extent map of a given extent map, caller needs to ensure locks */ -static struct extent_map *next_extent_map(struct extent_map *em) -{ - struct rb_node *next; - - next = rb_next(&em->rb_node); - if (!next) - return NULL; - return container_of(next, struct extent_map, rb_node); -} - -static struct extent_map *prev_extent_map(struct extent_map *em) -{ - struct rb_node *prev; - - prev = rb_prev(&em->rb_node); - if (!prev) - return NULL; - return container_of(prev, struct extent_map, rb_node); -} - -/* helper for btfs_get_extent. Given an existing extent in the tree, - * the existing extent is the nearest extent to map_start, - * and an extent that you want to insert, deal with overlap and insert - * the best fitted new extent into the tree. - */ -static int merge_extent_mapping(struct extent_map_tree *em_tree, - struct extent_map *existing, - struct extent_map *em, - u64 map_start) -{ - struct extent_map *prev; - struct extent_map *next; - u64 start; - u64 end; - u64 start_diff; - - BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); - - if (existing->start > map_start) { - next = existing; - prev = prev_extent_map(next); - } else { - prev = existing; - next = next_extent_map(prev); - } - - start = prev ? extent_map_end(prev) : em->start; - start = max_t(u64, start, em->start); - end = next ? next->start : extent_map_end(em); - end = min_t(u64, end, extent_map_end(em)); - start_diff = start - em->start; - em->start = start; - em->len = end - start; - if (em->block_start < EXTENT_MAP_LAST_BYTE && - !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - em->block_start += start_diff; - em->block_len -= start_diff; - } - return add_extent_mapping(em_tree, em, 0); -} - static noinline int uncompress_inline(struct btrfs_path *path, struct page *page, size_t pg_offset, u64 extent_offset, @@ -6939,10 +6895,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct extent_map *em = NULL; struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_io_tree *io_tree = &inode->io_tree; - struct btrfs_trans_handle *trans = NULL; const bool new_inline = !page || create; -again: read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (em) @@ -6981,8 +6935,7 @@ again: path->reada = READA_FORWARD; } - ret = btrfs_lookup_file_extent(trans, root, path, - objectid, start, trans != NULL); + ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { err = ret; goto out; @@ -7083,7 +7036,7 @@ next: em->orig_block_len = em->len; em->orig_start = em->start; ptr = btrfs_file_extent_inline_start(item) + extent_offset; - if (create == 0 && !PageUptodate(page)) { + if (!PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { ret = uncompress_inline(path, page, pg_offset, @@ -7104,25 +7057,6 @@ next: kunmap(page); } flush_dcache_page(page); - } else if (create && PageUptodate(page)) { - BUG(); - if (!trans) { - kunmap(page); - free_extent_map(em); - em = NULL; - - btrfs_release_path(path); - trans = btrfs_join_transaction(root); - - if (IS_ERR(trans)) - return ERR_CAST(trans); - goto again; - } - map = kmap(page); - write_extent_buffer(leaf, map + pg_offset, ptr, - copy_size); - kunmap(page); - btrfs_mark_buffer_dirty(leaf); } set_extent_uptodate(io_tree, em->start, extent_map_end(em) - 1, NULL, GFP_NOFS); @@ -7134,7 +7068,6 @@ not_found: em->len = len; not_found_em: em->block_start = EXTENT_MAP_HOLE; - set_bit(EXTENT_FLAG_VACANCY, &em->flags); insert: btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { @@ -7147,62 +7080,13 @@ insert: err = 0; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); - /* it is possible that someone inserted the extent into the tree - * while we had the lock dropped. It is also possible that - * an overlapping map exists in the tree - */ - if (ret == -EEXIST) { - struct extent_map *existing; - - ret = 0; - - existing = search_extent_mapping(em_tree, start, len); - /* - * existing will always be non-NULL, since there must be - * extent causing the -EEXIST. - */ - if (existing->start == em->start && - extent_map_end(existing) >= extent_map_end(em) && - em->block_start == existing->block_start) { - /* - * The existing extent map already encompasses the - * entire extent map we tried to add. - */ - free_extent_map(em); - em = existing; - err = 0; - - } else if (start >= extent_map_end(existing) || - start <= existing->start) { - /* - * The existing extent map is the one nearest to - * the [start, start + len) range which overlaps - */ - err = merge_extent_mapping(em_tree, existing, - em, start); - free_extent_map(existing); - if (err) { - free_extent_map(em); - em = NULL; - } - } else { - free_extent_map(em); - em = existing; - err = 0; - } - } + err = btrfs_add_extent_mapping(em_tree, &em, start, len); write_unlock(&em_tree->lock); out: trace_btrfs_get_extent(root, inode, em); btrfs_free_path(path); - if (trans) { - ret = btrfs_end_transaction(trans); - if (!err) - err = ret; - } if (err) { free_extent_map(em); return ERR_PTR(err); @@ -7324,7 +7208,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, em->block_start = EXTENT_MAP_DELALLOC; em->block_len = found; } - } else if (hole_em) { + } else { return hole_em; } out: @@ -7641,7 +7525,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, break; unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state, GFP_NOFS); + cached_state); if (ordered) { /* @@ -7926,7 +7810,7 @@ unlock: if (lockstart < lockend) { clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, - &cached_state, GFP_NOFS); + &cached_state); } else { free_extent_state(cached_state); } @@ -7937,7 +7821,7 @@ unlock: unlock_err: clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - unlock_bits, 1, 0, &cached_state, GFP_NOFS); + unlock_bits, 1, 0, &cached_state); err: if (dio_data) current->journal_info = dio_data; @@ -7953,15 +7837,12 @@ static inline blk_status_t submit_dio_repair_bio(struct inode *inode, BUG_ON(bio_op(bio) == REQ_OP_WRITE); - bio_get(bio); - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR); if (ret) - goto err; + return ret; ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); -err: - bio_put(bio); + return ret; } @@ -8015,6 +7896,7 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, int segs; int ret; blk_status_t status; + struct bio_vec bvec; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -8030,8 +7912,9 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, } segs = bio_segments(failed_bio); + bio_get_first_bvec(failed_bio, &bvec); if (segs > 1 || - (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode))) + (bvec.bv_len > btrfs_inode_sectorsize(inode))) read_mode |= REQ_FAILFAST_DEV; isector = start - btrfs_io_bio(failed_bio)->logical; @@ -8074,7 +7957,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) ASSERT(bio->bi_vcnt == 1); io_tree = &BTRFS_I(inode)->io_tree; failure_tree = &BTRFS_I(inode)->io_failure_tree; - ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode)); + ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode)); done->uptodate = 1; ASSERT(!bio_flagged(bio, BIO_CLONED)); @@ -8164,7 +8047,7 @@ static void btrfs_retry_endio(struct bio *bio) uptodate = 1; ASSERT(bio->bi_vcnt == 1); - ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode)); + ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode)); io_tree = &BTRFS_I(inode)->io_tree; failure_tree = &BTRFS_I(inode)->io_failure_tree; @@ -8460,11 +8343,10 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, bool write = bio_op(bio) == REQ_OP_WRITE; blk_status_t ret; + /* Check btrfs_submit_bio_hook() for rules about async submit. */ if (async_submit) async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); - bio_get(bio); - if (!write) { ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); if (ret) @@ -8497,7 +8379,6 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, map: ret = btrfs_map_bio(fs_info, bio, 0, 0); err: - bio_put(bio); return ret; } @@ -8854,7 +8735,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); + return extent_fiemap(inode, fieinfo, start, len); } int btrfs_readpage(struct file *file, struct page *page) @@ -8866,7 +8747,6 @@ int btrfs_readpage(struct file *file, struct page *page) static int btrfs_writepage(struct page *page, struct writeback_control *wbc) { - struct extent_io_tree *tree; struct inode *inode = page->mapping->host; int ret; @@ -8885,8 +8765,7 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc) redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; } - tree = &BTRFS_I(page->mapping->host)->io_tree; - ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc); + ret = extent_write_full_page(page, wbc); btrfs_add_delayed_iput(inode); return ret; } @@ -8897,7 +8776,7 @@ static int btrfs_writepages(struct address_space *mapping, struct extent_io_tree *tree; tree = &BTRFS_I(mapping->host)->io_tree; - return extent_writepages(tree, mapping, btrfs_get_extent, wbc); + return extent_writepages(tree, mapping, wbc); } static int @@ -8906,8 +8785,7 @@ btrfs_readpages(struct file *file, struct address_space *mapping, { struct extent_io_tree *tree; tree = &BTRFS_I(mapping->host)->io_tree; - return extent_readpages(tree, mapping, pages, nr_pages, - btrfs_get_extent); + return extent_readpages(tree, mapping, pages, nr_pages); } static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { @@ -8978,8 +8856,7 @@ again: EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 0, &cached_state, - GFP_NOFS); + EXTENT_DEFRAG, 1, 0, &cached_state); /* * whoever cleared the private bit is responsible * for the finish_ordered_io @@ -9036,7 +8913,7 @@ again: EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, - &cached_state, GFP_NOFS); + &cached_state); __btrfs_releasepage(page, GFP_NOFS); } @@ -9137,7 +9014,7 @@ again: PAGE_SIZE); if (ordered) { unlock_extent_cached(io_tree, page_start, page_end, - &cached_state, GFP_NOFS); + &cached_state); unlock_page(page); btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); @@ -9164,13 +9041,13 @@ again: clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state, GFP_NOFS); + 0, 0, &cached_state); ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state, 0); if (ret) { unlock_extent_cached(io_tree, page_start, page_end, - &cached_state, GFP_NOFS); + &cached_state); ret = VM_FAULT_SIGBUS; goto out_unlock; } @@ -9196,7 +9073,7 @@ again: BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; - unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); + unlock_extent_cached(io_tree, page_start, page_end, &cached_state); out_unlock: if (!ret) { @@ -9421,7 +9298,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) struct btrfs_inode *ei; struct inode *inode; - ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); + ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; @@ -9573,7 +9450,7 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_free_space_cachep); } -int btrfs_init_cachep(void) +int __init btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, @@ -10688,7 +10565,6 @@ out: btrfs_end_transaction(trans); if (ret) iput(inode); - btrfs_balance_delayed_items(fs_info); btrfs_btree_balance_dirty(fs_info); return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2ef8acaac688..111ee282b777 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -43,6 +43,7 @@ #include <linux/uuid.h> #include <linux/btrfs.h> #include <linux/uaccess.h> +#include <linux/iversion.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -307,12 +308,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip->flags |= BTRFS_INODE_COMPRESS; ip->flags &= ~BTRFS_INODE_NOCOMPRESS; - if (fs_info->compress_type == BTRFS_COMPRESS_LZO) - comp = "lzo"; - else if (fs_info->compress_type == BTRFS_COMPRESS_ZLIB) - comp = "zlib"; - else - comp = "zstd"; + comp = btrfs_compress_type2str(fs_info->compress_type); + if (!comp || comp[0] == 0) + comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB); + ret = btrfs_set_prop(inode, "btrfs.compression", comp, strlen(comp), 0); if (ret) @@ -979,7 +978,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) /* get the big lock and read metadata off disk */ lock_extent_bits(io_tree, start, end, &cached); em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); - unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS); + unlock_extent_cached(io_tree, start, end, &cached); if (IS_ERR(em)) return NULL; @@ -1130,7 +1129,7 @@ again: ordered = btrfs_lookup_ordered_extent(inode, page_start); unlock_extent_cached(tree, page_start, page_end, - &cached_state, GFP_NOFS); + &cached_state); if (!ordered) break; @@ -1190,7 +1189,7 @@ again: clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, - &cached_state, GFP_NOFS); + &cached_state); if (i_done != page_cnt) { spin_lock(&BTRFS_I(inode)->lock); @@ -1206,8 +1205,7 @@ again: &cached_state); unlock_extent_cached(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, &cached_state, - GFP_NOFS); + page_start, page_end - 1, &cached_state); for (i = 0; i < i_done; i++) { clear_page_dirty_for_io(pages[i]); @@ -1503,7 +1501,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, goto out_free; } - if (!device->writeable) { + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { btrfs_info(fs_info, "resizer unable to apply on readonly device %llu", devid); @@ -1528,7 +1526,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, } } - if (device->is_tgtdev_for_dev_replace) { + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = -EPERM; goto out_free; } @@ -2675,14 +2673,12 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto out; } - mutex_lock(&fs_info->volume_mutex); if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { ret = btrfs_rm_device(fs_info, NULL, vol_args->devid); } else { vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; ret = btrfs_rm_device(fs_info, vol_args->name, 0); } - mutex_unlock(&fs_info->volume_mutex); clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); if (!ret) { @@ -2726,9 +2722,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - mutex_lock(&fs_info->volume_mutex); ret = btrfs_rm_device(fs_info, vol_args->name, 0); - mutex_unlock(&fs_info->volume_mutex); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); @@ -2753,16 +2747,16 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, if (!fi_args) return -ENOMEM; - mutex_lock(&fs_devices->device_list_mutex); + rcu_read_lock(); fi_args->num_devices = fs_devices->num_devices; - memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid)); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (device->devid > fi_args->max_id) fi_args->max_id = device->devid; } - mutex_unlock(&fs_devices->device_list_mutex); + rcu_read_unlock(); + memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid)); fi_args->nodesize = fs_info->nodesize; fi_args->sectorsize = fs_info->sectorsize; fi_args->clone_alignment = fs_info->sectorsize; @@ -2779,7 +2773,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, { struct btrfs_ioctl_dev_info_args *di_args; struct btrfs_device *dev; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; int ret = 0; char *s_uuid = NULL; @@ -2790,7 +2783,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, if (!btrfs_is_empty_uuid(di_args->uuid)) s_uuid = di_args->uuid; - mutex_lock(&fs_devices->device_list_mutex); + rcu_read_lock(); dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL); if (!dev) { @@ -2805,17 +2798,15 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, if (dev->name) { struct rcu_string *name; - rcu_read_lock(); name = rcu_dereference(dev->name); - strncpy(di_args->path, name->str, sizeof(di_args->path)); - rcu_read_unlock(); + strncpy(di_args->path, name->str, sizeof(di_args->path) - 1); di_args->path[sizeof(di_args->path) - 1] = 0; } else { di_args->path[0] = '\0'; } out: - mutex_unlock(&fs_devices->device_list_mutex); + rcu_read_unlock(); if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) ret = -EFAULT; diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index f6a05f836629..b30a056963ab 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -164,7 +164,6 @@ static int iterate_object_props(struct btrfs_root *root, size_t), void *ctx) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *name_buf = NULL; char *value_buf = NULL; @@ -215,12 +214,6 @@ static int iterate_object_props(struct btrfs_root *root, name_ptr = (unsigned long)(di + 1); data_ptr = name_ptr + name_len; - if (verify_dir_item(fs_info, leaf, - path->slots[0], di)) { - ret = -EIO; - goto out; - } - if (name_len <= XATTR_BTRFS_PREFIX_LEN || memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX, name_ptr, @@ -430,11 +423,11 @@ static const char *prop_compression_extract(struct inode *inode) { switch (BTRFS_I(inode)->prop_compress) { case BTRFS_COMPRESS_ZLIB: - return "zlib"; case BTRFS_COMPRESS_LZO: - return "lzo"; case BTRFS_COMPRESS_ZSTD: - return "zstd"; + return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress); + default: + break; } return NULL; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 168fd03ca3ac..9e61dd624f7b 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2883,8 +2883,7 @@ cleanup: ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(&reserved->range_changed, &uiter))) clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, - unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL, - GFP_NOFS); + unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); extent_changeset_release(reserved); return ret; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index a7f79254ecca..dec0907dfb8a 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -231,7 +231,6 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) cur = h + i; INIT_LIST_HEAD(&cur->hash_list); spin_lock_init(&cur->lock); - init_waitqueue_head(&cur->wait); } x = cmpxchg(&info->stripe_hash_table, NULL, table); @@ -595,14 +594,31 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, * bio list here, anyone else that wants to * change this stripe needs to do their own rmw. */ - if (last->operation == BTRFS_RBIO_PARITY_SCRUB || - cur->operation == BTRFS_RBIO_PARITY_SCRUB) + if (last->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; - if (last->operation == BTRFS_RBIO_REBUILD_MISSING || - cur->operation == BTRFS_RBIO_REBUILD_MISSING) + if (last->operation == BTRFS_RBIO_REBUILD_MISSING) return 0; + if (last->operation == BTRFS_RBIO_READ_REBUILD) { + int fa = last->faila; + int fb = last->failb; + int cur_fa = cur->faila; + int cur_fb = cur->failb; + + if (last->faila >= last->failb) { + fa = last->failb; + fb = last->faila; + } + + if (cur->faila >= cur->failb) { + cur_fa = cur->failb; + cur_fb = cur->faila; + } + + if (fa != cur_fa || fb != cur_fb) + return 0; + } return 1; } @@ -670,7 +686,6 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) struct btrfs_raid_bio *cur; struct btrfs_raid_bio *pending; unsigned long flags; - DEFINE_WAIT(wait); struct btrfs_raid_bio *freeit = NULL; struct btrfs_raid_bio *cache_drop = NULL; int ret = 0; @@ -816,15 +831,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) } goto done_nolock; - /* - * The barrier for this waitqueue_active is not needed, - * we're protected by h->lock and can't miss a wakeup. - */ - } else if (waitqueue_active(&h->wait)) { - spin_unlock(&rbio->bio_list_lock); - spin_unlock_irqrestore(&h->lock, flags); - wake_up(&h->wait); - goto done_nolock; } } done: @@ -858,10 +864,17 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) kfree(rbio); } -static void free_raid_bio(struct btrfs_raid_bio *rbio) +static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) { - unlock_stripe(rbio); - __free_raid_bio(rbio); + struct bio *next; + + while (cur) { + next = cur->bi_next; + cur->bi_next = NULL; + cur->bi_status = err; + bio_endio(cur); + cur = next; + } } /* @@ -871,20 +884,26 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio) static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) { struct bio *cur = bio_list_get(&rbio->bio_list); - struct bio *next; + struct bio *extra; if (rbio->generic_bio_cnt) btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); - free_raid_bio(rbio); + /* + * At this moment, rbio->bio_list is empty, however since rbio does not + * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the + * hash list, rbio may be merged with others so that rbio->bio_list + * becomes non-empty. + * Once unlock_stripe() is done, rbio->bio_list will not be updated any + * more and we can call bio_endio() on all queued bios. + */ + unlock_stripe(rbio); + extra = bio_list_get(&rbio->bio_list); + __free_raid_bio(rbio); - while (cur) { - next = cur->bi_next; - cur->bi_next = NULL; - cur->bi_status = err; - bio_endio(cur); - cur = next; - } + rbio_endio_bio_list(cur, err); + if (extra) + rbio_endio_bio_list(extra, err); } /* @@ -1435,14 +1454,13 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio, */ static void set_bio_pages_uptodate(struct bio *bio) { - struct bio_vec bvec; - struct bvec_iter iter; + struct bio_vec *bvec; + int i; - if (bio_flagged(bio, BIO_CLONED)) - bio->bi_iter = btrfs_io_bio(bio)->iter; + ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment(bvec, bio, iter) - SetPageUptodate(bvec.bv_page); + bio_for_each_segment_all(bvec, bio, i) + SetPageUptodate(bvec->bv_page); } /* @@ -1969,7 +1987,22 @@ cleanup: cleanup_io: if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { - if (err == BLK_STS_OK) + /* + * - In case of two failures, where rbio->failb != -1: + * + * Do not cache this rbio since the above read reconstruction + * (raid6_datap_recov() or raid6_2data_recov()) may have + * changed some content of stripes which are not identical to + * on-disk content any more, otherwise, a later write/recover + * may steal stripe_pages from this rbio and end up with + * corruptions or rebuild failures. + * + * - In case of single failure, where rbio->failb == -1: + * + * Cache this rbio iff the above read reconstruction is + * excuted without problems. + */ + if (err == BLK_STS_OK && rbio->failb < 0) cache_rbio_pages(rbio); else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); @@ -2170,11 +2203,21 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, } /* - * reconstruct from the q stripe if they are - * asking for mirror 3 + * Loop retry: + * for 'mirror == 2', reconstruct from all other stripes. + * for 'mirror_num > 2', select a stripe to fail on every retry. */ - if (mirror_num == 3) - rbio->failb = rbio->real_stripes - 2; + if (mirror_num > 2) { + /* + * 'mirror == 3' is to fail the p stripe and + * reconstruct from the q stripe. 'mirror > 3' is to + * fail a data stripe and reconstruct from p+q stripe. + */ + rbio->failb = rbio->real_stripes - (mirror_num - 1); + ASSERT(rbio->failb > 0); + if (rbio->failb <= rbio->faila) + rbio->failb--; + } ret = lock_stripe_add(rbio); diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 34878699d363..171f3cce30e6 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -606,8 +606,7 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, } /* Walk up to the next node that needs to be processed */ -static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path, - int *level) +static int walk_up_tree(struct btrfs_path *path, int *level) { int l; @@ -984,7 +983,6 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) { struct btrfs_path *path; - struct btrfs_root *root; struct extent_buffer *eb; u64 bytenr = 0, num_bytes = 0; int ret, level; @@ -1014,7 +1012,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) &bytenr, &num_bytes); if (ret) break; - ret = walk_up_tree(root, path, &level); + ret = walk_up_tree(path, &level); if (ret < 0) break; if (ret > 0) { diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 3338407ef0f0..aab0194efe46 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -387,13 +387,6 @@ again: WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); ptr = (unsigned long)(ref + 1); - ret = btrfs_is_name_len_valid(leaf, path->slots[0], ptr, - name_len); - if (!ret) { - err = -EIO; - goto out; - } - WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); *sequence = btrfs_root_ref_sequence(leaf, ref); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b2f871d80982..ec56f33feea9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -301,6 +301,11 @@ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_put_ctx(struct scrub_ctx *sctx); +static inline int scrub_is_page_on_raid56(struct scrub_page *page) +{ + return page->recover && + (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); +} static void scrub_pending_bio_inc(struct scrub_ctx *sctx) { @@ -1323,15 +1328,34 @@ nodatasum_case: * could happen otherwise that a correct page would be * overwritten by a bad one). */ - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { + for (mirror_index = 0; ;mirror_index++) { struct scrub_block *sblock_other; if (mirror_index == failed_mirror_index) continue; - sblock_other = sblocks_for_recheck + mirror_index; + + /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */ + if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) { + if (mirror_index >= BTRFS_MAX_MIRRORS) + break; + if (!sblocks_for_recheck[mirror_index].page_count) + break; + + sblock_other = sblocks_for_recheck + mirror_index; + } else { + struct scrub_recover *r = sblock_bad->pagev[0]->recover; + int max_allowed = r->bbio->num_stripes - + r->bbio->num_tgtdevs; + + if (mirror_index >= max_allowed) + break; + if (!sblocks_for_recheck[1].page_count) + break; + + ASSERT(failed_mirror_index == 0); + sblock_other = sblocks_for_recheck + 1; + sblock_other->pagev[0]->mirror_num = 1 + mirror_index; + } /* build and submit the bios, check checksums */ scrub_recheck_block(fs_info, sblock_other, 0); @@ -1666,49 +1690,32 @@ leave_nomem: return 0; } -struct scrub_bio_ret { - struct completion event; - blk_status_t status; -}; - static void scrub_bio_wait_endio(struct bio *bio) { - struct scrub_bio_ret *ret = bio->bi_private; - - ret->status = bio->bi_status; - complete(&ret->event); -} - -static inline int scrub_is_page_on_raid56(struct scrub_page *page) -{ - return page->recover && - (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); + complete(bio->bi_private); } static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, struct bio *bio, struct scrub_page *page) { - struct scrub_bio_ret done; + DECLARE_COMPLETION_ONSTACK(done); int ret; + int mirror_num; - init_completion(&done.event); - done.status = 0; bio->bi_iter.bi_sector = page->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; + mirror_num = page->sblock->pagev[0]->mirror_num; ret = raid56_parity_recover(fs_info, bio, page->recover->bbio, page->recover->map_length, - page->mirror_num, 0); + mirror_num, 0); if (ret) return ret; - wait_for_completion_io(&done.event); - if (done.status) - return -EIO; - - return 0; + wait_for_completion_io(&done); + return blk_status_to_errno(bio->bi_status); } /* @@ -2535,7 +2542,7 @@ leave_nomem: } WARN_ON(sblock->page_count == 0); - if (dev->missing) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { /* * This case should only be hit for RAID 5/6 device replace. See * the comment in scrub_missing_raid56_pages() for details. @@ -2870,7 +2877,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, u8 csum[BTRFS_CSUM_SIZE]; u32 blocksize; - if (dev->missing) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { scrub_parity_mark_sectors_error(sparity, logical, len); return 0; } @@ -4112,12 +4119,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info, devid, NULL, NULL); - if (!dev || (dev->missing && !is_dev_replace)) { + if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && + !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); return -ENODEV; } - if (!is_dev_replace && !readonly && !dev->writeable) { + if (!is_dev_replace && !readonly && + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); rcu_read_lock(); name = rcu_dereference(dev->name); @@ -4128,14 +4137,15 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } mutex_lock(&fs_info->scrub_lock); - if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); return -EIO; } btrfs_dev_replace_lock(&fs_info->dev_replace, 0); - if (dev->scrub_device || + if (dev->scrub_ctx || (!is_dev_replace && btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); @@ -4160,7 +4170,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return PTR_ERR(sctx); } sctx->readonly = readonly; - dev->scrub_device = sctx; + dev->scrub_ctx = sctx; mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* @@ -4195,7 +4205,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, memcpy(progress, &sctx->stat, sizeof(*progress)); mutex_lock(&fs_info->scrub_lock); - dev->scrub_device = NULL; + dev->scrub_ctx = NULL; scrub_workers_put(fs_info); mutex_unlock(&fs_info->scrub_lock); @@ -4252,16 +4262,16 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, struct scrub_ctx *sctx; mutex_lock(&fs_info->scrub_lock); - sctx = dev->scrub_device; + sctx = dev->scrub_ctx; if (!sctx) { mutex_unlock(&fs_info->scrub_lock); return -ENOTCONN; } atomic_inc(&sctx->cancel_req); - while (dev->scrub_device) { + while (dev->scrub_ctx) { mutex_unlock(&fs_info->scrub_lock); wait_event(fs_info->scrub_pause_wait, - dev->scrub_device == NULL); + dev->scrub_ctx == NULL); mutex_lock(&fs_info->scrub_lock); } mutex_unlock(&fs_info->scrub_lock); @@ -4278,7 +4288,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (dev) - sctx = dev->scrub_device; + sctx = dev->scrub_ctx; if (sctx) memcpy(progress, &sctx->stat, sizeof(*progress)); mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -4478,8 +4488,7 @@ static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len, free_extent_map(em); out_unlock: - unlock_extent_cached(io_tree, lockstart, lockend, &cached_state, - GFP_NOFS); + unlock_extent_cached(io_tree, lockstart, lockend, &cached_state); return ret; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 20d3300bd268..f306c608dc28 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1059,12 +1059,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, } } - ret = btrfs_is_name_len_valid(eb, path->slots[0], - (unsigned long)(di + 1), name_len + data_len); - if (!ret) { - ret = -EIO; - goto out; - } if (name_len + data_len > buf_len) { buf_len = name_len + data_len; if (is_vmalloc_addr(buf)) { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3a4dce153645..6e71a2a78363 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -61,12 +61,21 @@ #include "tests/btrfs-tests.h" #include "qgroup.h" -#include "backref.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> static const struct super_operations btrfs_super_ops; + +/* + * Types for mounting the default subvolume and a subvolume explicitly + * requested by subvol=/path. That way the callchain is straightforward and we + * don't have to play tricks with the mount options and recursive calls to + * btrfs_mount. + * + * The new btrfs_root_fs_type also servers as a tag for the bdev_holder. + */ static struct file_system_type btrfs_fs_type; +static struct file_system_type btrfs_root_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); @@ -98,30 +107,6 @@ const char *btrfs_decode_error(int errno) return errstr; } -/* btrfs handle error by forcing the filesystem readonly */ -static void btrfs_handle_error(struct btrfs_fs_info *fs_info) -{ - struct super_block *sb = fs_info->sb; - - if (sb_rdonly(sb)) - return; - - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { - sb->s_flags |= SB_RDONLY; - btrfs_info(fs_info, "forced readonly"); - /* - * Note that a running device replace operation is not - * canceled here although there is no way to update - * the progress. It would add the risk of a deadlock, - * therefore the canceling is omitted. The only penalty - * is that some I/O remains active until the procedure - * completes. The next time when the filesystem is - * mounted writeable again, the device replace - * operation continues. - */ - } -} - /* * __btrfs_handle_fs_error decodes expected errors from the caller and * invokes the approciate error response. @@ -168,8 +153,23 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); /* Don't go through full error handling during mount */ - if (sb->s_flags & SB_BORN) - btrfs_handle_error(fs_info); + if (!(sb->s_flags & SB_BORN)) + return; + + if (sb_rdonly(sb)) + return; + + /* btrfs handle error by forcing the filesystem readonly */ + sb->s_flags |= SB_RDONLY; + btrfs_info(fs_info, "forced readonly"); + /* + * Note that a running device replace operation is not canceled here + * although there is no way to update the progress. It would add the + * risk of a deadlock, therefore the canceling is omitted. The only + * penalty is that some I/O remains active until the procedure + * completes. The next time when the filesystem is mounted writeable + * again, the device replace operation continues. + */ } #ifdef CONFIG_PRINTK @@ -405,7 +405,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags) { substring_t args[MAX_OPT_ARGS]; - char *p, *num, *orig = NULL; + char *p, *num; u64 cache_gen; int intarg; int ret = 0; @@ -428,16 +428,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, if (!options) goto check; - /* - * strsep changes the string, duplicate it because parse_options - * gets called twice - */ - options = kstrdup(options, GFP_KERNEL); - if (!options) - return -ENOMEM; - - orig = options; - while ((p = strsep(&options, ",")) != NULL) { int token; if (!*p) @@ -454,7 +444,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_subvolrootid: case Opt_device: /* - * These are parsed by btrfs_parse_early_options + * These are parsed by btrfs_parse_subvol_options + * and btrfs_parse_early_options * and can be happily ignored here. */ break; @@ -877,7 +868,6 @@ out: btrfs_info(info, "disk space caching is enabled"); if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE)) btrfs_info(info, "using free space tree"); - kfree(orig); return ret; } @@ -888,11 +878,60 @@ out: * only when we need to allocate a new super block. */ static int btrfs_parse_early_options(const char *options, fmode_t flags, - void *holder, char **subvol_name, u64 *subvol_objectid, - struct btrfs_fs_devices **fs_devices) + void *holder, struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; char *device_name, *opts, *orig, *p; + int error = 0; + + if (!options) + return 0; + + /* + * strsep changes the string, duplicate it because btrfs_parse_options + * gets called later + */ + opts = kstrdup(options, GFP_KERNEL); + if (!opts) + return -ENOMEM; + orig = opts; + + while ((p = strsep(&opts, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + if (token == Opt_device) { + device_name = match_strdup(&args[0]); + if (!device_name) { + error = -ENOMEM; + goto out; + } + error = btrfs_scan_one_device(device_name, + flags, holder, fs_devices); + kfree(device_name); + if (error) + goto out; + } + } + +out: + kfree(orig); + return error; +} + +/* + * Parse mount options that are related to subvolume id + * + * The value is later passed to mount_subvol() + */ +static int btrfs_parse_subvol_options(const char *options, fmode_t flags, + char **subvol_name, u64 *subvol_objectid) +{ + substring_t args[MAX_OPT_ARGS]; + char *opts, *orig, *p; char *num = NULL; int error = 0; @@ -900,8 +939,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, return 0; /* - * strsep changes the string, duplicate it because parse_options - * gets called twice + * strsep changes the string, duplicate it because + * btrfs_parse_early_options gets called later */ opts = kstrdup(options, GFP_KERNEL); if (!opts) @@ -940,18 +979,6 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, case Opt_subvolrootid: pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n"); break; - case Opt_device: - device_name = match_strdup(&args[0]); - if (!device_name) { - error = -ENOMEM; - goto out; - } - error = btrfs_scan_one_device(device_name, - flags, holder, fs_devices); - kfree(device_name); - if (error) - goto out; - break; default: break; } @@ -1243,7 +1270,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) { struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); - char *compress_type; + const char *compress_type; if (btrfs_test_opt(info, DEGRADED)) seq_puts(seq, ",degraded"); @@ -1259,12 +1286,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); if (btrfs_test_opt(info, COMPRESS)) { - if (info->compress_type == BTRFS_COMPRESS_ZLIB) - compress_type = "zlib"; - else if (info->compress_type == BTRFS_COMPRESS_LZO) - compress_type = "lzo"; - else - compress_type = "zstd"; + compress_type = btrfs_compress_type2str(info->compress_type); if (btrfs_test_opt(info, FORCE_COMPRESS)) seq_printf(seq, ",compress-force=%s", compress_type); else @@ -1365,86 +1387,12 @@ static inline int is_subvolume_inode(struct inode *inode) return 0; } -/* - * This will add subvolid=0 to the argument string while removing any subvol= - * and subvolid= arguments to make sure we get the top-level root for path - * walking to the subvol we want. - */ -static char *setup_root_args(char *args) -{ - char *buf, *dst, *sep; - - if (!args) - return kstrdup("subvolid=0", GFP_KERNEL); - - /* The worst case is that we add ",subvolid=0" to the end. */ - buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, - GFP_KERNEL); - if (!buf) - return NULL; - - while (1) { - sep = strchrnul(args, ','); - if (!strstarts(args, "subvol=") && - !strstarts(args, "subvolid=")) { - memcpy(dst, args, sep - args); - dst += sep - args; - *dst++ = ','; - } - if (*sep) - args = sep + 1; - else - break; - } - strcpy(dst, "subvolid=0"); - - return buf; -} - static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, - int flags, const char *device_name, - char *data) + const char *device_name, struct vfsmount *mnt) { struct dentry *root; - struct vfsmount *mnt = NULL; - char *newargs; int ret; - newargs = setup_root_args(data); - if (!newargs) { - root = ERR_PTR(-ENOMEM); - goto out; - } - - mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); - if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) { - if (flags & SB_RDONLY) { - mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~SB_RDONLY, - device_name, newargs); - } else { - mnt = vfs_kern_mount(&btrfs_fs_type, flags | SB_RDONLY, - device_name, newargs); - if (IS_ERR(mnt)) { - root = ERR_CAST(mnt); - mnt = NULL; - goto out; - } - - down_write(&mnt->mnt_sb->s_umount); - ret = btrfs_remount(mnt->mnt_sb, &flags, NULL); - up_write(&mnt->mnt_sb->s_umount); - if (ret < 0) { - root = ERR_PTR(ret); - goto out; - } - } - } - if (IS_ERR(mnt)) { - root = ERR_CAST(mnt); - mnt = NULL; - goto out; - } - if (!subvol_name) { if (!subvol_objectid) { ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb), @@ -1500,7 +1448,6 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, out: mntput(mnt); - kfree(newargs); kfree(subvol_name); return root; } @@ -1558,11 +1505,11 @@ static int setup_security_options(struct btrfs_fs_info *fs_info, /* * Find a superblock for the given device / mount point. * - * Note: This is based on get_sb_bdev from fs/super.c with a few additions - * for multiple device setup. Make sure to keep it in sync. + * Note: This is based on mount_bdev from fs/super.c with a few additions + * for multiple device setup. Make sure to keep it in sync. */ -static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, - const char *device_name, void *data) +static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, + int flags, const char *device_name, void *data) { struct block_device *bdev = NULL; struct super_block *s; @@ -1570,27 +1517,17 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, struct btrfs_fs_info *fs_info = NULL; struct security_mnt_opts new_sec_opts; fmode_t mode = FMODE_READ; - char *subvol_name = NULL; - u64 subvol_objectid = 0; int error = 0; if (!(flags & SB_RDONLY)) mode |= FMODE_WRITE; error = btrfs_parse_early_options(data, mode, fs_type, - &subvol_name, &subvol_objectid, &fs_devices); if (error) { - kfree(subvol_name); return ERR_PTR(error); } - if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) { - /* mount_subvol() will free subvol_name. */ - return mount_subvol(subvol_name, subvol_objectid, flags, - device_name, data); - } - security_init_mnt_opts(&new_sec_opts); if (data) { error = parse_security_options(data, &new_sec_opts); @@ -1674,6 +1611,84 @@ error_sec_opts: return ERR_PTR(error); } +/* + * Mount function which is called by VFS layer. + * + * In order to allow mounting a subvolume directly, btrfs uses mount_subtree() + * which needs vfsmount* of device's root (/). This means device's root has to + * be mounted internally in any case. + * + * Operation flow: + * 1. Parse subvol id related options for later use in mount_subvol(). + * + * 2. Mount device's root (/) by calling vfs_kern_mount(). + * + * NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the + * first place. In order to avoid calling btrfs_mount() again, we use + * different file_system_type which is not registered to VFS by + * register_filesystem() (btrfs_root_fs_type). As a result, + * btrfs_mount_root() is called. The return value will be used by + * mount_subtree() in mount_subvol(). + * + * 3. Call mount_subvol() to get the dentry of subvolume. Since there is + * "btrfs subvolume set-default", mount_subvol() is called always. + */ +static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, + const char *device_name, void *data) +{ + struct vfsmount *mnt_root; + struct dentry *root; + fmode_t mode = FMODE_READ; + char *subvol_name = NULL; + u64 subvol_objectid = 0; + int error = 0; + + if (!(flags & SB_RDONLY)) + mode |= FMODE_WRITE; + + error = btrfs_parse_subvol_options(data, mode, + &subvol_name, &subvol_objectid); + if (error) { + kfree(subvol_name); + return ERR_PTR(error); + } + + /* mount device's root (/) */ + mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data); + if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) { + if (flags & SB_RDONLY) { + mnt_root = vfs_kern_mount(&btrfs_root_fs_type, + flags & ~SB_RDONLY, device_name, data); + } else { + mnt_root = vfs_kern_mount(&btrfs_root_fs_type, + flags | SB_RDONLY, device_name, data); + if (IS_ERR(mnt_root)) { + root = ERR_CAST(mnt_root); + goto out; + } + + down_write(&mnt_root->mnt_sb->s_umount); + error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL); + up_write(&mnt_root->mnt_sb->s_umount); + if (error < 0) { + root = ERR_PTR(error); + mntput(mnt_root); + goto out; + } + } + } + if (IS_ERR(mnt_root)) { + root = ERR_CAST(mnt_root); + goto out; + } + + /* mount_subvol() will free subvol_name and mnt_root */ + root = mount_subvol(subvol_name, subvol_objectid, device_name, mnt_root); + +out: + return root; +} + static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, int new_pool_size, int old_pool_size) { @@ -1820,7 +1835,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } - if (!btrfs_check_rw_degradable(fs_info)) { + if (!btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, "too many missing devices, writeable remount is not allowed"); ret = -EACCES; @@ -1972,8 +1987,10 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { - if (!device->in_fs_metadata || !device->bdev || - device->is_tgtdev_for_dev_replace) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &device->dev_state) || + !device->bdev || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) continue; if (i >= nr_devices) @@ -2174,6 +2191,15 @@ static struct file_system_type btrfs_fs_type = { .kill_sb = btrfs_kill_super, .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, }; + +static struct file_system_type btrfs_root_fs_type = { + .owner = THIS_MODULE, + .name = "btrfs", + .mount = btrfs_mount_root, + .kill_sb = btrfs_kill_super, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, +}; + MODULE_ALIAS_FS("btrfs"); static int btrfs_control_open(struct inode *inode, struct file *file) @@ -2207,11 +2233,11 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case BTRFS_IOC_SCAN_DEV: ret = btrfs_scan_one_device(vol->name, FMODE_READ, - &btrfs_fs_type, &fs_devices); + &btrfs_root_fs_type, &fs_devices); break; case BTRFS_IOC_DEVICES_READY: ret = btrfs_scan_one_device(vol->name, FMODE_READ, - &btrfs_fs_type, &fs_devices); + &btrfs_root_fs_type, &fs_devices); if (ret) break; ret = !(fs_devices->num_devices == fs_devices->total_devices); @@ -2269,7 +2295,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) while (cur_devices) { head = &cur_devices->devices; list_for_each_entry(dev, head, dev_list) { - if (dev->missing) + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) continue; if (!dev->name) continue; @@ -2324,7 +2350,7 @@ static struct miscdevice btrfs_misc = { MODULE_ALIAS_MISCDEV(BTRFS_MINOR); MODULE_ALIAS("devname:btrfs-control"); -static int btrfs_interface_init(void) +static int __init btrfs_interface_init(void) { return misc_register(&btrfs_misc); } @@ -2334,7 +2360,7 @@ static void btrfs_interface_exit(void) misc_deregister(&btrfs_misc); } -static void btrfs_print_mod_info(void) +static void __init btrfs_print_mod_info(void) { pr_info("Btrfs loaded, crc32c=%s" #ifdef CONFIG_BTRFS_DEBUG diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index a28bba801264..a8bafed931f4 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -897,7 +897,7 @@ static int btrfs_init_debugfs(void) return 0; } -int btrfs_init_sysfs(void) +int __init btrfs_init_sysfs(void) { int ret; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index d3f25376a0f8..9786d8cd0aa6 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -277,6 +277,9 @@ int btrfs_run_sanity_tests(void) goto out; } } + ret = btrfs_test_extent_map(); + if (ret) + goto out; out: btrfs_destroy_test_fs(); return ret; diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 266f1e3d1784..bc0615bac3cc 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -33,6 +33,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); int btrfs_test_inodes(u32 sectorsize, u32 nodesize); int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); +int btrfs_test_extent_map(void); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c new file mode 100644 index 000000000000..70c993f01670 --- /dev/null +++ b/fs/btrfs/tests/extent-map-tests.c @@ -0,0 +1,366 @@ +/* + * Copyright (C) 2017 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/types.h> +#include "btrfs-tests.h" +#include "../ctree.h" + +static void free_extent_map_tree(struct extent_map_tree *em_tree) +{ + struct extent_map *em; + struct rb_node *node; + + while (!RB_EMPTY_ROOT(&em_tree->map)) { + node = rb_first(&em_tree->map); + em = rb_entry(node, struct extent_map, rb_node); + remove_extent_mapping(em_tree, em); + +#ifdef CONFIG_BTRFS_DEBUG + if (refcount_read(&em->refs) != 1) { + test_msg( +"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d\n", + em->start, em->len, em->block_start, + em->block_len, refcount_read(&em->refs)); + + refcount_set(&em->refs, 1); + } +#endif + free_extent_map(em); + } +} + +/* + * Test scenario: + * + * Suppose that no extent map has been loaded into memory yet, there is a file + * extent [0, 16K), followed by another file extent [16K, 20K), two dio reads + * are entering btrfs_get_extent() concurrently, t1 is reading [8K, 16K), t2 is + * reading [0, 8K) + * + * t1 t2 + * btrfs_get_extent() btrfs_get_extent() + * -> lookup_extent_mapping() ->lookup_extent_mapping() + * -> add_extent_mapping(0, 16K) + * -> return em + * ->add_extent_mapping(0, 16K) + * -> #handle -EEXIST + */ +static void test_case_1(struct extent_map_tree *em_tree) +{ + struct extent_map *em; + u64 start = 0; + u64 len = SZ_8K; + int ret; + + em = alloc_extent_map(); + if (!em) + /* Skip the test on error. */ + return; + + /* Add [0, 16K) */ + em->start = 0; + em->len = SZ_16K; + em->block_start = 0; + em->block_len = SZ_16K; + ret = add_extent_mapping(em_tree, em, 0); + ASSERT(ret == 0); + free_extent_map(em); + + /* Add [16K, 20K) following [0, 16K) */ + em = alloc_extent_map(); + if (!em) + goto out; + + em->start = SZ_16K; + em->len = SZ_4K; + em->block_start = SZ_32K; /* avoid merging */ + em->block_len = SZ_4K; + ret = add_extent_mapping(em_tree, em, 0); + ASSERT(ret == 0); + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) + goto out; + + /* Add [0, 8K), should return [0, 16K) instead. */ + em->start = start; + em->len = len; + em->block_start = start; + em->block_len = len; + ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len); + if (ret) + test_msg("case1 [%llu %llu]: ret %d\n", start, start + len, ret); + if (em && + (em->start != 0 || extent_map_end(em) != SZ_16K || + em->block_start != 0 || em->block_len != SZ_16K)) + test_msg( +"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n", + start, start + len, ret, em->start, em->len, + em->block_start, em->block_len); + free_extent_map(em); +out: + /* free memory */ + free_extent_map_tree(em_tree); +} + +/* + * Test scenario: + * + * Reading the inline ending up with EEXIST, ie. read an inline + * extent and discard page cache and read it again. + */ +static void test_case_2(struct extent_map_tree *em_tree) +{ + struct extent_map *em; + int ret; + + em = alloc_extent_map(); + if (!em) + /* Skip the test on error. */ + return; + + /* Add [0, 1K) */ + em->start = 0; + em->len = SZ_1K; + em->block_start = EXTENT_MAP_INLINE; + em->block_len = (u64)-1; + ret = add_extent_mapping(em_tree, em, 0); + ASSERT(ret == 0); + free_extent_map(em); + + /* Add [4K, 4K) following [0, 1K) */ + em = alloc_extent_map(); + if (!em) + goto out; + + em->start = SZ_4K; + em->len = SZ_4K; + em->block_start = SZ_4K; + em->block_len = SZ_4K; + ret = add_extent_mapping(em_tree, em, 0); + ASSERT(ret == 0); + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) + goto out; + + /* Add [0, 1K) */ + em->start = 0; + em->len = SZ_1K; + em->block_start = EXTENT_MAP_INLINE; + em->block_len = (u64)-1; + ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len); + if (ret) + test_msg("case2 [0 1K]: ret %d\n", ret); + if (em && + (em->start != 0 || extent_map_end(em) != SZ_1K || + em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) + test_msg( +"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n", + ret, em->start, em->len, em->block_start, + em->block_len); + free_extent_map(em); +out: + /* free memory */ + free_extent_map_tree(em_tree); +} + +static void __test_case_3(struct extent_map_tree *em_tree, u64 start) +{ + struct extent_map *em; + u64 len = SZ_4K; + int ret; + + em = alloc_extent_map(); + if (!em) + /* Skip this test on error. */ + return; + + /* Add [4K, 8K) */ + em->start = SZ_4K; + em->len = SZ_4K; + em->block_start = SZ_4K; + em->block_len = SZ_4K; + ret = add_extent_mapping(em_tree, em, 0); + ASSERT(ret == 0); + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) + goto out; + + /* Add [0, 16K) */ + em->start = 0; + em->len = SZ_16K; + em->block_start = 0; + em->block_len = SZ_16K; + ret = btrfs_add_extent_mapping(em_tree, &em, start, len); + if (ret) + test_msg("case3 [0x%llx 0x%llx): ret %d\n", + start, start + len, ret); + /* + * Since bytes within em are contiguous, em->block_start is identical to + * em->start. + */ + if (em && + (start < em->start || start + len > extent_map_end(em) || + em->start != em->block_start || em->len != em->block_len)) + test_msg( +"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n", + start, start + len, ret, em->start, em->len, + em->block_start, em->block_len); + free_extent_map(em); +out: + /* free memory */ + free_extent_map_tree(em_tree); +} + +/* + * Test scenario: + * + * Suppose that no extent map has been loaded into memory yet. + * There is a file extent [0, 16K), two jobs are running concurrently + * against it, t1 is buffered writing to [4K, 8K) and t2 is doing dio + * read from [0, 4K) or [8K, 12K) or [12K, 16K). + * + * t1 goes ahead of t2 and adds em [4K, 8K) into tree. + * + * t1 t2 + * cow_file_range() btrfs_get_extent() + * -> lookup_extent_mapping() + * -> add_extent_mapping() + * -> add_extent_mapping() + */ +static void test_case_3(struct extent_map_tree *em_tree) +{ + __test_case_3(em_tree, 0); + __test_case_3(em_tree, SZ_8K); + __test_case_3(em_tree, (12 * 1024ULL)); +} + +static void __test_case_4(struct extent_map_tree *em_tree, u64 start) +{ + struct extent_map *em; + u64 len = SZ_4K; + int ret; + + em = alloc_extent_map(); + if (!em) + /* Skip this test on error. */ + return; + + /* Add [0K, 8K) */ + em->start = 0; + em->len = SZ_8K; + em->block_start = 0; + em->block_len = SZ_8K; + ret = add_extent_mapping(em_tree, em, 0); + ASSERT(ret == 0); + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) + goto out; + + /* Add [8K, 24K) */ + em->start = SZ_8K; + em->len = 24 * 1024ULL; + em->block_start = SZ_16K; /* avoid merging */ + em->block_len = 24 * 1024ULL; + ret = add_extent_mapping(em_tree, em, 0); + ASSERT(ret == 0); + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) + goto out; + /* Add [0K, 32K) */ + em->start = 0; + em->len = SZ_32K; + em->block_start = 0; + em->block_len = SZ_32K; + ret = btrfs_add_extent_mapping(em_tree, &em, start, len); + if (ret) + test_msg("case4 [0x%llx 0x%llx): ret %d\n", + start, len, ret); + if (em && + (start < em->start || start + len > extent_map_end(em))) + test_msg( +"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n", + start, len, ret, em->start, em->len, em->block_start, + em->block_len); + free_extent_map(em); +out: + /* free memory */ + free_extent_map_tree(em_tree); +} + +/* + * Test scenario: + * + * Suppose that no extent map has been loaded into memory yet. + * There is a file extent [0, 32K), two jobs are running concurrently + * against it, t1 is doing dio write to [8K, 32K) and t2 is doing dio + * read from [0, 4K) or [4K, 8K). + * + * t1 goes ahead of t2 and splits em [0, 32K) to em [0K, 8K) and [8K 32K). + * + * t1 t2 + * btrfs_get_blocks_direct() btrfs_get_blocks_direct() + * -> btrfs_get_extent() -> btrfs_get_extent() + * -> lookup_extent_mapping() + * -> add_extent_mapping() -> lookup_extent_mapping() + * # load [0, 32K) + * -> btrfs_new_extent_direct() + * -> btrfs_drop_extent_cache() + * # split [0, 32K) + * -> add_extent_mapping() + * # add [8K, 32K) + * -> add_extent_mapping() + * # handle -EEXIST when adding + * # [0, 32K) + */ +static void test_case_4(struct extent_map_tree *em_tree) +{ + __test_case_4(em_tree, 0); + __test_case_4(em_tree, SZ_4K); +} + +int btrfs_test_extent_map() +{ + struct extent_map_tree *em_tree; + + test_msg("Running extent_map tests\n"); + + em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL); + if (!em_tree) + /* Skip the test on error. */ + return 0; + + extent_map_tree_init(em_tree); + + test_case_1(em_tree); + test_case_2(em_tree); + test_case_3(em_tree); + test_case_4(em_tree); + + kfree(em_tree); + return 0; +} diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 30affb60da51..13420cd19ef0 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -288,10 +288,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_msg("Expected a hole, got %llu\n", em->block_start); goto out; } - if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { - test_msg("Vacancy flag wasn't set properly\n"); - goto out; - } free_extent_map(em); btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); @@ -1001,8 +997,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, EXTENT_DELALLOC | EXTENT_DIRTY | - EXTENT_UPTODATE, 0, 0, - NULL, GFP_KERNEL); + EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1070,8 +1065,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, - NULL, GFP_KERNEL); + EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1104,8 +1098,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* Empty */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, - NULL, GFP_KERNEL); + EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1121,8 +1114,7 @@ out: if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, - NULL, GFP_KERNEL); + EXTENT_UPTODATE, 0, 0, NULL); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); @@ -1134,7 +1126,6 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize) int ret; set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only); - set_bit(EXTENT_FLAG_VACANCY, &vacancy_only); set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); test_msg("Running btrfs_get_extent tests\n"); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5a8c2649af2f..04f07144b45c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -495,8 +495,8 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, if (current->journal_info) { WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; - h->use_count++; - WARN_ON(h->use_count > 2); + refcount_inc(&h->use_count); + WARN_ON(refcount_read(&h->use_count) > 2); h->orig_rsv = h->block_rsv; h->block_rsv = NULL; goto got_it; @@ -567,7 +567,7 @@ again: h->transid = cur_trans->transid; h->transaction = cur_trans; h->root = root; - h->use_count = 1; + refcount_set(&h->use_count, 1); h->fs_info = root->fs_info; h->type = type; @@ -837,8 +837,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, int err = 0; int must_run_delayed_refs = 0; - if (trans->use_count > 1) { - trans->use_count--; + if (refcount_read(&trans->use_count) > 1) { + refcount_dec(&trans->use_count); trans->block_rsv = trans->orig_rsv; return 0; } @@ -1016,8 +1016,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, * it's safe to do it (through clear_btree_io_tree()). */ err = clear_extent_bit(dirty_pages, start, end, - EXTENT_NEED_WAIT, - 0, 0, &cached_state, GFP_NOFS); + EXTENT_NEED_WAIT, 0, 0, &cached_state); if (err == -ENOMEM) err = 0; if (!err) @@ -1869,7 +1868,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans = trans->transaction; DEFINE_WAIT(wait); - WARN_ON(trans->use_count > 1); + WARN_ON(refcount_read(&trans->use_count) > 1); btrfs_abort_transaction(trans, err); @@ -2266,16 +2265,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) } ret = write_all_supers(fs_info, 0); - if (ret) { - mutex_unlock(&fs_info->tree_log_mutex); - goto scrub_continue; - } - /* * the super is written, we can safely allow the tree-loggers * to go about their business */ mutex_unlock(&fs_info->tree_log_mutex); + if (ret) + goto scrub_continue; btrfs_finish_extent_commit(trans, fs_info); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index c55e44560103..6beee072b1bd 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -58,6 +58,7 @@ struct btrfs_transaction { /* Be protected by fs_info->trans_lock when we want to change it. */ enum btrfs_trans_state state; + int aborted; struct list_head list; struct extent_io_tree dirty_pages; unsigned long start_time; @@ -70,7 +71,6 @@ struct btrfs_transaction { struct list_head dirty_bgs; struct list_head io_bgs; struct list_head dropped_roots; - u64 num_dirty_bgs; /* * we need to make sure block group deletion doesn't race with @@ -79,11 +79,11 @@ struct btrfs_transaction { */ struct mutex cache_write_mutex; spinlock_t dirty_bgs_lock; + unsigned int num_dirty_bgs; /* Protected by spin lock fs_info->unused_bgs_lock. */ struct list_head deleted_bgs; spinlock_t dropped_roots_lock; struct btrfs_delayed_ref_root delayed_refs; - int aborted; struct btrfs_fs_info *fs_info; }; @@ -111,20 +111,19 @@ struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; u64 chunk_bytes_reserved; - unsigned long use_count; - unsigned long blocks_reserved; unsigned long delayed_ref_updates; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; + refcount_t use_count; + unsigned int type; short aborted; - short adding_csums; + bool adding_csums; bool allocating_chunk; bool can_flush_pending_bgs; bool reloc_reserved; bool sync; bool dirty; - unsigned int type; struct btrfs_root *root; struct btrfs_fs_info *fs_info; struct list_head new_bgs; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ce4ed6ec8f39..c3c8d48f6618 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -30,6 +30,7 @@ #include "tree-checker.h" #include "disk-io.h" #include "compression.h" +#include "hash.h" /* * Error message should follow the following format: @@ -223,6 +224,142 @@ static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf, } /* + * Customized reported for dir_item, only important new info is key->objectid, + * which represents inode number + */ +__printf(4, 5) +static void dir_item_err(const struct btrfs_root *root, + const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(root->fs_info, + "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid, + btrfs_header_bytenr(eb), slot, key.objectid, &vaf); + va_end(args); +} + +static int check_dir_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_dir_item *di; + u32 item_size = btrfs_item_size_nr(leaf, slot); + u32 cur = 0; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + while (cur < item_size) { + u32 name_len; + u32 data_len; + u32 max_name_len; + u32 total_size; + u32 name_hash; + u8 dir_type; + + /* header itself should not cross item boundary */ + if (cur + sizeof(*di) > item_size) { + dir_item_err(root, leaf, slot, + "dir item header crosses item boundary, have %zu boundary %u", + cur + sizeof(*di), item_size); + return -EUCLEAN; + } + + /* dir type check */ + dir_type = btrfs_dir_type(leaf, di); + if (dir_type >= BTRFS_FT_MAX) { + dir_item_err(root, leaf, slot, + "invalid dir item type, have %u expect [0, %u)", + dir_type, BTRFS_FT_MAX); + return -EUCLEAN; + } + + if (key->type == BTRFS_XATTR_ITEM_KEY && + dir_type != BTRFS_FT_XATTR) { + dir_item_err(root, leaf, slot, + "invalid dir item type for XATTR key, have %u expect %u", + dir_type, BTRFS_FT_XATTR); + return -EUCLEAN; + } + if (dir_type == BTRFS_FT_XATTR && + key->type != BTRFS_XATTR_ITEM_KEY) { + dir_item_err(root, leaf, slot, + "xattr dir type found for non-XATTR key"); + return -EUCLEAN; + } + if (dir_type == BTRFS_FT_XATTR) + max_name_len = XATTR_NAME_MAX; + else + max_name_len = BTRFS_NAME_LEN; + + /* Name/data length check */ + name_len = btrfs_dir_name_len(leaf, di); + data_len = btrfs_dir_data_len(leaf, di); + if (name_len > max_name_len) { + dir_item_err(root, leaf, slot, + "dir item name len too long, have %u max %u", + name_len, max_name_len); + return -EUCLEAN; + } + if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)) { + dir_item_err(root, leaf, slot, + "dir item name and data len too long, have %u max %u", + name_len + data_len, + BTRFS_MAX_XATTR_SIZE(root->fs_info)); + return -EUCLEAN; + } + + if (data_len && dir_type != BTRFS_FT_XATTR) { + dir_item_err(root, leaf, slot, + "dir item with invalid data len, have %u expect 0", + data_len); + return -EUCLEAN; + } + + total_size = sizeof(*di) + name_len + data_len; + + /* header and name/data should not cross item boundary */ + if (cur + total_size > item_size) { + dir_item_err(root, leaf, slot, + "dir item data crosses item boundary, have %u boundary %u", + cur + total_size, item_size); + return -EUCLEAN; + } + + /* + * Special check for XATTR/DIR_ITEM, as key->offset is name + * hash, should match its name + */ + if (key->type == BTRFS_DIR_ITEM_KEY || + key->type == BTRFS_XATTR_ITEM_KEY) { + char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)]; + + read_extent_buffer(leaf, namebuf, + (unsigned long)(di + 1), name_len); + name_hash = btrfs_name_hash(namebuf, name_len); + if (key->offset != name_hash) { + dir_item_err(root, leaf, slot, + "name hash mismatch with key, have 0x%016x expect 0x%016llx", + name_hash, key->offset); + return -EUCLEAN; + } + } + cur += total_size; + di = (struct btrfs_dir_item *)((void *)di + total_size); + } + return 0; +} + +/* * Common point to switch the item-specific validation. */ static int check_leaf_item(struct btrfs_root *root, @@ -238,6 +375,11 @@ static int check_leaf_item(struct btrfs_root *root, case BTRFS_EXTENT_CSUM_KEY: ret = check_csum_item(root, leaf, key, slot); break; + case BTRFS_DIR_ITEM_KEY: + case BTRFS_DIR_INDEX_KEY: + case BTRFS_XATTR_ITEM_KEY: + ret = check_dir_item(root, leaf, key, slot); + break; } return ret; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7bf9b31561db..afadaadab18e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -20,6 +20,7 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/list_sort.h> +#include <linux/iversion.h> #include "tree-log.h" #include "disk-io.h" #include "locking.h" @@ -1173,19 +1174,15 @@ next: return 0; } -static int extref_get_fields(struct extent_buffer *eb, int slot, - unsigned long ref_ptr, u32 *namelen, char **name, - u64 *index, u64 *parent_objectid) +static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + u32 *namelen, char **name, u64 *index, + u64 *parent_objectid) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ref_ptr; *namelen = btrfs_inode_extref_name_len(eb, extref); - if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name, - *namelen)) - return -EIO; - *name = kmalloc(*namelen, GFP_NOFS); if (*name == NULL) return -ENOMEM; @@ -1200,19 +1197,14 @@ static int extref_get_fields(struct extent_buffer *eb, int slot, return 0; } -static int ref_get_fields(struct extent_buffer *eb, int slot, - unsigned long ref_ptr, u32 *namelen, char **name, - u64 *index) +static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + u32 *namelen, char **name, u64 *index) { struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ref_ptr; *namelen = btrfs_inode_ref_name_len(eb, ref); - if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1), - *namelen)) - return -EIO; - *name = kmalloc(*namelen, GFP_NOFS); if (*name == NULL) return -ENOMEM; @@ -1287,8 +1279,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, while (ref_ptr < ref_end) { if (log_ref_ver) { - ret = extref_get_fields(eb, slot, ref_ptr, &namelen, - &name, &ref_index, &parent_objectid); + ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + &ref_index, &parent_objectid); /* * parent object can change from one array * item to another. @@ -1300,8 +1292,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, goto out; } } else { - ret = ref_get_fields(eb, slot, ref_ptr, &namelen, - &name, &ref_index); + ret = ref_get_fields(eb, ref_ptr, &namelen, &name, + &ref_index); } if (ret) goto out; @@ -1835,7 +1827,6 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; u32 item_size = btrfs_item_size_nr(eb, slot); struct btrfs_dir_item *di; @@ -1848,8 +1839,6 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(fs_info, eb, slot, di)) - return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); if (ret < 0) @@ -2024,11 +2013,6 @@ again: ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(fs_info, eb, slot, di)) { - ret = -EIO; - goto out; - } - name_len = btrfs_dir_name_len(eb, di); name = kmalloc(name_len, GFP_NOFS); if (!name) { @@ -2109,7 +2093,6 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans, struct btrfs_path *path, const u64 ino) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key search_key; struct btrfs_path *log_path; int i; @@ -2151,11 +2134,6 @@ process_leaf: u32 this_len = sizeof(*di) + name_len + data_len; char *name; - ret = verify_dir_item(fs_info, path->nodes[0], i, di); - if (ret) { - ret = -EIO; - goto out; - } name = kmalloc(name_len, GFP_NOFS); if (!name) { ret = -ENOMEM; @@ -3609,7 +3587,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), &token); - btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); + btrfs_set_token_inode_sequence(leaf, item, + inode_peek_iversion(inode), &token); btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); @@ -4572,12 +4551,6 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, this_len = sizeof(*extref) + this_name_len; } - ret = btrfs_is_name_len_valid(eb, slot, name_ptr, - this_name_len); - if (!ret) { - ret = -EIO; - goto out; - } if (this_name_len > name_len) { char *new_name; @@ -5432,11 +5405,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct dentry *parent, const loff_t start, const loff_t end, - int exists_only, + int inode_only, struct btrfs_log_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; - int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; struct dentry *old_parent = NULL; int ret = 0; @@ -5602,7 +5574,7 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, int ret; ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)), - parent, start, end, 0, ctx); + parent, start, end, LOG_INODE_ALL, ctx); dput(parent); return ret; @@ -5865,6 +5837,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, return 0; return btrfs_log_inode_parent(trans, root, inode, parent, 0, - LLONG_MAX, 1, NULL); + LLONG_MAX, LOG_INODE_EXISTS, NULL); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 49810b70afd3..b5036bd69e6a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -145,6 +145,71 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map); +/* + * Device locking + * ============== + * + * There are several mutexes that protect manipulation of devices and low-level + * structures like chunks but not block groups, extents or files + * + * uuid_mutex (global lock) + * ------------------------ + * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from + * the SCAN_DEV ioctl registration or from mount either implicitly (the first + * device) or requested by the device= mount option + * + * the mutex can be very coarse and can cover long-running operations + * + * protects: updates to fs_devices counters like missing devices, rw devices, + * seeding, structure cloning, openning/closing devices at mount/umount time + * + * global::fs_devs - add, remove, updates to the global list + * + * does not protect: manipulation of the fs_devices::devices list! + * + * btrfs_device::name - renames (write side), read is RCU + * + * fs_devices::device_list_mutex (per-fs, with RCU) + * ------------------------------------------------ + * protects updates to fs_devices::devices, ie. adding and deleting + * + * simple list traversal with read-only actions can be done with RCU protection + * + * may be used to exclude some operations from running concurrently without any + * modifications to the list (see write_all_supers) + * + * volume_mutex + * ------------ + * coarse lock owned by a mounted filesystem; used to exclude some operations + * that cannot run in parallel and affect the higher-level properties of the + * filesystem like: device add/deleting/resize/replace, or balance + * + * balance_mutex + * ------------- + * protects balance structures (status, state) and context accessed from + * several places (internally, ioctl) + * + * chunk_mutex + * ----------- + * protects chunks, adding or removing during allocation, trim or when a new + * device is added/removed + * + * cleaner_mutex + * ------------- + * a big lock that is held by the cleaner thread and prevents running subvolume + * cleaning together with relocation or delayed iputs + * + * + * Lock nesting + * ============ + * + * uuid_mutex + * volume_mutex + * device_list_mutex + * chunk_mutex + * balance_mutex + */ + DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); struct list_head *btrfs_get_fs_uuids(void) @@ -180,6 +245,13 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) return fs_devs; } +static void free_device(struct btrfs_device *device) +{ + rcu_string_free(device->name); + bio_put(device->flush_bio); + kfree(device); +} + static void free_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; @@ -188,9 +260,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) device = list_entry(fs_devices->devices.next, struct btrfs_device, dev_list); list_del(&device->dev_list); - rcu_string_free(device->name); - bio_put(device->flush_bio); - kfree(device); + free_device(device); } kfree(fs_devices); } @@ -220,6 +290,11 @@ void btrfs_cleanup_fs_uuids(void) } } +/* + * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error. + * Returned struct is not linked onto any lists and must be destroyed using + * free_device. + */ static struct btrfs_device *__alloc_device(void) { struct btrfs_device *dev; @@ -237,7 +312,6 @@ static struct btrfs_device *__alloc_device(void) kfree(dev); return ERR_PTR(-ENOMEM); } - bio_get(dev->flush_bio); INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); @@ -245,7 +319,6 @@ static struct btrfs_device *__alloc_device(void) spin_lock_init(&dev->io_lock); - spin_lock_init(&dev->reada_lock); atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); @@ -531,45 +604,42 @@ static void pending_bios_fn(struct btrfs_work *work) run_scheduled_bios(device); } - -static void btrfs_free_stale_device(struct btrfs_device *cur_dev) +/* + * Search and remove all stale (devices which are not mounted) devices. + * When both inputs are NULL, it will search and release all stale devices. + * path: Optional. When provided will it release all unmounted devices + * matching this path only. + * skip_dev: Optional. Will skip this device when searching for the stale + * devices. + */ +static void btrfs_free_stale_devices(const char *path, + struct btrfs_device *skip_dev) { - struct btrfs_fs_devices *fs_devs; - struct btrfs_device *dev; - - if (!cur_dev->name) - return; + struct btrfs_fs_devices *fs_devs, *tmp_fs_devs; + struct btrfs_device *dev, *tmp_dev; - list_for_each_entry(fs_devs, &fs_uuids, list) { - int del = 1; + list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) { if (fs_devs->opened) continue; - if (fs_devs->seeding) - continue; - list_for_each_entry(dev, &fs_devs->devices, dev_list) { + list_for_each_entry_safe(dev, tmp_dev, + &fs_devs->devices, dev_list) { + int not_found = 0; - if (dev == cur_dev) + if (skip_dev && skip_dev == dev) continue; - if (!dev->name) + if (path && !dev->name) continue; - /* - * Todo: This won't be enough. What if the same device - * comes back (with new uuid and) with its mapper path? - * But for now, this does help as mostly an admin will - * either use mapper or non mapper path throughout. - */ rcu_read_lock(); - del = strcmp(rcu_str_deref(dev->name), - rcu_str_deref(cur_dev->name)); + if (path) + not_found = strcmp(rcu_str_deref(dev->name), + path); rcu_read_unlock(); - if (!del) - break; - } + if (not_found) + continue; - if (!del) { /* delete the stale device */ if (fs_devs->num_devices == 1) { btrfs_sysfs_remove_fsid(fs_devs); @@ -578,38 +648,99 @@ static void btrfs_free_stale_device(struct btrfs_device *cur_dev) } else { fs_devs->num_devices--; list_del(&dev->dev_list); - rcu_string_free(dev->name); - bio_put(dev->flush_bio); - kfree(dev); + free_device(dev); } - break; } } } +static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *device, fmode_t flags, + void *holder) +{ + struct request_queue *q; + struct block_device *bdev; + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + u64 devid; + int ret; + + if (device->bdev) + return -EINVAL; + if (!device->name) + return -EINVAL; + + ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, + &bdev, &bh); + if (ret) + return ret; + + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = btrfs_stack_device_id(&disk_super->dev_item); + if (devid != device->devid) + goto error_brelse; + + if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) + goto error_brelse; + + device->generation = btrfs_super_generation(disk_super); + + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + fs_devices->seeding = 1; + } else { + if (bdev_read_only(bdev)) + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + else + set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + } + + q = bdev_get_queue(bdev); + if (!blk_queue_nonrot(q)) + fs_devices->rotating = 1; + + device->bdev = bdev; + clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + device->mode = flags; + + fs_devices->open_devices++; + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && + device->devid != BTRFS_DEV_REPLACE_DEVID) { + fs_devices->rw_devices++; + list_add(&device->dev_alloc_list, &fs_devices->alloc_list); + } + brelse(bh); + + return 0; + +error_brelse: + brelse(bh); + blkdev_put(bdev, flags); + + return -EINVAL; +} + /* * Add new device to list of registered devices * * Returns: - * 1 - first time device is seen - * 0 - device already known - * < 0 - error + * device pointer which was just added or updated when successful + * error pointer when failed */ -static noinline int device_list_add(const char *path, - struct btrfs_super_block *disk_super, - u64 devid, struct btrfs_fs_devices **fs_devices_ret) +static noinline struct btrfs_device *device_list_add(const char *path, + struct btrfs_super_block *disk_super) { struct btrfs_device *device; struct btrfs_fs_devices *fs_devices; struct rcu_string *name; - int ret = 0; u64 found_transid = btrfs_super_generation(disk_super); + u64 devid = btrfs_stack_device_id(&disk_super->dev_item); fs_devices = find_fsid(disk_super->fsid); if (!fs_devices) { fs_devices = alloc_fs_devices(disk_super->fsid); if (IS_ERR(fs_devices)) - return PTR_ERR(fs_devices); + return ERR_CAST(fs_devices); list_add(&fs_devices->list, &fs_uuids); @@ -621,20 +752,19 @@ static noinline int device_list_add(const char *path, if (!device) { if (fs_devices->opened) - return -EBUSY; + return ERR_PTR(-EBUSY); device = btrfs_alloc_device(NULL, &devid, disk_super->dev_item.uuid); if (IS_ERR(device)) { /* we can safely leave the fs_devices entry around */ - return PTR_ERR(device); + return device; } name = rcu_string_strdup(path, GFP_NOFS); if (!name) { - bio_put(device->flush_bio); - kfree(device); - return -ENOMEM; + free_device(device); + return ERR_PTR(-ENOMEM); } rcu_assign_pointer(device->name, name); @@ -643,8 +773,16 @@ static noinline int device_list_add(const char *path, fs_devices->num_devices++; mutex_unlock(&fs_devices->device_list_mutex); - ret = 1; device->fs_devices = fs_devices; + btrfs_free_stale_devices(path, device); + + if (disk_super->label[0]) + pr_info("BTRFS: device label %s devid %llu transid %llu %s\n", + disk_super->label, devid, found_transid, path); + else + pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n", + disk_super->fsid, devid, found_transid, path); + } else if (!device->name || strcmp(device->name->str, path)) { /* * When FS is already mounted. @@ -680,17 +818,17 @@ static noinline int device_list_add(const char *path, * with larger generation number or the last-in if * generation are equal. */ - return -EEXIST; + return ERR_PTR(-EEXIST); } name = rcu_string_strdup(path, GFP_NOFS); if (!name) - return -ENOMEM; + return ERR_PTR(-ENOMEM); rcu_string_free(device->name); rcu_assign_pointer(device->name, name); - if (device->missing) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { fs_devices->missing_devices--; - device->missing = 0; + clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); } } @@ -703,16 +841,9 @@ static noinline int device_list_add(const char *path, if (!fs_devices->opened) device->generation = found_transid; - /* - * if there is new btrfs on an already registered device, - * then remove the stale device entry. - */ - if (ret > 0) - btrfs_free_stale_device(device); - - *fs_devices_ret = fs_devices; + fs_devices->total_devices = btrfs_super_num_devices(disk_super); - return ret; + return device; } static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) @@ -745,8 +876,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) name = rcu_string_strdup(orig_dev->name->str, GFP_KERNEL); if (!name) { - bio_put(device->flush_bio); - kfree(device); + free_device(device); goto error; } rcu_assign_pointer(device->name, name); @@ -773,10 +903,12 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step) again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (device->in_fs_metadata) { - if (!device->is_tgtdev_for_dev_replace && - (!latest_dev || - device->generation > latest_dev->generation)) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &device->dev_state)) { + if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, + &device->dev_state) && + (!latest_dev || + device->generation > latest_dev->generation)) { latest_dev = device; } continue; @@ -793,7 +925,8 @@ again: * not, which means whether this device is * used or whether it should be removed. */ - if (step == 0 || device->is_tgtdev_for_dev_replace) { + if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, + &device->dev_state)) { continue; } } @@ -802,17 +935,16 @@ again: device->bdev = NULL; fs_devices->open_devices--; } - if (device->writeable) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { list_del_init(&device->dev_alloc_list); - device->writeable = 0; - if (!device->is_tgtdev_for_dev_replace) + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, + &device->dev_state)) fs_devices->rw_devices--; } list_del_init(&device->dev_list); fs_devices->num_devices--; - rcu_string_free(device->name); - bio_put(device->flush_bio); - kfree(device); + free_device(device); } if (fs_devices->seed) { @@ -825,35 +957,25 @@ again: mutex_unlock(&uuid_mutex); } -static void __free_device(struct work_struct *work) -{ - struct btrfs_device *device; - - device = container_of(work, struct btrfs_device, rcu_work); - rcu_string_free(device->name); - bio_put(device->flush_bio); - kfree(device); -} - -static void free_device(struct rcu_head *head) +static void free_device_rcu(struct rcu_head *head) { struct btrfs_device *device; device = container_of(head, struct btrfs_device, rcu); - - INIT_WORK(&device->rcu_work, __free_device); - schedule_work(&device->rcu_work); + free_device(device); } static void btrfs_close_bdev(struct btrfs_device *device) { - if (device->bdev && device->writeable) { + if (!device->bdev) + return; + + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { sync_blockdev(device->bdev); invalidate_bdev(device->bdev); } - if (device->bdev) - blkdev_put(device->bdev, device->mode); + blkdev_put(device->bdev, device->mode); } static void btrfs_prepare_close_one_device(struct btrfs_device *device) @@ -865,13 +987,13 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device) if (device->bdev) fs_devices->open_devices--; - if (device->writeable && + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { list_del_init(&device->dev_alloc_list); fs_devices->rw_devices--; } - if (device->missing) + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) fs_devices->missing_devices--; new_device = btrfs_alloc_device(NULL, &device->devid, @@ -917,7 +1039,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) struct btrfs_device, dev_list); list_del(&device->dev_list); btrfs_close_bdev(device); - call_rcu(&device->rcu, free_device); + call_rcu(&device->rcu, free_device_rcu); } WARN_ON(fs_devices->open_devices); @@ -947,93 +1069,32 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) __btrfs_close_devices(fs_devices); free_fs_devices(fs_devices); } - /* - * Wait for rcu kworkers under __btrfs_close_devices - * to finish all blkdev_puts so device is really - * free when umount is done. - */ - rcu_barrier(); return ret; } static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder) { - struct request_queue *q; - struct block_device *bdev; struct list_head *head = &fs_devices->devices; struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; - struct buffer_head *bh; - struct btrfs_super_block *disk_super; - u64 devid; - int seeding = 1; int ret = 0; flags |= FMODE_EXCL; list_for_each_entry(device, head, dev_list) { - if (device->bdev) - continue; - if (!device->name) - continue; - /* Just open everything we can; ignore failures here */ - if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev, &bh)) + if (btrfs_open_one_device(fs_devices, device, flags, holder)) continue; - disk_super = (struct btrfs_super_block *)bh->b_data; - devid = btrfs_stack_device_id(&disk_super->dev_item); - if (devid != device->devid) - goto error_brelse; - - if (memcmp(device->uuid, disk_super->dev_item.uuid, - BTRFS_UUID_SIZE)) - goto error_brelse; - - device->generation = btrfs_super_generation(disk_super); if (!latest_dev || device->generation > latest_dev->generation) latest_dev = device; - - if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { - device->writeable = 0; - } else { - device->writeable = !bdev_read_only(bdev); - seeding = 0; - } - - q = bdev_get_queue(bdev); - if (blk_queue_discard(q)) - device->can_discard = 1; - if (!blk_queue_nonrot(q)) - fs_devices->rotating = 1; - - device->bdev = bdev; - device->in_fs_metadata = 0; - device->mode = flags; - - fs_devices->open_devices++; - if (device->writeable && - device->devid != BTRFS_DEV_REPLACE_DEVID) { - fs_devices->rw_devices++; - list_add(&device->dev_alloc_list, - &fs_devices->alloc_list); - } - brelse(bh); - continue; - -error_brelse: - brelse(bh); - blkdev_put(bdev, flags); - continue; } if (fs_devices->open_devices == 0) { ret = -EINVAL; goto out; } - fs_devices->seeding = seeding; fs_devices->opened = 1; fs_devices->latest_bdev = latest_dev->bdev; fs_devices->total_rw_bytes = 0; @@ -1117,12 +1178,10 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret) { struct btrfs_super_block *disk_super; + struct btrfs_device *device; struct block_device *bdev; struct page *page; - int ret = -EINVAL; - u64 devid; - u64 transid; - u64 total_devices; + int ret = 0; u64 bytenr; /* @@ -1141,26 +1200,16 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, goto error; } - if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) + if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) { + ret = -EINVAL; goto error_bdev_put; - - devid = btrfs_stack_device_id(&disk_super->dev_item); - transid = btrfs_super_generation(disk_super); - total_devices = btrfs_super_num_devices(disk_super); - - ret = device_list_add(path, disk_super, devid, fs_devices_ret); - if (ret > 0) { - if (disk_super->label[0]) { - pr_info("BTRFS: device label %s ", disk_super->label); - } else { - pr_info("BTRFS: device fsid %pU ", disk_super->fsid); - } - - pr_cont("devid %llu transid %llu %s\n", devid, transid, path); - ret = 0; } - if (!ret && fs_devices_ret) - (*fs_devices_ret)->total_devices = total_devices; + + device = device_list_add(path, disk_super); + if (IS_ERR(device)) + ret = PTR_ERR(device); + else + *fs_devices_ret = device->fs_devices; btrfs_release_disk_super(page); @@ -1186,7 +1235,8 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, *length = 0; - if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) + if (start >= device->total_bytes || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) return 0; path = btrfs_alloc_path(); @@ -1364,7 +1414,8 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction, max_hole_size = 0; again: - if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { + if (search_start >= search_end || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = -ENOSPC; goto out; } @@ -1571,8 +1622,8 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; - WARN_ON(!device->in_fs_metadata); - WARN_ON(device->is_tgtdev_for_dev_replace); + WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); + WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1662,7 +1713,7 @@ error: * the device information is stored in the chunk root * the btrfs_device struct should be fully filled in */ -static int btrfs_add_device(struct btrfs_trans_handle *trans, +static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_device *device) { @@ -1818,7 +1869,8 @@ static struct btrfs_device * btrfs_find_next_active_device( list_for_each_entry(next_device, &fs_devs->devices, dev_list) { if (next_device != device && - !next_device->missing && next_device->bdev) + !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) + && next_device->bdev) return next_device; } @@ -1859,6 +1911,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, u64 num_devices; int ret = 0; + mutex_lock(&fs_info->volume_mutex); mutex_lock(&uuid_mutex); num_devices = fs_info->fs_devices->num_devices; @@ -1878,17 +1931,18 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (ret) goto out; - if (device->is_tgtdev_for_dev_replace) { + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = BTRFS_ERROR_DEV_TGT_REPLACE; goto out; } - if (device->writeable && fs_info->fs_devices->rw_devices == 1) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && + fs_info->fs_devices->rw_devices == 1) { ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; goto out; } - if (device->writeable) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_del_init(&device->dev_alloc_list); device->fs_devices->rw_devices--; @@ -1910,7 +1964,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (ret) goto error_undo; - device->in_fs_metadata = 0; + clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); btrfs_scrub_cancel_dev(fs_info, device); /* @@ -1930,7 +1984,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, device->fs_devices->num_devices--; device->fs_devices->total_devices--; - if (device->missing) + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) device->fs_devices->missing_devices--; btrfs_assign_next_active_device(fs_info, device, NULL); @@ -1950,11 +2004,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, * the devices list. All that's left is to zero out the old * supers and free the device. */ - if (device->writeable) + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) btrfs_scratch_superblocks(device->bdev, device->name->str); btrfs_close_bdev(device); - call_rcu(&device->rcu, free_device); + call_rcu(&device->rcu, free_device_rcu); if (cur_devices->open_devices == 0) { struct btrfs_fs_devices *fs_devices; @@ -1973,10 +2027,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, out: mutex_unlock(&uuid_mutex); + mutex_unlock(&fs_info->volume_mutex); return ret; error_undo: - if (device->writeable) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, &fs_info->fs_devices->alloc_list); @@ -2004,10 +2059,10 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, list_del_rcu(&srcdev->dev_list); list_del(&srcdev->dev_alloc_list); fs_devices->num_devices--; - if (srcdev->missing) + if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) fs_devices->missing_devices--; - if (srcdev->writeable) + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) fs_devices->rw_devices--; if (srcdev->bdev) @@ -2019,13 +2074,13 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; - if (srcdev->writeable) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { /* zero out the old super if it is writable */ btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); } btrfs_close_bdev(srcdev); - call_rcu(&srcdev->rcu, free_device); + call_rcu(&srcdev->rcu, free_device_rcu); /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { @@ -2084,7 +2139,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); btrfs_close_bdev(tgtdev); - call_rcu(&tgtdev->rcu, free_device); + call_rcu(&tgtdev->rcu, free_device_rcu); } static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, @@ -2129,7 +2184,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, * is held by the caller. */ list_for_each_entry(tmp, devices, dev_list) { - if (tmp->in_fs_metadata && !tmp->bdev) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &tmp->dev_state) && !tmp->bdev) { *device = tmp; break; } @@ -2358,26 +2414,19 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { - bio_put(device->flush_bio); - kfree(device); ret = -ENOMEM; - goto error; + goto error_free_device; } rcu_assign_pointer(device->name, name); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { - rcu_string_free(device->name); - bio_put(device->flush_bio); - kfree(device); ret = PTR_ERR(trans); - goto error; + goto error_free_device; } q = bdev_get_queue(bdev); - if (blk_queue_discard(q)) - device->can_discard = 1; - device->writeable = 1; + set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = trans->transid; device->io_width = fs_info->sectorsize; device->io_align = fs_info->sectorsize; @@ -2388,8 +2437,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->commit_total_bytes = device->total_bytes; device->fs_info = fs_info; device->bdev = bdev; - device->in_fs_metadata = 1; - device->is_tgtdev_for_dev_replace = 0; + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->mode = FMODE_EXCL; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); @@ -2450,7 +2499,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } } - ret = btrfs_add_device(trans, fs_info, device); + ret = btrfs_add_dev_item(trans, fs_info, device); if (ret) { btrfs_abort_transaction(trans, ret); goto error_sysfs; @@ -2511,9 +2560,8 @@ error_trans: sb->s_flags |= SB_RDONLY; if (trans) btrfs_end_transaction(trans); - rcu_string_free(device->name); - bio_put(device->flush_bio); - kfree(device); +error_free_device: + free_device(device); error: blkdev_put(bdev, FMODE_EXCL); if (seeding_dev && !unlocked) { @@ -2528,7 +2576,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev, struct btrfs_device **device_out) { - struct request_queue *q; struct btrfs_device *device; struct block_device *bdev; struct list_head *devices; @@ -2579,18 +2626,14 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { - bio_put(device->flush_bio); - kfree(device); + free_device(device); ret = -ENOMEM; goto error; } rcu_assign_pointer(device->name, name); - q = bdev_get_queue(bdev); - if (blk_queue_discard(q)) - device->can_discard = 1; mutex_lock(&fs_info->fs_devices->device_list_mutex); - device->writeable = 1; + set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = 0; device->io_width = fs_info->sectorsize; device->io_align = fs_info->sectorsize; @@ -2603,8 +2646,8 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->commit_bytes_used = device->bytes_used; device->fs_info = fs_info; device->bdev = bdev; - device->in_fs_metadata = 1; - device->is_tgtdev_for_dev_replace = 1; + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->mode = FMODE_EXCL; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); @@ -2632,7 +2675,7 @@ void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, tgtdev->io_align = sectorsize; tgtdev->sector_size = sectorsize; tgtdev->fs_info = fs_info; - tgtdev->in_fs_metadata = 1; + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &tgtdev->dev_state); } static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, @@ -2690,7 +2733,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, u64 old_total; u64 diff; - if (!device->writeable) + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return -EACCES; new_size = round_down(new_size, fs_info->sectorsize); @@ -2700,7 +2743,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); if (new_size <= device->total_bytes || - device->is_tgtdev_for_dev_replace) { + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { mutex_unlock(&fs_info->chunk_mutex); return -EINVAL; } @@ -3044,6 +3087,48 @@ error: return ret; } +/* + * return 1 : allocate a data chunk successfully, + * return <0: errors during allocating a data chunk, + * return 0 : no need to allocate a data chunk. + */ +static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, + u64 chunk_offset) +{ + struct btrfs_block_group_cache *cache; + u64 bytes_used; + u64 chunk_type; + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + ASSERT(cache); + chunk_type = cache->flags; + btrfs_put_block_group(cache); + + if (chunk_type & BTRFS_BLOCK_GROUP_DATA) { + spin_lock(&fs_info->data_sinfo->lock); + bytes_used = fs_info->data_sinfo->bytes_used; + spin_unlock(&fs_info->data_sinfo->lock); + + if (!bytes_used) { + struct btrfs_trans_handle *trans; + int ret; + + trans = btrfs_join_transaction(fs_info->tree_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_force_chunk_alloc(trans, fs_info, + BTRFS_BLOCK_GROUP_DATA); + btrfs_end_transaction(trans); + if (ret < 0) + return ret; + + return 1; + } + } + return 0; +} + static int insert_balance_item(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl) { @@ -3502,7 +3587,6 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u32 count_meta = 0; u32 count_sys = 0; int chunk_reserved = 0; - u64 bytes_used = 0; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -3510,10 +3594,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) old_size = btrfs_device_get_total_bytes(device); size_to_free = div_factor(old_size, 1); size_to_free = min_t(u64, size_to_free, SZ_1M); - if (!device->writeable || + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) || btrfs_device_get_total_bytes(device) - btrfs_device_get_bytes_used(device) > size_to_free || - device->is_tgtdev_for_dev_replace) + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) continue; ret = btrfs_shrink_device(device, old_size - size_to_free); @@ -3661,28 +3745,21 @@ again: goto loop; } - ASSERT(fs_info->data_sinfo); - spin_lock(&fs_info->data_sinfo->lock); - bytes_used = fs_info->data_sinfo->bytes_used; - spin_unlock(&fs_info->data_sinfo->lock); - - if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && - !chunk_reserved && !bytes_used) { - trans = btrfs_start_transaction(chunk_root, 0); - if (IS_ERR(trans)) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); - ret = PTR_ERR(trans); - goto error; - } - - ret = btrfs_force_chunk_alloc(trans, fs_info, - BTRFS_BLOCK_GROUP_DATA); - btrfs_end_transaction(trans); + if (!chunk_reserved) { + /* + * We may be relocating the only data chunk we have, + * which could potentially end up with losing data's + * raid profile, so lets allocate an empty one in + * advance. + */ + ret = btrfs_may_alloc_data_chunk(fs_info, + found_key.offset); if (ret < 0) { mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto error; + } else if (ret == 1) { + chunk_reserved = 1; } - chunk_reserved = 1; } ret = btrfs_relocate_chunk(fs_info, found_key.offset); @@ -4381,7 +4458,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) new_size = round_down(new_size, fs_info->sectorsize); diff = round_down(old_size - new_size, fs_info->sectorsize); - if (device->is_tgtdev_for_dev_replace) + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) return -EINVAL; path = btrfs_alloc_path(); @@ -4393,7 +4470,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, new_size); - if (device->writeable) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { device->fs_devices->total_rw_bytes -= diff; atomic64_sub(diff, &fs_info->free_chunk_space); } @@ -4445,6 +4522,18 @@ again: chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); btrfs_release_path(path); + /* + * We may be relocating the only data chunk we have, + * which could potentially end up with losing data's + * raid profile, so lets allocate an empty one in + * advance. + */ + ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); + if (ret < 0) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + goto done; + } + ret = btrfs_relocate_chunk(fs_info, chunk_offset); mutex_unlock(&fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) @@ -4518,7 +4607,7 @@ done: if (ret) { mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, old_size); - if (device->writeable) + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) device->fs_devices->total_rw_bytes += diff; atomic64_add(diff, &fs_info->free_chunk_space); mutex_unlock(&fs_info->chunk_mutex); @@ -4678,14 +4767,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 max_avail; u64 dev_offset; - if (!device->writeable) { + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { WARN(1, KERN_ERR "BTRFS: read-only device in alloc_list\n"); continue; } - if (!device->in_fs_metadata || - device->is_tgtdev_for_dev_replace) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &device->dev_state) || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) continue; if (device->total_bytes > device->bytes_used) @@ -5033,12 +5123,13 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { - if (map->stripes[i].dev->missing) { + if (test_bit(BTRFS_DEV_STATE_MISSING, + &map->stripes[i].dev->dev_state)) { miss_ndevs++; continue; } - - if (!map->stripes[i].dev->writeable) { + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, + &map->stripes[i].dev->dev_state)) { readonly = 1; goto end; } @@ -5104,7 +5195,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) else if (map->type & BTRFS_BLOCK_GROUP_RAID5) ret = 2; else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - ret = 3; + /* + * There could be two corrupted data stripes, we need + * to loop retry in order to rebuild the correct data. + * + * Fail a stripe at a time on every retry except the + * stripe under reconstruction. + */ + ret = map->num_stripes; else ret = 1; free_extent_map(em); @@ -6004,15 +6102,14 @@ static void btrfs_end_bio(struct bio *bio) dev = bbio->stripes[stripe_index].dev; if (dev->bdev) { if (bio_op(bio) == REQ_OP_WRITE) - btrfs_dev_stat_inc(dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); else - btrfs_dev_stat_inc(dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); if (bio->bi_opf & REQ_PREFLUSH) - btrfs_dev_stat_inc(dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); - btrfs_dev_stat_print_on_error(dev); } } } @@ -6062,16 +6159,15 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device, int should_queue = 1; struct btrfs_pending_bios *pending_bios; - if (device->missing || !device->bdev) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || + !device->bdev) { bio_io_error(bio); return; } /* don't bother with additional async steps for reads, right now */ if (bio_op(bio) == REQ_OP_READ) { - bio_get(bio); btrfsic_submit_bio(bio); - bio_put(bio); return; } @@ -6208,7 +6304,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || - (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) { + (bio_op(first_bio) == REQ_OP_WRITE && + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { bbio_error(bbio, first_bio, logical); continue; } @@ -6257,7 +6354,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, device->fs_devices = fs_devices; fs_devices->num_devices++; - device->missing = 1; + set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); fs_devices->missing_devices++; return device; @@ -6273,8 +6370,8 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, * is generated. * * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() - * on error. Returned struct is not linked onto any lists and can be - * destroyed with kfree() right away. + * on error. Returned struct is not linked onto any lists and must be + * destroyed with free_device. */ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, @@ -6297,8 +6394,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, ret = find_next_devid(fs_info, &tmp); if (ret) { - bio_put(dev->flush_bio); - kfree(dev); + free_device(dev); return ERR_PTR(ret); } } @@ -6477,7 +6573,9 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, } btrfs_report_missing_device(fs_info, devid, uuid, false); } - map->stripes[i].dev->in_fs_metadata = 1; + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &(map->stripes[i].dev->dev_state)); + } write_lock(&map_tree->map_tree.lock); @@ -6506,7 +6604,7 @@ static void fill_device_from_item(struct extent_buffer *leaf, device->io_width = btrfs_device_io_width(leaf, dev_item); device->sector_size = btrfs_device_sector_size(leaf, dev_item); WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); - device->is_tgtdev_for_dev_replace = 0; + clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); ptr = btrfs_device_uuid(dev_item); read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); @@ -6618,7 +6716,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, dev_uuid, false); } - if(!device->bdev && !device->missing) { + if (!device->bdev && + !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { /* * this happens when a device that was properly setup * in the device info lists suddenly goes bad. @@ -6626,12 +6725,13 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, * device->missing to one here */ device->fs_devices->missing_devices++; - device->missing = 1; + set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); } /* Move the device to its own fs_devices */ if (device->fs_devices != fs_devices) { - ASSERT(device->missing); + ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, + &device->dev_state)); list_move(&device->dev_list, &fs_devices->devices); device->fs_devices->num_devices--; @@ -6645,15 +6745,16 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, } if (device->fs_devices != fs_info->fs_devices) { - BUG_ON(device->writeable); + BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); if (device->generation != btrfs_device_generation(leaf, dev_item)) return -EINVAL; } fill_device_from_item(leaf, dev_item, device); - device->in_fs_metadata = 1; - if (device->writeable && !device->is_tgtdev_for_dev_replace) { + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && + !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { device->fs_devices->total_rw_bytes += device->total_bytes; atomic64_add(device->total_bytes - device->bytes_used, &fs_info->free_chunk_space); @@ -6785,10 +6886,13 @@ out_short_read: /* * Check if all chunks in the fs are OK for read-write degraded mount * + * If the @failing_dev is specified, it's accounted as missing. + * * Return true if all chunks meet the minimal RW mount requirements. * Return false if any chunk doesn't meet the minimal RW mount requirements. */ -bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info) +bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, + struct btrfs_device *failing_dev) { struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; @@ -6816,12 +6920,16 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info) for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *dev = map->stripes[i].dev; - if (!dev || !dev->bdev || dev->missing || + if (!dev || !dev->bdev || + test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || dev->last_flush_error) missing++; + else if (failing_dev && failing_dev == dev) + missing++; } if (missing > max_tolerated) { - btrfs_warn(fs_info, + if (!failing_dev) + btrfs_warn(fs_info, "chunk %llu missing %d devices, max tolerance is %d for writeable mount", em->start, missing, max_tolerated); free_extent_map(em); @@ -7092,10 +7200,24 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device)) + stats_cnt = atomic_read(&device->dev_stats_ccnt); + if (!device->dev_stats_valid || stats_cnt == 0) continue; - stats_cnt = atomic_read(&device->dev_stats_ccnt); + + /* + * There is a LOAD-LOAD control dependency between the value of + * dev_stats_ccnt and updating the on-disk values which requires + * reading the in-memory counters. Such control dependencies + * require explicit read memory barriers. + * + * This memory barriers pairs with smp_mb__before_atomic in + * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full + * barrier implied by atomic_xchg in + * btrfs_dev_stats_read_and_reset + */ + smp_rmb(); + ret = update_dev_stat_item(trans, fs_info, device); if (!ret) atomic_sub(stats_cnt, &device->dev_stats_ccnt); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ff15208344a7..28c28eeadff3 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -47,6 +47,12 @@ struct btrfs_pending_bios { #define btrfs_device_data_ordered_init(device) do { } while (0) #endif +#define BTRFS_DEV_STATE_WRITEABLE (0) +#define BTRFS_DEV_STATE_IN_FS_METADATA (1) +#define BTRFS_DEV_STATE_MISSING (2) +#define BTRFS_DEV_STATE_REPLACE_TGT (3) +#define BTRFS_DEV_STATE_FLUSH_SENT (4) + struct btrfs_device { struct list_head dev_list; struct list_head dev_alloc_list; @@ -69,11 +75,7 @@ struct btrfs_device { /* the mode sent to blkdev_get */ fmode_t mode; - int writeable; - int in_fs_metadata; - int missing; - int can_discard; - int is_tgtdev_for_dev_replace; + unsigned long dev_state; blk_status_t last_flush_error; int flush_bio_sent; @@ -129,14 +131,12 @@ struct btrfs_device { struct completion flush_wait; /* per-device scrub information */ - struct scrub_ctx *scrub_device; + struct scrub_ctx *scrub_ctx; struct btrfs_work work; struct rcu_head rcu; - struct work_struct rcu_work; /* readahead state */ - spinlock_t reada_lock; atomic_t reada_in_flight; u64 reada_next; struct reada_zone *reada_curr_zone; @@ -489,15 +489,16 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int btrfs_remove_chunk(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 chunk_offset); -static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev) -{ - return atomic_read(&dev->dev_stats_ccnt); -} - static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) { atomic_inc(dev->dev_stat_values + index); + /* + * This memory barrier orders stores updating statistics before stores + * updating dev_stats_ccnt. + * + * It pairs with smp_rmb() in btrfs_run_dev_stats(). + */ smp_mb__before_atomic(); atomic_inc(&dev->dev_stats_ccnt); } @@ -514,7 +515,13 @@ static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev, int ret; ret = atomic_xchg(dev->dev_stat_values + index, 0); - smp_mb__before_atomic(); + /* + * atomic_xchg implies a full memory barriers as per atomic_t.txt: + * - RMW operations that have a return value are fully ordered; + * + * This implicit memory barriers is paired with the smp_rmb in + * btrfs_run_dev_stats + */ atomic_inc(&dev->dev_stats_ccnt); return ret; } @@ -523,6 +530,12 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev, int index, unsigned long val) { atomic_set(dev->dev_stat_values + index, val); + /* + * This memory barrier orders stores updating statistics before stores + * updating dev_stats_ccnt. + * + * It pairs with smp_rmb() in btrfs_run_dev_stats(). + */ smp_mb__before_atomic(); atomic_inc(&dev->dev_stats_ccnt); } @@ -540,7 +553,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info, struct list_head *btrfs_get_fs_uuids(void); void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); - -bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info); +bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, + struct btrfs_device *failing_dev); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 2c7e53f9ff1b..de7d072c78ef 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -23,6 +23,7 @@ #include <linux/xattr.h> #include <linux/security.h> #include <linux/posix_acl_xattr.h> +#include <linux/iversion.h> #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" @@ -267,7 +268,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct btrfs_key key; struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; int ret = 0; @@ -336,11 +336,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) u32 this_len = sizeof(*di) + name_len + data_len; unsigned long name_ptr = (unsigned long)(di + 1); - if (verify_dir_item(fs_info, leaf, slot, di)) { - ret = -EIO; - goto err; - } - total_size += name_len + 1; /* * We are just looking for how big our buffer needs to diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 17f2dd8fddb8..01a4eab602a3 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -43,6 +43,8 @@ struct workspace { size_t size; char *buf; struct list_head list; + ZSTD_inBuffer in_buf; + ZSTD_outBuffer out_buf; }; static void zstd_free_workspace(struct list_head *ws) @@ -94,8 +96,6 @@ static int zstd_compress_pages(struct list_head *ws, int nr_pages = 0; struct page *in_page = NULL; /* The current page to read */ struct page *out_page = NULL; /* The current page to write to */ - ZSTD_inBuffer in_buf = { NULL, 0, 0 }; - ZSTD_outBuffer out_buf = { NULL, 0, 0 }; unsigned long tot_in = 0; unsigned long tot_out = 0; unsigned long len = *total_out; @@ -118,9 +118,9 @@ static int zstd_compress_pages(struct list_head *ws, /* map in the first page of input data */ in_page = find_get_page(mapping, start >> PAGE_SHIFT); - in_buf.src = kmap(in_page); - in_buf.pos = 0; - in_buf.size = min_t(size_t, len, PAGE_SIZE); + workspace->in_buf.src = kmap(in_page); + workspace->in_buf.pos = 0; + workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); /* Allocate and map in the output buffer */ @@ -130,14 +130,15 @@ static int zstd_compress_pages(struct list_head *ws, goto out; } pages[nr_pages++] = out_page; - out_buf.dst = kmap(out_page); - out_buf.pos = 0; - out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.pos = 0; + workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); while (1) { size_t ret2; - ret2 = ZSTD_compressStream(stream, &out_buf, &in_buf); + ret2 = ZSTD_compressStream(stream, &workspace->out_buf, + &workspace->in_buf); if (ZSTD_isError(ret2)) { pr_debug("BTRFS: ZSTD_compressStream returned %d\n", ZSTD_getErrorCode(ret2)); @@ -146,22 +147,22 @@ static int zstd_compress_pages(struct list_head *ws, } /* Check to see if we are making it bigger */ - if (tot_in + in_buf.pos > 8192 && - tot_in + in_buf.pos < - tot_out + out_buf.pos) { + if (tot_in + workspace->in_buf.pos > 8192 && + tot_in + workspace->in_buf.pos < + tot_out + workspace->out_buf.pos) { ret = -E2BIG; goto out; } /* We've reached the end of our output range */ - if (out_buf.pos >= max_out) { - tot_out += out_buf.pos; + if (workspace->out_buf.pos >= max_out) { + tot_out += workspace->out_buf.pos; ret = -E2BIG; goto out; } /* Check if we need more output space */ - if (out_buf.pos == out_buf.size) { + if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; kunmap(out_page); @@ -176,19 +177,20 @@ static int zstd_compress_pages(struct list_head *ws, goto out; } pages[nr_pages++] = out_page; - out_buf.dst = kmap(out_page); - out_buf.pos = 0; - out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.pos = 0; + workspace->out_buf.size = min_t(size_t, max_out, + PAGE_SIZE); } /* We've reached the end of the input */ - if (in_buf.pos >= len) { - tot_in += in_buf.pos; + if (workspace->in_buf.pos >= len) { + tot_in += workspace->in_buf.pos; break; } /* Check if we need more input */ - if (in_buf.pos == in_buf.size) { + if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; kunmap(in_page); put_page(in_page); @@ -196,15 +198,15 @@ static int zstd_compress_pages(struct list_head *ws, start += PAGE_SIZE; len -= PAGE_SIZE; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - in_buf.src = kmap(in_page); - in_buf.pos = 0; - in_buf.size = min_t(size_t, len, PAGE_SIZE); + workspace->in_buf.src = kmap(in_page); + workspace->in_buf.pos = 0; + workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); } } while (1) { size_t ret2; - ret2 = ZSTD_endStream(stream, &out_buf); + ret2 = ZSTD_endStream(stream, &workspace->out_buf); if (ZSTD_isError(ret2)) { pr_debug("BTRFS: ZSTD_endStream returned %d\n", ZSTD_getErrorCode(ret2)); @@ -212,11 +214,11 @@ static int zstd_compress_pages(struct list_head *ws, goto out; } if (ret2 == 0) { - tot_out += out_buf.pos; + tot_out += workspace->out_buf.pos; break; } - if (out_buf.pos >= max_out) { - tot_out += out_buf.pos; + if (workspace->out_buf.pos >= max_out) { + tot_out += workspace->out_buf.pos; ret = -E2BIG; goto out; } @@ -235,9 +237,9 @@ static int zstd_compress_pages(struct list_head *ws, goto out; } pages[nr_pages++] = out_page; - out_buf.dst = kmap(out_page); - out_buf.pos = 0; - out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.pos = 0; + workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } if (tot_out >= tot_in) { @@ -273,8 +275,6 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long total_out = 0; - ZSTD_inBuffer in_buf = { NULL, 0, 0 }; - ZSTD_outBuffer out_buf = { NULL, 0, 0 }; stream = ZSTD_initDStream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); @@ -284,18 +284,19 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } - in_buf.src = kmap(pages_in[page_in_index]); - in_buf.pos = 0; - in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.pos = 0; + workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); - out_buf.dst = workspace->buf; - out_buf.pos = 0; - out_buf.size = PAGE_SIZE; + workspace->out_buf.dst = workspace->buf; + workspace->out_buf.pos = 0; + workspace->out_buf.size = PAGE_SIZE; while (1) { size_t ret2; - ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf); + ret2 = ZSTD_decompressStream(stream, &workspace->out_buf, + &workspace->in_buf); if (ZSTD_isError(ret2)) { pr_debug("BTRFS: ZSTD_decompressStream returned %d\n", ZSTD_getErrorCode(ret2)); @@ -303,38 +304,38 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } buf_start = total_out; - total_out += out_buf.pos; - out_buf.pos = 0; + total_out += workspace->out_buf.pos; + workspace->out_buf.pos = 0; - ret = btrfs_decompress_buf2page(out_buf.dst, buf_start, - total_out, disk_start, orig_bio); + ret = btrfs_decompress_buf2page(workspace->out_buf.dst, + buf_start, total_out, disk_start, orig_bio); if (ret == 0) break; - if (in_buf.pos >= srclen) + if (workspace->in_buf.pos >= srclen) break; /* Check if we've hit the end of a frame */ if (ret2 == 0) break; - if (in_buf.pos == in_buf.size) { + if (workspace->in_buf.pos == workspace->in_buf.size) { kunmap(pages_in[page_in_index++]); if (page_in_index >= total_pages_in) { - in_buf.src = NULL; + workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - in_buf.src = kmap(pages_in[page_in_index]); - in_buf.pos = 0; - in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.pos = 0; + workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } } ret = 0; zero_fill_bio(orig_bio); done: - if (in_buf.src) + if (workspace->in_buf.src) kunmap(pages_in[page_in_index]); return ret; } @@ -348,8 +349,6 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in, ZSTD_DStream *stream; int ret = 0; size_t ret2; - ZSTD_inBuffer in_buf = { NULL, 0, 0 }; - ZSTD_outBuffer out_buf = { NULL, 0, 0 }; unsigned long total_out = 0; unsigned long pg_offset = 0; char *kaddr; @@ -364,16 +363,17 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in, destlen = min_t(size_t, destlen, PAGE_SIZE); - in_buf.src = data_in; - in_buf.pos = 0; - in_buf.size = srclen; + workspace->in_buf.src = data_in; + workspace->in_buf.pos = 0; + workspace->in_buf.size = srclen; - out_buf.dst = workspace->buf; - out_buf.pos = 0; - out_buf.size = PAGE_SIZE; + workspace->out_buf.dst = workspace->buf; + workspace->out_buf.pos = 0; + workspace->out_buf.size = PAGE_SIZE; ret2 = 1; - while (pg_offset < destlen && in_buf.pos < in_buf.size) { + while (pg_offset < destlen + && workspace->in_buf.pos < workspace->in_buf.size) { unsigned long buf_start; unsigned long buf_offset; unsigned long bytes; @@ -384,7 +384,8 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in, ret = -EIO; goto finish; } - ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf); + ret2 = ZSTD_decompressStream(stream, &workspace->out_buf, + &workspace->in_buf); if (ZSTD_isError(ret2)) { pr_debug("BTRFS: ZSTD_decompressStream returned %d\n", ZSTD_getErrorCode(ret2)); @@ -393,8 +394,8 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in, } buf_start = total_out; - total_out += out_buf.pos; - out_buf.pos = 0; + total_out += workspace->out_buf.pos; + workspace->out_buf.pos = 0; if (total_out <= start_byte) continue; @@ -405,10 +406,11 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in, buf_offset = 0; bytes = min_t(unsigned long, destlen - pg_offset, - out_buf.size - buf_offset); + workspace->out_buf.size - buf_offset); kaddr = kmap_atomic(dest_page); - memcpy(kaddr + pg_offset, out_buf.dst + buf_offset, bytes); + memcpy(kaddr + pg_offset, workspace->out_buf.dst + buf_offset, + bytes); kunmap_atomic(kaddr); pg_offset += bytes; diff --git a/fs/buffer.c b/fs/buffer.c index 0736a6a2e2f0..9a73924db22f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -53,13 +53,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) -void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) -{ - bh->b_end_io = handler; - bh->b_private = private; -} -EXPORT_SYMBOL(init_buffer); - inline void touch_buffer(struct buffer_head *bh) { trace_block_touch_buffer(bh); @@ -922,7 +915,8 @@ init_page_buffers(struct page *page, struct block_device *bdev, do { if (!buffer_mapped(bh)) { - init_buffer(bh, NULL, NULL); + bh->b_end_io = NULL; + bh->b_private = NULL; bh->b_bdev = bdev; bh->b_blocknr = block; if (uptodate) @@ -3014,7 +3008,7 @@ static void end_bio_bh_io_sync(struct bio *bio) void guard_bio_eod(int op, struct bio *bio) { sector_t maxsector; - struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; + struct bio_vec *bvec = bio_last_bvec_all(bio); unsigned truncated_bytes; struct hd_struct *part; diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 1ee54ffd3a24..7edbd0679952 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -31,7 +31,7 @@ static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t, loff_t *); static ssize_t cachefiles_daemon_write(struct file *, const char __user *, size_t, loff_t *); -static unsigned int cachefiles_daemon_poll(struct file *, +static __poll_t cachefiles_daemon_poll(struct file *, struct poll_table_struct *); static int cachefiles_daemon_frun(struct cachefiles_cache *, char *); static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *); @@ -291,11 +291,11 @@ found_command: * poll for culling state * - use POLLOUT to indicate culling state */ -static unsigned int cachefiles_daemon_poll(struct file *file, +static __poll_t cachefiles_daemon_poll(struct file *file, struct poll_table_struct *poll) { struct cachefiles_cache *cache = file->private_data; - unsigned int mask; + __poll_t mask; poll_wait(file, &cache->daemon_pollwq, poll); mask = 0; diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 264e9bf83ff3..52095f473464 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -34,7 +34,4 @@ config CEPH_FS_POSIX_ACL POSIX Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. - To learn more about Access Control Lists, visit the POSIX ACLs for - Linux website <http://acl.bestbits.at/>. - If you don't know what Access Control Lists are, say N diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index d5b2e12b5d02..687da62daf4e 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -108,14 +108,13 @@ config CIFS_XATTR depends on CIFS help Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). CIFS maps the name of - extended attributes beginning with the user namespace prefix - to SMB/CIFS EAs. EAs are stored on Windows servers without the - user namespace prefix, but their names are seen by Linux cifs clients - prefaced by the user namespace prefix. The system namespace - (used by some filesystems to store ACLs) is not supported at - this time. + the kernel or by users (see the attr(5) manual page for details). + CIFS maps the name of extended attributes beginning with the user + namespace prefix to SMB/CIFS EAs. EAs are stored on Windows + servers without the user namespace prefix, but their names are + seen by Linux cifs clients prefaced by the user namespace prefix. + The system namespace (used by some filesystems to store ACLs) is + not supported at this time. If unsure, say Y. @@ -196,6 +195,14 @@ config CIFS_SMB311 This dialect includes improved security negotiation features. If unsure, say N +config CIFS_SMB_DIRECT + bool "SMB Direct support (Experimental)" + depends on CIFS=m && INFINIBAND || CIFS=y && INFINIBAND=y + help + Enables SMB Direct experimental support for SMB 3.0, 3.02 and 3.1.1. + SMB Direct allows transferring SMB packets over RDMA. If unsure, + say N. + config CIFS_FSCACHE bool "Provide CIFS client caching support" depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 7134f182720b..7e4a1e2f0696 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -19,3 +19,5 @@ cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o + +cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index cbb9534b89b4..c7a863219fa3 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -30,6 +30,9 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifsfs.h" +#ifdef CONFIG_CIFS_SMB_DIRECT +#include "smbdirect.h" +#endif void cifs_dump_mem(char *label, void *data, int length) @@ -107,6 +110,32 @@ void cifs_dump_mids(struct TCP_Server_Info *server) } #ifdef CONFIG_PROC_FS +static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) +{ + __u32 dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType); + + seq_printf(m, "%s Mounts: %d ", tcon->treeName, tcon->tc_count); + if (tcon->nativeFileSystem) + seq_printf(m, "Type: %s ", tcon->nativeFileSystem); + seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x\n\tPathComponentMax: %d Status: %d", + le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics), + le32_to_cpu(tcon->fsAttrInfo.Attributes), + le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength), + tcon->tidStatus); + if (dev_type == FILE_DEVICE_DISK) + seq_puts(m, " type: DISK "); + else if (dev_type == FILE_DEVICE_CD_ROM) + seq_puts(m, " type: CDROM "); + else + seq_printf(m, " type: %d ", dev_type); + if (tcon->ses->server->ops->dump_share_caps) + tcon->ses->server->ops->dump_share_caps(m, tcon); + + if (tcon->need_reconnect) + seq_puts(m, "\tDISCONNECTED "); + seq_putc(m, '\n'); +} + static int cifs_debug_data_proc_show(struct seq_file *m, void *v) { struct list_head *tmp1, *tmp2, *tmp3; @@ -115,7 +144,6 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) struct cifs_ses *ses; struct cifs_tcon *tcon; int i, j; - __u32 dev_type; seq_puts(m, "Display Internal CIFS Data Structures for Debugging\n" @@ -152,6 +180,72 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) list_for_each(tmp1, &cifs_tcp_ses_list) { server = list_entry(tmp1, struct TCP_Server_Info, tcp_ses_list); + +#ifdef CONFIG_CIFS_SMB_DIRECT + if (!server->rdma) + goto skip_rdma; + + seq_printf(m, "\nSMBDirect (in hex) protocol version: %x " + "transport status: %x", + server->smbd_conn->protocol, + server->smbd_conn->transport_status); + seq_printf(m, "\nConn receive_credit_max: %x " + "send_credit_target: %x max_send_size: %x", + server->smbd_conn->receive_credit_max, + server->smbd_conn->send_credit_target, + server->smbd_conn->max_send_size); + seq_printf(m, "\nConn max_fragmented_recv_size: %x " + "max_fragmented_send_size: %x max_receive_size:%x", + server->smbd_conn->max_fragmented_recv_size, + server->smbd_conn->max_fragmented_send_size, + server->smbd_conn->max_receive_size); + seq_printf(m, "\nConn keep_alive_interval: %x " + "max_readwrite_size: %x rdma_readwrite_threshold: %x", + server->smbd_conn->keep_alive_interval, + server->smbd_conn->max_readwrite_size, + server->smbd_conn->rdma_readwrite_threshold); + seq_printf(m, "\nDebug count_get_receive_buffer: %x " + "count_put_receive_buffer: %x count_send_empty: %x", + server->smbd_conn->count_get_receive_buffer, + server->smbd_conn->count_put_receive_buffer, + server->smbd_conn->count_send_empty); + seq_printf(m, "\nRead Queue count_reassembly_queue: %x " + "count_enqueue_reassembly_queue: %x " + "count_dequeue_reassembly_queue: %x " + "fragment_reassembly_remaining: %x " + "reassembly_data_length: %x " + "reassembly_queue_length: %x", + server->smbd_conn->count_reassembly_queue, + server->smbd_conn->count_enqueue_reassembly_queue, + server->smbd_conn->count_dequeue_reassembly_queue, + server->smbd_conn->fragment_reassembly_remaining, + server->smbd_conn->reassembly_data_length, + server->smbd_conn->reassembly_queue_length); + seq_printf(m, "\nCurrent Credits send_credits: %x " + "receive_credits: %x receive_credit_target: %x", + atomic_read(&server->smbd_conn->send_credits), + atomic_read(&server->smbd_conn->receive_credits), + server->smbd_conn->receive_credit_target); + seq_printf(m, "\nPending send_pending: %x send_payload_pending:" + " %x smbd_send_pending: %x smbd_recv_pending: %x", + atomic_read(&server->smbd_conn->send_pending), + atomic_read(&server->smbd_conn->send_payload_pending), + server->smbd_conn->smbd_send_pending, + server->smbd_conn->smbd_recv_pending); + seq_printf(m, "\nReceive buffers count_receive_queue: %x " + "count_empty_packet_queue: %x", + server->smbd_conn->count_receive_queue, + server->smbd_conn->count_empty_packet_queue); + seq_printf(m, "\nMR responder_resources: %x " + "max_frmr_depth: %x mr_type: %x", + server->smbd_conn->responder_resources, + server->smbd_conn->max_frmr_depth, + server->smbd_conn->mr_type); + seq_printf(m, "\nMR mr_ready_count: %x mr_used_count: %x", + atomic_read(&server->smbd_conn->mr_ready_count), + atomic_read(&server->smbd_conn->mr_used_count)); +skip_rdma: +#endif seq_printf(m, "\nNumber of credits: %d", server->credits); i++; list_for_each(tmp2, &server->smb_ses_list) { @@ -176,6 +270,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) ses->ses_count, ses->serverOS, ses->serverNOS, ses->capabilities, ses->status); } + if (server->rdma) + seq_printf(m, "RDMA\n\t"); seq_printf(m, "TCP status: %d\n\tLocal Users To " "Server: %d SecMode: 0x%x Req On Wire: %d", server->tcpStatus, server->srv_count, @@ -189,35 +285,19 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) seq_puts(m, "\n\tShares:"); j = 0; + + seq_printf(m, "\n\t%d) IPC: ", j); + if (ses->tcon_ipc) + cifs_debug_tcon(m, ses->tcon_ipc); + else + seq_puts(m, "none\n"); + list_for_each(tmp3, &ses->tcon_list) { tcon = list_entry(tmp3, struct cifs_tcon, tcon_list); ++j; - dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType); - seq_printf(m, "\n\t%d) %s Mounts: %d ", j, - tcon->treeName, tcon->tc_count); - if (tcon->nativeFileSystem) { - seq_printf(m, "Type: %s ", - tcon->nativeFileSystem); - } - seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x" - "\n\tPathComponentMax: %d Status: %d", - le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics), - le32_to_cpu(tcon->fsAttrInfo.Attributes), - le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength), - tcon->tidStatus); - if (dev_type == FILE_DEVICE_DISK) - seq_puts(m, " type: DISK "); - else if (dev_type == FILE_DEVICE_CD_ROM) - seq_puts(m, " type: CDROM "); - else - seq_printf(m, " type: %d ", dev_type); - if (server->ops->dump_share_caps) - server->ops->dump_share_caps(m, tcon); - - if (tcon->need_reconnect) - seq_puts(m, "\tDISCONNECTED "); - seq_putc(m, '\n'); + seq_printf(m, "\n\t%d) ", j); + cifs_debug_tcon(m, tcon); } seq_puts(m, "\n\tMIDs:\n"); @@ -374,6 +454,45 @@ static const struct file_operations cifs_stats_proc_fops = { }; #endif /* STATS */ +#ifdef CONFIG_CIFS_SMB_DIRECT +#define PROC_FILE_DEFINE(name) \ +static ssize_t name##_write(struct file *file, const char __user *buffer, \ + size_t count, loff_t *ppos) \ +{ \ + int rc; \ + rc = kstrtoint_from_user(buffer, count, 10, & name); \ + if (rc) \ + return rc; \ + return count; \ +} \ +static int name##_proc_show(struct seq_file *m, void *v) \ +{ \ + seq_printf(m, "%d\n", name ); \ + return 0; \ +} \ +static int name##_open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, name##_proc_show, NULL); \ +} \ +\ +static const struct file_operations cifs_##name##_proc_fops = { \ + .open = name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ + .write = name##_write, \ +} + +PROC_FILE_DEFINE(rdma_readwrite_threshold); +PROC_FILE_DEFINE(smbd_max_frmr_depth); +PROC_FILE_DEFINE(smbd_keep_alive_interval); +PROC_FILE_DEFINE(smbd_max_receive_size); +PROC_FILE_DEFINE(smbd_max_fragmented_recv_size); +PROC_FILE_DEFINE(smbd_max_send_size); +PROC_FILE_DEFINE(smbd_send_credit_target); +PROC_FILE_DEFINE(smbd_receive_credit_max); +#endif + static struct proc_dir_entry *proc_fs_cifs; static const struct file_operations cifsFYI_proc_fops; static const struct file_operations cifs_lookup_cache_proc_fops; @@ -401,6 +520,24 @@ cifs_proc_init(void) &cifs_security_flags_proc_fops); proc_create("LookupCacheEnabled", 0, proc_fs_cifs, &cifs_lookup_cache_proc_fops); +#ifdef CONFIG_CIFS_SMB_DIRECT + proc_create("rdma_readwrite_threshold", 0, proc_fs_cifs, + &cifs_rdma_readwrite_threshold_proc_fops); + proc_create("smbd_max_frmr_depth", 0, proc_fs_cifs, + &cifs_smbd_max_frmr_depth_proc_fops); + proc_create("smbd_keep_alive_interval", 0, proc_fs_cifs, + &cifs_smbd_keep_alive_interval_proc_fops); + proc_create("smbd_max_receive_size", 0, proc_fs_cifs, + &cifs_smbd_max_receive_size_proc_fops); + proc_create("smbd_max_fragmented_recv_size", 0, proc_fs_cifs, + &cifs_smbd_max_fragmented_recv_size_proc_fops); + proc_create("smbd_max_send_size", 0, proc_fs_cifs, + &cifs_smbd_max_send_size_proc_fops); + proc_create("smbd_send_credit_target", 0, proc_fs_cifs, + &cifs_smbd_send_credit_target_proc_fops); + proc_create("smbd_receive_credit_max", 0, proc_fs_cifs, + &cifs_smbd_receive_credit_max_proc_fops); +#endif } void @@ -418,6 +555,16 @@ cifs_proc_clean(void) remove_proc_entry("SecurityFlags", proc_fs_cifs); remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs); remove_proc_entry("LookupCacheEnabled", proc_fs_cifs); +#ifdef CONFIG_CIFS_SMB_DIRECT + remove_proc_entry("rdma_readwrite_threshold", proc_fs_cifs); + remove_proc_entry("smbd_max_frmr_depth", proc_fs_cifs); + remove_proc_entry("smbd_keep_alive_interval", proc_fs_cifs); + remove_proc_entry("smbd_max_receive_size", proc_fs_cifs); + remove_proc_entry("smbd_max_fragmented_recv_size", proc_fs_cifs); + remove_proc_entry("smbd_max_send_size", proc_fs_cifs); + remove_proc_entry("smbd_send_credit_target", proc_fs_cifs); + remove_proc_entry("smbd_receive_credit_max", proc_fs_cifs); +#endif remove_proc_entry("fs/cifs", NULL); } diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index b98436f5c7c7..13a8a77322c9 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1125,7 +1125,7 @@ out: return rc; } -/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ +/* Translate the CIFS ACL (similar to NTFS ACL) for a file into mode bits */ int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, const char *path, diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 68abbb0db608..f2b0a7f124da 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -325,9 +325,8 @@ int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, { int i; int rc; - char password_with_pad[CIFS_ENCPWD_SIZE]; + char password_with_pad[CIFS_ENCPWD_SIZE] = {0}; - memset(password_with_pad, 0, CIFS_ENCPWD_SIZE); if (password) strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 31b7565b1617..32cdea67bbfd 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -327,6 +327,8 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) default: seq_puts(s, "(unknown)"); } + if (server->rdma) + seq_puts(s, ",rdma"); } static void @@ -1068,6 +1070,7 @@ const struct file_operations cifs_file_ops = { .flush = cifs_flush, .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, @@ -1086,6 +1089,7 @@ const struct file_operations cifs_file_strict_ops = { .flush = cifs_flush, .mmap = cifs_file_strict_mmap, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, @@ -1105,6 +1109,7 @@ const struct file_operations cifs_file_direct_ops = { .flush = cifs_flush, .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, .clone_file_range = cifs_clone_file_range, @@ -1122,6 +1127,7 @@ const struct file_operations cifs_file_nobrl_ops = { .flush = cifs_flush, .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, @@ -1139,6 +1145,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .flush = cifs_flush, .mmap = cifs_file_strict_mmap, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, @@ -1157,6 +1164,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .flush = cifs_flush, .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, .clone_file_range = cifs_clone_file_range, @@ -1231,9 +1239,11 @@ cifs_init_request_bufs(void) cifs_dbg(VFS, "CIFSMaxBufSize %d 0x%x\n", CIFSMaxBufSize, CIFSMaxBufSize); */ - cifs_req_cachep = kmem_cache_create("cifs_request", + cifs_req_cachep = kmem_cache_create_usercopy("cifs_request", CIFSMaxBufSize + max_hdr_size, 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN, 0, + CIFSMaxBufSize + max_hdr_size, + NULL); if (cifs_req_cachep == NULL) return -ENOMEM; @@ -1259,9 +1269,9 @@ cifs_init_request_bufs(void) more SMBs to use small buffer alloc and is still much more efficient to alloc 1 per page off the slab compared to 17K (5page) alloc of large cifs buffers even when page debugging is on */ - cifs_sm_req_cachep = kmem_cache_create("cifs_small_rq", + cifs_sm_req_cachep = kmem_cache_create_usercopy("cifs_small_rq", MAX_CIFS_SMALL_BUFFER_SIZE, 0, SLAB_HWCACHE_ALIGN, - NULL); + 0, MAX_CIFS_SMALL_BUFFER_SIZE, NULL); if (cifs_sm_req_cachep == NULL) { mempool_destroy(cifs_req_poolp); kmem_cache_destroy(cifs_req_cachep); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 5a10e566f0e6..013ba2aed8d9 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -149,5 +149,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.10" +#define CIFS_VERSION "2.11" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index b16583594d1a..48f7c197cd2d 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -64,8 +64,8 @@ #define RFC1001_NAME_LEN 15 #define RFC1001_NAME_LEN_WITH_NULL (RFC1001_NAME_LEN + 1) -/* currently length of NIP6_FMT */ -#define SERVER_NAME_LENGTH 40 +/* maximum length of ip addr as a string (including ipv6 and sctp) */ +#define SERVER_NAME_LENGTH 80 #define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1) /* echo interval in seconds */ @@ -230,8 +230,14 @@ struct smb_version_operations { __u64 (*get_next_mid)(struct TCP_Server_Info *); /* data offset from read response message */ unsigned int (*read_data_offset)(char *); - /* data length from read response message */ - unsigned int (*read_data_length)(char *); + /* + * Data length from read response message + * When in_remaining is true, the returned data length is in + * message field DataRemaining for out-of-band data read (e.g through + * Memory Registration RDMA write in SMBD). + * Otherwise, the returned data length is in message field DataLength. + */ + unsigned int (*read_data_length)(char *, bool in_remaining); /* map smb to linux error */ int (*map_error)(char *, bool); /* find mid corresponding to the response message */ @@ -532,6 +538,7 @@ struct smb_vol { bool nopersistent:1; bool resilient:1; /* noresilient not required since not fored for CA */ bool domainauto:1; + bool rdma:1; unsigned int rsize; unsigned int wsize; bool sockopt_tcp_nodelay:1; @@ -648,6 +655,10 @@ struct TCP_Server_Info { bool sec_kerberos; /* supports plain Kerberos */ bool sec_mskerberos; /* supports legacy MS Kerberos */ bool large_buf; /* is current buffer large? */ + /* use SMBD connection instead of socket */ + bool rdma; + /* point to the SMBD connection if RDMA is used instead of socket */ + struct smbd_connection *smbd_conn; struct delayed_work echo; /* echo ping workqueue job */ char *smallbuf; /* pointer to current "small" buffer */ char *bigbuf; /* pointer to current "big" buffer */ @@ -822,12 +833,12 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net) struct cifs_ses { struct list_head smb_ses_list; struct list_head tcon_list; + struct cifs_tcon *tcon_ipc; struct mutex session_mutex; struct TCP_Server_Info *server; /* pointer to server info */ int ses_count; /* reference counter */ enum statusEnum status; unsigned overrideSecFlg; /* if non-zero override global sec flags */ - __u32 ipc_tid; /* special tid for connection to IPC share */ char *serverOS; /* name of operating system underlying server */ char *serverNOS; /* name of network operating system of server */ char *serverDomain; /* security realm of server */ @@ -835,8 +846,7 @@ struct cifs_ses { kuid_t linux_uid; /* overriding owner of files on the mount */ kuid_t cred_uid; /* owner of credentials */ unsigned int capabilities; - char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for - TCP names - will ipv6 and sctp addresses fit? */ + char serverName[SERVER_NAME_LEN_WITH_NULL]; char *user_name; /* must not be null except during init of sess and after mount option parsing we fill it */ char *domainName; @@ -931,7 +941,9 @@ struct cifs_tcon { FILE_SYSTEM_DEVICE_INFO fsDevInfo; FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */ FILE_SYSTEM_UNIX_INFO fsUnixInfo; - bool ipc:1; /* set if connection to IPC$ eg for RPC/PIPES */ + bool ipc:1; /* set if connection to IPC$ share (always also pipe) */ + bool pipe:1; /* set if connection to pipe share */ + bool print:1; /* set if connection to printer share */ bool retry:1; bool nocase:1; bool seal:1; /* transport encryption for this mounted share */ @@ -944,7 +956,6 @@ struct cifs_tcon { bool need_reopen_files:1; /* need to reopen tcon file handles */ bool use_resilient:1; /* use resilient instead of durable handles */ bool use_persistent:1; /* use persistent instead of durable handles */ - bool print:1; /* set if connection to printer share */ __le32 capabilities; __u32 share_flags; __u32 maximal_access; @@ -1147,6 +1158,9 @@ struct cifs_readdata { struct cifs_readdata *rdata, struct iov_iter *iter); struct kvec iov[2]; +#ifdef CONFIG_CIFS_SMB_DIRECT + struct smbd_mr *mr; +#endif unsigned int pagesz; unsigned int tailsz; unsigned int credits; @@ -1169,6 +1183,9 @@ struct cifs_writedata { pid_t pid; unsigned int bytes; int result; +#ifdef CONFIG_CIFS_SMB_DIRECT + struct smbd_mr *mr; +#endif unsigned int pagesz; unsigned int tailsz; unsigned int credits; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 4143c9dec463..93d565186698 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -106,6 +106,10 @@ extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, struct kvec *, int /* nvec to send */, int * /* type of buf returned */, const int flags, struct kvec * /* resp vec */); +extern int smb2_send_recv(const unsigned int xid, struct cifs_ses *pses, + struct kvec *pkvec, int nvec_to_send, + int *pbuftype, const int flags, + struct kvec *presp); extern int SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *ptcon, struct smb_hdr *in_buf , diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 35dc5bf01ee2..4e0922d24eb2 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -43,6 +43,7 @@ #include "cifs_unicode.h" #include "cifs_debug.h" #include "fscache.h" +#include "smbdirect.h" #ifdef CONFIG_CIFS_POSIX static struct { @@ -1454,6 +1455,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) struct cifs_readdata *rdata = mid->callback_data; char *buf = server->smallbuf; unsigned int buflen = get_rfc1002_length(buf) + 4; + bool use_rdma_mr = false; cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n", __func__, mid->mid, rdata->offset, rdata->bytes); @@ -1542,8 +1544,11 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) rdata->iov[0].iov_base, server->total_read); /* how much data is in the response? */ - data_len = server->ops->read_data_length(buf); - if (data_offset + data_len > buflen) { +#ifdef CONFIG_CIFS_SMB_DIRECT + use_rdma_mr = rdata->mr; +#endif + data_len = server->ops->read_data_length(buf, use_rdma_mr); + if (!use_rdma_mr && (data_offset + data_len > buflen)) { /* data_len is corrupt -- discard frame */ rdata->result = -EIO; return cifs_readv_discard(server, mid); @@ -1923,6 +1928,12 @@ cifs_writedata_release(struct kref *refcount) { struct cifs_writedata *wdata = container_of(refcount, struct cifs_writedata, refcount); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (wdata->mr) { + smbd_deregister_mr(wdata->mr); + wdata->mr = NULL; + } +#endif if (wdata->cfile) cifsFileInfo_put(wdata->cfile); @@ -4822,10 +4833,11 @@ CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses, *target_nodes = NULL; cifs_dbg(FYI, "In GetDFSRefer the path %s\n", search_name); - if (ses == NULL) + if (ses == NULL || ses->tcon_ipc == NULL) return -ENODEV; + getDFSRetry: - rc = smb_init(SMB_COM_TRANSACTION2, 15, NULL, (void **) &pSMB, + rc = smb_init(SMB_COM_TRANSACTION2, 15, ses->tcon_ipc, (void **) &pSMB, (void **) &pSMBr); if (rc) return rc; @@ -4833,7 +4845,7 @@ getDFSRetry: /* server pointer checked in called function, but should never be null here anyway */ pSMB->hdr.Mid = get_next_mid(ses->server); - pSMB->hdr.Tid = ses->ipc_tid; + pSMB->hdr.Tid = ses->tcon_ipc->tid; pSMB->hdr.Uid = ses->Suid; if (ses->capabilities & CAP_STATUS32) pSMB->hdr.Flags2 |= SMBFLG2_ERR_STATUS; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0bfc2280436d..a726f524fb84 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -44,7 +44,6 @@ #include <net/ipv6.h> #include <linux/parser.h> #include <linux/bvec.h> - #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" @@ -56,6 +55,7 @@ #include "rfc1002pdu.h" #include "fscache.h" #include "smb2proto.h" +#include "smbdirect.h" #define CIFS_PORT 445 #define RFC1001_PORT 139 @@ -92,7 +92,7 @@ enum { Opt_multiuser, Opt_sloppy, Opt_nosharesock, Opt_persistent, Opt_nopersistent, Opt_resilient, Opt_noresilient, - Opt_domainauto, + Opt_domainauto, Opt_rdma, /* Mount options which take numeric value */ Opt_backupuid, Opt_backupgid, Opt_uid, @@ -183,6 +183,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_resilient, "resilienthandles"}, { Opt_noresilient, "noresilienthandles"}, { Opt_domainauto, "domainauto"}, + { Opt_rdma, "rdma"}, { Opt_backupuid, "backupuid=%s" }, { Opt_backupgid, "backupgid=%s" }, @@ -353,11 +354,12 @@ cifs_reconnect(struct TCP_Server_Info *server) list_for_each(tmp, &server->smb_ses_list) { ses = list_entry(tmp, struct cifs_ses, smb_ses_list); ses->need_reconnect = true; - ses->ipc_tid = 0; list_for_each(tmp2, &ses->tcon_list) { tcon = list_entry(tmp2, struct cifs_tcon, tcon_list); tcon->need_reconnect = true; } + if (ses->tcon_ipc) + ses->tcon_ipc->need_reconnect = true; } spin_unlock(&cifs_tcp_ses_lock); @@ -405,7 +407,10 @@ cifs_reconnect(struct TCP_Server_Info *server) /* we should try only the port we connected to before */ mutex_lock(&server->srv_mutex); - rc = generic_ip_connect(server); + if (cifs_rdma_enabled(server)) + rc = smbd_reconnect(server); + else + rc = generic_ip_connect(server); if (rc) { cifs_dbg(FYI, "reconnect error %d\n", rc); mutex_unlock(&server->srv_mutex); @@ -538,8 +543,10 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) if (server_unresponsive(server)) return -ECONNABORTED; - - length = sock_recvmsg(server->ssocket, smb_msg, 0); + if (cifs_rdma_enabled(server) && server->smbd_conn) + length = smbd_recv(server->smbd_conn, smb_msg); + else + length = sock_recvmsg(server->ssocket, smb_msg, 0); if (server->tcpStatus == CifsExiting) return -ESHUTDOWN; @@ -700,7 +707,10 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) wake_up_all(&server->request_q); /* give those requests time to exit */ msleep(125); - + if (cifs_rdma_enabled(server) && server->smbd_conn) { + smbd_destroy(server->smbd_conn); + server->smbd_conn = NULL; + } if (server->ssocket) { sock_release(server->ssocket); server->ssocket = NULL; @@ -1550,6 +1560,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_domainauto: vol->domainauto = true; break; + case Opt_rdma: + vol->rdma = true; + break; /* Numeric Values */ case Opt_backupuid: @@ -1707,7 +1720,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, tmp_end++; if (!(tmp_end < end && tmp_end[1] == delim)) { /* No it is not. Set the password to NULL */ - kfree(vol->password); + kzfree(vol->password); vol->password = NULL; break; } @@ -1745,7 +1758,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, options = end; } - kfree(vol->password); + kzfree(vol->password); /* Now build new password string */ temp_len = strlen(value); vol->password = kzalloc(temp_len+1, GFP_KERNEL); @@ -1951,6 +1964,19 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } + if (vol->rdma && vol->vals->protocol_id < SMB30_PROT_ID) { + cifs_dbg(VFS, "SMB Direct requires Version >=3.0\n"); + goto cifs_parse_mount_err; + } + +#ifdef CONFIG_CIFS_SMB_DIRECT + if (vol->rdma && vol->sign) { + cifs_dbg(VFS, "Currently SMB direct doesn't support signing." + " This is being fixed\n"); + goto cifs_parse_mount_err; + } +#endif + #ifndef CONFIG_KEYS /* Muliuser mounts require CONFIG_KEYS support */ if (vol->multiuser) { @@ -2162,6 +2188,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol) if (server->echo_interval != vol->echo_interval * HZ) return 0; + if (server->rdma != vol->rdma) + return 0; + return 1; } @@ -2260,6 +2289,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) tcp_ses->noblocksnd = volume_info->noblocksnd; tcp_ses->noautotune = volume_info->noautotune; tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay; + tcp_ses->rdma = volume_info->rdma; tcp_ses->in_flight = 0; tcp_ses->credits = 1; init_waitqueue_head(&tcp_ses->response_q); @@ -2297,13 +2327,29 @@ cifs_get_tcp_session(struct smb_vol *volume_info) tcp_ses->echo_interval = volume_info->echo_interval * HZ; else tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ; - + if (tcp_ses->rdma) { +#ifndef CONFIG_CIFS_SMB_DIRECT + cifs_dbg(VFS, "CONFIG_CIFS_SMB_DIRECT is not enabled\n"); + rc = -ENOENT; + goto out_err_crypto_release; +#endif + tcp_ses->smbd_conn = smbd_get_connection( + tcp_ses, (struct sockaddr *)&volume_info->dstaddr); + if (tcp_ses->smbd_conn) { + cifs_dbg(VFS, "RDMA transport established\n"); + rc = 0; + goto smbd_connected; + } else { + rc = -ENOENT; + goto out_err_crypto_release; + } + } rc = ip_connect(tcp_ses); if (rc < 0) { cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n"); goto out_err_crypto_release; } - +smbd_connected: /* * since we're in a cifs function already, we know that * this will succeed. No need for try_module_get(). @@ -2381,6 +2427,93 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol) return 1; } +/** + * cifs_setup_ipc - helper to setup the IPC tcon for the session + * + * A new IPC connection is made and stored in the session + * tcon_ipc. The IPC tcon has the same lifetime as the session. + */ +static int +cifs_setup_ipc(struct cifs_ses *ses, struct smb_vol *volume_info) +{ + int rc = 0, xid; + struct cifs_tcon *tcon; + struct nls_table *nls_codepage; + char unc[SERVER_NAME_LENGTH + sizeof("//x/IPC$")] = {0}; + bool seal = false; + + /* + * If the mount request that resulted in the creation of the + * session requires encryption, force IPC to be encrypted too. + */ + if (volume_info->seal) { + if (ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) + seal = true; + else { + cifs_dbg(VFS, + "IPC: server doesn't support encryption\n"); + return -EOPNOTSUPP; + } + } + + tcon = tconInfoAlloc(); + if (tcon == NULL) + return -ENOMEM; + + snprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->serverName); + + /* cannot fail */ + nls_codepage = load_nls_default(); + + xid = get_xid(); + tcon->ses = ses; + tcon->ipc = true; + tcon->seal = seal; + rc = ses->server->ops->tree_connect(xid, ses, unc, tcon, nls_codepage); + free_xid(xid); + + if (rc) { + cifs_dbg(VFS, "failed to connect to IPC (rc=%d)\n", rc); + tconInfoFree(tcon); + goto out; + } + + cifs_dbg(FYI, "IPC tcon rc = %d ipc tid = %d\n", rc, tcon->tid); + + ses->tcon_ipc = tcon; +out: + unload_nls(nls_codepage); + return rc; +} + +/** + * cifs_free_ipc - helper to release the session IPC tcon + * + * Needs to be called everytime a session is destroyed + */ +static int +cifs_free_ipc(struct cifs_ses *ses) +{ + int rc = 0, xid; + struct cifs_tcon *tcon = ses->tcon_ipc; + + if (tcon == NULL) + return 0; + + if (ses->server->ops->tree_disconnect) { + xid = get_xid(); + rc = ses->server->ops->tree_disconnect(xid, tcon); + free_xid(xid); + } + + if (rc) + cifs_dbg(FYI, "failed to disconnect IPC tcon (rc=%d)\n", rc); + + tconInfoFree(tcon); + ses->tcon_ipc = NULL; + return rc; +} + static struct cifs_ses * cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol) { @@ -2421,6 +2554,8 @@ cifs_put_smb_ses(struct cifs_ses *ses) ses->status = CifsExiting; spin_unlock(&cifs_tcp_ses_lock); + cifs_free_ipc(ses); + if (ses->status == CifsExiting && server->ops->logoff) { xid = get_xid(); rc = server->ops->logoff(xid, ses); @@ -2569,6 +2704,13 @@ cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)), } #endif /* CONFIG_KEYS */ +/** + * cifs_get_smb_ses - get a session matching @volume_info data from @server + * + * This function assumes it is being called from cifs_mount() where we + * already got a server reference (server refcount +1). See + * cifs_get_tcon() for refcount explanations. + */ static struct cifs_ses * cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) { @@ -2665,6 +2807,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) spin_unlock(&cifs_tcp_ses_lock); free_xid(xid); + + cifs_setup_ipc(ses, volume_info); + return ses; get_ses_fail: @@ -2709,8 +2854,16 @@ void cifs_put_tcon(struct cifs_tcon *tcon) { unsigned int xid; - struct cifs_ses *ses = tcon->ses; + struct cifs_ses *ses; + + /* + * IPC tcon share the lifetime of their session and are + * destroyed in the session put function + */ + if (tcon == NULL || tcon->ipc) + return; + ses = tcon->ses; cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count); spin_lock(&cifs_tcp_ses_lock); if (--tcon->tc_count > 0) { @@ -2731,6 +2884,26 @@ cifs_put_tcon(struct cifs_tcon *tcon) cifs_put_smb_ses(ses); } +/** + * cifs_get_tcon - get a tcon matching @volume_info data from @ses + * + * - tcon refcount is the number of mount points using the tcon. + * - ses refcount is the number of tcon using the session. + * + * 1. This function assumes it is being called from cifs_mount() where + * we already got a session reference (ses refcount +1). + * + * 2. Since we're in the context of adding a mount point, the end + * result should be either: + * + * a) a new tcon already allocated with refcount=1 (1 mount point) and + * its session refcount incremented (1 new tcon). This +1 was + * already done in (1). + * + * b) an existing tcon with refcount+1 (add a mount point to it) and + * identical ses refcount (no new tcon). Because of (1) we need to + * decrement the ses refcount. + */ static struct cifs_tcon * cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) { @@ -2739,8 +2912,11 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) tcon = cifs_find_tcon(ses, volume_info); if (tcon) { + /* + * tcon has refcount already incremented but we need to + * decrement extra ses reference gotten by caller (case b) + */ cifs_dbg(FYI, "Found match on UNC path\n"); - /* existing tcon already has a reference */ cifs_put_smb_ses(ses); return tcon; } @@ -2986,39 +3162,17 @@ get_dfs_path(const unsigned int xid, struct cifs_ses *ses, const char *old_path, const struct nls_table *nls_codepage, unsigned int *num_referrals, struct dfs_info3_param **referrals, int remap) { - char *temp_unc; int rc = 0; - if (!ses->server->ops->tree_connect || !ses->server->ops->get_dfs_refer) + if (!ses->server->ops->get_dfs_refer) return -ENOSYS; *num_referrals = 0; *referrals = NULL; - if (ses->ipc_tid == 0) { - temp_unc = kmalloc(2 /* for slashes */ + - strnlen(ses->serverName, SERVER_NAME_LEN_WITH_NULL * 2) - + 1 + 4 /* slash IPC$ */ + 2, GFP_KERNEL); - if (temp_unc == NULL) - return -ENOMEM; - temp_unc[0] = '\\'; - temp_unc[1] = '\\'; - strcpy(temp_unc + 2, ses->serverName); - strcpy(temp_unc + 2 + strlen(ses->serverName), "\\IPC$"); - rc = ses->server->ops->tree_connect(xid, ses, temp_unc, NULL, - nls_codepage); - cifs_dbg(FYI, "Tcon rc = %d ipc_tid = %d\n", rc, ses->ipc_tid); - kfree(temp_unc); - } - if (rc == 0) - rc = ses->server->ops->get_dfs_refer(xid, ses, old_path, - referrals, num_referrals, - nls_codepage, remap); - /* - * BB - map targetUNCs to dfs_info3 structures, here or in - * ses->server->ops->get_dfs_refer. - */ - + rc = ses->server->ops->get_dfs_refer(xid, ses, old_path, + referrals, num_referrals, + nls_codepage, remap); return rc; } @@ -3783,7 +3937,7 @@ try_mount_again: tcon->unix_ext = 0; /* server does not support them */ /* do not care if a following call succeed - informational */ - if (!tcon->ipc && server->ops->qfs_tcon) + if (!tcon->pipe && server->ops->qfs_tcon) server->ops->qfs_tcon(xid, tcon); cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info); @@ -3913,8 +4067,7 @@ out: } /* - * Issue a TREE_CONNECT request. Note that for IPC$ shares, that the tcon - * pointer may be NULL. + * Issue a TREE_CONNECT request. */ int CIFSTCon(const unsigned int xid, struct cifs_ses *ses, @@ -3950,7 +4103,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, pSMB->AndXCommand = 0xFF; pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO); bcc_ptr = &pSMB->Password[0]; - if (!tcon || (ses->server->sec_mode & SECMODE_USER)) { + if (tcon->pipe || (ses->server->sec_mode & SECMODE_USER)) { pSMB->PasswordLength = cpu_to_le16(1); /* minimum */ *bcc_ptr = 0; /* password is null byte */ bcc_ptr++; /* skip password */ @@ -4022,7 +4175,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, 0); /* above now done in SendReceive */ - if ((rc == 0) && (tcon != NULL)) { + if (rc == 0) { bool is_unicode; tcon->tidStatus = CifsGood; @@ -4042,7 +4195,8 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') && (bcc_ptr[2] == 'C')) { cifs_dbg(FYI, "IPC connection\n"); - tcon->ipc = 1; + tcon->ipc = true; + tcon->pipe = true; } } else if (length == 2) { if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) { @@ -4069,9 +4223,6 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, else tcon->Flags = 0; cifs_dbg(FYI, "Tcon flags: 0x%x\n", tcon->Flags); - } else if ((rc == 0) && tcon == NULL) { - /* all we need to save for IPC$ connection */ - ses->ipc_tid = smb_buffer_response->Tid; } cifs_buf_release(smb_buffer); @@ -4235,7 +4386,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) reset_cifs_unix_caps(0, tcon, NULL, vol_info); out: kfree(vol_info->username); - kfree(vol_info->password); + kzfree(vol_info->password); kfree(vol_info); return tcon; @@ -4387,7 +4538,7 @@ cifs_prune_tlinks(struct work_struct *work) struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info, prune_tlinks.work); struct rb_root *root = &cifs_sb->tlink_tree; - struct rb_node *node = rb_first(root); + struct rb_node *node; struct rb_node *tmp; struct tcon_link *tlink; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index df9f682708c6..7cee97b93a61 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -42,7 +42,7 @@ #include "cifs_debug.h" #include "cifs_fs_sb.h" #include "fscache.h" - +#include "smbdirect.h" static inline int cifs_convert_flags(unsigned int flags) { @@ -2902,7 +2902,12 @@ cifs_readdata_release(struct kref *refcount) { struct cifs_readdata *rdata = container_of(refcount, struct cifs_readdata, refcount); - +#ifdef CONFIG_CIFS_SMB_DIRECT + if (rdata->mr) { + smbd_deregister_mr(rdata->mr); + rdata->mr = NULL; + } +#endif if (rdata->cfile) cifsFileInfo_put(rdata->cfile); @@ -3031,6 +3036,10 @@ uncached_fill_pages(struct TCP_Server_Info *server, } if (iter) result = copy_page_from_iter(page, 0, n, iter); +#ifdef CONFIG_CIFS_SMB_DIRECT + else if (rdata->mr) + result = n; +#endif else result = cifs_read_page_from_socket(server, page, n); if (result < 0) @@ -3471,20 +3480,18 @@ static const struct vm_operations_struct cifs_file_vm_ops = { int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) { - int rc, xid; + int xid, rc = 0; struct inode *inode = file_inode(file); xid = get_xid(); - if (!CIFS_CACHE_READ(CIFS_I(inode))) { + if (!CIFS_CACHE_READ(CIFS_I(inode))) rc = cifs_zap_mapping(inode); - if (rc) - return rc; - } - - rc = generic_file_mmap(file, vma); - if (rc == 0) + if (!rc) + rc = generic_file_mmap(file, vma); + if (!rc) vma->vm_ops = &cifs_file_vm_ops; + free_xid(xid); return rc; } @@ -3494,16 +3501,16 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) int rc, xid; xid = get_xid(); + rc = cifs_revalidate_file(file); - if (rc) { + if (rc) cifs_dbg(FYI, "Validation prior to mmap failed, error=%d\n", rc); - free_xid(xid); - return rc; - } - rc = generic_file_mmap(file, vma); - if (rc == 0) + if (!rc) + rc = generic_file_mmap(file, vma); + if (!rc) vma->vm_ops = &cifs_file_vm_ops; + free_xid(xid); return rc; } @@ -3600,6 +3607,10 @@ readpages_fill_pages(struct TCP_Server_Info *server, if (iter) result = copy_page_from_iter(page, 0, n, iter); +#ifdef CONFIG_CIFS_SMB_DIRECT + else if (rdata->mr) + result = n; +#endif else result = cifs_read_page_from_socket(server, page, n); if (result < 0) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index ecb99079363a..8f9a8cc7cc62 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1049,7 +1049,7 @@ iget_no_retry: tcon->resource_id = CIFS_I(inode)->uniqueid; #endif - if (rc && tcon->ipc) { + if (rc && tcon->pipe) { cifs_dbg(FYI, "ipc connection - fake read inode\n"); spin_lock(&inode->i_lock); inode->i_mode |= S_IFDIR; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index eea93ac15ef0..a0dbced4a45c 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -98,14 +98,11 @@ sesInfoFree(struct cifs_ses *buf_to_free) kfree(buf_to_free->serverOS); kfree(buf_to_free->serverDomain); kfree(buf_to_free->serverNOS); - if (buf_to_free->password) { - memset(buf_to_free->password, 0, strlen(buf_to_free->password)); - kfree(buf_to_free->password); - } + kzfree(buf_to_free->password); kfree(buf_to_free->user_name); kfree(buf_to_free->domainName); - kfree(buf_to_free->auth_key.response); - kfree(buf_to_free); + kzfree(buf_to_free->auth_key.response); + kzfree(buf_to_free); } struct cifs_tcon * @@ -136,10 +133,7 @@ tconInfoFree(struct cifs_tcon *buf_to_free) } atomic_dec(&tconInfoAllocCount); kfree(buf_to_free->nativeFileSystem); - if (buf_to_free->password) { - memset(buf_to_free->password, 0, strlen(buf_to_free->password)); - kfree(buf_to_free->password); - } + kzfree(buf_to_free->password); kfree(buf_to_free); } diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index a723df3e0197..3d495e440c87 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -87,9 +87,11 @@ cifs_read_data_offset(char *buf) } static unsigned int -cifs_read_data_length(char *buf) +cifs_read_data_length(char *buf, bool in_remaining) { READ_RSP *rsp = (READ_RSP *)buf; + /* It's a bug reading remaining data for SMB1 packets */ + WARN_ON(in_remaining); return (le16_to_cpu(rsp->DataLengthHigh) << 16) + le16_to_cpu(rsp->DataLength); } diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index b4b1f0305f29..12af5dba742b 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -74,7 +74,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, nr_ioctl_req.Reserved = 0; rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid, fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY, - true /* is_fsctl */, false /* use_ipc */, + true /* is_fsctl */, (char *)&nr_ioctl_req, sizeof(nr_ioctl_req), NULL, NULL /* no return info */); if (rc == -EOPNOTSUPP) { diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 7b08a1446a7f..76d03abaa38c 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -578,7 +578,7 @@ smb2_is_valid_lease_break(char *buffer) bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) { - struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer; + struct smb2_oplock_break_rsp *rsp = (struct smb2_oplock_break_rsp *)buffer; struct list_head *tmp, *tmp1, *tmp2; struct cifs_ses *ses; struct cifs_tcon *tcon; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index ed88ab8a4774..eb68e2fcc500 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -32,6 +32,7 @@ #include "smb2status.h" #include "smb2glob.h" #include "cifs_ioctl.h" +#include "smbdirect.h" static int change_conf(struct TCP_Server_Info *server) @@ -250,7 +251,11 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) /* start with specified wsize, or default */ wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); - +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->rdma) + wsize = min_t(unsigned int, + wsize, server->smbd_conn->max_readwrite_size); +#endif if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); @@ -266,6 +271,11 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) /* start with specified rsize, or default */ rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->rdma) + rsize = min_t(unsigned int, + rsize, server->smbd_conn->max_readwrite_size); +#endif if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); @@ -283,7 +293,6 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */, - false /* use_ipc */, NULL /* no data input */, 0 /* no data input */, (char **)&out_buf, &ret_data_len); if (rc != 0) @@ -782,7 +791,6 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */, - false /* use_ipc */, NULL, 0 /* no input */, (char **)&res_key, &ret_data_len); @@ -848,8 +856,7 @@ smb2_copychunk_range(const unsigned int xid, /* Request server copy to target from src identified by key */ rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, - true /* is_fsctl */, false /* use_ipc */, - (char *)pcchunk, + true /* is_fsctl */, (char *)pcchunk, sizeof(struct copychunk_ioctl), (char **)&retbuf, &ret_data_len); if (rc == 0) { @@ -947,9 +954,13 @@ smb2_read_data_offset(char *buf) } static unsigned int -smb2_read_data_length(char *buf) +smb2_read_data_length(char *buf, bool in_remaining) { struct smb2_read_rsp *rsp = (struct smb2_read_rsp *)buf; + + if (in_remaining) + return le32_to_cpu(rsp->DataRemaining); + return le32_to_cpu(rsp->DataLength); } @@ -1006,7 +1017,7 @@ static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_SPARSE, - true /* is_fctl */, false /* use_ipc */, + true /* is_fctl */, &setsparse, 1, NULL, NULL); if (rc) { tcon->broken_sparse_sup = true; @@ -1077,7 +1088,7 @@ smb2_duplicate_extents(const unsigned int xid, rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_DUPLICATE_EXTENTS_TO_FILE, - true /* is_fsctl */, false /* use_ipc */, + true /* is_fsctl */, (char *)&dup_ext_buf, sizeof(struct duplicate_extents_to_file), NULL, @@ -1112,7 +1123,7 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_INTEGRITY_INFORMATION, - true /* is_fsctl */, false /* use_ipc */, + true /* is_fsctl */, (char *)&integr_info, sizeof(struct fsctl_set_integrity_information_req), NULL, @@ -1132,7 +1143,7 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SRV_ENUMERATE_SNAPSHOTS, - true /* is_fsctl */, false /* use_ipc */, + true /* is_fsctl */, NULL, 0 /* no input data */, (char **)&retbuf, &ret_data_len); @@ -1351,16 +1362,20 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, cifs_dbg(FYI, "smb2_get_dfs_refer path <%s>\n", search_name); /* - * Use any tcon from the current session. Here, the first one. + * Try to use the IPC tcon, otherwise just use any */ - spin_lock(&cifs_tcp_ses_lock); - tcon = list_first_entry_or_null(&ses->tcon_list, struct cifs_tcon, - tcon_list); - if (tcon) - tcon->tc_count++; - spin_unlock(&cifs_tcp_ses_lock); + tcon = ses->tcon_ipc; + if (tcon == NULL) { + spin_lock(&cifs_tcp_ses_lock); + tcon = list_first_entry_or_null(&ses->tcon_list, + struct cifs_tcon, + tcon_list); + if (tcon) + tcon->tc_count++; + spin_unlock(&cifs_tcp_ses_lock); + } - if (!tcon) { + if (tcon == NULL) { cifs_dbg(VFS, "session %p has no tcon available for a dfs referral request\n", ses); rc = -ENOTCONN; @@ -1389,20 +1404,11 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, memcpy(dfs_req->RequestFileName, utf16_path, utf16_path_len); do { - /* try first with IPC */ rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_DFS_GET_REFERRALS, - true /* is_fsctl */, true /* use_ipc */, + true /* is_fsctl */, (char *)dfs_req, dfs_req_size, (char **)&dfs_rsp, &dfs_rsp_size); - if (rc == -ENOTCONN) { - /* try with normal tcon */ - rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, - FSCTL_DFS_GET_REFERRALS, - true /* is_fsctl */, false /*use_ipc*/, - (char *)dfs_req, dfs_req_size, - (char **)&dfs_rsp, &dfs_rsp_size); - } } while (rc == -EAGAIN); if (rc) { @@ -1421,7 +1427,8 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, } out: - if (tcon) { + if (tcon && !tcon->ipc) { + /* ipc tcons are not refcounted */ spin_lock(&cifs_tcp_ses_lock); tcon->tc_count--; spin_unlock(&cifs_tcp_ses_lock); @@ -1713,8 +1720,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, - true /* is_fctl */, false /* use_ipc */, - (char *)&fsctl_buf, + true /* is_fctl */, (char *)&fsctl_buf, sizeof(struct file_zero_data_information), NULL, NULL); free_xid(xid); return rc; @@ -1748,8 +1754,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, - true /* is_fctl */, false /* use_ipc */, - (char *)&fsctl_buf, + true /* is_fctl */, (char *)&fsctl_buf, sizeof(struct file_zero_data_information), NULL, NULL); free_xid(xid); return rc; @@ -2411,6 +2416,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, struct iov_iter iter; struct kvec iov; int length; + bool use_rdma_mr = false; if (shdr->Command != SMB2_READ) { cifs_dbg(VFS, "only big read responses are supported\n"); @@ -2437,7 +2443,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, } data_offset = server->ops->read_data_offset(buf) + 4; - data_len = server->ops->read_data_length(buf); +#ifdef CONFIG_CIFS_SMB_DIRECT + use_rdma_mr = rdata->mr; +#endif + data_len = server->ops->read_data_length(buf, use_rdma_mr); if (data_offset < server->vals->read_rsp_size) { /* diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 01346b8b6edb..63778ac22fd9 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -48,6 +48,7 @@ #include "smb2glob.h" #include "cifspdu.h" #include "cifs_spnego.h" +#include "smbdirect.h" /* * The following table defines the expected "StructureSize" of SMB2 requests @@ -319,54 +320,16 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf, *total_len = parmsize + sizeof(struct smb2_sync_hdr); } -/* init request without RFC1001 length at the beginning */ -static int -smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, - void **request_buf, unsigned int *total_len) -{ - int rc; - struct smb2_sync_hdr *shdr; - - rc = smb2_reconnect(smb2_command, tcon); - if (rc) - return rc; - - /* BB eventually switch this to SMB2 specific small buf size */ - *request_buf = cifs_small_buf_get(); - if (*request_buf == NULL) { - /* BB should we add a retry in here if not a writepage? */ - return -ENOMEM; - } - - shdr = (struct smb2_sync_hdr *)(*request_buf); - - fill_small_buf(smb2_command, tcon, shdr, total_len); - - if (tcon != NULL) { -#ifdef CONFIG_CIFS_STATS2 - uint16_t com_code = le16_to_cpu(smb2_command); - - cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_sent[com_code]); -#endif - cifs_stats_inc(&tcon->num_smbs_sent); - } - - return rc; -} - /* * Allocate and return pointer to an SMB request hdr, and set basic * SMB information in the SMB header. If the return code is zero, this - * function must have filled in request_buf pointer. The returned buffer - * has RFC1001 length at the beginning. + * function must have filled in request_buf pointer. */ static int -small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon, - void **request_buf) +smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, + void **request_buf, unsigned int *total_len) { int rc; - unsigned int total_len; - struct smb2_pdu *pdu; rc = smb2_reconnect(smb2_command, tcon); if (rc) @@ -379,12 +342,9 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon, return -ENOMEM; } - pdu = (struct smb2_pdu *)(*request_buf); - - fill_small_buf(smb2_command, tcon, get_sync_hdr(pdu), &total_len); - - /* Note this is only network field converted to big endian */ - pdu->hdr.smb2_buf_length = cpu_to_be32(total_len); + fill_small_buf(smb2_command, tcon, + (struct smb2_sync_hdr *)(*request_buf), + total_len); if (tcon != NULL) { #ifdef CONFIG_CIFS_STATS2 @@ -398,8 +358,8 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon, } #ifdef CONFIG_CIFS_SMB311 -/* offset is sizeof smb2_negotiate_req - 4 but rounded up to 8 bytes */ -#define OFFSET_OF_NEG_CONTEXT 0x68 /* sizeof(struct smb2_negotiate_req) - 4 */ +/* offset is sizeof smb2_negotiate_req but rounded up to 8 bytes */ +#define OFFSET_OF_NEG_CONTEXT 0x68 /* sizeof(struct smb2_negotiate_req) */ #define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1) @@ -427,23 +387,25 @@ build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt) } static void -assemble_neg_contexts(struct smb2_negotiate_req *req) +assemble_neg_contexts(struct smb2_negotiate_req *req, + unsigned int *total_len) { - - /* +4 is to account for the RFC1001 len field */ - char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT + 4; + char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT; build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt); /* Add 2 to size to round to 8 byte boundary */ + pneg_ctxt += 2 + sizeof(struct smb2_preauth_neg_context); build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); req->NegotiateContextCount = cpu_to_le16(2); - inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) - + sizeof(struct smb2_encryption_neg_context)); /* calculate hash */ + + *total_len += 4 + sizeof(struct smb2_preauth_neg_context) + + sizeof(struct smb2_encryption_neg_context); } #else -static void assemble_neg_contexts(struct smb2_negotiate_req *req) +static void assemble_neg_contexts(struct smb2_negotiate_req *req, + unsigned int *total_len) { return; } @@ -477,6 +439,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) int blob_offset, blob_length; char *security_blob; int flags = CIFS_NEG_OP; + unsigned int total_len; cifs_dbg(FYI, "Negotiate protocol\n"); @@ -485,30 +448,30 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) return -EIO; } - rc = small_smb2_init(SMB2_NEGOTIATE, NULL, (void **) &req); + rc = smb2_plain_req_init(SMB2_NEGOTIATE, NULL, (void **) &req, &total_len); if (rc) return rc; - req->hdr.sync_hdr.SessionId = 0; + req->sync_hdr.SessionId = 0; if (strcmp(ses->server->vals->version_string, SMB3ANY_VERSION_STRING) == 0) { req->Dialects[0] = cpu_to_le16(SMB30_PROT_ID); req->Dialects[1] = cpu_to_le16(SMB302_PROT_ID); req->DialectCount = cpu_to_le16(2); - inc_rfc1001_len(req, 4); + total_len += 4; } else if (strcmp(ses->server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0) { req->Dialects[0] = cpu_to_le16(SMB21_PROT_ID); req->Dialects[1] = cpu_to_le16(SMB30_PROT_ID); req->Dialects[2] = cpu_to_le16(SMB302_PROT_ID); req->DialectCount = cpu_to_le16(3); - inc_rfc1001_len(req, 6); + total_len += 6; } else { /* otherwise send specific dialect */ req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id); req->DialectCount = cpu_to_le16(1); - inc_rfc1001_len(req, 2); + total_len += 2; } /* only one of SMB2 signing flags may be set in SMB2 request */ @@ -528,13 +491,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) memcpy(req->ClientGUID, server->client_guid, SMB2_CLIENT_GUID_SIZE); if (ses->server->vals->protocol_id == SMB311_PROT_ID) - assemble_neg_contexts(req); + assemble_neg_contexts(req, &total_len); } iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field */ - iov[0].iov_len = get_rfc1002_length(req) + 4; + iov[0].iov_len = total_len; - rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_negotiate_rsp *)rsp_iov.iov_base; /* @@ -654,6 +616,11 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) cifs_dbg(FYI, "validate negotiate\n"); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (tcon->ses->server->rdma) + return 0; +#endif + /* * validation ioctl must be signed, so no point sending this if we * can not sign it (ie are not known user). Even if signing is not @@ -713,7 +680,6 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, - false /* use_ipc */, (char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req), (char **)&pneg_rsp, &rsplen); @@ -733,8 +699,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) } /* check validate negotiate info response matches what we got earlier */ - if (pneg_rsp->Dialect != - cpu_to_le16(tcon->ses->server->vals->protocol_id)) + if (pneg_rsp->Dialect != cpu_to_le16(tcon->ses->server->dialect)) goto vneg_out; if (pneg_rsp->SecurityMode != cpu_to_le16(tcon->ses->server->sec_mode)) @@ -806,20 +771,22 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) struct cifs_ses *ses = sess_data->ses; struct smb2_sess_setup_req *req; struct TCP_Server_Info *server = ses->server; + unsigned int total_len; - rc = small_smb2_init(SMB2_SESSION_SETUP, NULL, (void **) &req); + rc = smb2_plain_req_init(SMB2_SESSION_SETUP, NULL, (void **) &req, + &total_len); if (rc) return rc; /* First session, not a reauthenticate */ - req->hdr.sync_hdr.SessionId = 0; + req->sync_hdr.SessionId = 0; /* if reconnect, we need to send previous sess id, otherwise it is 0 */ req->PreviousSessionId = sess_data->previous_session; req->Flags = 0; /* MBZ */ /* to enable echos and oplocks */ - req->hdr.sync_hdr.CreditRequest = cpu_to_le16(3); + req->sync_hdr.CreditRequest = cpu_to_le16(3); /* only one of SMB2 signing flags may be set in SMB2 request */ if (server->sign) @@ -833,8 +800,8 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) req->Channel = 0; /* MBZ */ sess_data->iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field and 1 for pad */ - sess_data->iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + /* 1 for pad */ + sess_data->iov[0].iov_len = total_len - 1; /* * This variable will be used to clear the buffer * allocated above in case of any error in the calling function. @@ -860,18 +827,15 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) /* Testing shows that buffer offset must be at location of Buffer[0] */ req->SecurityBufferOffset = - cpu_to_le16(sizeof(struct smb2_sess_setup_req) - - 1 /* pad */ - 4 /* rfc1001 len */); + cpu_to_le16(sizeof(struct smb2_sess_setup_req) - 1 /* pad */); req->SecurityBufferLength = cpu_to_le16(sess_data->iov[1].iov_len); - inc_rfc1001_len(req, sess_data->iov[1].iov_len - 1 /* pad */); - /* BB add code to build os and lm fields */ - rc = SendReceive2(sess_data->xid, sess_data->ses, - sess_data->iov, 2, - &sess_data->buf0_type, - CIFS_LOG_ERROR | CIFS_NEG_OP, &rsp_iov); + rc = smb2_send_recv(sess_data->xid, sess_data->ses, + sess_data->iov, 2, + &sess_data->buf0_type, + CIFS_LOG_ERROR | CIFS_NEG_OP, &rsp_iov); cifs_small_buf_release(sess_data->iov[0].iov_base); memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec)); @@ -1092,7 +1056,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) goto out; req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base; - req->hdr.sync_hdr.SessionId = ses->Suid; + req->sync_hdr.SessionId = ses->Suid; rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, sess_data->nls_cp); @@ -1202,6 +1166,10 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) int rc = 0; struct TCP_Server_Info *server; int flags = 0; + unsigned int total_len; + struct kvec iov[1]; + struct kvec rsp_iov; + int resp_buf_type; cifs_dbg(FYI, "disconnect session %p\n", ses); @@ -1214,19 +1182,24 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) if (ses->need_reconnect) goto smb2_session_already_dead; - rc = small_smb2_init(SMB2_LOGOFF, NULL, (void **) &req); + rc = smb2_plain_req_init(SMB2_LOGOFF, NULL, (void **) &req, &total_len); if (rc) return rc; /* since no tcon, smb2_init can not do this, so do here */ - req->hdr.sync_hdr.SessionId = ses->Suid; + req->sync_hdr.SessionId = ses->Suid; if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) flags |= CIFS_TRANSFORM_REQ; else if (server->sign) - req->hdr.sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + + flags |= CIFS_NO_RESP; + + iov[0].iov_base = (char *)req; + iov[0].iov_len = total_len; - rc = SendReceiveNoRsp(xid, ses, (char *) req, flags); + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); /* * No tcon so can't do @@ -1265,6 +1238,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, int unc_path_len; __le16 *unc_path = NULL; int flags = 0; + unsigned int total_len; cifs_dbg(FYI, "TCON\n"); @@ -1283,40 +1257,30 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, } /* SMB2 TREE_CONNECT request must be called with TreeId == 0 */ - if (tcon) - tcon->tid = 0; + tcon->tid = 0; - rc = small_smb2_init(SMB2_TREE_CONNECT, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_TREE_CONNECT, tcon, (void **) &req, + &total_len); if (rc) { kfree(unc_path); return rc; } - if (tcon == NULL) { - if ((ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)) - flags |= CIFS_TRANSFORM_REQ; - - /* since no tcon, smb2_init can not do this, so do here */ - req->hdr.sync_hdr.SessionId = ses->Suid; - if (ses->server->sign) - req->hdr.sync_hdr.Flags |= SMB2_FLAGS_SIGNED; - } else if (encryption_required(tcon)) + if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field and 1 for pad */ - iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + /* 1 for pad */ + iov[0].iov_len = total_len - 1; /* Testing shows that buffer offset must be at location of Buffer[0] */ req->PathOffset = cpu_to_le16(sizeof(struct smb2_tree_connect_req) - - 1 /* pad */ - 4 /* do not count rfc1001 len field */); + - 1 /* pad */); req->PathLength = cpu_to_le16(unc_path_len - 2); iov[1].iov_base = unc_path; iov[1].iov_len = unc_path_len; - inc_rfc1001_len(req, unc_path_len - 1 /* pad */); - - rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base; @@ -1328,21 +1292,16 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, goto tcon_error_exit; } - if (tcon == NULL) { - ses->ipc_tid = rsp->hdr.sync_hdr.TreeId; - goto tcon_exit; - } - switch (rsp->ShareType) { case SMB2_SHARE_TYPE_DISK: cifs_dbg(FYI, "connection to disk share\n"); break; case SMB2_SHARE_TYPE_PIPE: - tcon->ipc = true; + tcon->pipe = true; cifs_dbg(FYI, "connection to pipe share\n"); break; case SMB2_SHARE_TYPE_PRINT: - tcon->ipc = true; + tcon->print = true; cifs_dbg(FYI, "connection to printer\n"); break; default: @@ -1389,6 +1348,10 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) int rc = 0; struct cifs_ses *ses = tcon->ses; int flags = 0; + unsigned int total_len; + struct kvec iov[1]; + struct kvec rsp_iov; + int resp_buf_type; cifs_dbg(FYI, "Tree Disconnect\n"); @@ -1398,14 +1361,20 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) return 0; - rc = small_smb2_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req, + &total_len); if (rc) return rc; if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - rc = SendReceiveNoRsp(xid, ses, (char *)req, flags); + flags |= CIFS_NO_RESP; + + iov[0].iov_base = (char *)req; + iov[0].iov_len = total_len; + + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE); @@ -1505,11 +1474,10 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov, req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE; if (!req->CreateContextsOffset) req->CreateContextsOffset = cpu_to_le32( - sizeof(struct smb2_create_req) - 4 + + sizeof(struct smb2_create_req) + iov[num - 1].iov_len); le32_add_cpu(&req->CreateContextsLength, server->vals->create_lease_size); - inc_rfc1001_len(&req->hdr, server->vals->create_lease_size); *num_iovec = num + 1; return 0; } @@ -1589,10 +1557,9 @@ add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec, iov[num].iov_len = sizeof(struct create_durable_v2); if (!req->CreateContextsOffset) req->CreateContextsOffset = - cpu_to_le32(sizeof(struct smb2_create_req) - 4 + + cpu_to_le32(sizeof(struct smb2_create_req) + iov[1].iov_len); le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable_v2)); - inc_rfc1001_len(&req->hdr, sizeof(struct create_durable_v2)); *num_iovec = num + 1; return 0; } @@ -1613,12 +1580,10 @@ add_durable_reconnect_v2_context(struct kvec *iov, unsigned int *num_iovec, iov[num].iov_len = sizeof(struct create_durable_handle_reconnect_v2); if (!req->CreateContextsOffset) req->CreateContextsOffset = - cpu_to_le32(sizeof(struct smb2_create_req) - 4 + + cpu_to_le32(sizeof(struct smb2_create_req) + iov[1].iov_len); le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable_handle_reconnect_v2)); - inc_rfc1001_len(&req->hdr, - sizeof(struct create_durable_handle_reconnect_v2)); *num_iovec = num + 1; return 0; } @@ -1649,10 +1614,9 @@ add_durable_context(struct kvec *iov, unsigned int *num_iovec, iov[num].iov_len = sizeof(struct create_durable); if (!req->CreateContextsOffset) req->CreateContextsOffset = - cpu_to_le32(sizeof(struct smb2_create_req) - 4 + + cpu_to_le32(sizeof(struct smb2_create_req) + iov[1].iov_len); le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable)); - inc_rfc1001_len(&req->hdr, sizeof(struct create_durable)); *num_iovec = num + 1; return 0; } @@ -1723,6 +1687,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, __u32 file_attributes = 0; char *dhc_buf = NULL, *lc_buf = NULL; int flags = 0; + unsigned int total_len; cifs_dbg(FYI, "create/open\n"); @@ -1731,7 +1696,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, else return -EIO; - rc = small_smb2_init(SMB2_CREATE, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_CREATE, tcon, (void **) &req, &total_len); + if (rc) return rc; @@ -1752,12 +1718,10 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK); iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field */ - iov[0].iov_len = get_rfc1002_length(req) + 4; /* -1 since last byte is buf[0] which is sent below (path) */ - iov[0].iov_len--; + iov[0].iov_len = total_len - 1; - req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req) - 4); + req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req)); /* [MS-SMB2] 2.2.13 NameOffset: * If SMB2_FLAGS_DFS_OPERATIONS is set in the Flags field of @@ -1770,7 +1734,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (tcon->share_flags & SHI1005_FLAGS_DFS) { int name_len; - req->hdr.sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; + req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; rc = alloc_path_with_tree_prefix(©_path, ©_size, &name_len, tcon->treeName, path); @@ -1797,8 +1761,6 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, iov[1].iov_len = uni_path_len; iov[1].iov_base = path; - /* -1 since last byte is buf[0] which was counted in smb2_buf_len */ - inc_rfc1001_len(req, uni_path_len - 1); if (!server->oplocks) *oplock = SMB2_OPLOCK_LEVEL_NONE; @@ -1836,7 +1798,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, dhc_buf = iov[n_iov-1].iov_base; } - rc = SendReceive2(xid, ses, iov, n_iov, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, n_iov, &resp_buftype, flags, + &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_create_rsp *)rsp_iov.iov_base; @@ -1877,7 +1840,7 @@ creat_exit: */ int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, - u64 volatile_fid, u32 opcode, bool is_fsctl, bool use_ipc, + u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data, u32 indatalen, char **out_data, u32 *plen /* returned data len */) { @@ -1891,6 +1854,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, int n_iov; int rc = 0; int flags = 0; + unsigned int total_len; cifs_dbg(FYI, "SMB2 IOCTL\n"); @@ -1909,20 +1873,10 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (!ses || !(ses->server)) return -EIO; - rc = small_smb2_init(SMB2_IOCTL, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len); if (rc) return rc; - if (use_ipc) { - if (ses->ipc_tid == 0) { - cifs_small_buf_release(req); - return -ENOTCONN; - } - - cifs_dbg(FYI, "replacing tid 0x%x with IPC tid 0x%x\n", - req->hdr.sync_hdr.TreeId, ses->ipc_tid); - req->hdr.sync_hdr.TreeId = ses->ipc_tid; - } if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -1934,7 +1888,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, req->InputCount = cpu_to_le32(indatalen); /* do not set InputOffset if no input data */ req->InputOffset = - cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer) - 4); + cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer)); iov[1].iov_base = in_data; iov[1].iov_len = indatalen; n_iov = 2; @@ -1969,21 +1923,20 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, * but if input data passed to ioctl, we do not * want to double count this, so we do not send * the dummy one byte of data in iovec[0] if sending - * input data (in iovec[1]). We also must add 4 bytes - * in first iovec to allow for rfc1002 length field. + * input data (in iovec[1]). */ if (indatalen) { - iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; - inc_rfc1001_len(req, indatalen - 1); + iov[0].iov_len = total_len - 1; } else - iov[0].iov_len = get_rfc1002_length(req) + 4; + iov[0].iov_len = total_len; /* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) - req->hdr.sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; - rc = SendReceive2(xid, ses, iov, n_iov, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, n_iov, &resp_buftype, flags, + &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base; @@ -2052,7 +2005,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SET_COMPRESSION, true /* is_fsctl */, - false /* use_ipc */, (char *)&fsctl_input /* data input */, 2 /* in data len */, &ret_data /* out data */, NULL); @@ -2073,13 +2025,14 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, int resp_buftype; int rc = 0; int flags = 0; + unsigned int total_len; cifs_dbg(FYI, "Close\n"); if (!ses || !(ses->server)) return -EIO; - rc = small_smb2_init(SMB2_CLOSE, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_CLOSE, tcon, (void **) &req, &total_len); if (rc) return rc; @@ -2090,10 +2043,9 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, req->VolatileFileId = volatile_fid; iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field */ - iov[0].iov_len = get_rfc1002_length(req) + 4; + iov[0].iov_len = total_len; - rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_close_rsp *)rsp_iov.iov_base; @@ -2180,13 +2132,15 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, int resp_buftype; struct cifs_ses *ses = tcon->ses; int flags = 0; + unsigned int total_len; cifs_dbg(FYI, "Query Info\n"); if (!ses || !(ses->server)) return -EIO; - rc = small_smb2_init(SMB2_QUERY_INFO, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, (void **) &req, + &total_len); if (rc) return rc; @@ -2203,15 +2157,14 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, * We do not use the input buffer (do not send extra byte) */ req->InputBufferOffset = 0; - inc_rfc1001_len(req, -1); req->OutputBufferLength = cpu_to_le32(output_len); iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field */ - iov[0].iov_len = get_rfc1002_length(req) + 4; + /* 1 for Buffer */ + iov[0].iov_len = total_len - 1; - rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; @@ -2338,6 +2291,10 @@ void smb2_reconnect_server(struct work_struct *work) tcon_exist = true; } } + if (ses->tcon_ipc && ses->tcon_ipc->need_reconnect) { + list_add_tail(&ses->tcon_ipc->rlist, &tmp_list); + tcon_exist = true; + } } /* * Get the reference to server struct to be sure that the last call of @@ -2376,6 +2333,8 @@ SMB2_echo(struct TCP_Server_Info *server) struct kvec iov[2]; struct smb_rqst rqst = { .rq_iov = iov, .rq_nvec = 2 }; + unsigned int total_len; + __be32 rfc1002_marker; cifs_dbg(FYI, "In echo request\n"); @@ -2385,17 +2344,17 @@ SMB2_echo(struct TCP_Server_Info *server) return rc; } - rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req); + rc = smb2_plain_req_init(SMB2_ECHO, NULL, (void **)&req, &total_len); if (rc) return rc; - req->hdr.sync_hdr.CreditRequest = cpu_to_le16(1); + req->sync_hdr.CreditRequest = cpu_to_le16(1); - /* 4 for rfc1002 length field */ iov[0].iov_len = 4; - iov[0].iov_base = (char *)req; - iov[1].iov_len = get_rfc1002_length(req); - iov[1].iov_base = (char *)req + 4; + rfc1002_marker = cpu_to_be32(total_len); + iov[0].iov_base = &rfc1002_marker; + iov[1].iov_len = total_len; + iov[1].iov_base = (char *)req; rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, NULL, server, CIFS_ECHO_OP); @@ -2417,13 +2376,14 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, int resp_buftype; int rc = 0; int flags = 0; + unsigned int total_len; cifs_dbg(FYI, "Flush\n"); if (!ses || !(ses->server)) return -EIO; - rc = small_smb2_init(SMB2_FLUSH, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_FLUSH, tcon, (void **) &req, &total_len); if (rc) return rc; @@ -2434,10 +2394,9 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, req->VolatileFileId = volatile_fid; iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field */ - iov[0].iov_len = get_rfc1002_length(req) + 4; + iov[0].iov_len = total_len; - rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); if (rc != 0) @@ -2453,18 +2412,21 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, */ static int smb2_new_read_req(void **buf, unsigned int *total_len, - struct cifs_io_parms *io_parms, unsigned int remaining_bytes, - int request_type) + struct cifs_io_parms *io_parms, struct cifs_readdata *rdata, + unsigned int remaining_bytes, int request_type) { int rc = -EACCES; struct smb2_read_plain_req *req = NULL; struct smb2_sync_hdr *shdr; + struct TCP_Server_Info *server; rc = smb2_plain_req_init(SMB2_READ, io_parms->tcon, (void **) &req, total_len); if (rc) return rc; - if (io_parms->tcon->ses->server == NULL) + + server = io_parms->tcon->ses->server; + if (server == NULL) return -ECONNABORTED; shdr = &req->sync_hdr; @@ -2478,7 +2440,40 @@ smb2_new_read_req(void **buf, unsigned int *total_len, req->MinimumCount = 0; req->Length = cpu_to_le32(io_parms->length); req->Offset = cpu_to_le64(io_parms->offset); - +#ifdef CONFIG_CIFS_SMB_DIRECT + /* + * If we want to do a RDMA write, fill in and append + * smbd_buffer_descriptor_v1 to the end of read request + */ + if (server->rdma && rdata && + rdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) { + + struct smbd_buffer_descriptor_v1 *v1; + bool need_invalidate = + io_parms->tcon->ses->server->dialect == SMB30_PROT_ID; + + rdata->mr = smbd_register_mr( + server->smbd_conn, rdata->pages, + rdata->nr_pages, rdata->tailsz, + true, need_invalidate); + if (!rdata->mr) + return -ENOBUFS; + + req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE; + if (need_invalidate) + req->Channel = SMB2_CHANNEL_RDMA_V1; + req->ReadChannelInfoOffset = + cpu_to_le16(offsetof(struct smb2_read_plain_req, Buffer)); + req->ReadChannelInfoLength = + cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1)); + v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0]; + v1->offset = cpu_to_le64(rdata->mr->mr->iova); + v1->token = cpu_to_le32(rdata->mr->mr->rkey); + v1->length = cpu_to_le32(rdata->mr->mr->length); + + *total_len += sizeof(*v1) - 1; + } +#endif if (request_type & CHAINED_REQUEST) { if (!(request_type & END_OF_CHAIN)) { /* next 8-byte aligned request */ @@ -2557,7 +2552,17 @@ smb2_readv_callback(struct mid_q_entry *mid) if (rdata->result != -ENODATA) rdata->result = -EIO; } - +#ifdef CONFIG_CIFS_SMB_DIRECT + /* + * If this rdata has a memmory registered, the MR can be freed + * MR needs to be freed as soon as I/O finishes to prevent deadlock + * because they have limited number and are used for future I/Os + */ + if (rdata->mr) { + smbd_deregister_mr(rdata->mr); + rdata->mr = NULL; + } +#endif if (rdata->result) cifs_stats_fail_inc(tcon, SMB2_READ_HE); @@ -2592,7 +2597,8 @@ smb2_async_readv(struct cifs_readdata *rdata) server = io_parms.tcon->ses->server; - rc = smb2_new_read_req((void **) &buf, &total_len, &io_parms, 0, 0); + rc = smb2_new_read_req( + (void **) &buf, &total_len, &io_parms, rdata, 0, 0); if (rc) { if (rc == -EAGAIN && rdata->credits) { /* credits was reset by reconnect */ @@ -2650,31 +2656,24 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, struct smb2_read_plain_req *req = NULL; struct smb2_read_rsp *rsp = NULL; struct smb2_sync_hdr *shdr; - struct kvec iov[2]; + struct kvec iov[1]; struct kvec rsp_iov; unsigned int total_len; - __be32 req_len; - struct smb_rqst rqst = { .rq_iov = iov, - .rq_nvec = 2 }; int flags = CIFS_LOG_ERROR; struct cifs_ses *ses = io_parms->tcon->ses; *nbytes = 0; - rc = smb2_new_read_req((void **)&req, &total_len, io_parms, 0, 0); + rc = smb2_new_read_req((void **)&req, &total_len, io_parms, NULL, 0, 0); if (rc) return rc; if (encryption_required(io_parms->tcon)) flags |= CIFS_TRANSFORM_REQ; - req_len = cpu_to_be32(total_len); - - iov[0].iov_base = &req_len; - iov[0].iov_len = sizeof(__be32); - iov[1].iov_base = req; - iov[1].iov_len = total_len; + iov[0].iov_base = (char *)req; + iov[0].iov_len = total_len; - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_read_rsp *)rsp_iov.iov_base; @@ -2755,7 +2754,19 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->result = -EIO; break; } - +#ifdef CONFIG_CIFS_SMB_DIRECT + /* + * If this wdata has a memory registered, the MR can be freed + * The number of MRs available is limited, it's important to recover + * used MR as soon as I/O is finished. Hold MR longer in the later + * I/O process can possibly result in I/O deadlock due to lack of MR + * to send request on I/O retry + */ + if (wdata->mr) { + smbd_deregister_mr(wdata->mr); + wdata->mr = NULL; + } +#endif if (wdata->result) cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); @@ -2776,8 +2787,10 @@ smb2_async_writev(struct cifs_writedata *wdata, struct TCP_Server_Info *server = tcon->ses->server; struct kvec iov[2]; struct smb_rqst rqst = { }; + unsigned int total_len; + __be32 rfc1002_marker; - rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_WRITE, tcon, (void **) &req, &total_len); if (rc) { if (rc == -EAGAIN && wdata->credits) { /* credits was reset by reconnect */ @@ -2793,7 +2806,7 @@ smb2_async_writev(struct cifs_writedata *wdata, if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - shdr = get_sync_hdr(req); + shdr = (struct smb2_sync_hdr *)req; shdr->ProcessId = cpu_to_le32(wdata->cfile->pid); req->PersistentFileId = wdata->cfile->fid.persistent_fid; @@ -2802,16 +2815,51 @@ smb2_async_writev(struct cifs_writedata *wdata, req->WriteChannelInfoLength = 0; req->Channel = 0; req->Offset = cpu_to_le64(wdata->offset); - /* 4 for rfc1002 length field */ req->DataOffset = cpu_to_le16( - offsetof(struct smb2_write_req, Buffer) - 4); + offsetof(struct smb2_write_req, Buffer)); req->RemainingBytes = 0; - +#ifdef CONFIG_CIFS_SMB_DIRECT + /* + * If we want to do a server RDMA read, fill in and append + * smbd_buffer_descriptor_v1 to the end of write request + */ + if (server->rdma && wdata->bytes >= + server->smbd_conn->rdma_readwrite_threshold) { + + struct smbd_buffer_descriptor_v1 *v1; + bool need_invalidate = server->dialect == SMB30_PROT_ID; + + wdata->mr = smbd_register_mr( + server->smbd_conn, wdata->pages, + wdata->nr_pages, wdata->tailsz, + false, need_invalidate); + if (!wdata->mr) { + rc = -ENOBUFS; + goto async_writev_out; + } + req->Length = 0; + req->DataOffset = 0; + req->RemainingBytes = + cpu_to_le32((wdata->nr_pages-1)*PAGE_SIZE + wdata->tailsz); + req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE; + if (need_invalidate) + req->Channel = SMB2_CHANNEL_RDMA_V1; + req->WriteChannelInfoOffset = + cpu_to_le16(offsetof(struct smb2_write_req, Buffer)); + req->WriteChannelInfoLength = + cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1)); + v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0]; + v1->offset = cpu_to_le64(wdata->mr->mr->iova); + v1->token = cpu_to_le32(wdata->mr->mr->rkey); + v1->length = cpu_to_le32(wdata->mr->mr->length); + } +#endif /* 4 for rfc1002 length field and 1 for Buffer */ iov[0].iov_len = 4; - iov[0].iov_base = req; - iov[1].iov_len = get_rfc1002_length(req) - 1; - iov[1].iov_base = (char *)req + 4; + rfc1002_marker = cpu_to_be32(total_len - 1 + wdata->bytes); + iov[0].iov_base = &rfc1002_marker; + iov[1].iov_len = total_len - 1; + iov[1].iov_base = (char *)req; rqst.rq_iov = iov; rqst.rq_nvec = 2; @@ -2819,13 +2867,22 @@ smb2_async_writev(struct cifs_writedata *wdata, rqst.rq_npages = wdata->nr_pages; rqst.rq_pagesz = wdata->pagesz; rqst.rq_tailsz = wdata->tailsz; - +#ifdef CONFIG_CIFS_SMB_DIRECT + if (wdata->mr) { + iov[1].iov_len += sizeof(struct smbd_buffer_descriptor_v1); + rqst.rq_npages = 0; + } +#endif cifs_dbg(FYI, "async write at %llu %u bytes\n", wdata->offset, wdata->bytes); +#ifdef CONFIG_CIFS_SMB_DIRECT + /* For RDMA read, I/O size is in RemainingBytes not in Length */ + if (!wdata->mr) + req->Length = cpu_to_le32(wdata->bytes); +#else req->Length = cpu_to_le32(wdata->bytes); - - inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */); +#endif if (wdata->credits) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, @@ -2869,13 +2926,15 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, int resp_buftype; struct kvec rsp_iov; int flags = 0; + unsigned int total_len; *nbytes = 0; if (n_vec < 1) return rc; - rc = small_smb2_init(SMB2_WRITE, io_parms->tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_WRITE, io_parms->tcon, (void **) &req, + &total_len); if (rc) return rc; @@ -2885,7 +2944,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, if (encryption_required(io_parms->tcon)) flags |= CIFS_TRANSFORM_REQ; - req->hdr.sync_hdr.ProcessId = cpu_to_le32(io_parms->pid); + req->sync_hdr.ProcessId = cpu_to_le32(io_parms->pid); req->PersistentFileId = io_parms->persistent_fid; req->VolatileFileId = io_parms->volatile_fid; @@ -2894,20 +2953,16 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, req->Channel = 0; req->Length = cpu_to_le32(io_parms->length); req->Offset = cpu_to_le64(io_parms->offset); - /* 4 for rfc1002 length field */ req->DataOffset = cpu_to_le16( - offsetof(struct smb2_write_req, Buffer) - 4); + offsetof(struct smb2_write_req, Buffer)); req->RemainingBytes = 0; iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field and 1 for Buffer */ - iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + /* 1 for Buffer */ + iov[0].iov_len = total_len - 1; - /* length of entire message including data to be written */ - inc_rfc1001_len(req, io_parms->length - 1 /* Buffer */); - - rc = SendReceive2(xid, io_parms->tcon->ses, iov, n_vec + 1, - &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, io_parms->tcon->ses, iov, n_vec + 1, + &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_write_rsp *)rsp_iov.iov_base; @@ -2984,13 +3039,15 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, unsigned int output_size = CIFSMaxBufSize; size_t info_buf_size; int flags = 0; + unsigned int total_len; if (ses && (ses->server)) server = ses->server; else return -EIO; - rc = small_smb2_init(SMB2_QUERY_DIRECTORY, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_QUERY_DIRECTORY, tcon, (void **) &req, + &total_len); if (rc) return rc; @@ -3022,7 +3079,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, memcpy(bufptr, &asteriks, len); req->FileNameOffset = - cpu_to_le16(sizeof(struct smb2_query_directory_req) - 1 - 4); + cpu_to_le16(sizeof(struct smb2_query_directory_req) - 1); req->FileNameLength = cpu_to_le16(len); /* * BB could be 30 bytes or so longer if we used SMB2 specific @@ -3033,15 +3090,13 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, req->OutputBufferLength = cpu_to_le32(output_size); iov[0].iov_base = (char *)req; - /* 4 for RFC1001 length and 1 for Buffer */ - iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + /* 1 for Buffer */ + iov[0].iov_len = total_len - 1; iov[1].iov_base = (char *)(req->Buffer); iov[1].iov_len = len; - inc_rfc1001_len(req, len - 1 /* Buffer */); - - rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base; @@ -3110,6 +3165,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, unsigned int i; struct cifs_ses *ses = tcon->ses; int flags = 0; + unsigned int total_len; if (!ses || !(ses->server)) return -EIO; @@ -3121,7 +3177,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, if (!iov) return -ENOMEM; - rc = small_smb2_init(SMB2_SET_INFO, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_SET_INFO, tcon, (void **) &req, &total_len); if (rc) { kfree(iov); return rc; @@ -3130,7 +3186,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - req->hdr.sync_hdr.ProcessId = cpu_to_le32(pid); + req->sync_hdr.ProcessId = cpu_to_le32(pid); req->InfoType = info_type; req->FileInfoClass = info_class; @@ -3138,27 +3194,25 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, req->VolatileFileId = volatile_fid; req->AdditionalInformation = cpu_to_le32(additional_info); - /* 4 for RFC1001 length and 1 for Buffer */ req->BufferOffset = - cpu_to_le16(sizeof(struct smb2_set_info_req) - 1 - 4); + cpu_to_le16(sizeof(struct smb2_set_info_req) - 1); req->BufferLength = cpu_to_le32(*size); - inc_rfc1001_len(req, *size - 1 /* Buffer */); - memcpy(req->Buffer, *data, *size); + total_len += *size; iov[0].iov_base = (char *)req; - /* 4 for RFC1001 length */ - iov[0].iov_len = get_rfc1002_length(req) + 4; + /* 1 for Buffer */ + iov[0].iov_len = total_len - 1; for (i = 1; i < num; i++) { - inc_rfc1001_len(req, size[i]); le32_add_cpu(&req->BufferLength, size[i]); iov[i].iov_base = (char *)data[i]; iov[i].iov_len = size[i]; } - rc = SendReceive2(xid, ses, iov, num, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, num, &resp_buftype, flags, + &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base; @@ -3310,11 +3364,17 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, __u8 oplock_level) { int rc; - struct smb2_oplock_break *req = NULL; + struct smb2_oplock_break_req *req = NULL; + struct cifs_ses *ses = tcon->ses; int flags = CIFS_OBREAK_OP; + unsigned int total_len; + struct kvec iov[1]; + struct kvec rsp_iov; + int resp_buf_type; cifs_dbg(FYI, "SMB2_oplock_break\n"); - rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req, + &total_len); if (rc) return rc; @@ -3324,9 +3384,14 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, req->VolatileFid = volatile_fid; req->PersistentFid = persistent_fid; req->OplockLevel = oplock_level; - req->hdr.sync_hdr.CreditRequest = cpu_to_le16(1); + req->sync_hdr.CreditRequest = cpu_to_le16(1); - rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, flags); + flags |= CIFS_NO_RESP; + + iov[0].iov_base = (char *)req; + iov[0].iov_len = total_len; + + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { @@ -3355,13 +3420,15 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, { int rc; struct smb2_query_info_req *req; + unsigned int total_len; cifs_dbg(FYI, "Query FSInfo level %d\n", level); if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) return -EIO; - rc = small_smb2_init(SMB2_QUERY_INFO, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, (void **) &req, + &total_len); if (rc) return rc; @@ -3369,15 +3436,14 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, req->FileInfoClass = level; req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; - /* 4 for rfc1002 length field and 1 for pad */ + /* 1 for pad */ req->InputBufferOffset = - cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4); + cpu_to_le16(sizeof(struct smb2_query_info_req) - 1); req->OutputBufferLength = cpu_to_le32( outbuf_len + sizeof(struct smb2_query_info_rsp) - 1 - 4); iov->iov_base = (char *)req; - /* 4 for rfc1002 length field */ - iov->iov_len = get_rfc1002_length(req) + 4; + iov->iov_len = total_len; return 0; } @@ -3403,7 +3469,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(iov.iov_base); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); @@ -3459,7 +3525,7 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(iov.iov_base); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); @@ -3505,34 +3571,33 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, int resp_buf_type; unsigned int count; int flags = CIFS_NO_RESP; + unsigned int total_len; cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock); - rc = small_smb2_init(SMB2_LOCK, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_LOCK, tcon, (void **) &req, &total_len); if (rc) return rc; if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - req->hdr.sync_hdr.ProcessId = cpu_to_le32(pid); + req->sync_hdr.ProcessId = cpu_to_le32(pid); req->LockCount = cpu_to_le16(num_lock); req->PersistentFileId = persist_fid; req->VolatileFileId = volatile_fid; count = num_lock * sizeof(struct smb2_lock_element); - inc_rfc1001_len(req, count - sizeof(struct smb2_lock_element)); iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field and count for all locks */ - iov[0].iov_len = get_rfc1002_length(req) + 4 - count; + iov[0].iov_len = total_len - sizeof(struct smb2_lock_element); iov[1].iov_base = (char *)buf; iov[1].iov_len = count; cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); - rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, flags, - &rsp_iov); + rc = smb2_send_recv(xid, tcon->ses, iov, 2, &resp_buf_type, flags, + &rsp_iov); cifs_small_buf_release(req); if (rc) { cifs_dbg(FYI, "Send error in smb2_lockv = %d\n", rc); @@ -3565,24 +3630,35 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, { int rc; struct smb2_lease_ack *req = NULL; + struct cifs_ses *ses = tcon->ses; int flags = CIFS_OBREAK_OP; + unsigned int total_len; + struct kvec iov[1]; + struct kvec rsp_iov; + int resp_buf_type; cifs_dbg(FYI, "SMB2_lease_break\n"); - rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req, + &total_len); if (rc) return rc; if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - req->hdr.sync_hdr.CreditRequest = cpu_to_le16(1); + req->sync_hdr.CreditRequest = cpu_to_le16(1); req->StructureSize = cpu_to_le16(36); - inc_rfc1001_len(req, 12); + total_len += 12; memcpy(req->LeaseKey, lease_key, 16); req->LeaseState = lease_state; - rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, flags); + flags |= CIFS_NO_RESP; + + iov[0].iov_base = (char *)req; + iov[0].iov_len = total_len; + + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index c2ec934be968..6eb9f9691ed4 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -195,7 +195,7 @@ struct smb2_symlink_err_rsp { #define SMB2_CLIENT_GUID_SIZE 16 struct smb2_negotiate_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 36 */ __le16 DialectCount; __le16 SecurityMode; @@ -282,7 +282,7 @@ struct smb2_negotiate_rsp { #define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04 struct smb2_sess_setup_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 25 */ __u8 Flags; __u8 SecurityMode; @@ -308,7 +308,7 @@ struct smb2_sess_setup_rsp { } __packed; struct smb2_logoff_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 4 */ __le16 Reserved; } __packed; @@ -323,7 +323,7 @@ struct smb2_logoff_rsp { #define SMB2_SHAREFLAG_CLUSTER_RECONNECT 0x0001 struct smb2_tree_connect_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 9 */ __le16 Reserved; /* Flags in SMB3.1.1 */ __le16 PathOffset; @@ -375,7 +375,7 @@ struct smb2_tree_connect_rsp { #define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */ struct smb2_tree_disconnect_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 4 */ __le16 Reserved; } __packed; @@ -496,7 +496,7 @@ struct smb2_tree_disconnect_rsp { #define SVHDX_OPEN_DEVICE_CONTEXT 0x83CE6F1AD851E0986E34401CC9BCFCE9 struct smb2_create_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 57 */ __u8 SecurityFlags; __u8 RequestedOplockLevel; @@ -753,7 +753,7 @@ struct duplicate_extents_to_file { } __packed; struct smb2_ioctl_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 57 */ __u16 Reserved; __le32 CtlCode; @@ -789,7 +789,7 @@ struct smb2_ioctl_rsp { /* Currently defined values for close flags */ #define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001) struct smb2_close_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 24 */ __le16 Flags; __le32 Reserved; @@ -812,7 +812,7 @@ struct smb2_close_rsp { } __packed; struct smb2_flush_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 24 */ __le16 Reserved1; __le32 Reserved2; @@ -830,9 +830,9 @@ struct smb2_flush_rsp { #define SMB2_READFLAG_READ_UNBUFFERED 0x01 /* Channel field for read and write: exactly one of following flags can be set*/ -#define SMB2_CHANNEL_NONE 0x00000000 -#define SMB2_CHANNEL_RDMA_V1 0x00000001 /* SMB3 or later */ -#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000002 /* SMB3.02 or later */ +#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000) +#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001) /* SMB3 or later */ +#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) /* >= SMB3.02 */ /* SMB2 read request without RFC1001 length at the beginning */ struct smb2_read_plain_req { @@ -847,8 +847,8 @@ struct smb2_read_plain_req { __le32 MinimumCount; __le32 Channel; /* MBZ except for SMB3 or later */ __le32 RemainingBytes; - __le16 ReadChannelInfoOffset; /* Reserved MBZ */ - __le16 ReadChannelInfoLength; /* Reserved MBZ */ + __le16 ReadChannelInfoOffset; + __le16 ReadChannelInfoLength; __u8 Buffer[1]; } __packed; @@ -868,7 +868,7 @@ struct smb2_read_rsp { #define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */ struct smb2_write_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 49 */ __le16 DataOffset; /* offset from start of SMB2 header to write data */ __le32 Length; @@ -877,8 +877,8 @@ struct smb2_write_req { __u64 VolatileFileId; /* opaque endianness */ __le32 Channel; /* Reserved MBZ */ __le32 RemainingBytes; - __le16 WriteChannelInfoOffset; /* Reserved MBZ */ - __le16 WriteChannelInfoLength; /* Reserved MBZ */ + __le16 WriteChannelInfoOffset; + __le16 WriteChannelInfoLength; __le32 Flags; __u8 Buffer[1]; } __packed; @@ -907,7 +907,7 @@ struct smb2_lock_element { } __packed; struct smb2_lock_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 48 */ __le16 LockCount; __le32 Reserved; @@ -924,7 +924,7 @@ struct smb2_lock_rsp { } __packed; struct smb2_echo_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 4 */ __u16 Reserved; } __packed; @@ -942,7 +942,7 @@ struct smb2_echo_rsp { #define SMB2_REOPEN 0x10 struct smb2_query_directory_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 33 */ __u8 FileInformationClass; __u8 Flags; @@ -989,7 +989,7 @@ struct smb2_query_directory_rsp { #define SL_INDEX_SPECIFIED 0x00000004 struct smb2_query_info_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 41 */ __u8 InfoType; __u8 FileInfoClass; @@ -1013,7 +1013,7 @@ struct smb2_query_info_rsp { } __packed; struct smb2_set_info_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 33 */ __u8 InfoType; __u8 FileInfoClass; @@ -1031,7 +1031,19 @@ struct smb2_set_info_rsp { __le16 StructureSize; /* Must be 2 */ } __packed; -struct smb2_oplock_break { +/* oplock break without an rfc1002 header */ +struct smb2_oplock_break_req { + struct smb2_sync_hdr sync_hdr; + __le16 StructureSize; /* Must be 24 */ + __u8 OplockLevel; + __u8 Reserved; + __le32 Reserved2; + __u64 PersistentFid; + __u64 VolatileFid; +} __packed; + +/* oplock break with an rfc1002 header */ +struct smb2_oplock_break_rsp { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 24 */ __u8 OplockLevel; @@ -1057,7 +1069,7 @@ struct smb2_lease_break { } __packed; struct smb2_lease_ack { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 36 */ __le16 Reserved; __le32 Flags; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index e9ab5227e7a8..05287b01f596 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -125,8 +125,7 @@ extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, struct smb2_err_rsp **err_buf); extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 opcode, - bool is_fsctl, bool use_ipc, - char *in_data, u32 indatalen, + bool is_fsctl, char *in_data, u32 indatalen, char **out_data, u32 *plen /* returned data len */); extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c new file mode 100644 index 000000000000..5130492847eb --- /dev/null +++ b/fs/cifs/smbdirect.c @@ -0,0 +1,2610 @@ +/* + * Copyright (C) 2017, Microsoft Corporation. + * + * Author(s): Long Li <longli@microsoft.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + */ +#include <linux/module.h> +#include <linux/highmem.h> +#include "smbdirect.h" +#include "cifs_debug.h" + +static struct smbd_response *get_empty_queue_buffer( + struct smbd_connection *info); +static struct smbd_response *get_receive_buffer( + struct smbd_connection *info); +static void put_receive_buffer( + struct smbd_connection *info, + struct smbd_response *response); +static int allocate_receive_buffers(struct smbd_connection *info, int num_buf); +static void destroy_receive_buffers(struct smbd_connection *info); + +static void put_empty_packet( + struct smbd_connection *info, struct smbd_response *response); +static void enqueue_reassembly( + struct smbd_connection *info, + struct smbd_response *response, int data_length); +static struct smbd_response *_get_first_reassembly( + struct smbd_connection *info); + +static int smbd_post_recv( + struct smbd_connection *info, + struct smbd_response *response); + +static int smbd_post_send_empty(struct smbd_connection *info); +static int smbd_post_send_data( + struct smbd_connection *info, + struct kvec *iov, int n_vec, int remaining_data_length); +static int smbd_post_send_page(struct smbd_connection *info, + struct page *page, unsigned long offset, + size_t size, int remaining_data_length); + +static void destroy_mr_list(struct smbd_connection *info); +static int allocate_mr_list(struct smbd_connection *info); + +/* SMBD version number */ +#define SMBD_V1 0x0100 + +/* Port numbers for SMBD transport */ +#define SMB_PORT 445 +#define SMBD_PORT 5445 + +/* Address lookup and resolve timeout in ms */ +#define RDMA_RESOLVE_TIMEOUT 5000 + +/* SMBD negotiation timeout in seconds */ +#define SMBD_NEGOTIATE_TIMEOUT 120 + +/* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */ +#define SMBD_MIN_RECEIVE_SIZE 128 +#define SMBD_MIN_FRAGMENTED_SIZE 131072 + +/* + * Default maximum number of RDMA read/write outstanding on this connection + * This value is possibly decreased during QP creation on hardware limit + */ +#define SMBD_CM_RESPONDER_RESOURCES 32 + +/* Maximum number of retries on data transfer operations */ +#define SMBD_CM_RETRY 6 +/* No need to retry on Receiver Not Ready since SMBD manages credits */ +#define SMBD_CM_RNR_RETRY 0 + +/* + * User configurable initial values per SMBD transport connection + * as defined in [MS-SMBD] 3.1.1.1 + * Those may change after a SMBD negotiation + */ +/* The local peer's maximum number of credits to grant to the peer */ +int smbd_receive_credit_max = 255; + +/* The remote peer's credit request of local peer */ +int smbd_send_credit_target = 255; + +/* The maximum single message size can be sent to remote peer */ +int smbd_max_send_size = 1364; + +/* The maximum fragmented upper-layer payload receive size supported */ +int smbd_max_fragmented_recv_size = 1024 * 1024; + +/* The maximum single-message size which can be received */ +int smbd_max_receive_size = 8192; + +/* The timeout to initiate send of a keepalive message on idle */ +int smbd_keep_alive_interval = 120; + +/* + * User configurable initial values for RDMA transport + * The actual values used may be lower and are limited to hardware capabilities + */ +/* Default maximum number of SGEs in a RDMA write/read */ +int smbd_max_frmr_depth = 2048; + +/* If payload is less than this byte, use RDMA send/recv not read/write */ +int rdma_readwrite_threshold = 4096; + +/* Transport logging functions + * Logging are defined as classes. They can be OR'ed to define the actual + * logging level via module parameter smbd_logging_class + * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and + * log_rdma_event() + */ +#define LOG_OUTGOING 0x1 +#define LOG_INCOMING 0x2 +#define LOG_READ 0x4 +#define LOG_WRITE 0x8 +#define LOG_RDMA_SEND 0x10 +#define LOG_RDMA_RECV 0x20 +#define LOG_KEEP_ALIVE 0x40 +#define LOG_RDMA_EVENT 0x80 +#define LOG_RDMA_MR 0x100 +static unsigned int smbd_logging_class; +module_param(smbd_logging_class, uint, 0644); +MODULE_PARM_DESC(smbd_logging_class, + "Logging class for SMBD transport 0x0 to 0x100"); + +#define ERR 0x0 +#define INFO 0x1 +static unsigned int smbd_logging_level = ERR; +module_param(smbd_logging_level, uint, 0644); +MODULE_PARM_DESC(smbd_logging_level, + "Logging level for SMBD transport, 0 (default): error, 1: info"); + +#define log_rdma(level, class, fmt, args...) \ +do { \ + if (level <= smbd_logging_level || class & smbd_logging_class) \ + cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\ +} while (0) + +#define log_outgoing(level, fmt, args...) \ + log_rdma(level, LOG_OUTGOING, fmt, ##args) +#define log_incoming(level, fmt, args...) \ + log_rdma(level, LOG_INCOMING, fmt, ##args) +#define log_read(level, fmt, args...) log_rdma(level, LOG_READ, fmt, ##args) +#define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args) +#define log_rdma_send(level, fmt, args...) \ + log_rdma(level, LOG_RDMA_SEND, fmt, ##args) +#define log_rdma_recv(level, fmt, args...) \ + log_rdma(level, LOG_RDMA_RECV, fmt, ##args) +#define log_keep_alive(level, fmt, args...) \ + log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args) +#define log_rdma_event(level, fmt, args...) \ + log_rdma(level, LOG_RDMA_EVENT, fmt, ##args) +#define log_rdma_mr(level, fmt, args...) \ + log_rdma(level, LOG_RDMA_MR, fmt, ##args) + +/* + * Destroy the transport and related RDMA and memory resources + * Need to go through all the pending counters and make sure on one is using + * the transport while it is destroyed + */ +static void smbd_destroy_rdma_work(struct work_struct *work) +{ + struct smbd_response *response; + struct smbd_connection *info = + container_of(work, struct smbd_connection, destroy_work); + unsigned long flags; + + log_rdma_event(INFO, "destroying qp\n"); + ib_drain_qp(info->id->qp); + rdma_destroy_qp(info->id); + + /* Unblock all I/O waiting on the send queue */ + wake_up_interruptible_all(&info->wait_send_queue); + + log_rdma_event(INFO, "cancelling idle timer\n"); + cancel_delayed_work_sync(&info->idle_timer_work); + log_rdma_event(INFO, "cancelling send immediate work\n"); + cancel_delayed_work_sync(&info->send_immediate_work); + + log_rdma_event(INFO, "wait for all send to finish\n"); + wait_event(info->wait_smbd_send_pending, + info->smbd_send_pending == 0); + + log_rdma_event(INFO, "wait for all recv to finish\n"); + wake_up_interruptible(&info->wait_reassembly_queue); + wait_event(info->wait_smbd_recv_pending, + info->smbd_recv_pending == 0); + + log_rdma_event(INFO, "wait for all send posted to IB to finish\n"); + wait_event(info->wait_send_pending, + atomic_read(&info->send_pending) == 0); + wait_event(info->wait_send_payload_pending, + atomic_read(&info->send_payload_pending) == 0); + + log_rdma_event(INFO, "freeing mr list\n"); + wake_up_interruptible_all(&info->wait_mr); + wait_event(info->wait_for_mr_cleanup, + atomic_read(&info->mr_used_count) == 0); + destroy_mr_list(info); + + /* It's not posssible for upper layer to get to reassembly */ + log_rdma_event(INFO, "drain the reassembly queue\n"); + do { + spin_lock_irqsave(&info->reassembly_queue_lock, flags); + response = _get_first_reassembly(info); + if (response) { + list_del(&response->list); + spin_unlock_irqrestore( + &info->reassembly_queue_lock, flags); + put_receive_buffer(info, response); + } + } while (response); + spin_unlock_irqrestore(&info->reassembly_queue_lock, flags); + info->reassembly_data_length = 0; + + log_rdma_event(INFO, "free receive buffers\n"); + wait_event(info->wait_receive_queues, + info->count_receive_queue + info->count_empty_packet_queue + == info->receive_credit_max); + destroy_receive_buffers(info); + + ib_free_cq(info->send_cq); + ib_free_cq(info->recv_cq); + ib_dealloc_pd(info->pd); + rdma_destroy_id(info->id); + + /* free mempools */ + mempool_destroy(info->request_mempool); + kmem_cache_destroy(info->request_cache); + + mempool_destroy(info->response_mempool); + kmem_cache_destroy(info->response_cache); + + info->transport_status = SMBD_DESTROYED; + wake_up_all(&info->wait_destroy); +} + +static int smbd_process_disconnected(struct smbd_connection *info) +{ + schedule_work(&info->destroy_work); + return 0; +} + +static void smbd_disconnect_rdma_work(struct work_struct *work) +{ + struct smbd_connection *info = + container_of(work, struct smbd_connection, disconnect_work); + + if (info->transport_status == SMBD_CONNECTED) { + info->transport_status = SMBD_DISCONNECTING; + rdma_disconnect(info->id); + } +} + +static void smbd_disconnect_rdma_connection(struct smbd_connection *info) +{ + queue_work(info->workqueue, &info->disconnect_work); +} + +/* Upcall from RDMA CM */ +static int smbd_conn_upcall( + struct rdma_cm_id *id, struct rdma_cm_event *event) +{ + struct smbd_connection *info = id->context; + + log_rdma_event(INFO, "event=%d status=%d\n", + event->event, event->status); + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + case RDMA_CM_EVENT_ROUTE_RESOLVED: + info->ri_rc = 0; + complete(&info->ri_done); + break; + + case RDMA_CM_EVENT_ADDR_ERROR: + info->ri_rc = -EHOSTUNREACH; + complete(&info->ri_done); + break; + + case RDMA_CM_EVENT_ROUTE_ERROR: + info->ri_rc = -ENETUNREACH; + complete(&info->ri_done); + break; + + case RDMA_CM_EVENT_ESTABLISHED: + log_rdma_event(INFO, "connected event=%d\n", event->event); + info->transport_status = SMBD_CONNECTED; + wake_up_interruptible(&info->conn_wait); + break; + + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + log_rdma_event(INFO, "connecting failed event=%d\n", event->event); + info->transport_status = SMBD_DISCONNECTED; + wake_up_interruptible(&info->conn_wait); + break; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_DISCONNECTED: + /* This happenes when we fail the negotiation */ + if (info->transport_status == SMBD_NEGOTIATE_FAILED) { + info->transport_status = SMBD_DISCONNECTED; + wake_up(&info->conn_wait); + break; + } + + info->transport_status = SMBD_DISCONNECTED; + smbd_process_disconnected(info); + break; + + default: + break; + } + + return 0; +} + +/* Upcall from RDMA QP */ +static void +smbd_qp_async_error_upcall(struct ib_event *event, void *context) +{ + struct smbd_connection *info = context; + + log_rdma_event(ERR, "%s on device %s info %p\n", + ib_event_msg(event->event), event->device->name, info); + + switch (event->event) { + case IB_EVENT_CQ_ERR: + case IB_EVENT_QP_FATAL: + smbd_disconnect_rdma_connection(info); + + default: + break; + } +} + +static inline void *smbd_request_payload(struct smbd_request *request) +{ + return (void *)request->packet; +} + +static inline void *smbd_response_payload(struct smbd_response *response) +{ + return (void *)response->packet; +} + +/* Called when a RDMA send is done */ +static void send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + int i; + struct smbd_request *request = + container_of(wc->wr_cqe, struct smbd_request, cqe); + + log_rdma_send(INFO, "smbd_request %p completed wc->status=%d\n", + request, wc->status); + + if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { + log_rdma_send(ERR, "wc->status=%d wc->opcode=%d\n", + wc->status, wc->opcode); + smbd_disconnect_rdma_connection(request->info); + } + + for (i = 0; i < request->num_sge; i++) + ib_dma_unmap_single(request->info->id->device, + request->sge[i].addr, + request->sge[i].length, + DMA_TO_DEVICE); + + if (request->has_payload) { + if (atomic_dec_and_test(&request->info->send_payload_pending)) + wake_up(&request->info->wait_send_payload_pending); + } else { + if (atomic_dec_and_test(&request->info->send_pending)) + wake_up(&request->info->wait_send_pending); + } + + mempool_free(request, request->info->request_mempool); +} + +static void dump_smbd_negotiate_resp(struct smbd_negotiate_resp *resp) +{ + log_rdma_event(INFO, "resp message min_version %u max_version %u " + "negotiated_version %u credits_requested %u " + "credits_granted %u status %u max_readwrite_size %u " + "preferred_send_size %u max_receive_size %u " + "max_fragmented_size %u\n", + resp->min_version, resp->max_version, resp->negotiated_version, + resp->credits_requested, resp->credits_granted, resp->status, + resp->max_readwrite_size, resp->preferred_send_size, + resp->max_receive_size, resp->max_fragmented_size); +} + +/* + * Process a negotiation response message, according to [MS-SMBD]3.1.5.7 + * response, packet_length: the negotiation response message + * return value: true if negotiation is a success, false if failed + */ +static bool process_negotiation_response( + struct smbd_response *response, int packet_length) +{ + struct smbd_connection *info = response->info; + struct smbd_negotiate_resp *packet = smbd_response_payload(response); + + if (packet_length < sizeof(struct smbd_negotiate_resp)) { + log_rdma_event(ERR, + "error: packet_length=%d\n", packet_length); + return false; + } + + if (le16_to_cpu(packet->negotiated_version) != SMBD_V1) { + log_rdma_event(ERR, "error: negotiated_version=%x\n", + le16_to_cpu(packet->negotiated_version)); + return false; + } + info->protocol = le16_to_cpu(packet->negotiated_version); + + if (packet->credits_requested == 0) { + log_rdma_event(ERR, "error: credits_requested==0\n"); + return false; + } + info->receive_credit_target = le16_to_cpu(packet->credits_requested); + + if (packet->credits_granted == 0) { + log_rdma_event(ERR, "error: credits_granted==0\n"); + return false; + } + atomic_set(&info->send_credits, le16_to_cpu(packet->credits_granted)); + + atomic_set(&info->receive_credits, 0); + + if (le32_to_cpu(packet->preferred_send_size) > info->max_receive_size) { + log_rdma_event(ERR, "error: preferred_send_size=%d\n", + le32_to_cpu(packet->preferred_send_size)); + return false; + } + info->max_receive_size = le32_to_cpu(packet->preferred_send_size); + + if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) { + log_rdma_event(ERR, "error: max_receive_size=%d\n", + le32_to_cpu(packet->max_receive_size)); + return false; + } + info->max_send_size = min_t(int, info->max_send_size, + le32_to_cpu(packet->max_receive_size)); + + if (le32_to_cpu(packet->max_fragmented_size) < + SMBD_MIN_FRAGMENTED_SIZE) { + log_rdma_event(ERR, "error: max_fragmented_size=%d\n", + le32_to_cpu(packet->max_fragmented_size)); + return false; + } + info->max_fragmented_send_size = + le32_to_cpu(packet->max_fragmented_size); + info->rdma_readwrite_threshold = + rdma_readwrite_threshold > info->max_fragmented_send_size ? + info->max_fragmented_send_size : + rdma_readwrite_threshold; + + + info->max_readwrite_size = min_t(u32, + le32_to_cpu(packet->max_readwrite_size), + info->max_frmr_depth * PAGE_SIZE); + info->max_frmr_depth = info->max_readwrite_size / PAGE_SIZE; + + return true; +} + +/* + * Check and schedule to send an immediate packet + * This is used to extend credtis to remote peer to keep the transport busy + */ +static void check_and_send_immediate(struct smbd_connection *info) +{ + if (info->transport_status != SMBD_CONNECTED) + return; + + info->send_immediate = true; + + /* + * Promptly send a packet if our peer is running low on receive + * credits + */ + if (atomic_read(&info->receive_credits) < + info->receive_credit_target - 1) + queue_delayed_work( + info->workqueue, &info->send_immediate_work, 0); +} + +static void smbd_post_send_credits(struct work_struct *work) +{ + int ret = 0; + int use_receive_queue = 1; + int rc; + struct smbd_response *response; + struct smbd_connection *info = + container_of(work, struct smbd_connection, + post_send_credits_work); + + if (info->transport_status != SMBD_CONNECTED) { + wake_up(&info->wait_receive_queues); + return; + } + + if (info->receive_credit_target > + atomic_read(&info->receive_credits)) { + while (true) { + if (use_receive_queue) + response = get_receive_buffer(info); + else + response = get_empty_queue_buffer(info); + if (!response) { + /* now switch to emtpy packet queue */ + if (use_receive_queue) { + use_receive_queue = 0; + continue; + } else + break; + } + + response->type = SMBD_TRANSFER_DATA; + response->first_segment = false; + rc = smbd_post_recv(info, response); + if (rc) { + log_rdma_recv(ERR, + "post_recv failed rc=%d\n", rc); + put_receive_buffer(info, response); + break; + } + + ret++; + } + } + + spin_lock(&info->lock_new_credits_offered); + info->new_credits_offered += ret; + spin_unlock(&info->lock_new_credits_offered); + + atomic_add(ret, &info->receive_credits); + + /* Check if we can post new receive and grant credits to peer */ + check_and_send_immediate(info); +} + +static void smbd_recv_done_work(struct work_struct *work) +{ + struct smbd_connection *info = + container_of(work, struct smbd_connection, recv_done_work); + + /* + * We may have new send credits granted from remote peer + * If any sender is blcoked on lack of credets, unblock it + */ + if (atomic_read(&info->send_credits)) + wake_up_interruptible(&info->wait_send_queue); + + /* + * Check if we need to send something to remote peer to + * grant more credits or respond to KEEP_ALIVE packet + */ + check_and_send_immediate(info); +} + +/* Called from softirq, when recv is done */ +static void recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smbd_data_transfer *data_transfer; + struct smbd_response *response = + container_of(wc->wr_cqe, struct smbd_response, cqe); + struct smbd_connection *info = response->info; + int data_length = 0; + + log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d " + "byte_len=%d pkey_index=%x\n", + response, response->type, wc->status, wc->opcode, + wc->byte_len, wc->pkey_index); + + if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { + log_rdma_recv(INFO, "wc->status=%d opcode=%d\n", + wc->status, wc->opcode); + smbd_disconnect_rdma_connection(info); + goto error; + } + + ib_dma_sync_single_for_cpu( + wc->qp->device, + response->sge.addr, + response->sge.length, + DMA_FROM_DEVICE); + + switch (response->type) { + /* SMBD negotiation response */ + case SMBD_NEGOTIATE_RESP: + dump_smbd_negotiate_resp(smbd_response_payload(response)); + info->full_packet_received = true; + info->negotiate_done = + process_negotiation_response(response, wc->byte_len); + complete(&info->negotiate_completion); + break; + + /* SMBD data transfer packet */ + case SMBD_TRANSFER_DATA: + data_transfer = smbd_response_payload(response); + data_length = le32_to_cpu(data_transfer->data_length); + + /* + * If this is a packet with data playload place the data in + * reassembly queue and wake up the reading thread + */ + if (data_length) { + if (info->full_packet_received) + response->first_segment = true; + + if (le32_to_cpu(data_transfer->remaining_data_length)) + info->full_packet_received = false; + else + info->full_packet_received = true; + + enqueue_reassembly( + info, + response, + data_length); + } else + put_empty_packet(info, response); + + if (data_length) + wake_up_interruptible(&info->wait_reassembly_queue); + + atomic_dec(&info->receive_credits); + info->receive_credit_target = + le16_to_cpu(data_transfer->credits_requested); + atomic_add(le16_to_cpu(data_transfer->credits_granted), + &info->send_credits); + + log_incoming(INFO, "data flags %d data_offset %d " + "data_length %d remaining_data_length %d\n", + le16_to_cpu(data_transfer->flags), + le32_to_cpu(data_transfer->data_offset), + le32_to_cpu(data_transfer->data_length), + le32_to_cpu(data_transfer->remaining_data_length)); + + /* Send a KEEP_ALIVE response right away if requested */ + info->keep_alive_requested = KEEP_ALIVE_NONE; + if (le16_to_cpu(data_transfer->flags) & + SMB_DIRECT_RESPONSE_REQUESTED) { + info->keep_alive_requested = KEEP_ALIVE_PENDING; + } + + queue_work(info->workqueue, &info->recv_done_work); + return; + + default: + log_rdma_recv(ERR, + "unexpected response type=%d\n", response->type); + } + +error: + put_receive_buffer(info, response); +} + +static struct rdma_cm_id *smbd_create_id( + struct smbd_connection *info, + struct sockaddr *dstaddr, int port) +{ + struct rdma_cm_id *id; + int rc; + __be16 *sport; + + id = rdma_create_id(&init_net, smbd_conn_upcall, info, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(id)) { + rc = PTR_ERR(id); + log_rdma_event(ERR, "rdma_create_id() failed %i\n", rc); + return id; + } + + if (dstaddr->sa_family == AF_INET6) + sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port; + else + sport = &((struct sockaddr_in *)dstaddr)->sin_port; + + *sport = htons(port); + + init_completion(&info->ri_done); + info->ri_rc = -ETIMEDOUT; + + rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr, + RDMA_RESOLVE_TIMEOUT); + if (rc) { + log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc); + goto out; + } + wait_for_completion_interruptible_timeout( + &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + rc = info->ri_rc; + if (rc) { + log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc); + goto out; + } + + info->ri_rc = -ETIMEDOUT; + rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); + if (rc) { + log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc); + goto out; + } + wait_for_completion_interruptible_timeout( + &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + rc = info->ri_rc; + if (rc) { + log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc); + goto out; + } + + return id; + +out: + rdma_destroy_id(id); + return ERR_PTR(rc); +} + +/* + * Test if FRWR (Fast Registration Work Requests) is supported on the device + * This implementation requries FRWR on RDMA read/write + * return value: true if it is supported + */ +static bool frwr_is_supported(struct ib_device_attr *attrs) +{ + if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) + return false; + if (attrs->max_fast_reg_page_list_len == 0) + return false; + return true; +} + +static int smbd_ia_open( + struct smbd_connection *info, + struct sockaddr *dstaddr, int port) +{ + int rc; + + info->id = smbd_create_id(info, dstaddr, port); + if (IS_ERR(info->id)) { + rc = PTR_ERR(info->id); + goto out1; + } + + if (!frwr_is_supported(&info->id->device->attrs)) { + log_rdma_event(ERR, + "Fast Registration Work Requests " + "(FRWR) is not supported\n"); + log_rdma_event(ERR, + "Device capability flags = %llx " + "max_fast_reg_page_list_len = %u\n", + info->id->device->attrs.device_cap_flags, + info->id->device->attrs.max_fast_reg_page_list_len); + rc = -EPROTONOSUPPORT; + goto out2; + } + info->max_frmr_depth = min_t(int, + smbd_max_frmr_depth, + info->id->device->attrs.max_fast_reg_page_list_len); + info->mr_type = IB_MR_TYPE_MEM_REG; + if (info->id->device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) + info->mr_type = IB_MR_TYPE_SG_GAPS; + + info->pd = ib_alloc_pd(info->id->device, 0); + if (IS_ERR(info->pd)) { + rc = PTR_ERR(info->pd); + log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc); + goto out2; + } + + return 0; + +out2: + rdma_destroy_id(info->id); + info->id = NULL; + +out1: + return rc; +} + +/* + * Send a negotiation request message to the peer + * The negotiation procedure is in [MS-SMBD] 3.1.5.2 and 3.1.5.3 + * After negotiation, the transport is connected and ready for + * carrying upper layer SMB payload + */ +static int smbd_post_send_negotiate_req(struct smbd_connection *info) +{ + struct ib_send_wr send_wr, *send_wr_fail; + int rc = -ENOMEM; + struct smbd_request *request; + struct smbd_negotiate_req *packet; + + request = mempool_alloc(info->request_mempool, GFP_KERNEL); + if (!request) + return rc; + + request->info = info; + + packet = smbd_request_payload(request); + packet->min_version = cpu_to_le16(SMBD_V1); + packet->max_version = cpu_to_le16(SMBD_V1); + packet->reserved = 0; + packet->credits_requested = cpu_to_le16(info->send_credit_target); + packet->preferred_send_size = cpu_to_le32(info->max_send_size); + packet->max_receive_size = cpu_to_le32(info->max_receive_size); + packet->max_fragmented_size = + cpu_to_le32(info->max_fragmented_recv_size); + + request->num_sge = 1; + request->sge[0].addr = ib_dma_map_single( + info->id->device, (void *)packet, + sizeof(*packet), DMA_TO_DEVICE); + if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) { + rc = -EIO; + goto dma_mapping_failed; + } + + request->sge[0].length = sizeof(*packet); + request->sge[0].lkey = info->pd->local_dma_lkey; + + ib_dma_sync_single_for_device( + info->id->device, request->sge[0].addr, + request->sge[0].length, DMA_TO_DEVICE); + + request->cqe.done = send_done; + + send_wr.next = NULL; + send_wr.wr_cqe = &request->cqe; + send_wr.sg_list = request->sge; + send_wr.num_sge = request->num_sge; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + log_rdma_send(INFO, "sge addr=%llx length=%x lkey=%x\n", + request->sge[0].addr, + request->sge[0].length, request->sge[0].lkey); + + request->has_payload = false; + atomic_inc(&info->send_pending); + rc = ib_post_send(info->id->qp, &send_wr, &send_wr_fail); + if (!rc) + return 0; + + /* if we reach here, post send failed */ + log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); + atomic_dec(&info->send_pending); + ib_dma_unmap_single(info->id->device, request->sge[0].addr, + request->sge[0].length, DMA_TO_DEVICE); + +dma_mapping_failed: + mempool_free(request, info->request_mempool); + return rc; +} + +/* + * Extend the credits to remote peer + * This implements [MS-SMBD] 3.1.5.9 + * The idea is that we should extend credits to remote peer as quickly as + * it's allowed, to maintain data flow. We allocate as much receive + * buffer as possible, and extend the receive credits to remote peer + * return value: the new credtis being granted. + */ +static int manage_credits_prior_sending(struct smbd_connection *info) +{ + int new_credits; + + spin_lock(&info->lock_new_credits_offered); + new_credits = info->new_credits_offered; + info->new_credits_offered = 0; + spin_unlock(&info->lock_new_credits_offered); + + return new_credits; +} + +/* + * Check if we need to send a KEEP_ALIVE message + * The idle connection timer triggers a KEEP_ALIVE message when expires + * SMB_DIRECT_RESPONSE_REQUESTED is set in the message flag to have peer send + * back a response. + * return value: + * 1 if SMB_DIRECT_RESPONSE_REQUESTED needs to be set + * 0: otherwise + */ +static int manage_keep_alive_before_sending(struct smbd_connection *info) +{ + if (info->keep_alive_requested == KEEP_ALIVE_PENDING) { + info->keep_alive_requested = KEEP_ALIVE_SENT; + return 1; + } + return 0; +} + +/* + * Build and prepare the SMBD packet header + * This function waits for avaialbe send credits and build a SMBD packet + * header. The caller then optional append payload to the packet after + * the header + * intput values + * size: the size of the payload + * remaining_data_length: remaining data to send if this is part of a + * fragmented packet + * output values + * request_out: the request allocated from this function + * return values: 0 on success, otherwise actual error code returned + */ +static int smbd_create_header(struct smbd_connection *info, + int size, int remaining_data_length, + struct smbd_request **request_out) +{ + struct smbd_request *request; + struct smbd_data_transfer *packet; + int header_length; + int rc; + + /* Wait for send credits. A SMBD packet needs one credit */ + rc = wait_event_interruptible(info->wait_send_queue, + atomic_read(&info->send_credits) > 0 || + info->transport_status != SMBD_CONNECTED); + if (rc) + return rc; + + if (info->transport_status != SMBD_CONNECTED) { + log_outgoing(ERR, "disconnected not sending\n"); + return -ENOENT; + } + atomic_dec(&info->send_credits); + + request = mempool_alloc(info->request_mempool, GFP_KERNEL); + if (!request) { + rc = -ENOMEM; + goto err; + } + + request->info = info; + + /* Fill in the packet header */ + packet = smbd_request_payload(request); + packet->credits_requested = cpu_to_le16(info->send_credit_target); + packet->credits_granted = + cpu_to_le16(manage_credits_prior_sending(info)); + info->send_immediate = false; + + packet->flags = 0; + if (manage_keep_alive_before_sending(info)) + packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED); + + packet->reserved = 0; + if (!size) + packet->data_offset = 0; + else + packet->data_offset = cpu_to_le32(24); + packet->data_length = cpu_to_le32(size); + packet->remaining_data_length = cpu_to_le32(remaining_data_length); + packet->padding = 0; + + log_outgoing(INFO, "credits_requested=%d credits_granted=%d " + "data_offset=%d data_length=%d remaining_data_length=%d\n", + le16_to_cpu(packet->credits_requested), + le16_to_cpu(packet->credits_granted), + le32_to_cpu(packet->data_offset), + le32_to_cpu(packet->data_length), + le32_to_cpu(packet->remaining_data_length)); + + /* Map the packet to DMA */ + header_length = sizeof(struct smbd_data_transfer); + /* If this is a packet without payload, don't send padding */ + if (!size) + header_length = offsetof(struct smbd_data_transfer, padding); + + request->num_sge = 1; + request->sge[0].addr = ib_dma_map_single(info->id->device, + (void *)packet, + header_length, + DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) { + mempool_free(request, info->request_mempool); + rc = -EIO; + goto err; + } + + request->sge[0].length = header_length; + request->sge[0].lkey = info->pd->local_dma_lkey; + + *request_out = request; + return 0; + +err: + atomic_inc(&info->send_credits); + return rc; +} + +static void smbd_destroy_header(struct smbd_connection *info, + struct smbd_request *request) +{ + + ib_dma_unmap_single(info->id->device, + request->sge[0].addr, + request->sge[0].length, + DMA_TO_DEVICE); + mempool_free(request, info->request_mempool); + atomic_inc(&info->send_credits); +} + +/* Post the send request */ +static int smbd_post_send(struct smbd_connection *info, + struct smbd_request *request, bool has_payload) +{ + struct ib_send_wr send_wr, *send_wr_fail; + int rc, i; + + for (i = 0; i < request->num_sge; i++) { + log_rdma_send(INFO, + "rdma_request sge[%d] addr=%llu legnth=%u\n", + i, request->sge[0].addr, request->sge[0].length); + ib_dma_sync_single_for_device( + info->id->device, + request->sge[i].addr, + request->sge[i].length, + DMA_TO_DEVICE); + } + + request->cqe.done = send_done; + + send_wr.next = NULL; + send_wr.wr_cqe = &request->cqe; + send_wr.sg_list = request->sge; + send_wr.num_sge = request->num_sge; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + if (has_payload) { + request->has_payload = true; + atomic_inc(&info->send_payload_pending); + } else { + request->has_payload = false; + atomic_inc(&info->send_pending); + } + + rc = ib_post_send(info->id->qp, &send_wr, &send_wr_fail); + if (rc) { + log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); + if (has_payload) { + if (atomic_dec_and_test(&info->send_payload_pending)) + wake_up(&info->wait_send_payload_pending); + } else { + if (atomic_dec_and_test(&info->send_pending)) + wake_up(&info->wait_send_pending); + } + } else + /* Reset timer for idle connection after packet is sent */ + mod_delayed_work(info->workqueue, &info->idle_timer_work, + info->keep_alive_interval*HZ); + + return rc; +} + +static int smbd_post_send_sgl(struct smbd_connection *info, + struct scatterlist *sgl, int data_length, int remaining_data_length) +{ + int num_sgs; + int i, rc; + struct smbd_request *request; + struct scatterlist *sg; + + rc = smbd_create_header( + info, data_length, remaining_data_length, &request); + if (rc) + return rc; + + num_sgs = sgl ? sg_nents(sgl) : 0; + for_each_sg(sgl, sg, num_sgs, i) { + request->sge[i+1].addr = + ib_dma_map_page(info->id->device, sg_page(sg), + sg->offset, sg->length, DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error( + info->id->device, request->sge[i+1].addr)) { + rc = -EIO; + request->sge[i+1].addr = 0; + goto dma_mapping_failure; + } + request->sge[i+1].length = sg->length; + request->sge[i+1].lkey = info->pd->local_dma_lkey; + request->num_sge++; + } + + rc = smbd_post_send(info, request, data_length); + if (!rc) + return 0; + +dma_mapping_failure: + for (i = 1; i < request->num_sge; i++) + if (request->sge[i].addr) + ib_dma_unmap_single(info->id->device, + request->sge[i].addr, + request->sge[i].length, + DMA_TO_DEVICE); + smbd_destroy_header(info, request); + return rc; +} + +/* + * Send a page + * page: the page to send + * offset: offset in the page to send + * size: length in the page to send + * remaining_data_length: remaining data to send in this payload + */ +static int smbd_post_send_page(struct smbd_connection *info, struct page *page, + unsigned long offset, size_t size, int remaining_data_length) +{ + struct scatterlist sgl; + + sg_init_table(&sgl, 1); + sg_set_page(&sgl, page, size, offset); + + return smbd_post_send_sgl(info, &sgl, size, remaining_data_length); +} + +/* + * Send an empty message + * Empty message is used to extend credits to peer to for keep live + * while there is no upper layer payload to send at the time + */ +static int smbd_post_send_empty(struct smbd_connection *info) +{ + info->count_send_empty++; + return smbd_post_send_sgl(info, NULL, 0, 0); +} + +/* + * Send a data buffer + * iov: the iov array describing the data buffers + * n_vec: number of iov array + * remaining_data_length: remaining data to send following this packet + * in segmented SMBD packet + */ +static int smbd_post_send_data( + struct smbd_connection *info, struct kvec *iov, int n_vec, + int remaining_data_length) +{ + int i; + u32 data_length = 0; + struct scatterlist sgl[SMBDIRECT_MAX_SGE]; + + if (n_vec > SMBDIRECT_MAX_SGE) { + cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec); + return -ENOMEM; + } + + sg_init_table(sgl, n_vec); + for (i = 0; i < n_vec; i++) { + data_length += iov[i].iov_len; + sg_set_buf(&sgl[i], iov[i].iov_base, iov[i].iov_len); + } + + return smbd_post_send_sgl(info, sgl, data_length, remaining_data_length); +} + +/* + * Post a receive request to the transport + * The remote peer can only send data when a receive request is posted + * The interaction is controlled by send/receive credit system + */ +static int smbd_post_recv( + struct smbd_connection *info, struct smbd_response *response) +{ + struct ib_recv_wr recv_wr, *recv_wr_fail = NULL; + int rc = -EIO; + + response->sge.addr = ib_dma_map_single( + info->id->device, response->packet, + info->max_receive_size, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(info->id->device, response->sge.addr)) + return rc; + + response->sge.length = info->max_receive_size; + response->sge.lkey = info->pd->local_dma_lkey; + + response->cqe.done = recv_done; + + recv_wr.wr_cqe = &response->cqe; + recv_wr.next = NULL; + recv_wr.sg_list = &response->sge; + recv_wr.num_sge = 1; + + rc = ib_post_recv(info->id->qp, &recv_wr, &recv_wr_fail); + if (rc) { + ib_dma_unmap_single(info->id->device, response->sge.addr, + response->sge.length, DMA_FROM_DEVICE); + + log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc); + } + + return rc; +} + +/* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */ +static int smbd_negotiate(struct smbd_connection *info) +{ + int rc; + struct smbd_response *response = get_receive_buffer(info); + + response->type = SMBD_NEGOTIATE_RESP; + rc = smbd_post_recv(info, response); + log_rdma_event(INFO, + "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x " + "iov.lkey=%x\n", + rc, response->sge.addr, + response->sge.length, response->sge.lkey); + if (rc) + return rc; + + init_completion(&info->negotiate_completion); + info->negotiate_done = false; + rc = smbd_post_send_negotiate_req(info); + if (rc) + return rc; + + rc = wait_for_completion_interruptible_timeout( + &info->negotiate_completion, SMBD_NEGOTIATE_TIMEOUT * HZ); + log_rdma_event(INFO, "wait_for_completion_timeout rc=%d\n", rc); + + if (info->negotiate_done) + return 0; + + if (rc == 0) + rc = -ETIMEDOUT; + else if (rc == -ERESTARTSYS) + rc = -EINTR; + else + rc = -ENOTCONN; + + return rc; +} + +static void put_empty_packet( + struct smbd_connection *info, struct smbd_response *response) +{ + spin_lock(&info->empty_packet_queue_lock); + list_add_tail(&response->list, &info->empty_packet_queue); + info->count_empty_packet_queue++; + spin_unlock(&info->empty_packet_queue_lock); + + queue_work(info->workqueue, &info->post_send_credits_work); +} + +/* + * Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1 + * This is a queue for reassembling upper layer payload and present to upper + * layer. All the inncoming payload go to the reassembly queue, regardless of + * if reassembly is required. The uuper layer code reads from the queue for all + * incoming payloads. + * Put a received packet to the reassembly queue + * response: the packet received + * data_length: the size of payload in this packet + */ +static void enqueue_reassembly( + struct smbd_connection *info, + struct smbd_response *response, + int data_length) +{ + spin_lock(&info->reassembly_queue_lock); + list_add_tail(&response->list, &info->reassembly_queue); + info->reassembly_queue_length++; + /* + * Make sure reassembly_data_length is updated after list and + * reassembly_queue_length are updated. On the dequeue side + * reassembly_data_length is checked without a lock to determine + * if reassembly_queue_length and list is up to date + */ + virt_wmb(); + info->reassembly_data_length += data_length; + spin_unlock(&info->reassembly_queue_lock); + info->count_reassembly_queue++; + info->count_enqueue_reassembly_queue++; +} + +/* + * Get the first entry at the front of reassembly queue + * Caller is responsible for locking + * return value: the first entry if any, NULL if queue is empty + */ +static struct smbd_response *_get_first_reassembly(struct smbd_connection *info) +{ + struct smbd_response *ret = NULL; + + if (!list_empty(&info->reassembly_queue)) { + ret = list_first_entry( + &info->reassembly_queue, + struct smbd_response, list); + } + return ret; +} + +static struct smbd_response *get_empty_queue_buffer( + struct smbd_connection *info) +{ + struct smbd_response *ret = NULL; + unsigned long flags; + + spin_lock_irqsave(&info->empty_packet_queue_lock, flags); + if (!list_empty(&info->empty_packet_queue)) { + ret = list_first_entry( + &info->empty_packet_queue, + struct smbd_response, list); + list_del(&ret->list); + info->count_empty_packet_queue--; + } + spin_unlock_irqrestore(&info->empty_packet_queue_lock, flags); + + return ret; +} + +/* + * Get a receive buffer + * For each remote send, we need to post a receive. The receive buffers are + * pre-allocated in advance. + * return value: the receive buffer, NULL if none is available + */ +static struct smbd_response *get_receive_buffer(struct smbd_connection *info) +{ + struct smbd_response *ret = NULL; + unsigned long flags; + + spin_lock_irqsave(&info->receive_queue_lock, flags); + if (!list_empty(&info->receive_queue)) { + ret = list_first_entry( + &info->receive_queue, + struct smbd_response, list); + list_del(&ret->list); + info->count_receive_queue--; + info->count_get_receive_buffer++; + } + spin_unlock_irqrestore(&info->receive_queue_lock, flags); + + return ret; +} + +/* + * Return a receive buffer + * Upon returning of a receive buffer, we can post new receive and extend + * more receive credits to remote peer. This is done immediately after a + * receive buffer is returned. + */ +static void put_receive_buffer( + struct smbd_connection *info, struct smbd_response *response) +{ + unsigned long flags; + + ib_dma_unmap_single(info->id->device, response->sge.addr, + response->sge.length, DMA_FROM_DEVICE); + + spin_lock_irqsave(&info->receive_queue_lock, flags); + list_add_tail(&response->list, &info->receive_queue); + info->count_receive_queue++; + info->count_put_receive_buffer++; + spin_unlock_irqrestore(&info->receive_queue_lock, flags); + + queue_work(info->workqueue, &info->post_send_credits_work); +} + +/* Preallocate all receive buffer on transport establishment */ +static int allocate_receive_buffers(struct smbd_connection *info, int num_buf) +{ + int i; + struct smbd_response *response; + + INIT_LIST_HEAD(&info->reassembly_queue); + spin_lock_init(&info->reassembly_queue_lock); + info->reassembly_data_length = 0; + info->reassembly_queue_length = 0; + + INIT_LIST_HEAD(&info->receive_queue); + spin_lock_init(&info->receive_queue_lock); + info->count_receive_queue = 0; + + INIT_LIST_HEAD(&info->empty_packet_queue); + spin_lock_init(&info->empty_packet_queue_lock); + info->count_empty_packet_queue = 0; + + init_waitqueue_head(&info->wait_receive_queues); + + for (i = 0; i < num_buf; i++) { + response = mempool_alloc(info->response_mempool, GFP_KERNEL); + if (!response) + goto allocate_failed; + + response->info = info; + list_add_tail(&response->list, &info->receive_queue); + info->count_receive_queue++; + } + + return 0; + +allocate_failed: + while (!list_empty(&info->receive_queue)) { + response = list_first_entry( + &info->receive_queue, + struct smbd_response, list); + list_del(&response->list); + info->count_receive_queue--; + + mempool_free(response, info->response_mempool); + } + return -ENOMEM; +} + +static void destroy_receive_buffers(struct smbd_connection *info) +{ + struct smbd_response *response; + + while ((response = get_receive_buffer(info))) + mempool_free(response, info->response_mempool); + + while ((response = get_empty_queue_buffer(info))) + mempool_free(response, info->response_mempool); +} + +/* + * Check and send an immediate or keep alive packet + * The condition to send those packets are defined in [MS-SMBD] 3.1.1.1 + * Connection.KeepaliveRequested and Connection.SendImmediate + * The idea is to extend credits to server as soon as it becomes available + */ +static void send_immediate_work(struct work_struct *work) +{ + struct smbd_connection *info = container_of( + work, struct smbd_connection, + send_immediate_work.work); + + if (info->keep_alive_requested == KEEP_ALIVE_PENDING || + info->send_immediate) { + log_keep_alive(INFO, "send an empty message\n"); + smbd_post_send_empty(info); + } +} + +/* Implement idle connection timer [MS-SMBD] 3.1.6.2 */ +static void idle_connection_timer(struct work_struct *work) +{ + struct smbd_connection *info = container_of( + work, struct smbd_connection, + idle_timer_work.work); + + if (info->keep_alive_requested != KEEP_ALIVE_NONE) { + log_keep_alive(ERR, + "error status info->keep_alive_requested=%d\n", + info->keep_alive_requested); + smbd_disconnect_rdma_connection(info); + return; + } + + log_keep_alive(INFO, "about to send an empty idle message\n"); + smbd_post_send_empty(info); + + /* Setup the next idle timeout work */ + queue_delayed_work(info->workqueue, &info->idle_timer_work, + info->keep_alive_interval*HZ); +} + +/* Destroy this SMBD connection, called from upper layer */ +void smbd_destroy(struct smbd_connection *info) +{ + log_rdma_event(INFO, "destroying rdma session\n"); + + /* Kick off the disconnection process */ + smbd_disconnect_rdma_connection(info); + + log_rdma_event(INFO, "wait for transport being destroyed\n"); + wait_event(info->wait_destroy, + info->transport_status == SMBD_DESTROYED); + + destroy_workqueue(info->workqueue); + kfree(info); +} + +/* + * Reconnect this SMBD connection, called from upper layer + * return value: 0 on success, or actual error code + */ +int smbd_reconnect(struct TCP_Server_Info *server) +{ + log_rdma_event(INFO, "reconnecting rdma session\n"); + + if (!server->smbd_conn) { + log_rdma_event(ERR, "rdma session already destroyed\n"); + return -EINVAL; + } + + /* + * This is possible if transport is disconnected and we haven't received + * notification from RDMA, but upper layer has detected timeout + */ + if (server->smbd_conn->transport_status == SMBD_CONNECTED) { + log_rdma_event(INFO, "disconnecting transport\n"); + smbd_disconnect_rdma_connection(server->smbd_conn); + } + + /* wait until the transport is destroyed */ + wait_event(server->smbd_conn->wait_destroy, + server->smbd_conn->transport_status == SMBD_DESTROYED); + + destroy_workqueue(server->smbd_conn->workqueue); + kfree(server->smbd_conn); + + log_rdma_event(INFO, "creating rdma session\n"); + server->smbd_conn = smbd_get_connection( + server, (struct sockaddr *) &server->dstaddr); + + return server->smbd_conn ? 0 : -ENOENT; +} + +static void destroy_caches_and_workqueue(struct smbd_connection *info) +{ + destroy_receive_buffers(info); + destroy_workqueue(info->workqueue); + mempool_destroy(info->response_mempool); + kmem_cache_destroy(info->response_cache); + mempool_destroy(info->request_mempool); + kmem_cache_destroy(info->request_cache); +} + +#define MAX_NAME_LEN 80 +static int allocate_caches_and_workqueue(struct smbd_connection *info) +{ + char name[MAX_NAME_LEN]; + int rc; + + snprintf(name, MAX_NAME_LEN, "smbd_request_%p", info); + info->request_cache = + kmem_cache_create( + name, + sizeof(struct smbd_request) + + sizeof(struct smbd_data_transfer), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!info->request_cache) + return -ENOMEM; + + info->request_mempool = + mempool_create(info->send_credit_target, mempool_alloc_slab, + mempool_free_slab, info->request_cache); + if (!info->request_mempool) + goto out1; + + snprintf(name, MAX_NAME_LEN, "smbd_response_%p", info); + info->response_cache = + kmem_cache_create( + name, + sizeof(struct smbd_response) + + info->max_receive_size, + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!info->response_cache) + goto out2; + + info->response_mempool = + mempool_create(info->receive_credit_max, mempool_alloc_slab, + mempool_free_slab, info->response_cache); + if (!info->response_mempool) + goto out3; + + snprintf(name, MAX_NAME_LEN, "smbd_%p", info); + info->workqueue = create_workqueue(name); + if (!info->workqueue) + goto out4; + + rc = allocate_receive_buffers(info, info->receive_credit_max); + if (rc) { + log_rdma_event(ERR, "failed to allocate receive buffers\n"); + goto out5; + } + + return 0; + +out5: + destroy_workqueue(info->workqueue); +out4: + mempool_destroy(info->response_mempool); +out3: + kmem_cache_destroy(info->response_cache); +out2: + mempool_destroy(info->request_mempool); +out1: + kmem_cache_destroy(info->request_cache); + return -ENOMEM; +} + +/* Create a SMBD connection, called by upper layer */ +static struct smbd_connection *_smbd_get_connection( + struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port) +{ + int rc; + struct smbd_connection *info; + struct rdma_conn_param conn_param; + struct ib_qp_init_attr qp_attr; + struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr; + struct ib_port_immutable port_immutable; + u32 ird_ord_hdr[2]; + + info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL); + if (!info) + return NULL; + + info->transport_status = SMBD_CONNECTING; + rc = smbd_ia_open(info, dstaddr, port); + if (rc) { + log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc); + goto create_id_failed; + } + + if (smbd_send_credit_target > info->id->device->attrs.max_cqe || + smbd_send_credit_target > info->id->device->attrs.max_qp_wr) { + log_rdma_event(ERR, + "consider lowering send_credit_target = %d. " + "Possible CQE overrun, device " + "reporting max_cpe %d max_qp_wr %d\n", + smbd_send_credit_target, + info->id->device->attrs.max_cqe, + info->id->device->attrs.max_qp_wr); + goto config_failed; + } + + if (smbd_receive_credit_max > info->id->device->attrs.max_cqe || + smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) { + log_rdma_event(ERR, + "consider lowering receive_credit_max = %d. " + "Possible CQE overrun, device " + "reporting max_cpe %d max_qp_wr %d\n", + smbd_receive_credit_max, + info->id->device->attrs.max_cqe, + info->id->device->attrs.max_qp_wr); + goto config_failed; + } + + info->receive_credit_max = smbd_receive_credit_max; + info->send_credit_target = smbd_send_credit_target; + info->max_send_size = smbd_max_send_size; + info->max_fragmented_recv_size = smbd_max_fragmented_recv_size; + info->max_receive_size = smbd_max_receive_size; + info->keep_alive_interval = smbd_keep_alive_interval; + + if (info->id->device->attrs.max_sge < SMBDIRECT_MAX_SGE) { + log_rdma_event(ERR, "warning: device max_sge = %d too small\n", + info->id->device->attrs.max_sge); + log_rdma_event(ERR, "Queue Pair creation may fail\n"); + } + + info->send_cq = NULL; + info->recv_cq = NULL; + info->send_cq = ib_alloc_cq(info->id->device, info, + info->send_credit_target, 0, IB_POLL_SOFTIRQ); + if (IS_ERR(info->send_cq)) { + info->send_cq = NULL; + goto alloc_cq_failed; + } + + info->recv_cq = ib_alloc_cq(info->id->device, info, + info->receive_credit_max, 0, IB_POLL_SOFTIRQ); + if (IS_ERR(info->recv_cq)) { + info->recv_cq = NULL; + goto alloc_cq_failed; + } + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.event_handler = smbd_qp_async_error_upcall; + qp_attr.qp_context = info; + qp_attr.cap.max_send_wr = info->send_credit_target; + qp_attr.cap.max_recv_wr = info->receive_credit_max; + qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SGE; + qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_SGE; + qp_attr.cap.max_inline_data = 0; + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + qp_attr.qp_type = IB_QPT_RC; + qp_attr.send_cq = info->send_cq; + qp_attr.recv_cq = info->recv_cq; + qp_attr.port_num = ~0; + + rc = rdma_create_qp(info->id, info->pd, &qp_attr); + if (rc) { + log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc); + goto create_qp_failed; + } + + memset(&conn_param, 0, sizeof(conn_param)); + conn_param.initiator_depth = 0; + + conn_param.responder_resources = + info->id->device->attrs.max_qp_rd_atom + < SMBD_CM_RESPONDER_RESOURCES ? + info->id->device->attrs.max_qp_rd_atom : + SMBD_CM_RESPONDER_RESOURCES; + info->responder_resources = conn_param.responder_resources; + log_rdma_mr(INFO, "responder_resources=%d\n", + info->responder_resources); + + /* Need to send IRD/ORD in private data for iWARP */ + info->id->device->get_port_immutable( + info->id->device, info->id->port_num, &port_immutable); + if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) { + ird_ord_hdr[0] = info->responder_resources; + ird_ord_hdr[1] = 1; + conn_param.private_data = ird_ord_hdr; + conn_param.private_data_len = sizeof(ird_ord_hdr); + } else { + conn_param.private_data = NULL; + conn_param.private_data_len = 0; + } + + conn_param.retry_count = SMBD_CM_RETRY; + conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY; + conn_param.flow_control = 0; + init_waitqueue_head(&info->wait_destroy); + + log_rdma_event(INFO, "connecting to IP %pI4 port %d\n", + &addr_in->sin_addr, port); + + init_waitqueue_head(&info->conn_wait); + rc = rdma_connect(info->id, &conn_param); + if (rc) { + log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc); + goto rdma_connect_failed; + } + + wait_event_interruptible( + info->conn_wait, info->transport_status != SMBD_CONNECTING); + + if (info->transport_status != SMBD_CONNECTED) { + log_rdma_event(ERR, "rdma_connect failed port=%d\n", port); + goto rdma_connect_failed; + } + + log_rdma_event(INFO, "rdma_connect connected\n"); + + rc = allocate_caches_and_workqueue(info); + if (rc) { + log_rdma_event(ERR, "cache allocation failed\n"); + goto allocate_cache_failed; + } + + init_waitqueue_head(&info->wait_send_queue); + init_waitqueue_head(&info->wait_reassembly_queue); + + INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer); + INIT_DELAYED_WORK(&info->send_immediate_work, send_immediate_work); + queue_delayed_work(info->workqueue, &info->idle_timer_work, + info->keep_alive_interval*HZ); + + init_waitqueue_head(&info->wait_smbd_send_pending); + info->smbd_send_pending = 0; + + init_waitqueue_head(&info->wait_smbd_recv_pending); + info->smbd_recv_pending = 0; + + init_waitqueue_head(&info->wait_send_pending); + atomic_set(&info->send_pending, 0); + + init_waitqueue_head(&info->wait_send_payload_pending); + atomic_set(&info->send_payload_pending, 0); + + INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work); + INIT_WORK(&info->destroy_work, smbd_destroy_rdma_work); + INIT_WORK(&info->recv_done_work, smbd_recv_done_work); + INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits); + info->new_credits_offered = 0; + spin_lock_init(&info->lock_new_credits_offered); + + rc = smbd_negotiate(info); + if (rc) { + log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc); + goto negotiation_failed; + } + + rc = allocate_mr_list(info); + if (rc) { + log_rdma_mr(ERR, "memory registration allocation failed\n"); + goto allocate_mr_failed; + } + + return info; + +allocate_mr_failed: + /* At this point, need to a full transport shutdown */ + smbd_destroy(info); + return NULL; + +negotiation_failed: + cancel_delayed_work_sync(&info->idle_timer_work); + destroy_caches_and_workqueue(info); + info->transport_status = SMBD_NEGOTIATE_FAILED; + init_waitqueue_head(&info->conn_wait); + rdma_disconnect(info->id); + wait_event(info->conn_wait, + info->transport_status == SMBD_DISCONNECTED); + +allocate_cache_failed: +rdma_connect_failed: + rdma_destroy_qp(info->id); + +create_qp_failed: +alloc_cq_failed: + if (info->send_cq) + ib_free_cq(info->send_cq); + if (info->recv_cq) + ib_free_cq(info->recv_cq); + +config_failed: + ib_dealloc_pd(info->pd); + rdma_destroy_id(info->id); + +create_id_failed: + kfree(info); + return NULL; +} + +struct smbd_connection *smbd_get_connection( + struct TCP_Server_Info *server, struct sockaddr *dstaddr) +{ + struct smbd_connection *ret; + int port = SMBD_PORT; + +try_again: + ret = _smbd_get_connection(server, dstaddr, port); + + /* Try SMB_PORT if SMBD_PORT doesn't work */ + if (!ret && port == SMBD_PORT) { + port = SMB_PORT; + goto try_again; + } + return ret; +} + +/* + * Receive data from receive reassembly queue + * All the incoming data packets are placed in reassembly queue + * buf: the buffer to read data into + * size: the length of data to read + * return value: actual data read + * Note: this implementation copies the data from reassebmly queue to receive + * buffers used by upper layer. This is not the optimal code path. A better way + * to do it is to not have upper layer allocate its receive buffers but rather + * borrow the buffer from reassembly queue, and return it after data is + * consumed. But this will require more changes to upper layer code, and also + * need to consider packet boundaries while they still being reassembled. + */ +static int smbd_recv_buf(struct smbd_connection *info, char *buf, + unsigned int size) +{ + struct smbd_response *response; + struct smbd_data_transfer *data_transfer; + int to_copy, to_read, data_read, offset; + u32 data_length, remaining_data_length, data_offset; + int rc; + +again: + if (info->transport_status != SMBD_CONNECTED) { + log_read(ERR, "disconnected\n"); + return -ENODEV; + } + + /* + * No need to hold the reassembly queue lock all the time as we are + * the only one reading from the front of the queue. The transport + * may add more entries to the back of the queue at the same time + */ + log_read(INFO, "size=%d info->reassembly_data_length=%d\n", size, + info->reassembly_data_length); + if (info->reassembly_data_length >= size) { + int queue_length; + int queue_removed = 0; + + /* + * Need to make sure reassembly_data_length is read before + * reading reassembly_queue_length and calling + * _get_first_reassembly. This call is lock free + * as we never read at the end of the queue which are being + * updated in SOFTIRQ as more data is received + */ + virt_rmb(); + queue_length = info->reassembly_queue_length; + data_read = 0; + to_read = size; + offset = info->first_entry_offset; + while (data_read < size) { + response = _get_first_reassembly(info); + data_transfer = smbd_response_payload(response); + data_length = le32_to_cpu(data_transfer->data_length); + remaining_data_length = + le32_to_cpu( + data_transfer->remaining_data_length); + data_offset = le32_to_cpu(data_transfer->data_offset); + + /* + * The upper layer expects RFC1002 length at the + * beginning of the payload. Return it to indicate + * the total length of the packet. This minimize the + * change to upper layer packet processing logic. This + * will be eventually remove when an intermediate + * transport layer is added + */ + if (response->first_segment && size == 4) { + unsigned int rfc1002_len = + data_length + remaining_data_length; + *((__be32 *)buf) = cpu_to_be32(rfc1002_len); + data_read = 4; + response->first_segment = false; + log_read(INFO, "returning rfc1002 length %d\n", + rfc1002_len); + goto read_rfc1002_done; + } + + to_copy = min_t(int, data_length - offset, to_read); + memcpy( + buf + data_read, + (char *)data_transfer + data_offset + offset, + to_copy); + + /* move on to the next buffer? */ + if (to_copy == data_length - offset) { + queue_length--; + /* + * No need to lock if we are not at the + * end of the queue + */ + if (!queue_length) + spin_lock_irq( + &info->reassembly_queue_lock); + list_del(&response->list); + queue_removed++; + if (!queue_length) + spin_unlock_irq( + &info->reassembly_queue_lock); + + info->count_reassembly_queue--; + info->count_dequeue_reassembly_queue++; + put_receive_buffer(info, response); + offset = 0; + log_read(INFO, "put_receive_buffer offset=0\n"); + } else + offset += to_copy; + + to_read -= to_copy; + data_read += to_copy; + + log_read(INFO, "_get_first_reassembly memcpy %d bytes " + "data_transfer_length-offset=%d after that " + "to_read=%d data_read=%d offset=%d\n", + to_copy, data_length - offset, + to_read, data_read, offset); + } + + spin_lock_irq(&info->reassembly_queue_lock); + info->reassembly_data_length -= data_read; + info->reassembly_queue_length -= queue_removed; + spin_unlock_irq(&info->reassembly_queue_lock); + + info->first_entry_offset = offset; + log_read(INFO, "returning to thread data_read=%d " + "reassembly_data_length=%d first_entry_offset=%d\n", + data_read, info->reassembly_data_length, + info->first_entry_offset); +read_rfc1002_done: + return data_read; + } + + log_read(INFO, "wait_event on more data\n"); + rc = wait_event_interruptible( + info->wait_reassembly_queue, + info->reassembly_data_length >= size || + info->transport_status != SMBD_CONNECTED); + /* Don't return any data if interrupted */ + if (rc) + return -ENODEV; + + goto again; +} + +/* + * Receive a page from receive reassembly queue + * page: the page to read data into + * to_read: the length of data to read + * return value: actual data read + */ +static int smbd_recv_page(struct smbd_connection *info, + struct page *page, unsigned int to_read) +{ + int ret; + char *to_address; + + /* make sure we have the page ready for read */ + ret = wait_event_interruptible( + info->wait_reassembly_queue, + info->reassembly_data_length >= to_read || + info->transport_status != SMBD_CONNECTED); + if (ret) + return 0; + + /* now we can read from reassembly queue and not sleep */ + to_address = kmap_atomic(page); + + log_read(INFO, "reading from page=%p address=%p to_read=%d\n", + page, to_address, to_read); + + ret = smbd_recv_buf(info, to_address, to_read); + kunmap_atomic(to_address); + + return ret; +} + +/* + * Receive data from transport + * msg: a msghdr point to the buffer, can be ITER_KVEC or ITER_BVEC + * return: total bytes read, or 0. SMB Direct will not do partial read. + */ +int smbd_recv(struct smbd_connection *info, struct msghdr *msg) +{ + char *buf; + struct page *page; + unsigned int to_read; + int rc; + + info->smbd_recv_pending++; + + switch (msg->msg_iter.type) { + case READ | ITER_KVEC: + buf = msg->msg_iter.kvec->iov_base; + to_read = msg->msg_iter.kvec->iov_len; + rc = smbd_recv_buf(info, buf, to_read); + break; + + case READ | ITER_BVEC: + page = msg->msg_iter.bvec->bv_page; + to_read = msg->msg_iter.bvec->bv_len; + rc = smbd_recv_page(info, page, to_read); + break; + + default: + /* It's a bug in upper layer to get there */ + cifs_dbg(VFS, "CIFS: invalid msg type %d\n", + msg->msg_iter.type); + rc = -EIO; + } + + info->smbd_recv_pending--; + wake_up(&info->wait_smbd_recv_pending); + + /* SMBDirect will read it all or nothing */ + if (rc > 0) + msg->msg_iter.count = 0; + return rc; +} + +/* + * Send data to transport + * Each rqst is transported as a SMBDirect payload + * rqst: the data to write + * return value: 0 if successfully write, otherwise error code + */ +int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) +{ + struct kvec vec; + int nvecs; + int size; + int buflen = 0, remaining_data_length; + int start, i, j; + int max_iov_size = + info->max_send_size - sizeof(struct smbd_data_transfer); + struct kvec iov[SMBDIRECT_MAX_SGE]; + int rc; + + info->smbd_send_pending++; + if (info->transport_status != SMBD_CONNECTED) { + rc = -ENODEV; + goto done; + } + + /* + * This usually means a configuration error + * We use RDMA read/write for packet size > rdma_readwrite_threshold + * as long as it's properly configured we should never get into this + * situation + */ + if (rqst->rq_nvec + rqst->rq_npages > SMBDIRECT_MAX_SGE) { + log_write(ERR, "maximum send segment %x exceeding %x\n", + rqst->rq_nvec + rqst->rq_npages, SMBDIRECT_MAX_SGE); + rc = -EINVAL; + goto done; + } + + /* + * Remove the RFC1002 length defined in MS-SMB2 section 2.1 + * It is used only for TCP transport + * In future we may want to add a transport layer under protocol + * layer so this will only be issued to TCP transport + */ + iov[0].iov_base = (char *)rqst->rq_iov[0].iov_base + 4; + iov[0].iov_len = rqst->rq_iov[0].iov_len - 4; + buflen += iov[0].iov_len; + + /* total up iov array first */ + for (i = 1; i < rqst->rq_nvec; i++) { + iov[i].iov_base = rqst->rq_iov[i].iov_base; + iov[i].iov_len = rqst->rq_iov[i].iov_len; + buflen += iov[i].iov_len; + } + + /* add in the page array if there is one */ + if (rqst->rq_npages) { + buflen += rqst->rq_pagesz * (rqst->rq_npages - 1); + buflen += rqst->rq_tailsz; + } + + if (buflen + sizeof(struct smbd_data_transfer) > + info->max_fragmented_send_size) { + log_write(ERR, "payload size %d > max size %d\n", + buflen, info->max_fragmented_send_size); + rc = -EINVAL; + goto done; + } + + remaining_data_length = buflen; + + log_write(INFO, "rqst->rq_nvec=%d rqst->rq_npages=%d rq_pagesz=%d " + "rq_tailsz=%d buflen=%d\n", + rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz, + rqst->rq_tailsz, buflen); + + start = i = iov[0].iov_len ? 0 : 1; + buflen = 0; + while (true) { + buflen += iov[i].iov_len; + if (buflen > max_iov_size) { + if (i > start) { + remaining_data_length -= + (buflen-iov[i].iov_len); + log_write(INFO, "sending iov[] from start=%d " + "i=%d nvecs=%d " + "remaining_data_length=%d\n", + start, i, i-start, + remaining_data_length); + rc = smbd_post_send_data( + info, &iov[start], i-start, + remaining_data_length); + if (rc) + goto done; + } else { + /* iov[start] is too big, break it */ + nvecs = (buflen+max_iov_size-1)/max_iov_size; + log_write(INFO, "iov[%d] iov_base=%p buflen=%d" + " break to %d vectors\n", + start, iov[start].iov_base, + buflen, nvecs); + for (j = 0; j < nvecs; j++) { + vec.iov_base = + (char *)iov[start].iov_base + + j*max_iov_size; + vec.iov_len = max_iov_size; + if (j == nvecs-1) + vec.iov_len = + buflen - + max_iov_size*(nvecs-1); + remaining_data_length -= vec.iov_len; + log_write(INFO, + "sending vec j=%d iov_base=%p" + " iov_len=%zu " + "remaining_data_length=%d\n", + j, vec.iov_base, vec.iov_len, + remaining_data_length); + rc = smbd_post_send_data( + info, &vec, 1, + remaining_data_length); + if (rc) + goto done; + } + i++; + } + start = i; + buflen = 0; + } else { + i++; + if (i == rqst->rq_nvec) { + /* send out all remaining vecs */ + remaining_data_length -= buflen; + log_write(INFO, + "sending iov[] from start=%d i=%d " + "nvecs=%d remaining_data_length=%d\n", + start, i, i-start, + remaining_data_length); + rc = smbd_post_send_data(info, &iov[start], + i-start, remaining_data_length); + if (rc) + goto done; + break; + } + } + log_write(INFO, "looping i=%d buflen=%d\n", i, buflen); + } + + /* now sending pages if there are any */ + for (i = 0; i < rqst->rq_npages; i++) { + buflen = (i == rqst->rq_npages-1) ? + rqst->rq_tailsz : rqst->rq_pagesz; + nvecs = (buflen + max_iov_size - 1) / max_iov_size; + log_write(INFO, "sending pages buflen=%d nvecs=%d\n", + buflen, nvecs); + for (j = 0; j < nvecs; j++) { + size = max_iov_size; + if (j == nvecs-1) + size = buflen - j*max_iov_size; + remaining_data_length -= size; + log_write(INFO, "sending pages i=%d offset=%d size=%d" + " remaining_data_length=%d\n", + i, j*max_iov_size, size, remaining_data_length); + rc = smbd_post_send_page( + info, rqst->rq_pages[i], j*max_iov_size, + size, remaining_data_length); + if (rc) + goto done; + } + } + +done: + /* + * As an optimization, we don't wait for individual I/O to finish + * before sending the next one. + * Send them all and wait for pending send count to get to 0 + * that means all the I/Os have been out and we are good to return + */ + + wait_event(info->wait_send_payload_pending, + atomic_read(&info->send_payload_pending) == 0); + + info->smbd_send_pending--; + wake_up(&info->wait_smbd_send_pending); + + return rc; +} + +static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smbd_mr *mr; + struct ib_cqe *cqe; + + if (wc->status) { + log_rdma_mr(ERR, "status=%d\n", wc->status); + cqe = wc->wr_cqe; + mr = container_of(cqe, struct smbd_mr, cqe); + smbd_disconnect_rdma_connection(mr->conn); + } +} + +/* + * The work queue function that recovers MRs + * We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used + * again. Both calls are slow, so finish them in a workqueue. This will not + * block I/O path. + * There is one workqueue that recovers MRs, there is no need to lock as the + * I/O requests calling smbd_register_mr will never update the links in the + * mr_list. + */ +static void smbd_mr_recovery_work(struct work_struct *work) +{ + struct smbd_connection *info = + container_of(work, struct smbd_connection, mr_recovery_work); + struct smbd_mr *smbdirect_mr; + int rc; + + list_for_each_entry(smbdirect_mr, &info->mr_list, list) { + if (smbdirect_mr->state == MR_INVALIDATED || + smbdirect_mr->state == MR_ERROR) { + + if (smbdirect_mr->state == MR_INVALIDATED) { + ib_dma_unmap_sg( + info->id->device, smbdirect_mr->sgl, + smbdirect_mr->sgl_count, + smbdirect_mr->dir); + smbdirect_mr->state = MR_READY; + } else if (smbdirect_mr->state == MR_ERROR) { + + /* recover this MR entry */ + rc = ib_dereg_mr(smbdirect_mr->mr); + if (rc) { + log_rdma_mr(ERR, + "ib_dereg_mr faield rc=%x\n", + rc); + smbd_disconnect_rdma_connection(info); + } + + smbdirect_mr->mr = ib_alloc_mr( + info->pd, info->mr_type, + info->max_frmr_depth); + if (IS_ERR(smbdirect_mr->mr)) { + log_rdma_mr(ERR, + "ib_alloc_mr failed mr_type=%x " + "max_frmr_depth=%x\n", + info->mr_type, + info->max_frmr_depth); + smbd_disconnect_rdma_connection(info); + } + + smbdirect_mr->state = MR_READY; + } + /* smbdirect_mr->state is updated by this function + * and is read and updated by I/O issuing CPUs trying + * to get a MR, the call to atomic_inc_return + * implicates a memory barrier and guarantees this + * value is updated before waking up any calls to + * get_mr() from the I/O issuing CPUs + */ + if (atomic_inc_return(&info->mr_ready_count) == 1) + wake_up_interruptible(&info->wait_mr); + } + } +} + +static void destroy_mr_list(struct smbd_connection *info) +{ + struct smbd_mr *mr, *tmp; + + cancel_work_sync(&info->mr_recovery_work); + list_for_each_entry_safe(mr, tmp, &info->mr_list, list) { + if (mr->state == MR_INVALIDATED) + ib_dma_unmap_sg(info->id->device, mr->sgl, + mr->sgl_count, mr->dir); + ib_dereg_mr(mr->mr); + kfree(mr->sgl); + kfree(mr); + } +} + +/* + * Allocate MRs used for RDMA read/write + * The number of MRs will not exceed hardware capability in responder_resources + * All MRs are kept in mr_list. The MR can be recovered after it's used + * Recovery is done in smbd_mr_recovery_work. The content of list entry changes + * as MRs are used and recovered for I/O, but the list links will not change + */ +static int allocate_mr_list(struct smbd_connection *info) +{ + int i; + struct smbd_mr *smbdirect_mr, *tmp; + + INIT_LIST_HEAD(&info->mr_list); + init_waitqueue_head(&info->wait_mr); + spin_lock_init(&info->mr_list_lock); + atomic_set(&info->mr_ready_count, 0); + atomic_set(&info->mr_used_count, 0); + init_waitqueue_head(&info->wait_for_mr_cleanup); + /* Allocate more MRs (2x) than hardware responder_resources */ + for (i = 0; i < info->responder_resources * 2; i++) { + smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); + if (!smbdirect_mr) + goto out; + smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type, + info->max_frmr_depth); + if (IS_ERR(smbdirect_mr->mr)) { + log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x " + "max_frmr_depth=%x\n", + info->mr_type, info->max_frmr_depth); + goto out; + } + smbdirect_mr->sgl = kcalloc( + info->max_frmr_depth, + sizeof(struct scatterlist), + GFP_KERNEL); + if (!smbdirect_mr->sgl) { + log_rdma_mr(ERR, "failed to allocate sgl\n"); + ib_dereg_mr(smbdirect_mr->mr); + goto out; + } + smbdirect_mr->state = MR_READY; + smbdirect_mr->conn = info; + + list_add_tail(&smbdirect_mr->list, &info->mr_list); + atomic_inc(&info->mr_ready_count); + } + INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work); + return 0; + +out: + kfree(smbdirect_mr); + + list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) { + ib_dereg_mr(smbdirect_mr->mr); + kfree(smbdirect_mr->sgl); + kfree(smbdirect_mr); + } + return -ENOMEM; +} + +/* + * Get a MR from mr_list. This function waits until there is at least one + * MR available in the list. It may access the list while the + * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock + * as they never modify the same places. However, there may be several CPUs + * issueing I/O trying to get MR at the same time, mr_list_lock is used to + * protect this situation. + */ +static struct smbd_mr *get_mr(struct smbd_connection *info) +{ + struct smbd_mr *ret; + int rc; +again: + rc = wait_event_interruptible(info->wait_mr, + atomic_read(&info->mr_ready_count) || + info->transport_status != SMBD_CONNECTED); + if (rc) { + log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc); + return NULL; + } + + if (info->transport_status != SMBD_CONNECTED) { + log_rdma_mr(ERR, "info->transport_status=%x\n", + info->transport_status); + return NULL; + } + + spin_lock(&info->mr_list_lock); + list_for_each_entry(ret, &info->mr_list, list) { + if (ret->state == MR_READY) { + ret->state = MR_REGISTERED; + spin_unlock(&info->mr_list_lock); + atomic_dec(&info->mr_ready_count); + atomic_inc(&info->mr_used_count); + return ret; + } + } + + spin_unlock(&info->mr_list_lock); + /* + * It is possible that we could fail to get MR because other processes may + * try to acquire a MR at the same time. If this is the case, retry it. + */ + goto again; +} + +/* + * Register memory for RDMA read/write + * pages[]: the list of pages to register memory with + * num_pages: the number of pages to register + * tailsz: if non-zero, the bytes to register in the last page + * writing: true if this is a RDMA write (SMB read), false for RDMA read + * need_invalidate: true if this MR needs to be locally invalidated after I/O + * return value: the MR registered, NULL if failed. + */ +struct smbd_mr *smbd_register_mr( + struct smbd_connection *info, struct page *pages[], int num_pages, + int tailsz, bool writing, bool need_invalidate) +{ + struct smbd_mr *smbdirect_mr; + int rc, i; + enum dma_data_direction dir; + struct ib_reg_wr *reg_wr; + struct ib_send_wr *bad_wr; + + if (num_pages > info->max_frmr_depth) { + log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n", + num_pages, info->max_frmr_depth); + return NULL; + } + + smbdirect_mr = get_mr(info); + if (!smbdirect_mr) { + log_rdma_mr(ERR, "get_mr returning NULL\n"); + return NULL; + } + smbdirect_mr->need_invalidate = need_invalidate; + smbdirect_mr->sgl_count = num_pages; + sg_init_table(smbdirect_mr->sgl, num_pages); + + for (i = 0; i < num_pages - 1; i++) + sg_set_page(&smbdirect_mr->sgl[i], pages[i], PAGE_SIZE, 0); + + sg_set_page(&smbdirect_mr->sgl[i], pages[i], + tailsz ? tailsz : PAGE_SIZE, 0); + + dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + smbdirect_mr->dir = dir; + rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgl, num_pages, dir); + if (!rc) { + log_rdma_mr(INFO, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n", + num_pages, dir, rc); + goto dma_map_error; + } + + rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgl, num_pages, + NULL, PAGE_SIZE); + if (rc != num_pages) { + log_rdma_mr(INFO, + "ib_map_mr_sg failed rc = %x num_pages = %x\n", + rc, num_pages); + goto map_mr_error; + } + + ib_update_fast_reg_key(smbdirect_mr->mr, + ib_inc_rkey(smbdirect_mr->mr->rkey)); + reg_wr = &smbdirect_mr->wr; + reg_wr->wr.opcode = IB_WR_REG_MR; + smbdirect_mr->cqe.done = register_mr_done; + reg_wr->wr.wr_cqe = &smbdirect_mr->cqe; + reg_wr->wr.num_sge = 0; + reg_wr->wr.send_flags = IB_SEND_SIGNALED; + reg_wr->mr = smbdirect_mr->mr; + reg_wr->key = smbdirect_mr->mr->rkey; + reg_wr->access = writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ; + + /* + * There is no need for waiting for complemtion on ib_post_send + * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution + * on the next ib_post_send when we actaully send I/O to remote peer + */ + rc = ib_post_send(info->id->qp, ®_wr->wr, &bad_wr); + if (!rc) + return smbdirect_mr; + + log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n", + rc, reg_wr->key); + + /* If all failed, attempt to recover this MR by setting it MR_ERROR*/ +map_mr_error: + ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgl, + smbdirect_mr->sgl_count, smbdirect_mr->dir); + +dma_map_error: + smbdirect_mr->state = MR_ERROR; + if (atomic_dec_and_test(&info->mr_used_count)) + wake_up(&info->wait_for_mr_cleanup); + + return NULL; +} + +static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smbd_mr *smbdirect_mr; + struct ib_cqe *cqe; + + cqe = wc->wr_cqe; + smbdirect_mr = container_of(cqe, struct smbd_mr, cqe); + smbdirect_mr->state = MR_INVALIDATED; + if (wc->status != IB_WC_SUCCESS) { + log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status); + smbdirect_mr->state = MR_ERROR; + } + complete(&smbdirect_mr->invalidate_done); +} + +/* + * Deregister a MR after I/O is done + * This function may wait if remote invalidation is not used + * and we have to locally invalidate the buffer to prevent data is being + * modified by remote peer after upper layer consumes it + */ +int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) +{ + struct ib_send_wr *wr, *bad_wr; + struct smbd_connection *info = smbdirect_mr->conn; + int rc = 0; + + if (smbdirect_mr->need_invalidate) { + /* Need to finish local invalidation before returning */ + wr = &smbdirect_mr->inv_wr; + wr->opcode = IB_WR_LOCAL_INV; + smbdirect_mr->cqe.done = local_inv_done; + wr->wr_cqe = &smbdirect_mr->cqe; + wr->num_sge = 0; + wr->ex.invalidate_rkey = smbdirect_mr->mr->rkey; + wr->send_flags = IB_SEND_SIGNALED; + + init_completion(&smbdirect_mr->invalidate_done); + rc = ib_post_send(info->id->qp, wr, &bad_wr); + if (rc) { + log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc); + smbd_disconnect_rdma_connection(info); + goto done; + } + wait_for_completion(&smbdirect_mr->invalidate_done); + smbdirect_mr->need_invalidate = false; + } else + /* + * For remote invalidation, just set it to MR_INVALIDATED + * and defer to mr_recovery_work to recover the MR for next use + */ + smbdirect_mr->state = MR_INVALIDATED; + + /* + * Schedule the work to do MR recovery for future I/Os + * MR recovery is slow and we don't want it to block the current I/O + */ + queue_work(info->workqueue, &info->mr_recovery_work); + +done: + if (atomic_dec_and_test(&info->mr_used_count)) + wake_up(&info->wait_for_mr_cleanup); + + return rc; +} diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h new file mode 100644 index 000000000000..f9038daea194 --- /dev/null +++ b/fs/cifs/smbdirect.h @@ -0,0 +1,338 @@ +/* + * Copyright (C) 2017, Microsoft Corporation. + * + * Author(s): Long Li <longli@microsoft.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + */ +#ifndef _SMBDIRECT_H +#define _SMBDIRECT_H + +#ifdef CONFIG_CIFS_SMB_DIRECT +#define cifs_rdma_enabled(server) ((server)->rdma) + +#include "cifsglob.h" +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include <linux/mempool.h> + +extern int rdma_readwrite_threshold; +extern int smbd_max_frmr_depth; +extern int smbd_keep_alive_interval; +extern int smbd_max_receive_size; +extern int smbd_max_fragmented_recv_size; +extern int smbd_max_send_size; +extern int smbd_send_credit_target; +extern int smbd_receive_credit_max; + +enum keep_alive_status { + KEEP_ALIVE_NONE, + KEEP_ALIVE_PENDING, + KEEP_ALIVE_SENT, +}; + +enum smbd_connection_status { + SMBD_CREATED, + SMBD_CONNECTING, + SMBD_CONNECTED, + SMBD_NEGOTIATE_FAILED, + SMBD_DISCONNECTING, + SMBD_DISCONNECTED, + SMBD_DESTROYED +}; + +/* + * The context for the SMBDirect transport + * Everything related to the transport is here. It has several logical parts + * 1. RDMA related structures + * 2. SMBDirect connection parameters + * 3. Memory registrations + * 4. Receive and reassembly queues for data receive path + * 5. mempools for allocating packets + */ +struct smbd_connection { + enum smbd_connection_status transport_status; + + /* RDMA related */ + struct rdma_cm_id *id; + struct ib_qp_init_attr qp_attr; + struct ib_pd *pd; + struct ib_cq *send_cq, *recv_cq; + struct ib_device_attr dev_attr; + int ri_rc; + struct completion ri_done; + wait_queue_head_t conn_wait; + wait_queue_head_t wait_destroy; + + struct completion negotiate_completion; + bool negotiate_done; + + struct work_struct destroy_work; + struct work_struct disconnect_work; + struct work_struct recv_done_work; + struct work_struct post_send_credits_work; + + spinlock_t lock_new_credits_offered; + int new_credits_offered; + + /* Connection parameters defined in [MS-SMBD] 3.1.1.1 */ + int receive_credit_max; + int send_credit_target; + int max_send_size; + int max_fragmented_recv_size; + int max_fragmented_send_size; + int max_receive_size; + int keep_alive_interval; + int max_readwrite_size; + enum keep_alive_status keep_alive_requested; + int protocol; + atomic_t send_credits; + atomic_t receive_credits; + int receive_credit_target; + int fragment_reassembly_remaining; + + /* Memory registrations */ + /* Maximum number of RDMA read/write outstanding on this connection */ + int responder_resources; + /* Maximum number of SGEs in a RDMA write/read */ + int max_frmr_depth; + /* + * If payload is less than or equal to the threshold, + * use RDMA send/recv to send upper layer I/O. + * If payload is more than the threshold, + * use RDMA read/write through memory registration for I/O. + */ + int rdma_readwrite_threshold; + enum ib_mr_type mr_type; + struct list_head mr_list; + spinlock_t mr_list_lock; + /* The number of available MRs ready for memory registration */ + atomic_t mr_ready_count; + atomic_t mr_used_count; + wait_queue_head_t wait_mr; + struct work_struct mr_recovery_work; + /* Used by transport to wait until all MRs are returned */ + wait_queue_head_t wait_for_mr_cleanup; + + /* Activity accoutning */ + /* Pending reqeusts issued from upper layer */ + int smbd_send_pending; + wait_queue_head_t wait_smbd_send_pending; + + int smbd_recv_pending; + wait_queue_head_t wait_smbd_recv_pending; + + atomic_t send_pending; + wait_queue_head_t wait_send_pending; + atomic_t send_payload_pending; + wait_queue_head_t wait_send_payload_pending; + + /* Receive queue */ + struct list_head receive_queue; + int count_receive_queue; + spinlock_t receive_queue_lock; + + struct list_head empty_packet_queue; + int count_empty_packet_queue; + spinlock_t empty_packet_queue_lock; + + wait_queue_head_t wait_receive_queues; + + /* Reassembly queue */ + struct list_head reassembly_queue; + spinlock_t reassembly_queue_lock; + wait_queue_head_t wait_reassembly_queue; + + /* total data length of reassembly queue */ + int reassembly_data_length; + int reassembly_queue_length; + /* the offset to first buffer in reassembly queue */ + int first_entry_offset; + + bool send_immediate; + + wait_queue_head_t wait_send_queue; + + /* + * Indicate if we have received a full packet on the connection + * This is used to identify the first SMBD packet of a assembled + * payload (SMB packet) in reassembly queue so we can return a + * RFC1002 length to upper layer to indicate the length of the SMB + * packet received + */ + bool full_packet_received; + + struct workqueue_struct *workqueue; + struct delayed_work idle_timer_work; + struct delayed_work send_immediate_work; + + /* Memory pool for preallocating buffers */ + /* request pool for RDMA send */ + struct kmem_cache *request_cache; + mempool_t *request_mempool; + + /* response pool for RDMA receive */ + struct kmem_cache *response_cache; + mempool_t *response_mempool; + + /* for debug purposes */ + unsigned int count_get_receive_buffer; + unsigned int count_put_receive_buffer; + unsigned int count_reassembly_queue; + unsigned int count_enqueue_reassembly_queue; + unsigned int count_dequeue_reassembly_queue; + unsigned int count_send_empty; +}; + +enum smbd_message_type { + SMBD_NEGOTIATE_RESP, + SMBD_TRANSFER_DATA, +}; + +#define SMB_DIRECT_RESPONSE_REQUESTED 0x0001 + +/* SMBD negotiation request packet [MS-SMBD] 2.2.1 */ +struct smbd_negotiate_req { + __le16 min_version; + __le16 max_version; + __le16 reserved; + __le16 credits_requested; + __le32 preferred_send_size; + __le32 max_receive_size; + __le32 max_fragmented_size; +} __packed; + +/* SMBD negotiation response packet [MS-SMBD] 2.2.2 */ +struct smbd_negotiate_resp { + __le16 min_version; + __le16 max_version; + __le16 negotiated_version; + __le16 reserved; + __le16 credits_requested; + __le16 credits_granted; + __le32 status; + __le32 max_readwrite_size; + __le32 preferred_send_size; + __le32 max_receive_size; + __le32 max_fragmented_size; +} __packed; + +/* SMBD data transfer packet with payload [MS-SMBD] 2.2.3 */ +struct smbd_data_transfer { + __le16 credits_requested; + __le16 credits_granted; + __le16 flags; + __le16 reserved; + __le32 remaining_data_length; + __le32 data_offset; + __le32 data_length; + __le32 padding; + __u8 buffer[]; +} __packed; + +/* The packet fields for a registered RDMA buffer */ +struct smbd_buffer_descriptor_v1 { + __le64 offset; + __le32 token; + __le32 length; +} __packed; + +/* Default maximum number of SGEs in a RDMA send/recv */ +#define SMBDIRECT_MAX_SGE 16 +/* The context for a SMBD request */ +struct smbd_request { + struct smbd_connection *info; + struct ib_cqe cqe; + + /* true if this request carries upper layer payload */ + bool has_payload; + + /* the SGE entries for this packet */ + struct ib_sge sge[SMBDIRECT_MAX_SGE]; + int num_sge; + + /* SMBD packet header follows this structure */ + u8 packet[]; +}; + +/* The context for a SMBD response */ +struct smbd_response { + struct smbd_connection *info; + struct ib_cqe cqe; + struct ib_sge sge; + + enum smbd_message_type type; + + /* Link to receive queue or reassembly queue */ + struct list_head list; + + /* Indicate if this is the 1st packet of a payload */ + bool first_segment; + + /* SMBD packet header and payload follows this structure */ + u8 packet[]; +}; + +/* Create a SMBDirect session */ +struct smbd_connection *smbd_get_connection( + struct TCP_Server_Info *server, struct sockaddr *dstaddr); + +/* Reconnect SMBDirect session */ +int smbd_reconnect(struct TCP_Server_Info *server); +/* Destroy SMBDirect session */ +void smbd_destroy(struct smbd_connection *info); + +/* Interface for carrying upper layer I/O through send/recv */ +int smbd_recv(struct smbd_connection *info, struct msghdr *msg); +int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst); + +enum mr_state { + MR_READY, + MR_REGISTERED, + MR_INVALIDATED, + MR_ERROR +}; + +struct smbd_mr { + struct smbd_connection *conn; + struct list_head list; + enum mr_state state; + struct ib_mr *mr; + struct scatterlist *sgl; + int sgl_count; + enum dma_data_direction dir; + union { + struct ib_reg_wr wr; + struct ib_send_wr inv_wr; + }; + struct ib_cqe cqe; + bool need_invalidate; + struct completion invalidate_done; +}; + +/* Interfaces to register and deregister MR for RDMA read/write */ +struct smbd_mr *smbd_register_mr( + struct smbd_connection *info, struct page *pages[], int num_pages, + int tailsz, bool writing, bool need_invalidate); +int smbd_deregister_mr(struct smbd_mr *mr); + +#else +#define cifs_rdma_enabled(server) 0 +struct smbd_connection {}; +static inline void *smbd_get_connection( + struct TCP_Server_Info *server, struct sockaddr *dstaddr) {return NULL;} +static inline int smbd_reconnect(struct TCP_Server_Info *server) {return -1; } +static inline void smbd_destroy(struct smbd_connection *info) {} +static inline int smbd_recv(struct smbd_connection *info, struct msghdr *msg) {return -1; } +static inline int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) {return -1; } +#endif + +#endif diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 7efbab013957..9779b3292d8e 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -37,6 +37,10 @@ #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" +#include "smbdirect.h" + +/* Max number of iovectors we can use off the stack when sending requests. */ +#define CIFS_MAX_IOV_SIZE 8 void cifs_wake_up_task(struct mid_q_entry *mid) @@ -229,7 +233,10 @@ __smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) struct socket *ssocket = server->ssocket; struct msghdr smb_msg; int val = 1; - + if (cifs_rdma_enabled(server) && server->smbd_conn) { + rc = smbd_send(server->smbd_conn, rqst); + goto smbd_done; + } if (ssocket == NULL) return -ENOTSOCK; @@ -298,7 +305,7 @@ uncork: */ server->tcpStatus = CifsNeedReconnect; } - +smbd_done: if (rc < 0 && rc != -EINTR) cifs_dbg(VFS, "Error %d sending data on socket to server\n", rc); @@ -803,12 +810,16 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, const int flags, struct kvec *resp_iov) { struct smb_rqst rqst; - struct kvec *new_iov; + struct kvec s_iov[CIFS_MAX_IOV_SIZE], *new_iov; int rc; - new_iov = kmalloc(sizeof(struct kvec) * (n_vec + 1), GFP_KERNEL); - if (!new_iov) - return -ENOMEM; + if (n_vec + 1 > CIFS_MAX_IOV_SIZE) { + new_iov = kmalloc(sizeof(struct kvec) * (n_vec + 1), + GFP_KERNEL); + if (!new_iov) + return -ENOMEM; + } else + new_iov = s_iov; /* 1st iov is a RFC1001 length followed by the rest of the packet */ memcpy(new_iov + 1, iov, (sizeof(struct kvec) * n_vec)); @@ -823,7 +834,51 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, rqst.rq_nvec = n_vec + 1; rc = cifs_send_recv(xid, ses, &rqst, resp_buf_type, flags, resp_iov); - kfree(new_iov); + if (n_vec + 1 > CIFS_MAX_IOV_SIZE) + kfree(new_iov); + return rc; +} + +/* Like SendReceive2 but iov[0] does not contain an rfc1002 header */ +int +smb2_send_recv(const unsigned int xid, struct cifs_ses *ses, + struct kvec *iov, int n_vec, int *resp_buf_type /* ret */, + const int flags, struct kvec *resp_iov) +{ + struct smb_rqst rqst; + struct kvec s_iov[CIFS_MAX_IOV_SIZE], *new_iov; + int rc; + int i; + __u32 count; + __be32 rfc1002_marker; + + if (n_vec + 1 > CIFS_MAX_IOV_SIZE) { + new_iov = kmalloc(sizeof(struct kvec) * (n_vec + 1), + GFP_KERNEL); + if (!new_iov) + return -ENOMEM; + } else + new_iov = s_iov; + + /* 1st iov is an RFC1002 Session Message length */ + memcpy(new_iov + 1, iov, (sizeof(struct kvec) * n_vec)); + + count = 0; + for (i = 1; i < n_vec + 1; i++) + count += new_iov[i].iov_len; + + rfc1002_marker = cpu_to_be32(count); + + new_iov[0].iov_base = &rfc1002_marker; + new_iov[0].iov_len = 4; + + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = new_iov; + rqst.rq_nvec = n_vec + 1; + + rc = cifs_send_recv(xid, ses, &rqst, resp_buf_type, flags, resp_iov); + if (n_vec + 1 > CIFS_MAX_IOV_SIZE) + kfree(new_iov); return rc; } diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index f40e3953e7fe..49d3c6fda89a 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -61,10 +61,10 @@ static struct class *coda_psdev_class; * Device operations */ -static unsigned int coda_psdev_poll(struct file *file, poll_table * wait) +static __poll_t coda_psdev_poll(struct file *file, poll_table * wait) { struct venus_comm *vcp = (struct venus_comm *) file->private_data; - unsigned int mask = POLLOUT | POLLWRNORM; + __poll_t mask = POLLOUT | POLLWRNORM; poll_wait(file, &vcp->vc_waitq, wait); mutex_lock(&vcp->vc_mutex); diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 5fc5dc660600..ef80085ed564 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1218,23 +1218,11 @@ COMPATIBLE_IOCTL(DMX_SET_PES_FILTER) COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE) COMPATIBLE_IOCTL(DMX_GET_PES_PIDS) COMPATIBLE_IOCTL(DMX_GET_STC) -COMPATIBLE_IOCTL(FE_GET_INFO) -COMPATIBLE_IOCTL(FE_DISEQC_RESET_OVERLOAD) -COMPATIBLE_IOCTL(FE_DISEQC_SEND_MASTER_CMD) -COMPATIBLE_IOCTL(FE_DISEQC_RECV_SLAVE_REPLY) -COMPATIBLE_IOCTL(FE_DISEQC_SEND_BURST) -COMPATIBLE_IOCTL(FE_SET_TONE) -COMPATIBLE_IOCTL(FE_SET_VOLTAGE) -COMPATIBLE_IOCTL(FE_ENABLE_HIGH_LNB_VOLTAGE) -COMPATIBLE_IOCTL(FE_READ_STATUS) -COMPATIBLE_IOCTL(FE_READ_BER) -COMPATIBLE_IOCTL(FE_READ_SIGNAL_STRENGTH) -COMPATIBLE_IOCTL(FE_READ_SNR) -COMPATIBLE_IOCTL(FE_READ_UNCORRECTED_BLOCKS) -COMPATIBLE_IOCTL(FE_SET_FRONTEND) -COMPATIBLE_IOCTL(FE_GET_FRONTEND) -COMPATIBLE_IOCTL(FE_GET_EVENT) -COMPATIBLE_IOCTL(FE_DISHNETWORK_SEND_LEGACY_CMD) +COMPATIBLE_IOCTL(DMX_REQBUFS) +COMPATIBLE_IOCTL(DMX_QUERYBUF) +COMPATIBLE_IOCTL(DMX_EXPBUF) +COMPATIBLE_IOCTL(DMX_QBUF) +COMPATIBLE_IOCTL(DMX_DQBUF) COMPATIBLE_IOCTL(VIDEO_STOP) COMPATIBLE_IOCTL(VIDEO_PLAY) COMPATIBLE_IOCTL(VIDEO_FREEZE) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 732a786cce9d..ce654526c0fb 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -27,6 +27,7 @@ #include <linux/dcache.h> #include <linux/namei.h> #include <crypto/aes.h> +#include <crypto/skcipher.h> #include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 305541bcd108..e33f3d3c5ade 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -13,42 +13,46 @@ #include <linux/scatterlist.h> #include <linux/ratelimit.h> +#include <crypto/skcipher.h> #include "fscrypt_private.h" +static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) +{ + if (str->len == 1 && str->name[0] == '.') + return true; + + if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') + return true; + + return false; +} + /** * fname_encrypt() - encrypt a filename * - * The caller must have allocated sufficient memory for the @oname string. + * The output buffer must be at least as large as the input buffer. + * Any extra space is filled with NUL padding before encryption. * * Return: 0 on success, -errno on failure */ -static int fname_encrypt(struct inode *inode, - const struct qstr *iname, struct fscrypt_str *oname) +int fname_encrypt(struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen) { struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); - struct fscrypt_info *ci = inode->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = inode->i_crypt_info->ci_ctfm; int res = 0; char iv[FS_CRYPTO_BLOCK_SIZE]; struct scatterlist sg; - int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - unsigned int lim; - unsigned int cryptlen; - - lim = inode->i_sb->s_cop->max_namelen(inode); - if (iname->len <= 0 || iname->len > lim) - return -EIO; /* * Copy the filename to the output buffer for encrypting in-place and * pad it with the needed number of NUL bytes. */ - cryptlen = max_t(unsigned int, iname->len, FS_CRYPTO_BLOCK_SIZE); - cryptlen = round_up(cryptlen, padding); - cryptlen = min(cryptlen, lim); - memcpy(oname->name, iname->name, iname->len); - memset(oname->name + iname->len, 0, cryptlen - iname->len); + if (WARN_ON(olen < iname->len)) + return -ENOBUFS; + memcpy(out, iname->name, iname->len); + memset(out + iname->len, 0, olen - iname->len); /* Initialize the IV */ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); @@ -63,8 +67,8 @@ static int fname_encrypt(struct inode *inode, skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); - sg_init_one(&sg, oname->name, cryptlen); - skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv); + sg_init_one(&sg, out, olen); + skcipher_request_set_crypt(req, &sg, &sg, olen, iv); /* Do the encryption */ res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); @@ -75,7 +79,6 @@ static int fname_encrypt(struct inode *inode, return res; } - oname->len = cryptlen; return 0; } @@ -188,50 +191,52 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen) +bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, + u32 max_len, u32 *encrypted_len_ret) { - int padding = 32; - struct fscrypt_info *ci = inode->i_crypt_info; - - if (ci) - padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - ilen = max(ilen, (u32)FS_CRYPTO_BLOCK_SIZE); - return round_up(ilen, padding); + int padding = 4 << (inode->i_crypt_info->ci_flags & + FS_POLICY_FLAGS_PAD_MASK); + u32 encrypted_len; + + if (orig_len > max_len) + return false; + encrypted_len = max(orig_len, (u32)FS_CRYPTO_BLOCK_SIZE); + encrypted_len = round_up(encrypted_len, padding); + *encrypted_len_ret = min(encrypted_len, max_len); + return true; } -EXPORT_SYMBOL(fscrypt_fname_encrypted_size); /** - * fscrypt_fname_crypto_alloc_obuff() - + * fscrypt_fname_alloc_buffer - allocate a buffer for presented filenames + * + * Allocate a buffer that is large enough to hold any decrypted or encoded + * filename (null-terminated), for the given maximum encrypted filename length. * - * Allocates an output buffer that is sufficient for the crypto operation - * specified by the context and the direction. + * Return: 0 on success, -errno on failure */ int fscrypt_fname_alloc_buffer(const struct inode *inode, - u32 ilen, struct fscrypt_str *crypto_str) + u32 max_encrypted_len, + struct fscrypt_str *crypto_str) { - u32 olen = fscrypt_fname_encrypted_size(inode, ilen); const u32 max_encoded_len = max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE), 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))); + u32 max_presented_len; - crypto_str->len = olen; - olen = max(olen, max_encoded_len); + max_presented_len = max(max_encoded_len, max_encrypted_len); - /* - * Allocated buffer can hold one more character to null-terminate the - * string - */ - crypto_str->name = kmalloc(olen + 1, GFP_NOFS); - if (!(crypto_str->name)) + crypto_str->name = kmalloc(max_presented_len + 1, GFP_NOFS); + if (!crypto_str->name) return -ENOMEM; + crypto_str->len = max_presented_len; return 0; } EXPORT_SYMBOL(fscrypt_fname_alloc_buffer); /** - * fscrypt_fname_crypto_free_buffer() - + * fscrypt_fname_free_buffer - free the buffer for presented filenames * - * Frees the buffer allocated for crypto operation. + * Free the buffer allocated by fscrypt_fname_alloc_buffer(). */ void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { @@ -298,35 +303,6 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); /** - * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk - * space - * - * The caller must have allocated sufficient memory for the @oname string. - * - * Return: 0 on success, -errno on failure - */ -int fscrypt_fname_usr_to_disk(struct inode *inode, - const struct qstr *iname, - struct fscrypt_str *oname) -{ - if (fscrypt_is_dot_dotdot(iname)) { - oname->name[0] = '.'; - oname->name[iname->len - 1] = '.'; - oname->len = iname->len; - return 0; - } - if (inode->i_crypt_info) - return fname_encrypt(inode, iname, oname); - /* - * Without a proper key, a user is not allowed to modify the filenames - * in a directory. Consequently, a user space name cannot be mapped to - * a disk-space name - */ - return -ENOKEY; -} -EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); - -/** * fscrypt_setup_filename() - prepare to search a possibly encrypted directory * @dir: the directory that will be searched * @iname: the user-provided filename being searched for @@ -369,11 +345,17 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return ret; if (dir->i_crypt_info) { - ret = fscrypt_fname_alloc_buffer(dir, iname->len, - &fname->crypto_buf); - if (ret) - return ret; - ret = fname_encrypt(dir, iname, &fname->crypto_buf); + if (!fscrypt_fname_encrypted_size(dir, iname->len, + dir->i_sb->s_cop->max_namelen(dir), + &fname->crypto_buf.len)) + return -ENAMETOOLONG; + fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, + GFP_NOFS); + if (!fname->crypto_buf.name) + return -ENOMEM; + + ret = fname_encrypt(dir, iname, fname->crypto_buf.name, + fname->crypto_buf.len); if (ret) goto errout; fname->disk_name.name = fname->crypto_buf.name; @@ -425,7 +407,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return 0; errout: - fscrypt_fname_free_buffer(&fname->crypto_buf); + kfree(fname->crypto_buf.name); return ret; } EXPORT_SYMBOL(fscrypt_setup_filename); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index c0b4f5597e1a..ad6722bae8b7 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -50,6 +50,15 @@ struct fscrypt_context { #define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct fscrypt_symlink_data { + __le16 len; + char encrypted_path[1]; +} __packed; + /* * A pointer to this structure is stored in the file system's in-core * representation of an inode. @@ -71,7 +80,22 @@ typedef enum { #define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 #define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002 +static inline bool fscrypt_valid_enc_modes(u32 contents_mode, + u32 filenames_mode) +{ + if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC && + filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS) + return true; + + if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS && + filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS) + return true; + + return false; +} + /* crypto.c */ +extern struct kmem_cache *fscrypt_info_cachep; extern int fscrypt_initialize(unsigned int cop_flags); extern struct workqueue_struct *fscrypt_read_workqueue; extern int fscrypt_do_page_crypto(const struct inode *inode, @@ -83,6 +107,13 @@ extern int fscrypt_do_page_crypto(const struct inode *inode, extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); +/* fname.c */ +extern int fname_encrypt(struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen); +extern bool fscrypt_fname_encrypted_size(const struct inode *inode, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret); + /* keyinfo.c */ extern void __exit fscrypt_essiv_cleanup(void); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 9f5fb2eb9cf7..bec06490fb13 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -110,3 +110,161 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry) return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); + +int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, + unsigned int max_len, + struct fscrypt_str *disk_link) +{ + int err; + + /* + * To calculate the size of the encrypted symlink target we need to know + * the amount of NUL padding, which is determined by the flags set in + * the encryption policy which will be inherited from the directory. + * The easiest way to get access to this is to just load the directory's + * fscrypt_info, since we'll need it to create the dir_entry anyway. + * + * Note: in test_dummy_encryption mode, @dir may be unencrypted. + */ + err = fscrypt_get_encryption_info(dir); + if (err) + return err; + if (!fscrypt_has_encryption_key(dir)) + return -ENOKEY; + + /* + * Calculate the size of the encrypted symlink and verify it won't + * exceed max_len. Note that for historical reasons, encrypted symlink + * targets are prefixed with the ciphertext length, despite this + * actually being redundant with i_size. This decreases by 2 bytes the + * longest symlink target we can accept. + * + * We could recover 1 byte by not counting a null terminator, but + * counting it (even though it is meaningless for ciphertext) is simpler + * for now since filesystems will assume it is there and subtract it. + */ + if (!fscrypt_fname_encrypted_size(dir, len, + max_len - sizeof(struct fscrypt_symlink_data), + &disk_link->len)) + return -ENAMETOOLONG; + disk_link->len += sizeof(struct fscrypt_symlink_data); + + disk_link->name = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_prepare_symlink); + +int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, + unsigned int len, struct fscrypt_str *disk_link) +{ + int err; + struct qstr iname = QSTR_INIT(target, len); + struct fscrypt_symlink_data *sd; + unsigned int ciphertext_len; + + err = fscrypt_require_key(inode); + if (err) + return err; + + if (disk_link->name) { + /* filesystem-provided buffer */ + sd = (struct fscrypt_symlink_data *)disk_link->name; + } else { + sd = kmalloc(disk_link->len, GFP_NOFS); + if (!sd) + return -ENOMEM; + } + ciphertext_len = disk_link->len - sizeof(*sd); + sd->len = cpu_to_le16(ciphertext_len); + + err = fname_encrypt(inode, &iname, sd->encrypted_path, ciphertext_len); + if (err) { + if (!disk_link->name) + kfree(sd); + return err; + } + /* + * Null-terminating the ciphertext doesn't make sense, but we still + * count the null terminator in the length, so we might as well + * initialize it just in case the filesystem writes it out. + */ + sd->encrypted_path[ciphertext_len] = '\0'; + + if (!disk_link->name) + disk_link->name = (unsigned char *)sd; + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink); + +/** + * fscrypt_get_symlink - get the target of an encrypted symlink + * @inode: the symlink inode + * @caddr: the on-disk contents of the symlink + * @max_size: size of @caddr buffer + * @done: if successful, will be set up to free the returned target + * + * If the symlink's encryption key is available, we decrypt its target. + * Otherwise, we encode its target for presentation. + * + * This may sleep, so the filesystem must have dropped out of RCU mode already. + * + * Return: the presentable symlink target or an ERR_PTR() + */ +const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, + unsigned int max_size, + struct delayed_call *done) +{ + const struct fscrypt_symlink_data *sd; + struct fscrypt_str cstr, pstr; + int err; + + /* This is for encrypted symlinks only */ + if (WARN_ON(!IS_ENCRYPTED(inode))) + return ERR_PTR(-EINVAL); + + /* + * Try to set up the symlink's encryption key, but we can continue + * regardless of whether the key is available or not. + */ + err = fscrypt_get_encryption_info(inode); + if (err) + return ERR_PTR(err); + + /* + * For historical reasons, encrypted symlink targets are prefixed with + * the ciphertext length, even though this is redundant with i_size. + */ + + if (max_size < sizeof(*sd)) + return ERR_PTR(-EUCLEAN); + sd = caddr; + cstr.name = (unsigned char *)sd->encrypted_path; + cstr.len = le16_to_cpu(sd->len); + + if (cstr.len == 0) + return ERR_PTR(-EUCLEAN); + + if (cstr.len + sizeof(*sd) - 1 > max_size) + return ERR_PTR(-EUCLEAN); + + err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); + if (err) + return ERR_PTR(err); + + err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); + if (err) + goto err_kfree; + + err = -EUCLEAN; + if (pstr.name[0] == '\0') + goto err_kfree; + + pstr.name[pstr.len] = '\0'; + set_delayed_call(done, kfree_link, pstr.name); + return pstr.name; + +err_kfree: + kfree(pstr.name); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(fscrypt_get_symlink); diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 5e6e846f5a24..05f5ee1f0705 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -14,6 +14,7 @@ #include <linux/ratelimit.h> #include <crypto/aes.h> #include <crypto/sha.h> +#include <crypto/skcipher.h> #include "fscrypt_private.h" static struct crypto_shash *essiv_hash_tfm; @@ -354,19 +355,9 @@ out: } EXPORT_SYMBOL(fscrypt_get_encryption_info); -void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) +void fscrypt_put_encryption_info(struct inode *inode) { - struct fscrypt_info *prev; - - if (ci == NULL) - ci = READ_ONCE(inode->i_crypt_info); - if (ci == NULL) - return; - - prev = cmpxchg(&inode->i_crypt_info, ci, NULL); - if (prev != ci) - return; - - put_crypt_info(ci); + put_crypt_info(inode->i_crypt_info); + inode->i_crypt_info = NULL; } EXPORT_SYMBOL(fscrypt_put_encryption_info); @@ -44,6 +44,7 @@ /* The 'colour' (ie low bits) within a PMD of a page offset. */ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) +#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; @@ -375,8 +376,8 @@ restart: * unmapped. */ if (pmd_downgrade && dax_is_zero_entry(entry)) - unmap_mapping_range(mapping, - (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); + unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, + PG_PMD_NR, false); err = radix_tree_preload( mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); @@ -538,12 +539,10 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) - unmap_mapping_range(mapping, - (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, - PMD_SIZE, 0); + unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, + PG_PMD_NR, false); else /* pte entry */ - unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, - PAGE_SIZE, 0); + unmap_mapping_pages(mapping, vmf->pgoff, 1, false); } spin_lock_irq(&mapping->tree_lock); @@ -636,8 +635,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pmd = pmd_mkclean(pmd); set_pmd_at(vma->vm_mm, address, pmdp, pmd); unlock_pmd: - spin_unlock(ptl); #endif + spin_unlock(ptl); } else { if (pfn != pte_pfn(*ptep)) goto unlock_pte; @@ -1096,7 +1095,7 @@ static bool dax_fault_is_synchronous(unsigned long flags, } static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, - const struct iomap_ops *ops) + int *iomap_errp, const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; @@ -1149,6 +1148,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, * that we never have to deal with more than a single extent here. */ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); + if (iomap_errp) + *iomap_errp = error; if (error) { vmf_ret = dax_fault_return(error); goto unlock_entry; @@ -1269,12 +1270,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, } #ifdef CONFIG_FS_DAX_PMD -/* - * The 'colour' (ie low bits) within a PMD of a page offset. This comes up - * more often than one might expect in the below functions. - */ -#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) - static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *entry) { @@ -1488,6 +1483,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * @vmf: The description of the fault * @pe_size: Size of the page to fault in * @pfnp: PFN to insert for synchronous faults if fsync is required + * @iomap_errp: Storage for detailed error code in case of error * @ops: Iomap ops passed from the file system * * When a page fault occurs, filesystems may call this helper in @@ -1496,11 +1492,11 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * successfully. */ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, - pfn_t *pfnp, const struct iomap_ops *ops) + pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { switch (pe_size) { case PE_SIZE_PTE: - return dax_iomap_pte_fault(vmf, pfnp, ops); + return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops); case PE_SIZE_PMD: return dax_iomap_pmd_fault(vmf, pfnp, ops); default: diff --git a/fs/dcache.c b/fs/dcache.c index 5c7df1df81ff..7c38f39958bc 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -32,14 +32,11 @@ #include <linux/swap.h> #include <linux/bootmem.h> #include <linux/fs_struct.h> -#include <linux/hardirq.h> #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> #include <linux/prefetch.h> #include <linux/ratelimit.h> #include <linux/list_lru.h> -#include <linux/kasan.h> - #include "internal.h" #include "mount.h" @@ -49,8 +46,8 @@ * - i_dentry, d_u.d_alias, d_inode of aliases * dcache_hash_bucket lock protects: * - the dcache hash table - * s_anon bl list spinlock protects: - * - the s_anon list (see __d_drop) + * s_roots bl list spinlock protects: + * - the s_roots list (see __d_drop) * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: @@ -68,7 +65,7 @@ * dentry->d_lock * dentry->d_sb->s_dentry_lru_lock * dcache_hash_bucket lock - * s_anon lock + * s_roots lock * * If there is an ancestor relationship: * dentry->d_parent->...->d_parent->d_lock @@ -104,14 +101,13 @@ EXPORT_SYMBOL(slash_name); * information, yet avoid using a prime hash-size or similar. */ -static unsigned int d_hash_mask __read_mostly; static unsigned int d_hash_shift __read_mostly; static struct hlist_bl_head *dentry_hashtable __read_mostly; static inline struct hlist_bl_head *d_hash(unsigned int hash) { - return dentry_hashtable + (hash >> (32 - d_hash_shift)); + return dentry_hashtable + (hash >> d_hash_shift); } #define IN_LOOKUP_SHIFT 10 @@ -195,7 +191,7 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char unsigned long a,b,mask; for (;;) { - a = *(unsigned long *)cs; + a = read_word_at_a_time(cs); b = load_unaligned_zeropad(ct); if (tcount < sizeof(unsigned long)) break; @@ -468,30 +464,37 @@ static void dentry_lru_add(struct dentry *dentry) * d_drop() is used mainly for stuff that wants to invalidate a dentry for some * reason (NFS timeouts or autofs deletes). * - * __d_drop requires dentry->d_lock. + * __d_drop requires dentry->d_lock + * ___d_drop doesn't mark dentry as "unhashed" + * (dentry->d_hash.pprev will be LIST_POISON2, not NULL). */ -void __d_drop(struct dentry *dentry) +static void ___d_drop(struct dentry *dentry) { if (!d_unhashed(dentry)) { struct hlist_bl_head *b; /* * Hashed dentries are normally on the dentry hashtable, * with the exception of those newly allocated by - * d_obtain_alias, which are always IS_ROOT: + * d_obtain_root, which are always IS_ROOT: */ if (unlikely(IS_ROOT(dentry))) - b = &dentry->d_sb->s_anon; + b = &dentry->d_sb->s_roots; else b = d_hash(dentry->d_name.hash); hlist_bl_lock(b); __hlist_bl_del(&dentry->d_hash); - dentry->d_hash.pprev = NULL; hlist_bl_unlock(b); /* After this call, in-progress rcu-walk path lookup will fail. */ write_seqcount_invalidate(&dentry->d_seq); } } + +void __d_drop(struct dentry *dentry) +{ + ___d_drop(dentry); + dentry->d_hash.pprev = NULL; +} EXPORT_SYMBOL(__d_drop); void d_drop(struct dentry *dentry) @@ -1500,8 +1503,8 @@ void shrink_dcache_for_umount(struct super_block *sb) sb->s_root = NULL; do_one_tree(dentry); - while (!hlist_bl_empty(&sb->s_anon)) { - dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash)); + while (!hlist_bl_empty(&sb->s_roots)) { + dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash)); do_one_tree(dentry); } } @@ -1623,9 +1626,6 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) } atomic_set(&p->u.count, 1); dname = p->name; - if (IS_ENABLED(CONFIG_DCACHE_WORD_ACCESS)) - kasan_unpoison_shadow(dname, - round_up(name->len + 1, sizeof(unsigned long))); } else { dname = dentry->d_iname; } @@ -1636,8 +1636,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dname[name->len] = 0; /* Make sure we always see the terminating NUL character */ - smp_wmb(); - dentry->d_name.name = dname; + smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ dentry->d_lockref.count = 1; dentry->d_flags = 0; @@ -1699,9 +1698,15 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) } EXPORT_SYMBOL(d_alloc); +struct dentry *d_alloc_anon(struct super_block *sb) +{ + return __d_alloc(sb, NULL); +} +EXPORT_SYMBOL(d_alloc_anon); + struct dentry *d_alloc_cursor(struct dentry * parent) { - struct dentry *dentry = __d_alloc(parent->d_sb, NULL); + struct dentry *dentry = d_alloc_anon(parent->d_sb); if (dentry) { dentry->d_flags |= DCACHE_RCUACCESS | DCACHE_DENTRY_CURSOR; dentry->d_parent = dget(parent); @@ -1887,7 +1892,7 @@ struct dentry *d_make_root(struct inode *root_inode) struct dentry *res = NULL; if (root_inode) { - res = __d_alloc(root_inode->i_sb, NULL); + res = d_alloc_anon(root_inode->i_sb); if (res) d_instantiate(res, root_inode); else @@ -1926,33 +1931,19 @@ struct dentry *d_find_any_alias(struct inode *inode) } EXPORT_SYMBOL(d_find_any_alias); -static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected) +static struct dentry *__d_instantiate_anon(struct dentry *dentry, + struct inode *inode, + bool disconnected) { - struct dentry *tmp; struct dentry *res; unsigned add_flags; - if (!inode) - return ERR_PTR(-ESTALE); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - res = d_find_any_alias(inode); - if (res) - goto out_iput; - - tmp = __d_alloc(inode->i_sb, NULL); - if (!tmp) { - res = ERR_PTR(-ENOMEM); - goto out_iput; - } - - security_d_instantiate(tmp, inode); + security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); res = __d_find_any_alias(inode); if (res) { spin_unlock(&inode->i_lock); - dput(tmp); + dput(dentry); goto out_iput; } @@ -1962,22 +1953,57 @@ static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected) if (disconnected) add_flags |= DCACHE_DISCONNECTED; - spin_lock(&tmp->d_lock); - __d_set_inode_and_type(tmp, inode, add_flags); - hlist_add_head(&tmp->d_u.d_alias, &inode->i_dentry); - hlist_bl_lock(&tmp->d_sb->s_anon); - hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon); - hlist_bl_unlock(&tmp->d_sb->s_anon); - spin_unlock(&tmp->d_lock); + spin_lock(&dentry->d_lock); + __d_set_inode_and_type(dentry, inode, add_flags); + hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); + if (!disconnected) { + hlist_bl_lock(&dentry->d_sb->s_roots); + hlist_bl_add_head(&dentry->d_hash, &dentry->d_sb->s_roots); + hlist_bl_unlock(&dentry->d_sb->s_roots); + } + spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); - return tmp; + return dentry; out_iput: iput(inode); return res; } +struct dentry *d_instantiate_anon(struct dentry *dentry, struct inode *inode) +{ + return __d_instantiate_anon(dentry, inode, true); +} +EXPORT_SYMBOL(d_instantiate_anon); + +static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected) +{ + struct dentry *tmp; + struct dentry *res; + + if (!inode) + return ERR_PTR(-ESTALE); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + res = d_find_any_alias(inode); + if (res) + goto out_iput; + + tmp = d_alloc_anon(inode->i_sb); + if (!tmp) { + res = ERR_PTR(-ENOMEM); + goto out_iput; + } + + return __d_instantiate_anon(tmp, inode, disconnected); + +out_iput: + iput(inode); + return res; +} + /** * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode * @inode: inode to allocate the dentry for @@ -1998,7 +2024,7 @@ static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected) */ struct dentry *d_obtain_alias(struct inode *inode) { - return __d_obtain_alias(inode, 1); + return __d_obtain_alias(inode, true); } EXPORT_SYMBOL(d_obtain_alias); @@ -2019,7 +2045,7 @@ EXPORT_SYMBOL(d_obtain_alias); */ struct dentry *d_obtain_root(struct inode *inode) { - return __d_obtain_alias(inode, 0); + return __d_obtain_alias(inode, false); } EXPORT_SYMBOL(d_obtain_root); @@ -2381,7 +2407,7 @@ EXPORT_SYMBOL(d_delete); static void __d_rehash(struct dentry *entry) { struct hlist_bl_head *b = d_hash(entry->d_name.hash); - BUG_ON(!d_unhashed(entry)); + hlist_bl_lock(b); hlist_bl_add_head_rcu(&entry->d_hash, b); hlist_bl_unlock(b); @@ -2816,9 +2842,9 @@ static void __d_move(struct dentry *dentry, struct dentry *target, write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); /* unhash both */ - /* __d_drop does write_seqcount_barrier, but they're OK to nest. */ - __d_drop(dentry); - __d_drop(target); + /* ___d_drop does write_seqcount_barrier, but they're OK to nest. */ + ___d_drop(dentry); + ___d_drop(target); /* Switch the names.. */ if (exchange) @@ -2830,6 +2856,8 @@ static void __d_move(struct dentry *dentry, struct dentry *target, __d_rehash(dentry); if (exchange) __d_rehash(target); + else + target->d_hash.pprev = NULL; /* ... and switch them in the tree */ if (IS_ROOT(dentry)) { @@ -3047,17 +3075,14 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) * retry it again when a d_move() does happen. So any garbage in the buffer * due to mismatched pointer and length will be discarded. * - * Data dependency barrier is needed to make sure that we see that terminating - * NUL. Alpha strikes again, film at 11... + * Load acquire is needed to make sure that we see that terminating NUL. */ static int prepend_name(char **buffer, int *buflen, const struct qstr *name) { - const char *dname = READ_ONCE(name->name); + const char *dname = smp_load_acquire(&name->name); /* ^^^ */ u32 dlen = READ_ONCE(name->len); char *p; - smp_read_barrier_depends(); - *buflen -= dlen + 1; if (*buflen < 0) return -ENAMETOOLONG; @@ -3527,6 +3552,7 @@ bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) return result; } +EXPORT_SYMBOL(is_subdir); static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) { @@ -3589,9 +3615,10 @@ static void __init dcache_init_early(void) 13, HASH_EARLY | HASH_ZERO, &d_hash_shift, - &d_hash_mask, + NULL, 0, 0); + d_hash_shift = 32 - d_hash_shift; } static void __init dcache_init(void) @@ -3601,8 +3628,9 @@ static void __init dcache_init(void) * but it is probably not worth it because of the cache nature * of the dcache. */ - dentry_cache = KMEM_CACHE(dentry, - SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT); + dentry_cache = KMEM_CACHE_USERCOPY(dentry, + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT, + d_iname); /* Hash may have been set up in dcache_init_early */ if (!hashdist) @@ -3615,9 +3643,10 @@ static void __init dcache_init(void) 13, HASH_ZERO, &d_hash_shift, - &d_hash_mask, + NULL, 0, 0); + d_hash_shift = 32 - d_hash_shift; } /* SLAB cache for __getname() consumers */ @@ -3639,8 +3668,8 @@ void __init vfs_caches_init_early(void) void __init vfs_caches_init(void) { - names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL); dcache_init(); inode_init(); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index cd12e6576b48..6fdbf21be318 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -206,11 +206,11 @@ FULL_PROXY_FUNC(unlocked_ioctl, long, filp, PROTO(struct file *filp, unsigned int cmd, unsigned long arg), ARGS(filp, cmd, arg)); -static unsigned int full_proxy_poll(struct file *filp, +static __poll_t full_proxy_poll(struct file *filp, struct poll_table_struct *wait) { struct dentry *dentry = F_DENTRY(filp); - unsigned int r = 0; + __poll_t r = 0; const struct file_operations *real_fops; if (debugfs_file_get(dentry)) diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 7eae33ffa3fc..e31d6ed3ec32 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -168,11 +168,11 @@ struct vfsmount *devpts_mntget(struct file *filp, struct pts_fs_info *fsi) dput(path.dentry); if (err) { mntput(path.mnt); - path.mnt = ERR_PTR(err); + return ERR_PTR(err); } if (DEVPTS_SB(path.mnt->mnt_sb) != fsi) { mntput(path.mnt); - path.mnt = ERR_PTR(-ENODEV); + return ERR_PTR(-ENODEV); } return path.mnt; } diff --git a/fs/direct-io.c b/fs/direct-io.c index 3aafb3343a65..a0ca9e48e993 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -219,6 +219,27 @@ static inline struct page *dio_get_page(struct dio *dio, return dio->pages[sdio->head]; } +/* + * Warn about a page cache invalidation failure during a direct io write. + */ +void dio_warn_stale_pagecache(struct file *filp) +{ + static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); + char pathname[128]; + struct inode *inode = file_inode(filp); + char *path; + + errseq_set(&inode->i_mapping->wb_err, -EIO); + if (__ratelimit(&_rs)) { + path = file_path(filp, pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); + pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, + current->comm); + } +} + /** * dio_complete() - called when all DIO BIO I/O has been completed * @offset: the byte offset in the file of the completed operation @@ -290,7 +311,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) err = invalidate_inode_pages2_range(dio->inode->i_mapping, offset >> PAGE_SHIFT, (offset + ret - 1) >> PAGE_SHIFT); - WARN_ON_ONCE(err); + if (err) + dio_warn_stale_pagecache(dio->iocb->ki_filp); } if (!(dio->flags & DIO_SKIP_DIO_COUNT)) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 05707850f93a..cff79ea0c01d 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -675,9 +675,9 @@ static int receive_from_sock(struct connection *con) nvec = 2; } len = iov[0].iov_len + iov[1].iov_len; + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, iov, nvec, len); - r = ret = kernel_recvmsg(con->sock, &msg, iov, nvec, len, - MSG_DONTWAIT | MSG_NOSIGNAL); + r = ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT | MSG_NOSIGNAL); if (ret <= 0) goto out_close; else if (ret == len) diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index e631b1689228..a4c63e9e6385 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -463,9 +463,9 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, return count; } -static unsigned int dev_poll(struct file *file, poll_table *wait) +static __poll_t dev_poll(struct file *file, poll_table *wait) { - unsigned int mask = 0; + __poll_t mask = 0; poll_wait(file, &send_wq, wait); diff --git a/fs/dlm/user.c b/fs/dlm/user.c index d18e7a539f11..662432af8ce8 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -887,7 +887,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, return rv; } -static unsigned int device_poll(struct file *file, poll_table *wait) +static __poll_t device_poll(struct file *file, poll_table *wait) { struct dlm_user_proc *proc = file->private_data; diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index f09cacaf8c80..7423e792a092 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -38,11 +38,11 @@ static atomic_t ecryptfs_num_miscdev_opens; * * Returns the poll mask */ -static unsigned int +static __poll_t ecryptfs_miscdev_poll(struct file *file, poll_table *pt) { struct ecryptfs_daemon *daemon = file->private_data; - unsigned int mask = 0; + __poll_t mask = 0; mutex_lock(&daemon->mux); if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { diff --git a/fs/eventfd.c b/fs/eventfd.c index 2fb4eadaa118..04fd824142a1 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -80,24 +80,11 @@ static void eventfd_free(struct kref *kref) } /** - * eventfd_ctx_get - Acquires a reference to the internal eventfd context. - * @ctx: [in] Pointer to the eventfd context. - * - * Returns: In case of success, returns a pointer to the eventfd context. - */ -struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx) -{ - kref_get(&ctx->kref); - return ctx; -} -EXPORT_SYMBOL_GPL(eventfd_ctx_get); - -/** * eventfd_ctx_put - Releases a reference to the internal eventfd context. * @ctx: [in] Pointer to eventfd context. * * The eventfd context reference must have been previously acquired either - * with eventfd_ctx_get() or eventfd_ctx_fdget(). + * with eventfd_ctx_fdget() or eventfd_ctx_fileget(). */ void eventfd_ctx_put(struct eventfd_ctx *ctx) { @@ -114,10 +101,10 @@ static int eventfd_release(struct inode *inode, struct file *file) return 0; } -static unsigned int eventfd_poll(struct file *file, poll_table *wait) +static __poll_t eventfd_poll(struct file *file, poll_table *wait) { struct eventfd_ctx *ctx = file->private_data; - unsigned int events = 0; + __poll_t events = 0; u64 count; poll_wait(file, &ctx->wqh, wait); @@ -207,36 +194,27 @@ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *w } EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); -/** - * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero. - * @ctx: [in] Pointer to eventfd context. - * @no_wait: [in] Different from zero if the operation should not block. - * @cnt: [out] Pointer to the 64-bit counter value. - * - * Returns %0 if successful, or the following error codes: - * - * - -EAGAIN : The operation would have blocked but @no_wait was non-zero. - * - -ERESTARTSYS : A signal interrupted the wait operation. - * - * If @no_wait is zero, the function might sleep until the eventfd internal - * counter becomes greater than zero. - */ -ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt) +static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) { + struct eventfd_ctx *ctx = file->private_data; ssize_t res; + __u64 ucnt = 0; DECLARE_WAITQUEUE(wait, current); + if (count < sizeof(ucnt)) + return -EINVAL; + spin_lock_irq(&ctx->wqh.lock); - *cnt = 0; res = -EAGAIN; if (ctx->count > 0) - res = 0; - else if (!no_wait) { + res = sizeof(ucnt); + else if (!(file->f_flags & O_NONBLOCK)) { __add_wait_queue(&ctx->wqh, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (ctx->count > 0) { - res = 0; + res = sizeof(ucnt); break; } if (signal_pending(current)) { @@ -250,31 +228,17 @@ ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt) __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } - if (likely(res == 0)) { - eventfd_ctx_do_read(ctx, cnt); + if (likely(res > 0)) { + eventfd_ctx_do_read(ctx, &ucnt); if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, POLLOUT); } spin_unlock_irq(&ctx->wqh.lock); - return res; -} -EXPORT_SYMBOL_GPL(eventfd_ctx_read); - -static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, - loff_t *ppos) -{ - struct eventfd_ctx *ctx = file->private_data; - ssize_t res; - __u64 cnt; - - if (count < sizeof(cnt)) - return -EINVAL; - res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt); - if (res < 0) - return res; + if (res > 0 && put_user(ucnt, (__u64 __user *)buf)) + return -EFAULT; - return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt); + return res; } static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, @@ -405,79 +369,44 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); */ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file) { + struct eventfd_ctx *ctx; + if (file->f_op != &eventfd_fops) return ERR_PTR(-EINVAL); - return eventfd_ctx_get(file->private_data); + ctx = file->private_data; + kref_get(&ctx->kref); + return ctx; } EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); -/** - * eventfd_file_create - Creates an eventfd file pointer. - * @count: Initial eventfd counter value. - * @flags: Flags for the eventfd file. - * - * This function creates an eventfd file pointer, w/out installing it into - * the fd table. This is useful when the eventfd file is used during the - * initialization of data structures that require extra setup after the eventfd - * creation. So the eventfd creation is split into the file pointer creation - * phase, and the file descriptor installation phase. - * In this way races with userspace closing the newly installed file descriptor - * can be avoided. - * Returns an eventfd file pointer, or a proper error pointer. - */ -struct file *eventfd_file_create(unsigned int count, int flags) +SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) { - struct file *file; struct eventfd_ctx *ctx; + int fd; /* Check the EFD_* constants for consistency. */ BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); if (flags & ~EFD_FLAGS_SET) - return ERR_PTR(-EINVAL); + return -EINVAL; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) - return ERR_PTR(-ENOMEM); + return -ENOMEM; kref_init(&ctx->kref); init_waitqueue_head(&ctx->wqh); ctx->count = count; ctx->flags = flags; - file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, - O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS)); - if (IS_ERR(file)) + fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, + O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS)); + if (fd < 0) eventfd_free_ctx(ctx); - return file; -} - -SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) -{ - int fd, error; - struct file *file; - - error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS); - if (error < 0) - return error; - fd = error; - - file = eventfd_file_create(count, flags); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_put_unused_fd; - } - fd_install(fd, file); - return fd; - -err_put_unused_fd: - put_unused_fd(fd); - - return error; } SYSCALL_DEFINE1(eventfd, unsigned int, count) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index afd548ebc328..42e35a6977c9 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -874,7 +874,8 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() * is correctly annotated. */ -static unsigned int ep_item_poll(struct epitem *epi, poll_table *pt, int depth) +static unsigned int ep_item_poll(const struct epitem *epi, poll_table *pt, + int depth) { struct eventpoll *ep; bool locked; @@ -920,7 +921,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, return 0; } -static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) +static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) { struct eventpoll *ep = file->private_data; int depth = 0; @@ -1117,6 +1118,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v unsigned long flags; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; + __poll_t pollflags = key_to_poll(key); int ewake = 0; spin_lock_irqsave(&ep->lock, flags); @@ -1138,7 +1140,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * callback. We need to be able to handle both cases here, hence the * test for "key" != NULL before the event match test. */ - if (key && !((unsigned long) key & epi->event.events)) + if (pollflags && !(pollflags & epi->event.events)) goto out_unlock; /* @@ -1175,8 +1177,8 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v */ if (waitqueue_active(&ep->wq)) { if ((epi->event.events & EPOLLEXCLUSIVE) && - !((unsigned long)key & POLLFREE)) { - switch ((unsigned long)key & EPOLLINOUT_BITS) { + !(pollflags & POLLFREE)) { + switch (pollflags & EPOLLINOUT_BITS) { case POLLIN: if (epi->event.events & POLLIN) ewake = 1; @@ -1205,7 +1207,7 @@ out_unlock: if (!(epi->event.events & EPOLLEXCLUSIVE)) ewake = 1; - if ((unsigned long)key & POLLFREE) { + if (pollflags & POLLFREE) { /* * If we race with ep_remove_wait_queue() it can miss * ->whead = NULL and do another remove_wait_queue() after @@ -1409,7 +1411,7 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi) /* * Must be called with "mtx" held. */ -static int ep_insert(struct eventpoll *ep, struct epoll_event *event, +static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, struct file *tfile, int fd, int full_check) { int error, revents, pwake = 0; @@ -1486,7 +1488,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, ep_set_busy_poll_napi_id(epi); /* If the file is already "ready" we drop it inside the ready list */ - if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { + if (revents && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); @@ -1540,10 +1542,10 @@ error_create_wakeup_source: * Modify the interest event mask by dropping an event if the new mask * has a match in the current file status. Must be called with "mtx" held. */ -static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event) +static int ep_modify(struct eventpoll *ep, struct epitem *epi, + const struct epoll_event *event) { int pwake = 0; - unsigned int revents; poll_table pt; init_poll_funcptr(&pt, NULL); @@ -1585,14 +1587,10 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. - */ - revents = ep_item_poll(epi, &pt, 1); - - /* * If the item is "hot" and it is not registered inside the ready * list, push it inside. */ - if (revents & event->events) { + if (ep_item_poll(epi, &pt, 1)) { spin_lock_irq(&ep->lock); if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); diff --git a/fs/exec.c b/fs/exec.c index 5688b5e1b937..7eb8d21bcab9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1349,9 +1349,14 @@ void setup_new_exec(struct linux_binprm * bprm) current->sas_ss_sp = current->sas_ss_size = 0; - /* Figure out dumpability. */ + /* + * Figure out dumpability. Note that this checking only of current + * is wrong, but userspace depends on it. This should be testing + * bprm->secureexec instead. + */ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || - bprm->secureexec) + !(uid_eq(current_euid(), current_uid()) && + gid_eq(current_egid(), current_gid()))) set_dumpable(current->mm, suid_dumpable); else set_dumpable(current->mm, SUID_DUMP_USER); diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 98233a97b7b8..c5a53fcc43ea 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -31,6 +31,7 @@ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#include <linux/iversion.h> #include "exofs.h" static inline unsigned exofs_chunk_size(struct inode *inode) @@ -60,7 +61,7 @@ static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len) struct inode *dir = mapping->host; int err = 0; - dir->i_version++; + inode_inc_iversion(dir); if (!PageUptodate(page)) SetPageUptodate(page); @@ -241,7 +242,7 @@ exofs_readdir(struct file *file, struct dir_context *ctx) unsigned long n = pos >> PAGE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(exofs_chunk_size(inode)-1); - int need_revalidate = (file->f_version != inode->i_version); + bool need_revalidate = inode_cmp_iversion(inode, file->f_version); if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1)) return 0; @@ -264,8 +265,8 @@ exofs_readdir(struct file *file, struct dir_context *ctx) chunk_mask); ctx->pos = (n<<PAGE_SHIFT) + offset; } - file->f_version = inode->i_version; - need_revalidate = 0; + file->f_version = inode_query_iversion(inode); + need_revalidate = false; } de = (struct exofs_dir_entry *)(kaddr + offset); limit = kaddr + exofs_last_byte(inode, n) - diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 819624cfc8da..179cd5c2f52a 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -38,6 +38,7 @@ #include <linux/module.h> #include <linux/exportfs.h> #include <linux/slab.h> +#include <linux/iversion.h> #include "exofs.h" @@ -159,7 +160,7 @@ static struct inode *exofs_alloc_inode(struct super_block *sb) if (!oi) return NULL; - oi->vfs_inode.i_version = 1; + inode_set_iversion(&oi->vfs_inode, 1); return &oi->vfs_inode; } @@ -192,10 +193,13 @@ static void exofs_init_once(void *foo) */ static int init_inodecache(void) { - exofs_inode_cachep = kmem_cache_create("exofs_inode_cache", + exofs_inode_cachep = kmem_cache_create_usercopy("exofs_inode_cache", sizeof(struct exofs_i_info), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | - SLAB_ACCOUNT, exofs_init_once); + SLAB_ACCOUNT, + offsetof(struct exofs_i_info, i_data), + sizeof_field(struct exofs_i_info, i_data), + exofs_init_once); if (exofs_inode_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index c634874e12d9..894e4c53d1d2 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -13,8 +13,7 @@ config EXT2_FS_XATTR depends on EXT2_FS help Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). + the kernel or by users (see the attr(5) manual page for details). If unsure, say N. @@ -26,9 +25,6 @@ config EXT2_FS_POSIX_ACL Posix Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. - To learn more about Access Control Lists, visit the Posix ACLs for - Linux website <http://acl.bestbits.at/>. - If you don't know what Access Control Lists are, say N config EXT2_FS_SECURITY diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 987647986f47..4111085a129f 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -26,6 +26,7 @@ #include <linux/buffer_head.h> #include <linux/pagemap.h> #include <linux/swap.h> +#include <linux/iversion.h> typedef struct ext2_dir_entry_2 ext2_dirent; @@ -92,7 +93,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) struct inode *dir = mapping->host; int err = 0; - dir->i_version++; + inode_inc_iversion(dir); block_write_end(NULL, mapping, pos, len, len, page, NULL); if (pos+len > dir->i_size) { @@ -293,7 +294,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx) unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); unsigned char *types = NULL; - int need_revalidate = file->f_version != inode->i_version; + bool need_revalidate = inode_cmp_iversion(inode, file->f_version); if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) return 0; @@ -319,8 +320,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx) offset = ext2_validate_entry(kaddr, offset, chunk_mask); ctx->pos = (n<<PAGE_SHIFT) + offset; } - file->f_version = inode->i_version; - need_revalidate = 0; + file->f_version = inode_query_iversion(inode); + need_revalidate = false; } de = (ext2_dirent *)(kaddr+offset); limit = kaddr + ext2_last_byte(inode, n) - EXT2_DIR_REC_LEN(1); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 2da67699dc33..09640220fda8 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -100,7 +100,7 @@ static int ext2_dax_fault(struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ext2_iomap_ops); + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 15f90f5f3e13..7666c065b96f 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -33,6 +33,7 @@ #include <linux/quotaops.h> #include <linux/uaccess.h> #include <linux/dax.h> +#include <linux/iversion.h> #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -184,7 +185,7 @@ static struct inode *ext2_alloc_inode(struct super_block *sb) if (!ei) return NULL; ei->i_block_alloc_info = NULL; - ei->vfs_inode.i_version = 1; + inode_set_iversion(&ei->vfs_inode, 1); #ifdef CONFIG_QUOTA memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); #endif @@ -220,11 +221,13 @@ static void init_once(void *foo) static int __init init_inodecache(void) { - ext2_inode_cachep = kmem_cache_create("ext2_inode_cache", - sizeof(struct ext2_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), - init_once); + ext2_inode_cachep = kmem_cache_create_usercopy("ext2_inode_cache", + sizeof(struct ext2_inode_info), 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT), + offsetof(struct ext2_inode_info, i_data), + sizeof_field(struct ext2_inode_info, i_data), + init_once); if (ext2_inode_cachep == NULL) return -ENOMEM; return 0; @@ -959,8 +962,11 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { err = bdev_dax_supported(sb, blocksize); - if (err) - goto failed_mount; + if (err) { + ext2_msg(sb, KERN_ERR, + "DAX unsupported by block device. Turning off DAX."); + sbi->s_mount_opt &= ~EXT2_MOUNT_DAX; + } } /* If the blocksize doesn't match, re-read the thing.. */ @@ -1569,7 +1575,7 @@ out: return err; if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); - inode->i_version++; + inode_inc_iversion(inode); inode->i_mtime = inode->i_ctime = current_time(inode); mark_inode_dirty(inode); return len - towrite; diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 73b850f5659c..a453cc87082b 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -82,9 +82,6 @@ config EXT4_FS_POSIX_ACL POSIX Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. - To learn more about Access Control Lists, visit the POSIX ACLs for - Linux website <http://acl.bestbits.at/>. - If you don't know what Access Control Lists are, say N config EXT4_FS_SECURITY diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index a48fc5ae2701..9b63f5416a2f 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* File: fs/ext4/acl.h diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index a943e568292e..f9b3e0a83526 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -355,10 +355,10 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, blk = ext4_inode_table(sb, desc); offset = blk - group_first_block; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, - EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group), + EXT4_B2C(sbi, offset + sbi->s_itb_per_group), EXT4_B2C(sbi, offset)); if (next_zero_bit < - EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group)) + EXT4_B2C(sbi, offset + sbi->s_itb_per_group)) /* bad bitmap for inode tables */ return blk; return 0; diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index bee888e0e2db..913061c0de1b 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -147,11 +147,11 @@ int ext4_setup_system_zone(struct super_block *sb) int ret; if (!test_opt(sb, BLOCK_VALIDITY)) { - if (EXT4_SB(sb)->system_blks.rb_node) + if (sbi->system_blks.rb_node) ext4_release_system_zone(sb); return 0; } - if (EXT4_SB(sb)->system_blks.rb_node) + if (sbi->system_blks.rb_node) return 0; for (i=0; i < ngroups; i++) { @@ -173,7 +173,7 @@ int ext4_setup_system_zone(struct super_block *sb) } if (test_opt(sb, DEBUG)) - debug_print_tree(EXT4_SB(sb)); + debug_print_tree(sbi); return 0; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d5babc9f222b..afda0a0499ce 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -25,6 +25,7 @@ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/slab.h> +#include <linux/iversion.h> #include "ext4.h" #include "xattr.h" @@ -208,7 +209,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (file->f_version != inode->i_version) { + if (inode_cmp_iversion(inode, file->f_version)) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext4_dir_entry_2 *) (bh->b_data + i); @@ -227,7 +228,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) offset = i; ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; - file->f_version = inode->i_version; + file->f_version = inode_query_iversion(inode); } while (ctx->pos < inode->i_size @@ -568,10 +569,10 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) * cached entries. */ if ((!info->curr_node) || - (file->f_version != inode->i_version)) { + inode_cmp_iversion(inode, file->f_version)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); - file->f_version = inode->i_version; + file->f_version = inode_query_iversion(inode); ret = ext4_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4e091eae38b1..3241475a1733 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* * ext4.h * @@ -611,10 +611,10 @@ enum { /* * Flags used by ext4_free_blocks */ -#define EXT4_FREE_BLOCKS_METADATA 0x0001 -#define EXT4_FREE_BLOCKS_FORGET 0x0002 -#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 -#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 +#define EXT4_FREE_BLOCKS_METADATA 0x0001 +#define EXT4_FREE_BLOCKS_FORGET 0x0002 +#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 @@ -1986,10 +1986,10 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) /* Legal values for the dx_root hash_version field: */ -#define DX_HASH_LEGACY 0 -#define DX_HASH_HALF_MD4 1 -#define DX_HASH_TEA 2 -#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 #define DX_HASH_HALF_MD4_UNSIGNED 4 #define DX_HASH_TEA_UNSIGNED 5 @@ -2000,7 +2000,6 @@ static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, struct shash_desc shash; char ctx[4]; } desc; - int err; BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx)); @@ -2008,8 +2007,7 @@ static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, desc.shash.flags = 0; *(u32 *)desc.ctx = crc; - err = crypto_shash_update(&desc.shash, address, length); - BUG_ON(err); + BUG_ON(crypto_shash_update(&desc.shash, address, length)); return *(u32 *)desc.ctx; } diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 8ecf84b8f5a1..98fb0c119c68 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- */ #ifndef _EXT4_EXTENTS diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 48143e32411c..15b6dd733780 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * ext4_jbd2.h * @@ -5,10 +6,6 @@ * * Copyright 1998--1999 Red Hat corp --- All Rights Reserved * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * * Ext4-specific journaling extensions. */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c941251ac0c0..054416e9d827 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> @@ -5,19 +6,6 @@ * Architecture independence: * Copyright (c) 2005, Bull S.A. * Written by Pierre Peiffer <pierre.peiffer@bull.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- */ /* diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index ca90fc96f47e..8efdeb903d6b 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/extents_status.h * diff --git a/fs/ext4/file.c b/fs/ext4/file.c index a0ae27b1bc66..fb6f023622fe 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -280,7 +280,8 @@ out: static int ext4_dax_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size) { - int result; + int result, error = 0; + int retries = 0; handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; @@ -304,6 +305,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, sb_start_pagefault(sb); file_update_time(vmf->vma->vm_file); down_read(&EXT4_I(inode)->i_mmap_sem); +retry: handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); if (IS_ERR(handle)) { @@ -314,9 +316,13 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, } else { down_read(&EXT4_I(inode)->i_mmap_sem); } - result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops); + result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops); if (write) { ext4_journal_stop(handle); + + if ((result & VM_FAULT_ERROR) && error == -ENOSPC && + ext4_should_retry_alloc(sb, &retries)) + goto retry; /* Handling synchronous page fault? */ if (result & VM_FAULT_NEEDDSYNC) result = dax_finish_sync_fault(vmf, pe_size, pfn); diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index 7ec340898598..e871c4bf18e9 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -1,21 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "ext4.h" #include <linux/fsmap.h> diff --git a/fs/ext4/fsmap.h b/fs/ext4/fsmap.h index 9a2cd367cc66..68c8001fee85 100644 --- a/fs/ext4/fsmap.h +++ b/fs/ext4/fsmap.h @@ -1,21 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __EXT4_FSMAP_H__ #define __EXT4_FSMAP_H__ diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 00c6dd29e621..e22dcfab308b 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/hash.c * * Copyright (C) 2002 by Theodore Ts'o - * - * This file is released under the GPL v2. - * - * This file may be redistributed under the terms of the GNU Public - * License. */ #include <linux/fs.h> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b32cf263750d..7830d28df331 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -303,7 +303,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) /* Do this BEFORE marking the inode not in use or returning an error */ ext4_clear_inode(inode); - es = EXT4_SB(sb)->s_es; + es = sbi->s_es; if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { ext4_error(sb, "reserved or nonexistent inode %lu", ino); goto error_return; @@ -1157,7 +1157,7 @@ got: ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ext4_set_inode_state(inode, EXT4_STATE_NEW); - ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; + ei->i_extra_isize = sbi->s_want_extra_isize; ei->i_inline_off = 0; if (ext4_has_feature_inline_data(sb)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 1367553c43bb..7c4165b88505 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1,19 +1,12 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * Copyright (c) 2012 Taobao. * Written by Tao Ma <boyu.mt@taobao.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include <linux/iomap.h> #include <linux/fiemap.h> +#include <linux/iversion.h> #include "ext4_jbd2.h" #include "ext4.h" @@ -1042,7 +1035,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, */ dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); - dir->i_version++; + inode_inc_iversion(dir); return 1; } @@ -1494,7 +1487,7 @@ int ext4_read_inline_dir(struct file *file, * dirent right now. Scan from the start of the inline * dir to make sure. */ - if (file->f_version != inode->i_version) { + if (inode_cmp_iversion(inode, file->f_version)) { for (i = 0; i < extra_size && i < offset;) { /* * "." is with offset 0 and @@ -1526,7 +1519,7 @@ int ext4_read_inline_dir(struct file *file, } offset = i; ctx->pos = offset; - file->f_version = inode->i_version; + file->f_version = inode_query_iversion(inode); } while (ctx->pos < extra_size) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 534a9130f625..c94780075b04 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -39,6 +39,7 @@ #include <linux/slab.h> #include <linux/bitops.h> #include <linux/iomap.h> +#include <linux/iversion.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -3767,10 +3768,18 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) /* Credits for sb + inode write */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { - /* This is really bad luck. We've written the data - * but cannot extend i_size. Bail out and pretend - * the write failed... */ - ret = PTR_ERR(handle); + /* + * We wrote the data but cannot extend + * i_size. Bail out. In async io case, we do + * not return error here because we have + * already submmitted the corresponding + * bio. Returning error here makes the caller + * think that this IO is done and failed + * resulting in race with bio's completion + * handler. + */ + if (!ret) + ret = PTR_ERR(handle); if (inode->i_nlink) ext4_orphan_del(NULL, inode); @@ -4882,12 +4891,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { - inode->i_version = le32_to_cpu(raw_inode->i_disk_version); + u64 ivers = le32_to_cpu(raw_inode->i_disk_version); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) - inode->i_version |= + ivers |= (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } + inode_set_iversion_queried(inode, ivers); } ret = 0; @@ -5173,11 +5184,13 @@ static int ext4_do_update_inode(handle_t *handle, } if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { - raw_inode->i_disk_version = cpu_to_le32(inode->i_version); + u64 ivers = inode_peek_iversion(inode); + + raw_inode->i_disk_version = cpu_to_le32(ivers); if (ei->i_extra_isize) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) raw_inode->i_version_hi = - cpu_to_le32(inode->i_version >> 32); + cpu_to_le32(ivers >> 32); raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 1eec25014f62..7e99ad02f1ba 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -19,6 +19,7 @@ #include <linux/uuid.h> #include <linux/uaccess.h> #include <linux/delay.h> +#include <linux/iversion.h> #include "ext4_jbd2.h" #include "ext4.h" #include <linux/fsmap.h> @@ -144,7 +145,7 @@ static long swap_inode_boot_loader(struct super_block *sb, i_gid_write(inode_bl, 0); inode_bl->i_flags = 0; ei_bl->i_flags = 0; - inode_bl->i_version = 1; + inode_set_iversion(inode_bl, 1); i_size_write(inode_bl, 0); inode_bl->i_mode = S_IFREG; if (ext4_has_feature_extents(sb)) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d9f8b90a93ed..769a62708b1c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- */ @@ -769,10 +757,10 @@ void ext4_mb_generate_buddy(struct super_block *sb, clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); period = get_cycles() - period; - spin_lock(&EXT4_SB(sb)->s_bal_lock); - EXT4_SB(sb)->s_mb_buddies_generated++; - EXT4_SB(sb)->s_mb_generation_time += period; - spin_unlock(&EXT4_SB(sb)->s_bal_lock); + spin_lock(&sbi->s_bal_lock); + sbi->s_mb_buddies_generated++; + sbi->s_mb_generation_time += period; + spin_unlock(&sbi->s_bal_lock); } static void mb_regenerate_buddy(struct ext4_buddy *e4b) @@ -1459,7 +1447,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); - blocknr += EXT4_C2B(EXT4_SB(sb), block); + blocknr += EXT4_C2B(sbi, block); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, @@ -4850,9 +4838,9 @@ do_more: if (in_range(ext4_block_bitmap(sb, gdp), block, count) || in_range(ext4_inode_bitmap(sb, gdp), block, count) || in_range(block, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group) || + sbi->s_itb_per_group) || in_range(block + count - 1, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group)) { + sbi->s_itb_per_group)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index dcf52540f379..88c98f17e3d9 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/mballoc.h * diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index cf5181b62df1..61a9d1927817 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -1,15 +1,8 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * Copyright IBM Corporation, 2007 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * */ #include <linux/slab.h> diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 9bb36909ec92..b96e4bd3b3ec 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -1,16 +1,8 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd. * Written by Takashi Sato <t-sato@yk.jp.nec.com> * Akira Fujita <a-fujita@rs.jp.nec.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include <linux/fs.h> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index e750d68fbcb5..b1f21e3a0763 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -34,6 +34,7 @@ #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/bio.h> +#include <linux/iversion.h> #include "ext4.h" #include "ext4_jbd2.h" @@ -2959,7 +2960,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) "empty directory '%.*s' has too many links (%u)", dentry->d_name.len, dentry->d_name.name, inode->i_nlink); - inode->i_version++; + inode_inc_iversion(inode); clear_nlink(inode); /* There's no need to set i_disksize: the fact that i_nlink is * zero will ensure that the right thing happens during any @@ -3056,39 +3057,19 @@ static int ext4_symlink(struct inode *dir, struct inode *inode; int err, len = strlen(symname); int credits; - bool encryption_required; struct fscrypt_str disk_link; - struct fscrypt_symlink_data *sd = NULL; if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) return -EIO; - disk_link.len = len + 1; - disk_link.name = (char *) symname; - - encryption_required = (ext4_encrypted_inode(dir) || - DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))); - if (encryption_required) { - err = fscrypt_get_encryption_info(dir); - if (err) - return err; - if (!fscrypt_has_encryption_key(dir)) - return -ENOKEY; - disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + - sizeof(struct fscrypt_symlink_data)); - sd = kzalloc(disk_link.len, GFP_KERNEL); - if (!sd) - return -ENOMEM; - } - - if (disk_link.len > dir->i_sb->s_blocksize) { - err = -ENAMETOOLONG; - goto err_free_sd; - } + err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, + &disk_link); + if (err) + return err; err = dquot_initialize(dir); if (err) - goto err_free_sd; + return err; if ((disk_link.len > EXT4_N_BLOCKS * 4)) { /* @@ -3117,27 +3098,18 @@ static int ext4_symlink(struct inode *dir, if (IS_ERR(inode)) { if (handle) ext4_journal_stop(handle); - err = PTR_ERR(inode); - goto err_free_sd; + return PTR_ERR(inode); } - if (encryption_required) { - struct qstr istr; - struct fscrypt_str ostr = - FSTR_INIT(sd->encrypted_path, disk_link.len); - - istr.name = (const unsigned char *) symname; - istr.len = len; - err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); + if (IS_ENCRYPTED(inode)) { + err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); if (err) goto err_drop_inode; - sd->len = cpu_to_le16(ostr.len); - disk_link.name = (char *) sd; inode->i_op = &ext4_encrypted_symlink_inode_operations; } if ((disk_link.len > EXT4_N_BLOCKS * 4)) { - if (!encryption_required) + if (!IS_ENCRYPTED(inode)) inode->i_op = &ext4_symlink_inode_operations; inode_nohighmem(inode); ext4_set_aops(inode); @@ -3179,7 +3151,7 @@ static int ext4_symlink(struct inode *dir, } else { /* clear the extent format for fast symlink */ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); - if (!encryption_required) { + if (!IS_ENCRYPTED(inode)) { inode->i_op = &ext4_fast_symlink_inode_operations; inode->i_link = (char *)&EXT4_I(inode)->i_data; } @@ -3194,16 +3166,17 @@ static int ext4_symlink(struct inode *dir, if (handle) ext4_journal_stop(handle); - kfree(sd); - return err; + goto out_free_encrypted_link; + err_drop_inode: if (handle) ext4_journal_stop(handle); clear_nlink(inode); unlock_new_inode(inode); iput(inode); -err_free_sd: - kfree(sd); +out_free_encrypted_link: + if (disk_link.name != (unsigned char *)symname) + kfree(disk_link.name); return err; } @@ -3221,9 +3194,9 @@ static int ext4_link(struct dentry *old_dentry, if (err) return err; - if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && - (!projid_eq(EXT4_I(dir)->i_projid, - EXT4_I(old_dentry->d_inode)->i_projid))) + if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && + (!projid_eq(EXT4_I(dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid))) return -EXDEV; err = dquot_initialize(dir); @@ -3365,7 +3338,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, ent->de->inode = cpu_to_le32(ino); if (ext4_has_feature_filetype(ent->dir->i_sb)) ent->de->file_type = file_type; - ent->dir->i_version++; + inode_inc_iversion(ent->dir); ent->dir->i_ctime = ent->dir->i_mtime = current_time(ent->dir); ext4_mark_inode_dirty(handle, ent->dir); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 50443bda8e98..b6bec270a8e4 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1477,7 +1477,7 @@ static int ext4_flex_group_add(struct super_block *sb, goto exit_journal; group = flex_gd->groups[0].group; - BUG_ON(group != EXT4_SB(sb)->s_groups_count); + BUG_ON(group != sbi->s_groups_count); err = ext4_add_new_descs(handle, sb, group, resize_inode, flex_gd->count); if (err) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7c46693a14d7..39bf464c35f1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/super.c * @@ -40,6 +41,7 @@ #include <linux/dax.h> #include <linux/cleancache.h> #include <linux/uaccess.h> +#include <linux/iversion.h> #include <linux/kthread.h> #include <linux/freezer.h> @@ -742,6 +744,7 @@ __acquires(bitlock) } ext4_unlock_group(sb, grp); + ext4_commit_super(sb, 1); ext4_handle_error(sb); /* * We only get here in the ERRORS_RO case; relocking the group @@ -870,7 +873,6 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); ext4_quota_off_umount(sb); - flush_workqueue(sbi->rsv_conversion_wq); destroy_workqueue(sbi->rsv_conversion_wq); if (sbi->s_journal) { @@ -967,7 +969,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) if (!ei) return NULL; - ei->vfs_inode.i_version = 1; + inode_set_iversion(&ei->vfs_inode, 1); spin_lock_init(&ei->i_raw_lock); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); @@ -1036,11 +1038,13 @@ static void init_once(void *foo) static int __init init_inodecache(void) { - ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", - sizeof(struct ext4_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), - init_once); + ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", + sizeof(struct ext4_inode_info), 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT), + offsetof(struct ext4_inode_info, i_data), + sizeof_field(struct ext4_inode_info, i_data), + init_once); if (ext4_inode_cachep == NULL) return -ENOMEM; return 0; @@ -1069,9 +1073,7 @@ void ext4_clear_inode(struct inode *inode) jbd2_free_inode(EXT4_I(inode)->jinode); EXT4_I(inode)->jinode = NULL; } -#ifdef CONFIG_EXT4_FS_ENCRYPTION - fscrypt_put_encryption_info(inode, NULL); -#endif + fscrypt_put_encryption_info(inode); } static struct inode *ext4_nfs_get_inode(struct super_block *sb, @@ -2676,7 +2678,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb, * compensate. */ if (sb->s_blocksize == 1024 && nr == 0 && - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) == 0) + le32_to_cpu(sbi->s_es->s_first_data_block) == 0) has_super++; return (has_super + ext4_group_first_block_no(sb, bg)); @@ -3121,7 +3123,7 @@ int ext4_register_li_request(struct super_block *sb, { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_li_request *elr = NULL; - ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + ext4_group_t ngroups = sbi->s_groups_count; int ret = 0; mutex_lock(&ext4_li_mtx); @@ -3710,11 +3712,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); - goto failed_mount; + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; } err = bdev_dax_supported(sb, blocksize); - if (err) - goto failed_mount; + if (err) { + ext4_msg(sb, KERN_ERR, + "DAX unsupported by block device. Turning off DAX."); + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; + } } if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { @@ -4836,7 +4841,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sbi))) return 0; trace_ext4_sync_fs(sb, wait); diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index a2006c9af1d9..dd05af983092 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -28,59 +28,28 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, struct delayed_call *done) { struct page *cpage = NULL; - char *caddr, *paddr = NULL; - struct fscrypt_str cstr, pstr; - struct fscrypt_symlink_data *sd; - int res; - u32 max_size = inode->i_sb->s_blocksize; + const void *caddr; + unsigned int max_size; + const char *paddr; if (!dentry) return ERR_PTR(-ECHILD); - res = fscrypt_get_encryption_info(inode); - if (res) - return ERR_PTR(res); - if (ext4_inode_is_fast_symlink(inode)) { - caddr = (char *) EXT4_I(inode)->i_data; + caddr = EXT4_I(inode)->i_data; max_size = sizeof(EXT4_I(inode)->i_data); } else { cpage = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(cpage)) return ERR_CAST(cpage); caddr = page_address(cpage); + max_size = inode->i_sb->s_blocksize; } - /* Symlink is encrypted */ - sd = (struct fscrypt_symlink_data *)caddr; - cstr.name = sd->encrypted_path; - cstr.len = le16_to_cpu(sd->len); - if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) { - /* Symlink data on the disk is corrupted */ - res = -EFSCORRUPTED; - goto errout; - } - - res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); - if (res) - goto errout; - paddr = pstr.name; - - res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); - if (res) - goto errout; - - /* Null-terminate the name */ - paddr[pstr.len] = '\0'; + paddr = fscrypt_get_symlink(inode, caddr, max_size, done); if (cpage) put_page(cpage); - set_delayed_call(done, kfree_link, paddr); return paddr; -errout: - if (cpage) - put_page(cpage); - kfree(paddr); - return ERR_PTR(res); } const struct inode_operations ext4_encrypted_symlink_inode_operations = { diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index e21afd52e7d7..1205261f130c 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -11,6 +11,7 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/seq_file.h> +#include <linux/slab.h> #include <linux/proc_fs.h> #include "ext4.h" @@ -329,6 +330,13 @@ static void ext4_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } +static void ext4_kset_release(struct kobject *kobj) +{ + struct kset *kset = container_of(kobj, struct kset, kobj); + + kfree(kset); +} + static const struct sysfs_ops ext4_attr_ops = { .show = ext4_attr_show, .store = ext4_attr_store, @@ -342,20 +350,18 @@ static struct kobj_type ext4_sb_ktype = { static struct kobj_type ext4_ktype = { .sysfs_ops = &ext4_attr_ops, + .release = ext4_kset_release, }; -static struct kset ext4_kset = { - .kobj = {.ktype = &ext4_ktype}, -}; +static struct kset *ext4_kset; static struct kobj_type ext4_feat_ktype = { .default_attrs = ext4_feat_attrs, .sysfs_ops = &ext4_attr_ops, + .release = (void (*)(struct kobject *))kfree, }; -static struct kobject ext4_feat = { - .kset = &ext4_kset, -}; +static struct kobject *ext4_feat; #define PROC_FILE_SHOW_DEFN(name) \ static int name##_open(struct inode *inode, struct file *file) \ @@ -392,12 +398,15 @@ int ext4_register_sysfs(struct super_block *sb) const struct ext4_proc_files *p; int err; - sbi->s_kobj.kset = &ext4_kset; + sbi->s_kobj.kset = ext4_kset; init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, NULL, "%s", sb->s_id); - if (err) + if (err) { + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); return err; + } if (ext4_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); @@ -427,25 +436,45 @@ int __init ext4_init_sysfs(void) { int ret; - kobject_set_name(&ext4_kset.kobj, "ext4"); - ext4_kset.kobj.parent = fs_kobj; - ret = kset_register(&ext4_kset); + ext4_kset = kzalloc(sizeof(*ext4_kset), GFP_KERNEL); + if (!ext4_kset) + return -ENOMEM; + + kobject_set_name(&ext4_kset->kobj, "ext4"); + ext4_kset->kobj.parent = fs_kobj; + ext4_kset->kobj.ktype = &ext4_ktype; + ret = kset_register(ext4_kset); if (ret) - return ret; + goto kset_err; + + ext4_feat = kzalloc(sizeof(*ext4_feat), GFP_KERNEL); + if (!ext4_feat) { + ret = -ENOMEM; + goto kset_err; + } - ret = kobject_init_and_add(&ext4_feat, &ext4_feat_ktype, + ext4_feat->kset = ext4_kset; + ret = kobject_init_and_add(ext4_feat, &ext4_feat_ktype, NULL, "features"); if (ret) - kset_unregister(&ext4_kset); - else - ext4_proc_root = proc_mkdir(proc_dirname, NULL); + goto feat_err; + + ext4_proc_root = proc_mkdir(proc_dirname, NULL); + return ret; + +feat_err: + kobject_put(ext4_feat); +kset_err: + kset_unregister(ext4_kset); + ext4_kset = NULL; return ret; } void ext4_exit_sysfs(void) { - kobject_put(&ext4_feat); - kset_unregister(&ext4_kset); + kobject_put(ext4_feat); + kset_unregister(ext4_kset); + ext4_kset = NULL; remove_proc_entry(proc_dirname, NULL); ext4_proc_root = NULL; } diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h index b64a9fa0ff41..0cb13badf473 100644 --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/truncate.h * diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 218a7ba57819..63656dbafdc4 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -56,6 +56,7 @@ #include <linux/slab.h> #include <linux/mbcache.h> #include <linux/quotaops.h> +#include <linux/iversion.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" @@ -294,13 +295,13 @@ ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) { return ((u64)ea_inode->i_ctime.tv_sec << 32) | - ((u32)ea_inode->i_version); + (u32) inode_peek_iversion_raw(ea_inode); } static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) { ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32); - ea_inode->i_version = (u32)ref_count; + inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff); } static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode) diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index f8cc07588ac9..dd54c4f995c8 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* File: fs/ext4/xattr.h diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 378c221d68a9..9a20ef42fadd 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -35,8 +35,7 @@ config F2FS_FS_XATTR default y help Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). + the kernel or by users (see the attr(5) manual page for details). If unsure, say N. @@ -49,9 +48,6 @@ config F2FS_FS_POSIX_ACL Posix Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. - To learn more about Access Control Lists, visit the POSIX ACLs for - Linux website <http://acl.bestbits.at/>. - If you don't know what Access Control Lists are, say N config F2FS_FS_SECURITY diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 2bb7c9fc5144..111824199a88 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -270,7 +270,7 @@ static struct posix_acl *f2fs_acl_clone(const struct posix_acl *acl, sizeof(struct posix_acl_entry); clone = kmemdup(acl, size, flags); if (clone) - atomic_set(&clone->a_refcount, 1); + refcount_set(&clone->a_refcount, 1); } return clone; } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 4aa69bc1c70a..512dca8abc7d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -237,12 +237,15 @@ static int __f2fs_write_meta_page(struct page *page, trace_f2fs_writepage(page, META); + if (unlikely(f2fs_cp_error(sbi))) { + dec_page_count(sbi, F2FS_DIRTY_META); + unlock_page(page); + return 0; + } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) goto redirty_out; - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; write_meta_page(sbi, page, io_type); dec_page_count(sbi, F2FS_DIRTY_META); @@ -793,7 +796,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) block_t cp_blk_no; int i; - sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL); + sbi->ckpt = f2fs_kzalloc(sbi, cp_blks * blk_size, GFP_KERNEL); if (!sbi->ckpt) return -ENOMEM; /* @@ -1154,6 +1157,7 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* set this flag to activate crc|cp_ver for recovery */ __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); + __clear_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG); spin_unlock_irqrestore(&sbi->cp_lock, flags); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 516fa0d3ff9c..7578ed1a85e0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -56,7 +56,7 @@ static void f2fs_read_end_io(struct bio *bio) int i; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { + if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), FAULT_IO)) { f2fs_show_injection_info(FAULT_IO); bio->bi_status = BLK_STS_IOERR; } @@ -111,8 +111,13 @@ static void f2fs_write_end_io(struct bio *bio) if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); - f2fs_stop_checkpoint(sbi, true); + if (type == F2FS_WB_CP_DATA) + f2fs_stop_checkpoint(sbi, true); } + + f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && + page->index != nid_of_node(page)); + dec_page_count(sbi, type); clear_cold_data(page); end_page_writeback(page); @@ -169,6 +174,7 @@ static bool __same_bdev(struct f2fs_sb_info *sbi, * Low-level block read/write IO operations. */ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, + struct writeback_control *wbc, int npages, bool is_read) { struct bio *bio; @@ -178,6 +184,8 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, f2fs_target_device(sbi, blk_addr, bio); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; bio->bi_private = is_read ? NULL : sbi; + if (wbc) + wbc_init_bio(wbc, bio); return bio; } @@ -373,7 +381,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) f2fs_trace_ios(fio, 0); /* Allocate a new bio */ - bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->op)); + bio = __bio_alloc(fio->sbi, fio->new_blkaddr, fio->io_wbc, + 1, is_read_io(fio->op)); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); @@ -435,7 +444,7 @@ alloc_new: dec_page_count(sbi, WB_DATA_TYPE(bio_page)); goto out_fail; } - io->bio = __bio_alloc(sbi, fio->new_blkaddr, + io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc, BIO_MAX_PAGES, false); io->fio = *fio; } @@ -445,6 +454,9 @@ alloc_new: goto alloc_new; } + if (fio->io_wbc) + wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE); + io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); @@ -783,7 +795,7 @@ got_it: return page; } -static int __allocate_data_block(struct dnode_of_data *dn) +static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; @@ -808,7 +820,7 @@ alloc: set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, - &sum, CURSEG_WARM_DATA, NULL, false); + &sum, seg_type, NULL, false); set_data_blkaddr(dn); /* update i_size */ @@ -831,10 +843,12 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); struct f2fs_map_blocks map; + int flag; int err = 0; + bool direct_io = iocb->ki_flags & IOCB_DIRECT; /* convert inline data for Direct I/O*/ - if (iocb->ki_flags & IOCB_DIRECT) { + if (direct_io) { err = f2fs_convert_inline_inode(inode); if (err) return err; @@ -851,19 +865,33 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_len = 0; map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; - if (iocb->ki_flags & IOCB_DIRECT) - return f2fs_map_blocks(inode, &map, 1, - __force_buffered_io(inode, WRITE) ? - F2FS_GET_BLOCK_PRE_AIO : - F2FS_GET_BLOCK_PRE_DIO); + if (direct_io) { + map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); + flag = __force_buffered_io(inode, WRITE) ? + F2FS_GET_BLOCK_PRE_AIO : + F2FS_GET_BLOCK_PRE_DIO; + goto map_blocks; + } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; } - if (!f2fs_has_inline_data(inode)) - return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (f2fs_has_inline_data(inode)) + return err; + + flag = F2FS_GET_BLOCK_PRE_AIO; + +map_blocks: + err = f2fs_map_blocks(inode, &map, 1, flag); + if (map.m_len > 0 && err == -ENOSPC) { + if (!direct_io) + set_inode_flag(inode, FI_NO_PREALLOC); + err = 0; + } return err; } @@ -904,6 +932,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, blkcnt_t prealloc; struct extent_info ei = {0,0,0}; block_t blkaddr; + unsigned int start_pgofs; if (!maxblocks) return 0; @@ -919,6 +948,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, map->m_pblk = ei.blk + pgofs - ei.fofs; map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs); map->m_flags = F2FS_MAP_MAPPED; + if (map->m_next_extent) + *map->m_next_extent = pgofs + map->m_len; goto out; } @@ -937,10 +968,14 @@ next_dnode: if (map->m_next_pgofs) *map->m_next_pgofs = get_next_page_offset(&dn, pgofs); + if (map->m_next_extent) + *map->m_next_extent = + get_next_page_offset(&dn, pgofs); } goto unlock_out; } + start_pgofs = pgofs; prealloc = 0; last_ofs_in_node = ofs_in_node = dn.ofs_in_node; end_offset = ADDRS_PER_PAGE(dn.node_page, inode); @@ -960,7 +995,8 @@ next_block: last_ofs_in_node = dn.ofs_in_node; } } else { - err = __allocate_data_block(&dn); + err = __allocate_data_block(&dn, + map->m_seg_type); if (!err) set_inode_flag(inode, FI_APPEND_WRITE); } @@ -973,14 +1009,20 @@ next_block: map->m_pblk = 0; goto sync_out; } + if (flag == F2FS_GET_BLOCK_PRECACHE) + goto sync_out; if (flag == F2FS_GET_BLOCK_FIEMAP && blkaddr == NULL_ADDR) { if (map->m_next_pgofs) *map->m_next_pgofs = pgofs + 1; + goto sync_out; } - if (flag != F2FS_GET_BLOCK_FIEMAP || - blkaddr != NEW_ADDR) + if (flag != F2FS_GET_BLOCK_FIEMAP) { + /* for defragment case */ + if (map->m_next_pgofs) + *map->m_next_pgofs = pgofs + 1; goto sync_out; + } } } @@ -1031,6 +1073,16 @@ skip: else if (dn.ofs_in_node < end_offset) goto next_block; + if (flag == F2FS_GET_BLOCK_PRECACHE) { + if (map->m_flags & F2FS_MAP_MAPPED) { + unsigned int ofs = start_pgofs - map->m_lblk; + + f2fs_update_extent_cache_range(&dn, + start_pgofs, map->m_pblk + ofs, + map->m_len - ofs); + } + } + f2fs_put_dnode(&dn); if (create) { @@ -1040,6 +1092,17 @@ skip: goto next_dnode; sync_out: + if (flag == F2FS_GET_BLOCK_PRECACHE) { + if (map->m_flags & F2FS_MAP_MAPPED) { + unsigned int ofs = start_pgofs - map->m_lblk; + + f2fs_update_extent_cache_range(&dn, + start_pgofs, map->m_pblk + ofs, + map->m_len - ofs); + } + if (map->m_next_extent) + *map->m_next_extent = pgofs + 1; + } f2fs_put_dnode(&dn); unlock_out: if (create) { @@ -1053,7 +1116,7 @@ out: static int __get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create, int flag, - pgoff_t *next_pgofs) + pgoff_t *next_pgofs, int seg_type) { struct f2fs_map_blocks map; int err; @@ -1061,6 +1124,8 @@ static int __get_data_block(struct inode *inode, sector_t iblock, map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; map.m_next_pgofs = next_pgofs; + map.m_next_extent = NULL; + map.m_seg_type = seg_type; err = f2fs_map_blocks(inode, &map, create, flag); if (!err) { @@ -1076,14 +1141,17 @@ static int get_data_block(struct inode *inode, sector_t iblock, pgoff_t *next_pgofs) { return __get_data_block(inode, iblock, bh_result, create, - flag, next_pgofs); + flag, next_pgofs, + NO_CHECK_TYPE); } static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DEFAULT, NULL); + F2FS_GET_BLOCK_DEFAULT, NULL, + rw_hint_to_seg_type( + inode->i_write_hint)); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, @@ -1094,7 +1162,8 @@ static int get_data_block_bmap(struct inode *inode, sector_t iblock, return -EFBIG; return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_BMAP, NULL); + F2FS_GET_BLOCK_BMAP, NULL, + NO_CHECK_TYPE); } static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) @@ -1107,6 +1176,68 @@ static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) return (blk << inode->i_blkbits); } +static int f2fs_xattr_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct page *page; + struct node_info ni; + __u64 phys = 0, len; + __u32 flags; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + int err = 0; + + if (f2fs_has_inline_xattr(inode)) { + int offset; + + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), + inode->i_ino, false); + if (!page) + return -ENOMEM; + + get_node_info(sbi, inode->i_ino, &ni); + + phys = (__u64)blk_to_logical(inode, ni.blk_addr); + offset = offsetof(struct f2fs_inode, i_addr) + + sizeof(__le32) * (DEF_ADDRS_PER_INODE - + get_inline_xattr_addrs(inode)); + + phys += offset; + len = inline_xattr_size(inode); + + f2fs_put_page(page, 1); + + flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED; + + if (!xnid) + flags |= FIEMAP_EXTENT_LAST; + + err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); + if (err || err == 1) + return err; + } + + if (xnid) { + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), xnid, false); + if (!page) + return -ENOMEM; + + get_node_info(sbi, xnid, &ni); + + phys = (__u64)blk_to_logical(inode, ni.blk_addr); + len = inode->i_sb->s_blocksize; + + f2fs_put_page(page, 1); + + flags = FIEMAP_EXTENT_LAST; + } + + if (phys) + err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); + + return (err < 0 ? err : 0); +} + int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -1117,18 +1248,29 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u32 flags = 0; int ret = 0; - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + ret = f2fs_precache_extents(inode); + if (ret) + return ret; + } + + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR); if (ret) return ret; + inode_lock(inode); + + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + ret = f2fs_xattr_fiemap(inode, fieinfo); + goto out; + } + if (f2fs_has_inline_data(inode)) { ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len); if (ret != -EAGAIN) - return ret; + goto out; } - inode_lock(inode); - if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); @@ -1198,7 +1340,6 @@ static int f2fs_mpage_readpages(struct address_space *mapping, unsigned nr_pages) { struct bio *bio = NULL; - unsigned page_idx; sector_t last_block_in_bio = 0; struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; @@ -1214,9 +1355,10 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = 0; map.m_flags = 0; map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; - for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { - + for (; nr_pages; nr_pages--) { if (pages) { page = list_last_entry(pages, struct page, lru); @@ -1376,18 +1518,79 @@ retry_encrypt: return PTR_ERR(fio->encrypted_page); } +static inline bool check_inplace_update_policy(struct inode *inode, + struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int policy = SM_I(sbi)->ipu_policy; + + if (policy & (0x1 << F2FS_IPU_FORCE)) + return true; + if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) + return true; + if (policy & (0x1 << F2FS_IPU_UTIL) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + + /* + * IPU for rewrite async pages + */ + if (policy & (0x1 << F2FS_IPU_ASYNC) && + fio && fio->op == REQ_OP_WRITE && + !(fio->op_flags & REQ_SYNC) && + !f2fs_encrypted_inode(inode)) + return true; + + /* this is only set during fdatasync */ + if (policy & (0x1 << F2FS_IPU_FSYNC) && + is_inode_flag_set(inode, FI_NEED_IPU)) + return true; + + return false; +} + +bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) +{ + if (f2fs_is_pinned_file(inode)) + return true; + + /* if this is cold file, we should overwrite to avoid fragmentation */ + if (file_is_cold(inode)) + return true; + + return check_inplace_update_policy(inode, fio); +} + +bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (test_opt(sbi, LFS)) + return true; + if (S_ISDIR(inode->i_mode)) + return true; + if (f2fs_is_atomic_file(inode)) + return true; + if (fio) { + if (is_cold_data(fio->page)) + return true; + if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + return true; + } + return false; +} + static inline bool need_inplace_update(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; - if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) - return false; - if (is_cold_data(fio->page)) - return false; - if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + if (should_update_outplace(inode, fio)) return false; - return need_inplace_update_policy(inode, fio); + return should_update_inplace(inode, fio); } static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio) @@ -1508,10 +1711,17 @@ static int __write_data_page(struct page *page, bool *submitted, .submitted = false, .need_lock = LOCK_RETRY, .io_type = io_type, + .io_wbc = wbc, }; trace_f2fs_writepage(page, DATA); + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(page->mapping, -EIO); + goto out; + } + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; @@ -1536,12 +1746,6 @@ write: available_free_memory(sbi, BASE_CHECK)))) goto redirty_out; - /* we should bypass data pages to proceed the kworkder jobs */ - if (unlikely(f2fs_cp_error(sbi))) { - mapping_set_error(page->mapping, -EIO); - goto out; - } - /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { fio.need_lock = LOCK_DONE; @@ -1571,10 +1775,14 @@ write: } } - down_write(&F2FS_I(inode)->i_sem); - if (F2FS_I(inode)->last_disk_size < psize) - F2FS_I(inode)->last_disk_size = psize; - up_write(&F2FS_I(inode)->i_sem); + if (err) { + file_set_keep_isize(inode); + } else { + down_write(&F2FS_I(inode)->i_sem); + if (F2FS_I(inode)->last_disk_size < psize) + F2FS_I(inode)->last_disk_size = psize; + up_write(&F2FS_I(inode)->i_sem); + } done: if (err && err != -ENOENT) @@ -1933,7 +2141,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; - bool need_balance = false; + bool need_balance = false, drop_atomic = false; block_t blkaddr = NULL_ADDR; int err = 0; @@ -1942,6 +2150,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, if (f2fs_is_atomic_file(inode) && !available_free_memory(sbi, INMEM_PAGES)) { err = -ENOMEM; + drop_atomic = true; goto fail; } @@ -2022,7 +2231,7 @@ repeat: fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); - if (f2fs_is_atomic_file(inode)) + if (drop_atomic) drop_inmem_pages_all(sbi); return err; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index ecada8425268..a66107b5cfff 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -49,14 +49,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; - - si->nquota_files = 0; - if (f2fs_sb_has_quota_ino(sbi->sb)) { - for (i = 0; i < MAXQUOTAS; i++) { - if (f2fs_qf_ino(sbi->sb, i)) - si->nquota_files++; - } - } + si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = atomic_read(&sbi->aw_cnt); @@ -186,7 +179,6 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); - si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; /* build sm */ si->base_mem += sizeof(struct f2fs_sm_info); @@ -447,7 +439,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; - si = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL); + si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL); if (!si) return -ENOMEM; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 2d98d877c09d..f00b5ed8c011 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -713,6 +713,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); + if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); @@ -798,6 +800,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int bit_pos; struct f2fs_dir_entry *de = NULL; struct fscrypt_str de_name = FSTR_INIT(NULL, 0); + struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); bit_pos = ((unsigned long)ctx->pos % d->max); @@ -836,6 +839,9 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, le32_to_cpu(de->ino), d_type)) return 1; + if (sbi->readdir_ra == 1) + ra_node_page(sbi, le32_to_cpu(de->ino)); + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6abf26c31d01..6300ac5bcbe4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -19,6 +19,7 @@ #include <linux/magic.h> #include <linux/kobject.h> #include <linux/sched.h> +#include <linux/cred.h> #include <linux/vmalloc.h> #include <linux/bio.h> #include <linux/blkdev.h> @@ -43,6 +44,7 @@ #ifdef CONFIG_F2FS_FAULT_INJECTION enum { FAULT_KMALLOC, + FAULT_KVMALLOC, FAULT_PAGE_ALLOC, FAULT_PAGE_GET, FAULT_ALLOC_BIO, @@ -94,6 +96,7 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_PRJQUOTA 0x00200000 #define F2FS_MOUNT_QUOTA 0x00400000 #define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 +#define F2FS_MOUNT_RESERVE_ROOT 0x01000000 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) @@ -121,6 +124,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_INODE_CHKSUM 0x0020 #define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040 #define F2FS_FEATURE_QUOTA_INO 0x0080 +#define F2FS_FEATURE_INODE_CRTIME 0x0100 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -130,6 +134,12 @@ struct f2fs_mount_info { (F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)) /* + * Default values for user and/or group using reserved blocks + */ +#define F2FS_DEF_RESUID 0 +#define F2FS_DEF_RESGID 0 + +/* * For checkpoint manager */ enum { @@ -179,6 +189,7 @@ enum { ORPHAN_INO, /* for orphan ino list */ APPEND_INO, /* for append ino list */ UPDATE_INO, /* for update ino list */ + TRANS_DIR_INO, /* for trasactions dir ino list */ FLUSH_INO, /* for multiple device flushing */ MAX_INO_ENTRY, /* max. list */ }; @@ -264,7 +275,6 @@ struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head entry_list; /* 4KB discard entry list */ struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ - unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */ struct list_head wait_list; /* store on-flushing entries */ struct list_head fstrim_list; /* in-flight discard from fstrim */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ @@ -347,6 +357,9 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ struct f2fs_gc_range) #define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) +#define F2FS_IOC_SET_PIN_FILE _IOW(F2FS_IOCTL_MAGIC, 13, __u32) +#define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32) +#define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -402,10 +415,9 @@ struct f2fs_flush_device { #define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); static inline int get_inline_xattr_addrs(struct inode *inode); -#define F2FS_INLINE_XATTR_ADDRS(inode) get_inline_xattr_addrs(inode) #define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ (CUR_ADDRS_PER_INODE(inode) - \ - F2FS_INLINE_XATTR_ADDRS(inode) - \ + get_inline_xattr_addrs(inode) - \ DEF_INLINE_RESERVED_SIZE)) /* for inline dir */ @@ -542,6 +554,8 @@ struct f2fs_map_blocks { unsigned int m_len; unsigned int m_flags; pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ + pgoff_t *m_next_extent; /* point to next possible extent */ + int m_seg_type; }; /* for flag in get_data_block */ @@ -551,6 +565,7 @@ enum { F2FS_GET_BLOCK_BMAP, F2FS_GET_BLOCK_PRE_DIO, F2FS_GET_BLOCK_PRE_AIO, + F2FS_GET_BLOCK_PRECACHE, }; /* @@ -583,7 +598,10 @@ struct f2fs_inode_info { unsigned long i_flags; /* keep an inode flags for ioctl */ unsigned char i_advise; /* use to give file attribute hints */ unsigned char i_dir_level; /* use for dentry level for large dir */ - unsigned int i_current_depth; /* use only in directory structure */ + union { + unsigned int i_current_depth; /* only for directory depth */ + unsigned short i_gc_failures; /* only for regular file */ + }; unsigned int i_pino; /* parent inode number */ umode_t i_acl_mode; /* keep file acl mode temporarily */ @@ -618,6 +636,7 @@ struct f2fs_inode_info { int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ int i_inline_xattr_size; /* inline xattr size */ + struct timespec i_crtime; /* inode creation time */ }; static inline void get_extent_info(struct extent_info *ext, @@ -922,6 +941,7 @@ enum cp_reason_type { CP_NODE_NEED_CP, CP_FASTBOOT_MODE, CP_SPEC_LOG_NUM, + CP_RECOVER_DIR, }; enum iostat_type { @@ -957,6 +977,7 @@ struct f2fs_io_info { int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ enum iostat_type io_type; /* io type */ + struct writeback_control *io_wbc; /* writeback control */ }; #define is_read_io(rw) ((rw) == READ) @@ -1093,6 +1114,7 @@ struct f2fs_sb_info { int dir_level; /* directory level */ int inline_xattr_size; /* inline xattr size */ unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */ + int readdir_ra; /* readahead inode in readdir */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ @@ -1100,6 +1122,11 @@ struct f2fs_sb_info { block_t last_valid_block_count; /* for recovery */ block_t reserved_blocks; /* configurable reserved blocks */ block_t current_reserved_blocks; /* current reserved blocks */ + block_t root_reserved_blocks; /* root reserved blocks */ + kuid_t s_resuid; /* reserved blocks for uid */ + kgid_t s_resgid; /* reserved blocks for gid */ + + unsigned int nquota_files; /* # of quota sysfile */ u32 s_next_generation; /* for NFS support */ @@ -1124,6 +1151,9 @@ struct f2fs_sb_info { /* threshold for converting bg victims for fg */ u64 fggc_threshold; + /* threshold for gc trials on pinned files */ + u64 gc_pin_file_threshold; + /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; @@ -1250,33 +1280,7 @@ static inline bool is_idle(struct f2fs_sb_info *sbi) /* * Inline functions */ -static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, - unsigned int length) -{ - SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver); - u32 *ctx = (u32 *)shash_desc_ctx(shash); - u32 retval; - int err; - - shash->tfm = sbi->s_chksum_driver; - shash->flags = 0; - *ctx = F2FS_SUPER_MAGIC; - - err = crypto_shash_update(shash, address, length); - BUG_ON(err); - - retval = *ctx; - barrier_data(ctx); - return retval; -} - -static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, - void *buf, size_t buf_size) -{ - return f2fs_crc32(sbi, buf, buf_size) == blk_crc; -} - -static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, +static inline u32 __f2fs_crc32(struct f2fs_sb_info *sbi, u32 crc, const void *address, unsigned int length) { struct { @@ -1297,6 +1301,24 @@ static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, return *(u32 *)desc.ctx; } +static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, + unsigned int length) +{ + return __f2fs_crc32(sbi, F2FS_SUPER_MAGIC, address, length); +} + +static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, + void *buf, size_t buf_size) +{ + return f2fs_crc32(sbi, buf, buf_size) == blk_crc; +} + +static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + return __f2fs_crc32(sbi, crc, address, length); +} + static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); @@ -1555,6 +1577,25 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } +static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, + struct inode *inode) +{ + if (!inode) + return true; + if (!test_opt(sbi, RESERVE_ROOT)) + return false; + if (IS_NOQUOTA(inode)) + return true; + if (capable(CAP_SYS_RESOURCE)) + return true; + if (uid_eq(sbi->s_resuid, current_fsuid())) + return true; + if (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && + in_group_p(sbi->s_resgid)) + return true; + return false; +} + static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count) @@ -1584,11 +1625,17 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, sbi->total_valid_block_count += (block_t)(*count); avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; + + if (!__allow_reserved_blocks(sbi, inode)) + avail_user_block_count -= sbi->root_reserved_blocks; + if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { diff = sbi->total_valid_block_count - avail_user_block_count; + if (diff > *count) + diff = *count; *count -= diff; release = diff; - sbi->total_valid_block_count = avail_user_block_count; + sbi->total_valid_block_count -= diff; if (!*count) { spin_unlock(&sbi->stat_lock); percpu_counter_sub(&sbi->alloc_valid_block_count, diff); @@ -1597,7 +1644,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, } spin_unlock(&sbi->stat_lock); - if (release) + if (unlikely(release)) dquot_release_reservation_block(inode, release); f2fs_i_blocks_write(inode, *count, true, true); return 0; @@ -1777,9 +1824,13 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); - valid_block_count = sbi->total_valid_block_count + 1; - if (unlikely(valid_block_count + sbi->current_reserved_blocks > - sbi->user_block_count)) { + valid_block_count = sbi->total_valid_block_count + + sbi->current_reserved_blocks + 1; + + if (!__allow_reserved_blocks(sbi, inode)) + valid_block_count += sbi->root_reserved_blocks; + + if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); goto enospc; } @@ -1992,11 +2043,11 @@ static inline block_t datablock_addr(struct inode *inode, raw_node = F2FS_NODE(node_page); /* from GC path only */ - if (!inode) { - if (is_inode) + if (is_inode) { + if (!inode) base = offset_in_addr(&raw_node->i); - } else if (f2fs_has_extra_attr(inode) && is_inode) { - base = get_extra_isize(inode); + else if (f2fs_has_extra_attr(inode)) + base = get_extra_isize(inode); } addr_array = blkaddr_in_node(raw_node); @@ -2107,6 +2158,7 @@ enum { FI_HOT_DATA, /* indicate file is hot */ FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ + FI_PIN_FILE, /* indicate file should not be gced */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -2116,10 +2168,12 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, case FI_INLINE_XATTR: case FI_INLINE_DATA: case FI_INLINE_DENTRY: + case FI_NEW_INODE: if (set) return; case FI_DATA_EXIST: case FI_INLINE_DOTS: + case FI_PIN_FILE: f2fs_mark_inode_dirty_sync(inode, true); } } @@ -2200,6 +2254,13 @@ static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) f2fs_mark_inode_dirty_sync(inode, true); } +static inline void f2fs_i_gc_failures_write(struct inode *inode, + unsigned int count) +{ + F2FS_I(inode)->i_gc_failures = count; + f2fs_mark_inode_dirty_sync(inode, true); +} + static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) { F2FS_I(inode)->i_xattr_nid = xnid; @@ -2228,6 +2289,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) set_bit(FI_INLINE_DOTS, &fi->flags); if (ri->i_inline & F2FS_EXTRA_ATTR) set_bit(FI_EXTRA_ATTR, &fi->flags); + if (ri->i_inline & F2FS_PIN_FILE) + set_bit(FI_PIN_FILE, &fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -2246,6 +2309,8 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) ri->i_inline |= F2FS_INLINE_DOTS; if (is_inode_flag_set(inode, FI_EXTRA_ATTR)) ri->i_inline |= F2FS_EXTRA_ATTR; + if (is_inode_flag_set(inode, FI_PIN_FILE)) + ri->i_inline |= F2FS_PIN_FILE; } static inline int f2fs_has_extra_attr(struct inode *inode) @@ -2260,7 +2325,7 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) static inline unsigned int addrs_per_inode(struct inode *inode) { - return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS(inode); + return CUR_ADDRS_PER_INODE(inode) - get_inline_xattr_addrs(inode); } static inline void *inline_xattr_addr(struct inode *inode, struct page *page) @@ -2268,7 +2333,7 @@ static inline void *inline_xattr_addr(struct inode *inode, struct page *page) struct f2fs_inode *ri = F2FS_INODE(page); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - - F2FS_INLINE_XATTR_ADDRS(inode)]); + get_inline_xattr_addrs(inode)]); } static inline int inline_xattr_size(struct inode *inode) @@ -2291,6 +2356,11 @@ static inline int f2fs_has_inline_dots(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DOTS); } +static inline bool f2fs_is_pinned_file(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_PIN_FILE); +} + static inline bool f2fs_is_atomic_file(struct inode *inode) { return is_inode_flag_set(inode, FI_ATOMIC_FILE); @@ -2418,12 +2488,35 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, return kmalloc(size, flags); } +static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + return f2fs_kmalloc(sbi, size, flags | __GFP_ZERO); +} + +static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_KVMALLOC)) { + f2fs_show_injection_info(FAULT_KVMALLOC); + return NULL; + } +#endif + return kvmalloc(size, flags); +} + +static inline void *f2fs_kvzalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + return f2fs_kvmalloc(sbi, size, flags | __GFP_ZERO); +} + static inline int get_extra_isize(struct inode *inode) { return F2FS_I(inode)->i_extra_isize / sizeof(__le32); } -static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb); static inline int get_inline_xattr_addrs(struct inode *inode) { return F2FS_I(inode)->i_inline_xattr_size; @@ -2479,9 +2572,11 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); int f2fs_setattr(struct dentry *dentry, struct iattr *attr); int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); -int truncate_data_blocks_range(struct dnode_of_data *dn, int count); +void truncate_data_blocks_range(struct dnode_of_data *dn, int count); +int f2fs_precache_extents(struct inode *inode); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int f2fs_pin_file_control(struct inode *inode, bool inc); /* * inode.c @@ -2492,8 +2587,8 @@ void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); -int update_inode(struct inode *inode, struct page *node_page); -int update_inode_page(struct inode *inode); +void update_inode(struct inode *inode, struct page *node_page); +void update_inode_page(struct inode *inode); int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); void f2fs_evict_inode(struct inode *inode); void handle_failed_inode(struct inode *inode); @@ -2604,10 +2699,9 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); void recover_inline_xattr(struct inode *inode, struct page *page); -int recover_xattr_data(struct inode *inode, struct page *page, - block_t blkaddr); +int recover_xattr_data(struct inode *inode, struct page *page); int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); -int restore_node_summary(struct f2fs_sb_info *sbi, +void restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int build_node_manager(struct f2fs_sb_info *sbi); @@ -2634,6 +2728,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void init_discard_policy(struct discard_policy *dpolicy, int discard_type, unsigned int granularity); +void drop_discard_cmd(struct f2fs_sb_info *sbi); void stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); @@ -2672,6 +2767,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi); void destroy_segment_manager(struct f2fs_sb_info *sbi); int __init create_segment_manager_caches(void); void destroy_segment_manager_caches(void); +int rw_hint_to_seg_type(enum rw_hint hint); /* * checkpoint.c @@ -2741,6 +2837,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); +bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); +bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); void f2fs_set_page_dirty_nobuffers(struct page *page); int __f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc, @@ -3109,6 +3207,11 @@ static inline int f2fs_sb_has_quota_ino(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO); } +static inline int f2fs_sb_has_inode_crtime(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CRTIME); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7874bbd7311d..672a542e5464 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -165,6 +165,9 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) cp_reason = CP_FASTBOOT_MODE; else if (sbi->active_logs == 2) cp_reason = CP_SPEC_LOG_NUM; + else if (need_dentry_mark(sbi, inode->i_ino) && + exist_written_data(sbi, F2FS_I(inode)->i_pino, TRANS_DIR_INO)) + cp_reason = CP_RECOVER_DIR; return cp_reason; } @@ -472,26 +475,14 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) static int f2fs_file_open(struct inode *inode, struct file *filp) { - struct dentry *dir; + int err = fscrypt_file_open(inode, filp); - if (f2fs_encrypted_inode(inode)) { - int ret = fscrypt_get_encryption_info(inode); - if (ret) - return -EACCES; - if (!fscrypt_has_encryption_key(inode)) - return -ENOKEY; - } - dir = dget_parent(file_dentry(filp)); - if (f2fs_encrypted_inode(d_inode(dir)) && - !fscrypt_has_permitted_context(d_inode(dir), inode)) { - dput(dir); - return -EPERM; - } - dput(dir); + if (err) + return err; return dquot_file_open(inode, filp); } -int truncate_data_blocks_range(struct dnode_of_data *dn, int count) +void truncate_data_blocks_range(struct dnode_of_data *dn, int count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_node *raw_node; @@ -534,7 +525,6 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) f2fs_update_time(sbi, REQ_TIME); trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, dn->ofs_in_node, nr_free); - return nr_free; } void truncate_data_blocks(struct dnode_of_data *dn) @@ -682,8 +672,17 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, { struct inode *inode = d_inode(path->dentry); struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_inode *ri; unsigned int flags; + if (f2fs_has_extra_attr(inode) && + f2fs_sb_has_inode_crtime(inode->i_sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) { + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = fi->i_crtime.tv_sec; + stat->btime.tv_nsec = fi->i_crtime.tv_nsec; + } + flags = fi->i_flags & (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL); if (flags & FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; @@ -755,6 +754,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; + err = fscrypt_prepare_setattr(dentry, attr); + if (err) + return err; + if (is_quota_modification(inode, attr)) { err = dquot_initialize(inode); if (err) @@ -770,14 +773,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - if (f2fs_encrypted_inode(inode)) { - err = fscrypt_get_encryption_info(inode); - if (err) - return err; - if (!fscrypt_has_encryption_key(inode)) - return -ENOKEY; - } - if (attr->ia_size <= i_size_read(inode)) { down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); @@ -1114,11 +1109,13 @@ static int __exchange_data_block(struct inode *src_inode, while (len) { olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); - src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + src_blkaddr = f2fs_kvzalloc(F2FS_I_SB(src_inode), + sizeof(block_t) * olen, GFP_KERNEL); if (!src_blkaddr) return -ENOMEM; - do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL); + do_replace = f2fs_kvzalloc(F2FS_I_SB(src_inode), + sizeof(int) * olen, GFP_KERNEL); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; @@ -1186,14 +1183,14 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - goto out; - - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + goto out_unlock; truncate_pagecache(inode, offset); @@ -1212,9 +1209,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); out_unlock: - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); -out: up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); return ret; } @@ -1385,6 +1381,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) @@ -1395,9 +1394,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (ret) goto out; - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); - truncate_pagecache(inode, offset); pg_start = offset >> PAGE_SHIFT; @@ -1425,10 +1421,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); - - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); out: up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); return ret; } @@ -1436,7 +1431,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset, loff_t len, int mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; + struct f2fs_map_blocks map = { .m_next_pgofs = NULL, + .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE }; pgoff_t pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; @@ -1852,14 +1848,20 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) switch (in) { case F2FS_GOING_DOWN_FULLSYNC: sb = freeze_bdev(sb->s_bdev); - if (sb && !IS_ERR(sb)) { + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + goto out; + } + if (sb) { f2fs_stop_checkpoint(sbi, false); thaw_bdev(sb->s_bdev, sb); } break; case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ - f2fs_sync_fs(sb, 1); + ret = f2fs_sync_fs(sb, 1); + if (ret) + goto out; f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_NOSYNC: @@ -1873,6 +1875,13 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) ret = -EINVAL; goto out; } + + stop_gc_thread(sbi); + stop_discard_thread(sbi); + + drop_discard_cmd(sbi); + clear_opt(sbi, DISCARD); + f2fs_update_time(sbi, REQ_TIME); out: mnt_drop_write_file(filp); @@ -2084,9 +2093,10 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, struct f2fs_defragment *range) { struct inode *inode = file_inode(filp); - struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; + struct f2fs_map_blocks map = { .m_next_extent = NULL, + .m_seg_type = NO_CHECK_TYPE }; struct extent_info ei = {0,0,0}; - pgoff_t pg_start, pg_end; + pgoff_t pg_start, pg_end, next_pgofs; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; block_t blk_end = 0; @@ -2094,7 +2104,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, int err; /* if in-place-update policy is enabled, don't waste time here */ - if (need_inplace_update_policy(inode, NULL)) + if (should_update_inplace(inode, NULL)) return -EINVAL; pg_start = range->start >> PAGE_SHIFT; @@ -2120,6 +2130,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, } map.m_lblk = pg_start; + map.m_next_pgofs = &next_pgofs; /* * lookup mapping info in dnode page cache, skip defragmenting if all @@ -2133,14 +2144,16 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, goto out; if (!(map.m_flags & F2FS_MAP_FLAGS)) { - map.m_lblk++; + map.m_lblk = next_pgofs; continue; } - if (blk_end && blk_end != map.m_pblk) { + if (blk_end && blk_end != map.m_pblk) fragmented = true; - break; - } + + /* record total count of block that we're going to move */ + total += map.m_len; + blk_end = map.m_pblk + map.m_len; map.m_lblk += map.m_len; @@ -2149,10 +2162,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, if (!fragmented) goto out; - map.m_lblk = pg_start; - map.m_len = pg_end - pg_start; - - sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi); + sec_num = (total + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi); /* * make sure there are enough free section for LFS allocation, this can @@ -2164,6 +2174,10 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, goto out; } + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; + total = 0; + while (map.m_lblk < pg_end) { pgoff_t idx; int cnt = 0; @@ -2175,7 +2189,7 @@ do_map: goto clear_out; if (!(map.m_flags & F2FS_MAP_FLAGS)) { - map.m_lblk++; + map.m_lblk = next_pgofs; continue; } @@ -2681,6 +2695,125 @@ static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg) return 0; } +int f2fs_pin_file_control(struct inode *inode, bool inc) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* Use i_gc_failures for normal file as a risk signal. */ + if (inc) + f2fs_i_gc_failures_write(inode, fi->i_gc_failures + 1); + + if (fi->i_gc_failures > sbi->gc_pin_file_threshold) { + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: Enable GC = ino %lx after %x GC trials\n", + __func__, inode->i_ino, fi->i_gc_failures); + clear_inode_flag(inode, FI_PIN_FILE); + return -EAGAIN; + } + return 0; +} + +static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u32 pin; + int ret = 0; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(pin, (__u32 __user *)arg)) + return -EFAULT; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (f2fs_readonly(F2FS_I_SB(inode)->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + + if (should_update_outplace(inode, NULL)) { + ret = -EINVAL; + goto out; + } + + if (!pin) { + clear_inode_flag(inode, FI_PIN_FILE); + F2FS_I(inode)->i_gc_failures = 1; + goto done; + } + + if (f2fs_pin_file_control(inode, false)) { + ret = -EAGAIN; + goto out; + } + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + set_inode_flag(inode, FI_PIN_FILE); + ret = F2FS_I(inode)->i_gc_failures; +done: + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u32 pin = 0; + + if (is_inode_flag_set(inode, FI_PIN_FILE)) + pin = F2FS_I(inode)->i_gc_failures; + return put_user(pin, (u32 __user *)arg); +} + +int f2fs_precache_extents(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_map_blocks map; + pgoff_t m_next_extent; + loff_t end; + int err; + + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return -EOPNOTSUPP; + + map.m_lblk = 0; + map.m_next_pgofs = NULL; + map.m_next_extent = &m_next_extent; + map.m_seg_type = NO_CHECK_TYPE; + end = F2FS_I_SB(inode)->max_file_blocks; + + while (map.m_lblk < end) { + map.m_len = end - map.m_lblk; + + down_write(&fi->dio_rwsem[WRITE]); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE); + up_write(&fi->dio_rwsem[WRITE]); + if (err) + return err; + + map.m_lblk = m_next_extent; + } + + return err; +} + +static int f2fs_ioc_precache_extents(struct file *filp, unsigned long arg) +{ + return f2fs_precache_extents(file_inode(filp)); +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) @@ -2731,6 +2864,12 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_fsgetxattr(filp, arg); case F2FS_IOC_FSSETXATTR: return f2fs_ioc_fssetxattr(filp, arg); + case F2FS_IOC_GET_PIN_FILE: + return f2fs_ioc_get_pin_file(filp, arg); + case F2FS_IOC_SET_PIN_FILE: + return f2fs_ioc_set_pin_file(filp, arg); + case F2FS_IOC_PRECACHE_EXTENTS: + return f2fs_ioc_precache_extents(filp, arg); default: return -ENOTTY; } @@ -2806,6 +2945,9 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_GET_FEATURES: case F2FS_IOC_FSGETXATTR: case F2FS_IOC_FSSETXATTR: + case F2FS_IOC_GET_PIN_FILE: + case F2FS_IOC_SET_PIN_FILE: + case F2FS_IOC_PRECACHE_EXTENTS: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d844dcb80570..aa720cc44509 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -624,6 +624,11 @@ static void move_data_block(struct inode *inode, block_t bidx, if (f2fs_is_atomic_file(inode)) goto out; + if (f2fs_is_pinned_file(inode)) { + f2fs_pin_file_control(inode, true); + goto out; + } + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) @@ -686,7 +691,12 @@ static void move_data_block(struct inode *inode, block_t bidx, fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC; fio.new_blkaddr = newaddr; - f2fs_submit_page_write(&fio); + err = f2fs_submit_page_write(&fio); + if (err) { + if (PageWriteback(fio.encrypted_page)) + end_page_writeback(fio.encrypted_page); + goto put_page_out; + } f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); @@ -720,6 +730,11 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, if (f2fs_is_atomic_file(inode)) goto out; + if (f2fs_is_pinned_file(inode)) { + if (gc_type == FG_GC) + f2fs_pin_file_control(inode, true); + goto out; + } if (gc_type == BG_GC) { if (PageWriteback(page)) @@ -1091,6 +1106,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi) sbi->fggc_threshold = div64_u64((main_count - ovp_count) * BLKS_PER_SEC(sbi), (main_count - resv_count)); + sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; /* give warm/cold data area from slower device */ if (sbi->s_ndevs && sbi->segs_per_sec == 1) diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 9325191fab2d..b0045d4c8d1e 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -20,6 +20,8 @@ #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ +#define DEF_GC_FAILED_PINNED_FILES 2048 + /* Search max. number of dirty segments to select a victim segment */ #define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index b4c4f2b25304..205add3d0f3a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -22,6 +22,9 @@ void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) { + if (is_inode_flag_set(inode, FI_NEW_INODE)) + return; + if (f2fs_inode_dirtied(inode, sync)) return; @@ -275,6 +278,12 @@ static int do_read_inode(struct inode *inode) i_projid = F2FS_DEF_PROJID; fi->i_projid = make_kprojid(&init_user_ns, i_projid); + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi->sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) { + fi->i_crtime.tv_sec = le64_to_cpu(ri->i_crtime); + fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); + } + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -360,14 +369,15 @@ retry: return inode; } -int update_inode(struct inode *inode, struct page *node_page) +void update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; struct extent_tree *et = F2FS_I(inode)->extent_tree; - f2fs_inode_synced(inode); - f2fs_wait_on_page_writeback(node_page, NODE, true); + set_page_dirty(node_page); + + f2fs_inode_synced(inode); ri = F2FS_INODE(node_page); @@ -417,6 +427,15 @@ int update_inode(struct inode *inode, struct page *node_page) F2FS_I(inode)->i_projid); ri->i_projid = cpu_to_le32(i_projid); } + + if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)->sb) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_crtime)) { + ri->i_crtime = + cpu_to_le64(F2FS_I(inode)->i_crtime.tv_sec); + ri->i_crtime_nsec = + cpu_to_le32(F2FS_I(inode)->i_crtime.tv_nsec); + } } __set_inode_rdev(inode, ri); @@ -426,14 +445,12 @@ int update_inode(struct inode *inode, struct page *node_page) if (inode->i_nlink == 0) clear_inline_node(node_page); - return set_page_dirty(node_page); } -int update_inode_page(struct inode *inode) +void update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; - int ret = 0; retry: node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { @@ -444,11 +461,10 @@ retry: } else if (err != -ENOENT) { f2fs_stop_checkpoint(sbi, false); } - return 0; + return; } - ret = update_inode(inode, node_page); + update_inode(inode, node_page); f2fs_put_page(node_page, 1); - return ret; } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) @@ -569,7 +585,7 @@ no_delete: !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); } out_clear: - fscrypt_put_encryption_info(inode, NULL); + fscrypt_put_encryption_info(inode); clear_inode(inode); } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 28bdf8828e73..b68e7b03959f 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -50,7 +50,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode->i_ctime = + F2FS_I(inode)->i_crtime = current_time(inode); inode->i_generation = sbi->s_next_generation++; err = insert_inode_locked(inode); @@ -74,12 +75,12 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (err) goto fail_drop; + set_inode_flag(inode, FI_NEW_INODE); + /* If the directory encrypted, then we should encrypt the inode. */ if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); - set_inode_flag(inode, FI_NEW_INODE); - if (f2fs_sb_has_extra_attr(sbi->sb)) { set_inode_flag(inode, FI_EXTRA_ATTR); F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; @@ -240,9 +241,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if (f2fs_encrypted_inode(dir) && - !fscrypt_has_permitted_context(dir, inode)) - return -EPERM; + err = fscrypt_prepare_link(old_dentry, dir, dentry); + if (err) + return err; if (is_inode_flag_set(dir, FI_PROJ_INHERIT) && (!projid_eq(F2FS_I(dir)->i_projid, @@ -357,20 +358,9 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, trace_f2fs_lookup_start(dir, dentry, flags); - if (f2fs_encrypted_inode(dir)) { - err = fscrypt_get_encryption_info(dir); - - /* - * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is - * created while the directory was encrypted and we - * don't have access to the key. - */ - if (fscrypt_has_encryption_key(dir)) - fscrypt_set_encrypted_dentry(dentry); - fscrypt_set_d_op(dentry); - if (err && err != -ENOKEY) - goto out; - } + err = fscrypt_prepare_lookup(dir, dentry, flags); + if (err) + goto out; if (dentry->d_name.len > F2FS_NAME_LEN) { err = -ENAMETOOLONG; @@ -496,27 +486,16 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; size_t len = strlen(symname); - struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1); - struct fscrypt_symlink_data *sd = NULL; + struct fscrypt_str disk_link; int err; if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if (f2fs_encrypted_inode(dir)) { - err = fscrypt_get_encryption_info(dir); - if (err) - return err; - - if (!fscrypt_has_encryption_key(dir)) - return -ENOKEY; - - disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + - sizeof(struct fscrypt_symlink_data)); - } - - if (disk_link.len > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; + err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, + &disk_link); + if (err) + return err; err = dquot_initialize(dir); if (err) @@ -526,7 +505,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) return PTR_ERR(inode); - if (f2fs_encrypted_inode(inode)) + if (IS_ENCRYPTED(inode)) inode->i_op = &f2fs_encrypted_symlink_inode_operations; else inode->i_op = &f2fs_symlink_inode_operations; @@ -536,38 +515,13 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) - goto out; + goto out_handle_failed_inode; f2fs_unlock_op(sbi); alloc_nid_done(sbi, inode->i_ino); - if (f2fs_encrypted_inode(inode)) { - struct qstr istr = QSTR_INIT(symname, len); - struct fscrypt_str ostr; - - sd = kzalloc(disk_link.len, GFP_NOFS); - if (!sd) { - err = -ENOMEM; - goto err_out; - } - - err = fscrypt_get_encryption_info(inode); - if (err) - goto err_out; - - if (!fscrypt_has_encryption_key(inode)) { - err = -ENOKEY; - goto err_out; - } - - ostr.name = sd->encrypted_path; - ostr.len = disk_link.len; - err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); - if (err) - goto err_out; - - sd->len = cpu_to_le16(ostr.len); - disk_link.name = (char *)sd; - } + err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); + if (err) + goto err_out; err = page_symlink(inode, disk_link.name, disk_link.len); @@ -594,12 +548,14 @@ err_out: f2fs_unlink(dir, dentry); } - kfree(sd); - f2fs_balance_fs(sbi, true); - return err; -out: + goto out_free_encrypted_link; + +out_handle_failed_inode: handle_failed_inode(inode); +out_free_encrypted_link: + if (disk_link.name != (unsigned char *)symname) + kfree(disk_link.name); return err; } @@ -800,18 +756,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if ((f2fs_encrypted_inode(old_dir) && - !fscrypt_has_encryption_key(old_dir)) || - (f2fs_encrypted_inode(new_dir) && - !fscrypt_has_encryption_key(new_dir))) - return -ENOKEY; - - if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) && - !fscrypt_has_permitted_context(new_dir, old_inode)) { - err = -EPERM; - goto out; - } - if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && (!projid_eq(F2FS_I(new_dir)->i_projid, F2FS_I(old_dentry->d_inode)->i_projid))) @@ -958,6 +902,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_i_links_write(old_dir, false); } + add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); f2fs_unlock_op(sbi); @@ -1002,18 +947,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if ((f2fs_encrypted_inode(old_dir) && - !fscrypt_has_encryption_key(old_dir)) || - (f2fs_encrypted_inode(new_dir) && - !fscrypt_has_encryption_key(new_dir))) - return -ENOKEY; - - if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) && - (old_dir != new_dir) && - (!fscrypt_has_permitted_context(new_dir, old_inode) || - !fscrypt_has_permitted_context(old_dir, new_inode))) - return -EPERM; - if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && !projid_eq(F2FS_I(new_dir)->i_projid, F2FS_I(old_dentry->d_inode)->i_projid)) || @@ -1124,6 +1057,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_mark_inode_dirty_sync(new_dir, false); + add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); + add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + f2fs_unlock_op(sbi); if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) @@ -1153,9 +1089,16 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { + int err; + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; + err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); + if (err) + return err; + if (flags & RENAME_EXCHANGE) { return f2fs_cross_rename(old_dir, old_dentry, new_dir, new_dentry); @@ -1171,68 +1114,20 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct page *cpage = NULL; - char *caddr, *paddr = NULL; - struct fscrypt_str cstr = FSTR_INIT(NULL, 0); - struct fscrypt_str pstr = FSTR_INIT(NULL, 0); - struct fscrypt_symlink_data *sd; - u32 max_size = inode->i_sb->s_blocksize; - int res; + struct page *page; + const char *target; if (!dentry) return ERR_PTR(-ECHILD); - res = fscrypt_get_encryption_info(inode); - if (res) - return ERR_PTR(res); - - cpage = read_mapping_page(inode->i_mapping, 0, NULL); - if (IS_ERR(cpage)) - return ERR_CAST(cpage); - caddr = page_address(cpage); - - /* Symlink is encrypted */ - sd = (struct fscrypt_symlink_data *)caddr; - cstr.name = sd->encrypted_path; - cstr.len = le16_to_cpu(sd->len); - - /* this is broken symlink case */ - if (unlikely(cstr.len == 0)) { - res = -ENOENT; - goto errout; - } - - if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) { - /* Symlink data on the disk is corrupted */ - res = -EIO; - goto errout; - } - res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); - if (res) - goto errout; - - res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); - if (res) - goto errout; - - /* this is broken symlink case */ - if (unlikely(pstr.name[0] == 0)) { - res = -ENOENT; - goto errout; - } - - paddr = pstr.name; - - /* Null-terminate the name */ - paddr[pstr.len] = '\0'; + page = read_mapping_page(inode->i_mapping, 0, NULL); + if (IS_ERR(page)) + return ERR_CAST(page); - put_page(cpage); - set_delayed_call(done, kfree_link, paddr); - return paddr; -errout: - fscrypt_fname_free_buffer(&pstr); - put_page(cpage); - return ERR_PTR(res); + target = fscrypt_get_symlink(inode, page_address(page), + inode->i_sb->s_blocksize, done); + put_page(page); + return target; } const struct inode_operations f2fs_encrypted_symlink_inode_operations = { diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d3322752426f..177c438e4a56 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -143,11 +143,9 @@ static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail) struct nat_entry *new; if (no_fail) - new = f2fs_kmem_cache_alloc(nat_entry_slab, - GFP_NOFS | __GFP_ZERO); + new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); else - new = kmem_cache_alloc(nat_entry_slab, - GFP_NOFS | __GFP_ZERO); + new = kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); if (new) { nat_set_nid(new, nid); nat_reset_flag(new); @@ -702,7 +700,6 @@ static void truncate_node(struct dnode_of_data *dn) struct node_info ni; get_node_info(sbi, dn->nid, &ni); - f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); @@ -1336,14 +1333,19 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, .encrypted_page = NULL, .submitted = false, .io_type = io_type, + .io_wbc = wbc, }; trace_f2fs_writepage(page, NODE); + if (unlikely(f2fs_cp_error(sbi))) { + dec_page_count(sbi, F2FS_DIRTY_NODES); + unlock_page(page); + return 0; + } + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; /* get old block addr of this node page */ nid = nid_of_node(page); @@ -1580,12 +1582,6 @@ next_step: struct page *page = pvec.pages[i]; bool submitted = false; - if (unlikely(f2fs_cp_error(sbi))) { - pagevec_release(&pvec); - ret = -EIO; - goto out; - } - /* * flushing sequence with step: * 0. indirect nodes @@ -1655,9 +1651,12 @@ continue_unlock: step++; goto next_step; } -out: + if (nwritten) f2fs_submit_merged_write(sbi, NODE); + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; return ret; } @@ -1812,8 +1811,33 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, } } +static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, + bool set, bool build) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); + unsigned int nid_ofs = nid - START_NID(nid); + + if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + + if (set) { + if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; + __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + nm_i->free_nid_count[nat_ofs]++; + } else { + if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; + __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + if (!build) + nm_i->free_nid_count[nat_ofs]--; + } +} + /* return if the nid is recognized as free */ -static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) +static bool add_free_nid(struct f2fs_sb_info *sbi, + nid_t nid, bool build, bool update) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i, *e; @@ -1829,8 +1853,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) i->nid = nid; i->state = FREE_NID; - if (radix_tree_preload(GFP_NOFS)) - goto err; + radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); spin_lock(&nm_i->nid_list_lock); @@ -1871,9 +1894,14 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) ret = true; err = __insert_free_nid(sbi, i, FREE_NID); err_out: + if (update) { + update_free_nid_bitmap(sbi, nid, ret, build); + if (!build) + nm_i->available_nids++; + } spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); -err: + if (err) kmem_cache_free(free_nid_slab, i); return ret; @@ -1897,30 +1925,6 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } -static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, - bool set, bool build) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); - unsigned int nid_ofs = nid - START_NID(nid); - - if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) - return; - - if (set) { - if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) - return; - __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - nm_i->free_nid_count[nat_ofs]++; - } else { - if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) - return; - __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - if (!build) - nm_i->free_nid_count[nat_ofs]--; - } -} - static void scan_nat_page(struct f2fs_sb_info *sbi, struct page *nat_page, nid_t start_nid) { @@ -1930,26 +1934,23 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; - if (test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) - return; - __set_bit_le(nat_ofs, nm_i->nat_block_bitmap); i = start_nid % NAT_ENTRY_PER_BLOCK; for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { - bool freed = false; - if (unlikely(start_nid >= nm_i->max_nid)) break; blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(sbi, blk_addr == NEW_ADDR); - if (blk_addr == NULL_ADDR) - freed = add_free_nid(sbi, start_nid, true); - spin_lock(&NM_I(sbi)->nid_list_lock); - update_free_nid_bitmap(sbi, start_nid, freed, true); - spin_unlock(&NM_I(sbi)->nid_list_lock); + if (blk_addr == NULL_ADDR) { + add_free_nid(sbi, start_nid, true, true); + } else { + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, start_nid, false, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } } } @@ -1967,7 +1968,7 @@ static void scan_curseg_cache(struct f2fs_sb_info *sbi) addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); nid = le32_to_cpu(nid_in_journal(journal, i)); if (addr == NULL_ADDR) - add_free_nid(sbi, nid, true); + add_free_nid(sbi, nid, true, false); else remove_free_nid(sbi, nid); } @@ -1994,7 +1995,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) break; nid = i * NAT_ENTRY_PER_BLOCK + idx; - add_free_nid(sbi, nid, true); + add_free_nid(sbi, nid, true, false); if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS) goto out; @@ -2037,10 +2038,13 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) down_read(&nm_i->nat_tree_lock); while (1) { - struct page *page = get_current_nat_page(sbi, nid); + if (!test_bit_le(NAT_BLOCK_OFFSET(nid), + nm_i->nat_block_bitmap)) { + struct page *page = get_current_nat_page(sbi, nid); - scan_nat_page(sbi, page, nid); - f2fs_put_page(page, 1); + scan_nat_page(sbi, page, nid); + f2fs_put_page(page, 1); + } nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); if (unlikely(nid >= nm_i->max_nid)) @@ -2203,7 +2207,9 @@ void recover_inline_xattr(struct inode *inode, struct page *page) f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage)); ri = F2FS_INODE(page); - if (!(ri->i_inline & F2FS_INLINE_XATTR)) { + if (ri->i_inline & F2FS_INLINE_XATTR) { + set_inode_flag(inode, FI_INLINE_XATTR); + } else { clear_inode_flag(inode, FI_INLINE_XATTR); goto update_inode; } @@ -2219,7 +2225,7 @@ update_inode: f2fs_put_page(ipage, 1); } -int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +int recover_xattr_data(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; @@ -2233,7 +2239,6 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) /* 1: invalidate the previous xattr nid */ get_node_info(sbi, prev_xnid, &ni); - f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, inode, false); set_node_addr(sbi, &ni, NULL_ADDR, false); @@ -2322,7 +2327,7 @@ retry: return 0; } -int restore_node_summary(struct f2fs_sb_info *sbi, +void restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum) { struct f2fs_node *rn; @@ -2355,7 +2360,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi, invalidate_mapping_pages(META_MAPPING(sbi), addr, addr + nrpages); } - return 0; } static void remove_nats_in_journal(struct f2fs_sb_info *sbi) @@ -2497,11 +2501,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), set, ne); if (nat_get_blkaddr(ne) == NULL_ADDR) { - add_free_nid(sbi, nid, false); - spin_lock(&NM_I(sbi)->nid_list_lock); - NM_I(sbi)->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false); - spin_unlock(&NM_I(sbi)->nid_list_lock); + add_free_nid(sbi, nid, false, true); } else { spin_lock(&NM_I(sbi)->nid_list_lock); update_free_nid_bitmap(sbi, nid, false, false); @@ -2582,8 +2582,8 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 + F2FS_BLKSIZE - 1); - nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, - GFP_KERNEL); + nm_i->nat_bits = f2fs_kzalloc(sbi, + nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL); if (!nm_i->nat_bits) return -ENOMEM; @@ -2661,7 +2661,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - - F2FS_RESERVED_NODE_NUM; + sbi->nquota_files - F2FS_RESERVED_NODE_NUM; nm_i->nid_cnt[FREE_NID] = 0; nm_i->nid_cnt[PREALLOC_NID] = 0; nm_i->nat_cnt = 0; @@ -2708,17 +2708,17 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); - nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks * + nm_i->free_nid_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks * NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; - nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8, + nm_i->nat_block_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks / 8, GFP_KERNEL); if (!nm_i->nat_block_bitmap) return -ENOMEM; - nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks * + nm_i->free_nid_count = f2fs_kvzalloc(sbi, nm_i->nat_blocks * sizeof(unsigned short), GFP_KERNEL); if (!nm_i->free_nid_count) return -ENOMEM; @@ -2729,7 +2729,8 @@ int build_node_manager(struct f2fs_sb_info *sbi) { int err; - sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL); + sbi->nm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_nm_info), + GFP_KERNEL); if (!sbi->nm_info) return -ENOMEM; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 0ee3e5ff49a3..081ef0d672bf 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -305,6 +305,10 @@ static inline bool is_recoverable_dnode(struct page *page) struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); __u64 cp_ver = cur_cp_version(ckpt); + /* Don't care crc part, if fsck.f2fs sets it. */ + if (__is_set_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG)) + return (cp_ver << 32) == (cpver_of_node(page) << 32); + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) cp_ver |= (cur_cp_crc(ckpt) << 32); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index b3a14b0429f2..337f3363f48f 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -195,6 +195,20 @@ out: return err; } +static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri) +{ + if (ri->i_inline & F2FS_PIN_FILE) + set_inode_flag(inode, FI_PIN_FILE); + else + clear_inode_flag(inode, FI_PIN_FILE); + if (ri->i_inline & F2FS_DATA_EXIST) + set_inode_flag(inode, FI_DATA_EXIST); + else + clear_inode_flag(inode, FI_DATA_EXIST); + if (!(ri->i_inline & F2FS_INLINE_DOTS)) + clear_inode_flag(inode, FI_INLINE_DOTS); +} + static void recover_inode(struct inode *inode, struct page *page) { struct f2fs_inode *raw = F2FS_INODE(page); @@ -211,13 +225,16 @@ static void recover_inode(struct inode *inode, struct page *page) F2FS_I(inode)->i_advise = raw->i_advise; + recover_inline_flags(inode, raw); + if (file_enc_name(inode)) name = "<encrypted>"; else name = F2FS_INODE(page)->i_name; - f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s", - ino_of_node(page), name); + f2fs_msg(inode->i_sb, KERN_NOTICE, + "recover_inode: ino = %x, name = %s, inline = %x", + ino_of_node(page), name, raw->i_inline); } static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, @@ -404,7 +421,7 @@ truncate_out: } static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, - struct page *page, block_t blkaddr) + struct page *page) { struct dnode_of_data dn; struct node_info ni; @@ -415,7 +432,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (IS_INODE(page)) { recover_inline_xattr(inode, page); } else if (f2fs_has_xattr_block(ofs_of_node(page))) { - err = recover_xattr_data(inode, page, blkaddr); + err = recover_xattr_data(inode, page); if (!err) recovered++; goto out; @@ -568,7 +585,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, break; } } - err = do_recover_data(sbi, entry->inode, page, blkaddr); + err = do_recover_data(sbi, entry->inode, page); if (err) { f2fs_put_page(page, 1); break; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c117e0913f2a..b16a8e6625aa 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -248,7 +248,11 @@ retry: goto next; } get_node_info(sbi, dn.nid, &ni); - f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + if (cur->old_addr == NEW_ADDR) { + invalidate_blocks(sbi, dn.data_blkaddr); + f2fs_update_data_blkaddr(&dn, NEW_ADDR); + } else + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, cur->old_addr, ni.version, true, true); f2fs_put_dnode(&dn); } @@ -657,7 +661,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) goto init_thread; } - fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); + fcc = f2fs_kzalloc(sbi, sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; atomic_set(&fcc->issued_flush, 0); @@ -884,7 +888,7 @@ static void f2fs_submit_discard_endio(struct bio *bio) bio_put(bio); } -void __check_sit_bitmap(struct f2fs_sb_info *sbi, +static void __check_sit_bitmap(struct f2fs_sb_info *sbi, block_t start, block_t end) { #ifdef CONFIG_F2FS_CHECK_FS @@ -1204,6 +1208,8 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, pend_list = &dcc->pend_list[i]; mutex_lock(&dcc->cmd_lock); + if (list_empty(pend_list)) + goto next; f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { @@ -1222,6 +1228,7 @@ skip: break; } blk_finish_plug(&plug); +next: mutex_unlock(&dcc->cmd_lock); if (iter >= dpolicy->max_requests) @@ -1256,6 +1263,11 @@ static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) return dropped; } +void drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + __drop_discard_cmd(sbi); +} + static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { @@ -1324,7 +1336,7 @@ static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, } /* This should be covered by global mutex, &sit_i->sentry_lock */ -void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *dc; @@ -1394,6 +1406,8 @@ static int issue_discard_thread(void *data) msecs_to_jiffies(wait_ms)); if (try_to_freeze()) continue; + if (f2fs_readonly(sbi->sb)) + continue; if (kthread_should_stop()) return 0; @@ -1703,25 +1717,20 @@ void init_discard_policy(struct discard_policy *dpolicy, dpolicy->sync = true; dpolicy->granularity = granularity; + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + if (discard_type == DPOLICY_BG) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; dpolicy->io_aware = true; } else if (discard_type == DPOLICY_FORCE) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; dpolicy->io_aware = true; } else if (discard_type == DPOLICY_FSTRIM) { - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; dpolicy->io_aware = false; } else if (discard_type == DPOLICY_UMOUNT) { - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; dpolicy->io_aware = false; } } @@ -1737,7 +1746,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) goto init_thread; } - dcc = kzalloc(sizeof(struct discard_cmd_control), GFP_KERNEL); + dcc = f2fs_kzalloc(sbi, sizeof(struct discard_cmd_control), GFP_KERNEL); if (!dcc) return -ENOMEM; @@ -2739,6 +2748,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } } + f2fs_bug_on(sbi, !IS_DATASEG(type)); curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); @@ -2823,7 +2833,7 @@ void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) } } -static int read_compacted_summaries(struct f2fs_sb_info *sbi) +static void read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *seg_i; @@ -2880,7 +2890,6 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) } } f2fs_put_page(page, 1); - return 0; } static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) @@ -2926,13 +2935,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) ns->ofs_in_node = 0; } } else { - int err; - - err = restore_node_summary(sbi, segno, sum); - if (err) { - f2fs_put_page(new, 1); - return err; - } + restore_node_summary(sbi, segno, sum); } } @@ -2971,8 +2974,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) META_CP, true); /* restore for compacted data summary */ - if (read_compacted_summaries(sbi)) - return -EINVAL; + read_compacted_summaries(sbi); type = CURSEG_HOT_NODE; } @@ -3108,28 +3110,19 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, unsigned int start) { struct sit_info *sit_i = SIT_I(sbi); - struct page *src_page, *dst_page; + struct page *page; pgoff_t src_off, dst_off; - void *src_addr, *dst_addr; src_off = current_sit_addr(sbi, start); dst_off = next_sit_addr(sbi, src_off); - /* get current sit block page without lock */ - src_page = get_meta_page(sbi, src_off); - dst_page = grab_meta_page(sbi, dst_off); - f2fs_bug_on(sbi, PageDirty(src_page)); - - src_addr = page_address(src_page); - dst_addr = page_address(dst_page); - memcpy(dst_addr, src_addr, PAGE_SIZE); - - set_page_dirty(dst_page); - f2fs_put_page(src_page, 1); + page = grab_meta_page(sbi, dst_off); + seg_info_to_sit_page(sbi, page, start); + set_page_dirty(page); set_to_next_sit(sit_i, start); - return dst_page; + return page; } static struct sit_entry_set *grab_sit_entry_set(void) @@ -3338,52 +3331,54 @@ static int build_sit_info(struct f2fs_sb_info *sbi) unsigned int bitmap_size; /* allocate memory for SIT information */ - sit_i = kzalloc(sizeof(struct sit_info), GFP_KERNEL); + sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL); if (!sit_i) return -ENOMEM; SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) * + sit_i->sentries = f2fs_kvzalloc(sbi, MAIN_SEGS(sbi) * sizeof(struct seg_entry), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL); + sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, bitmap_size, + GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; for (start = 0; start < MAIN_SEGS(sbi); start++) { sit_i->sentries[start].cur_valid_map - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); sit_i->sentries[start].ckpt_valid_map - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->sentries[start].cur_valid_map || !sit_i->sentries[start].ckpt_valid_map) return -ENOMEM; #ifdef CONFIG_F2FS_CHECK_FS sit_i->sentries[start].cur_valid_map_mir - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->sentries[start].cur_valid_map_mir) return -ENOMEM; #endif if (f2fs_discard_en(sbi)) { sit_i->sentries[start].discard_map - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, + GFP_KERNEL); if (!sit_i->sentries[start].discard_map) return -ENOMEM; } } - sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->tmp_map) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) * + sit_i->sec_entries = f2fs_kvzalloc(sbi, MAIN_SECS(sbi) * sizeof(struct sec_entry), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; @@ -3427,19 +3422,19 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) unsigned int bitmap_size, sec_bitmap_size; /* allocate memory for free segmap information */ - free_i = kzalloc(sizeof(struct free_segmap_info), GFP_KERNEL); + free_i = f2fs_kzalloc(sbi, sizeof(struct free_segmap_info), GFP_KERNEL); if (!free_i) return -ENOMEM; SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL); + free_i->free_segmap = f2fs_kvmalloc(sbi, bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL); + free_i->free_secmap = f2fs_kvmalloc(sbi, sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -3460,7 +3455,7 @@ static int build_curseg(struct f2fs_sb_info *sbi) struct curseg_info *array; int i; - array = kcalloc(NR_CURSEG_TYPE, sizeof(*array), GFP_KERNEL); + array = f2fs_kzalloc(sbi, sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL); if (!array) return -ENOMEM; @@ -3468,12 +3463,12 @@ static int build_curseg(struct f2fs_sb_info *sbi) for (i = 0; i < NR_CURSEG_TYPE; i++) { mutex_init(&array[i].curseg_mutex); - array[i].sum_blk = kzalloc(PAGE_SIZE, GFP_KERNEL); + array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL); if (!array[i].sum_blk) return -ENOMEM; init_rwsem(&array[i].journal_rwsem); - array[i].journal = kzalloc(sizeof(struct f2fs_journal), - GFP_KERNEL); + array[i].journal = f2fs_kzalloc(sbi, + sizeof(struct f2fs_journal), GFP_KERNEL); if (!array[i].journal) return -ENOMEM; array[i].segno = NULL_SEGNO; @@ -3482,7 +3477,7 @@ static int build_curseg(struct f2fs_sb_info *sbi) return restore_curseg_summaries(sbi); } -static void build_sit_entries(struct f2fs_sb_info *sbi) +static int build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); @@ -3492,6 +3487,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; + int err = 0; do { readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, @@ -3510,7 +3506,9 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; f2fs_put_page(page, 1); - check_block_count(sbi, start, &sit); + err = check_block_count(sbi, start, &sit); + if (err) + return err; seg_info_from_raw_sit(se, &sit); /* build discard map only one time */ @@ -3545,7 +3543,9 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) old_valid_blocks = se->valid_blocks; - check_block_count(sbi, start, &sit); + err = check_block_count(sbi, start, &sit); + if (err) + break; seg_info_from_raw_sit(se, &sit); if (f2fs_discard_en(sbi)) { @@ -3565,6 +3565,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) se->valid_blocks - old_valid_blocks; } up_read(&curseg->journal_rwsem); + return err; } static void init_free_segmap(struct f2fs_sb_info *sbi) @@ -3619,7 +3620,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; return 0; @@ -3631,7 +3632,8 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) unsigned int bitmap_size, i; /* allocate memory for dirty segments list information */ - dirty_i = kzalloc(sizeof(struct dirty_seglist_info), GFP_KERNEL); + dirty_i = f2fs_kzalloc(sbi, sizeof(struct dirty_seglist_info), + GFP_KERNEL); if (!dirty_i) return -ENOMEM; @@ -3641,7 +3643,8 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { - dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->dirty_segmap[i] = f2fs_kvzalloc(sbi, bitmap_size, + GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } @@ -3685,7 +3688,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) struct f2fs_sm_info *sm_info; int err; - sm_info = kzalloc(sizeof(struct f2fs_sm_info), GFP_KERNEL); + sm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_sm_info), GFP_KERNEL); if (!sm_info) return -ENOMEM; @@ -3737,7 +3740,9 @@ int build_segment_manager(struct f2fs_sb_info *sbi) return err; /* reinit free segmap based on SIT */ - build_sit_entries(sbi); + err = build_sit_entries(sbi); + if (err) + return err; init_free_segmap(sbi); err = build_dirty_segmap(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index d1d394cdf61d..f11c4bc82c78 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -348,16 +348,41 @@ static inline void seg_info_from_raw_sit(struct seg_entry *se, se->mtime = le64_to_cpu(rs->mtime); } -static inline void seg_info_to_raw_sit(struct seg_entry *se, +static inline void __seg_info_to_raw_sit(struct seg_entry *se, struct f2fs_sit_entry *rs) { unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) | se->valid_blocks; rs->vblocks = cpu_to_le16(raw_vblocks); memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); + rs->mtime = cpu_to_le64(se->mtime); +} + +static inline void seg_info_to_sit_page(struct f2fs_sb_info *sbi, + struct page *page, unsigned int start) +{ + struct f2fs_sit_block *raw_sit; + struct seg_entry *se; + struct f2fs_sit_entry *rs; + unsigned int end = min(start + SIT_ENTRY_PER_BLOCK, + (unsigned long)MAIN_SEGS(sbi)); + int i; + + raw_sit = (struct f2fs_sit_block *)page_address(page); + for (i = 0; i < end - start; i++) { + rs = &raw_sit->entries[i]; + se = get_seg_entry(sbi, start + i); + __seg_info_to_raw_sit(se, rs); + } +} + +static inline void seg_info_to_raw_sit(struct seg_entry *se, + struct f2fs_sit_entry *rs) +{ + __seg_info_to_raw_sit(se, rs); + memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); se->ckpt_valid_blocks = se->valid_blocks; - rs->mtime = cpu_to_le64(se->mtime); } static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, @@ -580,47 +605,6 @@ enum { F2FS_IPU_ASYNC, }; -static inline bool need_inplace_update_policy(struct inode *inode, - struct f2fs_io_info *fio) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - unsigned int policy = SM_I(sbi)->ipu_policy; - - if (test_opt(sbi, LFS)) - return false; - - /* if this is cold file, we should overwrite to avoid fragmentation */ - if (file_is_cold(inode)) - return true; - - if (policy & (0x1 << F2FS_IPU_FORCE)) - return true; - if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) - return true; - if (policy & (0x1 << F2FS_IPU_UTIL) && - utilization(sbi) > SM_I(sbi)->min_ipu_util) - return true; - if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) && - utilization(sbi) > SM_I(sbi)->min_ipu_util) - return true; - - /* - * IPU for rewrite async pages - */ - if (policy & (0x1 << F2FS_IPU_ASYNC) && - fio && fio->op == REQ_OP_WRITE && - !(fio->op_flags & REQ_SYNC) && - !f2fs_encrypted_inode(inode)) - return true; - - /* this is only set during fdatasync */ - if (policy & (0x1 << F2FS_IPU_FSYNC) && - is_inode_flag_set(inode, FI_NEED_IPU)) - return true; - - return false; -} - static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, int type) { @@ -655,7 +639,7 @@ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) /* * Summary block is always treated as an invalid block */ -static inline void check_block_count(struct f2fs_sb_info *sbi, +static inline int check_block_count(struct f2fs_sb_info *sbi, int segno, struct f2fs_sit_entry *raw_sit) { #ifdef CONFIG_F2FS_CHECK_FS @@ -677,11 +661,25 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, cur_pos = next_pos; is_valid = !is_valid; } while (cur_pos < sbi->blocks_per_seg); - BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); + + if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Mismatch valid blocks %d vs. %d", + GET_SIT_VBLOCKS(raw_sit), valid_blocks); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -EINVAL; + } #endif /* check segment usage, and check boundary of a given segment number */ - f2fs_bug_on(sbi, GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg - || segno > TOTAL_SEGS(sbi) - 1); + if (unlikely(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg + || segno > TOTAL_SEGS(sbi) - 1)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong valid blocks %d or segno %u", + GET_SIT_VBLOCKS(raw_sit), segno); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -EINVAL; + } + return 0; } static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 708155d9c2e4..8173ae688814 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -43,6 +43,7 @@ static struct kmem_cache *f2fs_inode_cachep; char *fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", + [FAULT_KVMALLOC] = "kvmalloc", [FAULT_PAGE_ALLOC] = "page alloc", [FAULT_PAGE_GET] = "page get", [FAULT_ALLOC_BIO] = "alloc bio", @@ -106,6 +107,9 @@ enum { Opt_noextent_cache, Opt_noinline_data, Opt_data_flush, + Opt_reserve_root, + Opt_resgid, + Opt_resuid, Opt_mode, Opt_io_size_bits, Opt_fault_injection, @@ -156,6 +160,9 @@ static match_table_t f2fs_tokens = { {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, {Opt_data_flush, "data_flush"}, + {Opt_reserve_root, "reserve_root=%u"}, + {Opt_resgid, "resgid=%u"}, + {Opt_resuid, "resuid=%u"}, {Opt_mode, "mode=%s"}, {Opt_io_size_bits, "io_bits=%u"}, {Opt_fault_injection, "fault_injection=%u"}, @@ -190,6 +197,28 @@ void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) va_end(args); } +static inline void limit_reserve_root(struct f2fs_sb_info *sbi) +{ + block_t limit = (sbi->user_block_count << 1) / 1000; + + /* limit is 0.2% */ + if (test_opt(sbi, RESERVE_ROOT) && sbi->root_reserved_blocks > limit) { + sbi->root_reserved_blocks = limit; + f2fs_msg(sbi->sb, KERN_INFO, + "Reduce reserved blocks for root = %u", + sbi->root_reserved_blocks); + } + if (!test_opt(sbi, RESERVE_ROOT) && + (!uid_eq(sbi->s_resuid, + make_kuid(&init_user_ns, F2FS_DEF_RESUID)) || + !gid_eq(sbi->s_resgid, + make_kgid(&init_user_ns, F2FS_DEF_RESGID)))) + f2fs_msg(sbi->sb, KERN_INFO, + "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root", + from_kuid_munged(&init_user_ns, sbi->s_resuid), + from_kgid_munged(&init_user_ns, sbi->s_resgid)); +} + static void init_once(void *foo) { struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; @@ -320,6 +349,8 @@ static int parse_options(struct super_block *sb, char *options) substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; + kuid_t uid; + kgid_t gid; #ifdef CONFIG_QUOTA int ret; #endif @@ -487,6 +518,40 @@ static int parse_options(struct super_block *sb, char *options) case Opt_data_flush: set_opt(sbi, DATA_FLUSH); break; + case Opt_reserve_root: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (test_opt(sbi, RESERVE_ROOT)) { + f2fs_msg(sb, KERN_INFO, + "Preserve previous reserve_root=%u", + sbi->root_reserved_blocks); + } else { + sbi->root_reserved_blocks = arg; + set_opt(sbi, RESERVE_ROOT); + } + break; + case Opt_resuid: + if (args->from && match_int(args, &arg)) + return -EINVAL; + uid = make_kuid(current_user_ns(), arg); + if (!uid_valid(uid)) { + f2fs_msg(sb, KERN_ERR, + "Invalid uid value %d", arg); + return -EINVAL; + } + sbi->s_resuid = uid; + break; + case Opt_resgid: + if (args->from && match_int(args, &arg)) + return -EINVAL; + gid = make_kgid(current_user_ns(), arg); + if (!gid_valid(gid)) { + f2fs_msg(sb, KERN_ERR, + "Invalid gid value %d", arg); + return -EINVAL; + } + sbi->s_resgid = gid; + break; case Opt_mode: name = match_strdup(&args[0]); @@ -993,22 +1058,25 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); - block_t total_count, user_block_count, start_count, ovp_count; + block_t total_count, user_block_count, start_count; u64 avail_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); user_block_count = sbi->user_block_count; start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); - ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; buf->f_type = F2FS_SUPER_MAGIC; buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; - buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; - buf->f_bavail = user_block_count - valid_user_blocks(sbi) - + buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; + if (buf->f_bfree > sbi->root_reserved_blocks) + buf->f_bavail = buf->f_bfree - sbi->root_reserved_blocks; + else + buf->f_bavail = 0; - avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + avail_node_count = sbi->total_node_count - sbi->nquota_files - + F2FS_RESERVED_NODE_NUM; if (avail_node_count > user_block_count) { buf->f_files = user_block_count; @@ -1134,6 +1202,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (test_opt(sbi, LFS)) seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); + if (test_opt(sbi, RESERVE_ROOT)) + seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", + sbi->root_reserved_blocks, + from_kuid_munged(&init_user_ns, sbi->s_resuid), + from_kgid_munged(&init_user_ns, sbi->s_resgid)); if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -1263,7 +1336,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; - } else { + } else if (f2fs_readonly(sb) && !(*flags & MS_RDONLY)) { /* dquot_resume needs RW */ sb->s_flags &= ~SB_RDONLY; if (sb_any_quota_suspended(sb)) { @@ -1332,6 +1405,7 @@ skip: sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); + limit_reserve_root(sbi); return 0; restore_gc: if (need_restart_gc) { @@ -1656,7 +1730,7 @@ void f2fs_quota_off_umount(struct super_block *sb) f2fs_quota_off(sb, type); } -int f2fs_get_projid(struct inode *inode, kprojid_t *projid) +static int f2fs_get_projid(struct inode *inode, kprojid_t *projid) { *projid = F2FS_I(inode)->i_projid; return 0; @@ -2148,14 +2222,15 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) FDEV(devi).nr_blkz++; - FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); + FDEV(devi).blkz_type = f2fs_kmalloc(sbi, FDEV(devi).nr_blkz, + GFP_KERNEL); if (!FDEV(devi).blkz_type) return -ENOMEM; #define F2FS_REPORT_NR_ZONES 4096 - zones = kcalloc(F2FS_REPORT_NR_ZONES, sizeof(struct blk_zone), - GFP_KERNEL); + zones = f2fs_kzalloc(sbi, sizeof(struct blk_zone) * + F2FS_REPORT_NR_ZONES, GFP_KERNEL); if (!zones) return -ENOMEM; @@ -2295,8 +2370,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) * Initialize multiple devices information, or single * zoned block device information. */ - sbi->devs = kcalloc(max_devices, sizeof(struct f2fs_dev_info), - GFP_KERNEL); + sbi->devs = f2fs_kzalloc(sbi, sizeof(struct f2fs_dev_info) * + max_devices, GFP_KERNEL); if (!sbi->devs) return -ENOMEM; @@ -2419,6 +2494,9 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + sbi->s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); + sbi->s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); + /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sb)) sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, @@ -2462,6 +2540,13 @@ try_onemore: else sb->s_qcop = &f2fs_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; + + if (f2fs_sb_has_quota_ino(sbi->sb)) { + for (i = 0; i < MAXQUOTAS; i++) { + if (f2fs_qf_ino(sbi->sb, i)) + sbi->nquota_files++; + } + } #endif sb->s_op = &f2fs_sops; @@ -2475,6 +2560,7 @@ try_onemore: sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); + sb->s_iflags |= SB_I_CGROUPWB; /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; @@ -2495,8 +2581,9 @@ try_onemore: int n = (i == META) ? 1: NR_TEMP_TYPE; int j; - sbi->write_io[i] = kmalloc(n * sizeof(struct f2fs_bio_info), - GFP_KERNEL); + sbi->write_io[i] = f2fs_kmalloc(sbi, + n * sizeof(struct f2fs_bio_info), + GFP_KERNEL); if (!sbi->write_io[i]) { err = -ENOMEM; goto free_options; @@ -2517,14 +2604,14 @@ try_onemore: err = init_percpu_info(sbi); if (err) - goto free_options; + goto free_bio_info; if (F2FS_IO_SIZE(sbi) > 1) { sbi->write_io_dummy = mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); if (!sbi->write_io_dummy) { err = -ENOMEM; - goto free_options; + goto free_percpu; } } @@ -2559,6 +2646,7 @@ try_onemore: sbi->last_valid_block_count = sbi->total_valid_block_count; sbi->reserved_blocks = 0; sbi->current_reserved_blocks = 0; + limit_reserve_root(sbi); for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); @@ -2604,18 +2692,16 @@ try_onemore: goto free_nm; } - f2fs_join_shrinker(sbi); - err = f2fs_build_stats(sbi); if (err) - goto free_nm; + goto free_node_inode; /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { f2fs_msg(sb, KERN_ERR, "Failed to read root inode"); err = PTR_ERR(root); - goto free_node_inode; + goto free_stats; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { iput(root); @@ -2711,6 +2797,8 @@ skip_recovery: sbi->valid_super_block ? 1 : 2, err); } + f2fs_join_shrinker(sbi); + f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx", cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); @@ -2737,14 +2825,12 @@ free_sysfs: free_root_inode: dput(sb->s_root); sb->s_root = NULL; +free_stats: + f2fs_destroy_stats(sbi); free_node_inode: - truncate_inode_pages_final(NODE_MAPPING(sbi)); - mutex_lock(&sbi->umount_mutex); release_ino_entry(sbi, true); - f2fs_leave_shrinker(sbi); + truncate_inode_pages_final(NODE_MAPPING(sbi)); iput(sbi->node_inode); - mutex_unlock(&sbi->umount_mutex); - f2fs_destroy_stats(sbi); free_nm: destroy_node_manager(sbi); free_sm: @@ -2757,10 +2843,12 @@ free_meta_inode: iput(sbi->meta_inode); free_io_dummy: mempool_destroy(sbi->write_io_dummy); -free_options: +free_percpu: + destroy_percpu_info(sbi); +free_bio_info: for (i = 0; i < NR_PAGE_TYPE; i++) kfree(sbi->write_io[i]); - destroy_percpu_info(sbi); +free_options: #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 9835348b6e5d..d978c7b6ea04 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -113,6 +113,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_quota_ino(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "quota_ino"); + if (f2fs_sb_has_inode_crtime(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "inode_crtime"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -162,7 +165,8 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, #endif if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); - if (t > (unsigned long)sbi->user_block_count) { + if (t > (unsigned long)(sbi->user_block_count - + sbi->root_reserved_blocks)) { spin_unlock(&sbi->stat_lock); return -EINVAL; } @@ -231,6 +235,7 @@ enum feat_id { FEAT_INODE_CHECKSUM, FEAT_FLEXIBLE_INLINE_XATTR, FEAT_QUOTA_INO, + FEAT_INODE_CRTIME, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -245,6 +250,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_INODE_CHECKSUM: case FEAT_FLEXIBLE_INLINE_XATTR: case FEAT_QUOTA_INO: + case FEAT_INODE_CRTIME: return snprintf(buf, PAGE_SIZE, "supported\n"); } return 0; @@ -299,6 +305,8 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); #ifdef CONFIG_F2FS_FAULT_INJECTION F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); @@ -320,6 +328,7 @@ F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); +F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -346,6 +355,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(cp_interval), ATTR_LIST(idle_interval), ATTR_LIST(iostat_enable), + ATTR_LIST(readdir_ra), + ATTR_LIST(gc_pin_file_thresh), #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), @@ -371,6 +382,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(inode_checksum), ATTR_LIST(flexible_inline_xattr), ATTR_LIST(quota_ino), + ATTR_LIST(inode_crtime), NULL, }; diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index bccbbf2616d2..a1fcd00bbb2b 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -17,7 +17,7 @@ #include "trace.h" static RADIX_TREE(pids, GFP_ATOMIC); -static spinlock_t pids_lock; +static struct mutex pids_lock; static struct last_io_info last_io; static inline void __print_last_io(void) @@ -64,7 +64,7 @@ void f2fs_trace_pid(struct page *page) if (radix_tree_preload(GFP_NOFS)) return; - spin_lock(&pids_lock); + mutex_lock(&pids_lock); p = radix_tree_lookup(&pids, pid); if (p == current) goto out; @@ -77,7 +77,7 @@ void f2fs_trace_pid(struct page *page) MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), pid, current->comm); out: - spin_unlock(&pids_lock); + mutex_unlock(&pids_lock); radix_tree_preload_end(); } @@ -122,7 +122,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) void f2fs_build_trace_ios(void) { - spin_lock_init(&pids_lock); + mutex_init(&pids_lock); } #define PIDVEC_SIZE 128 @@ -150,7 +150,7 @@ void f2fs_destroy_trace_ios(void) pid_t next_pid = 0; unsigned int found; - spin_lock(&pids_lock); + mutex_lock(&pids_lock); while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { unsigned idx; @@ -158,5 +158,5 @@ void f2fs_destroy_trace_ios(void) for (idx = 0; idx < found; idx++) radix_tree_delete(&pids, pid[idx]); } - spin_unlock(&pids_lock); + mutex_unlock(&pids_lock); } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index ec8961ef8cac..ae2dfa709f5d 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -298,8 +298,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, if (!size && !inline_size) return -ENODATA; - txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, - GFP_F2FS_ZERO); + txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), + inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS); if (!txattr_addr) return -ENOMEM; @@ -351,8 +351,8 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, void *txattr_addr; int err; - txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, - GFP_F2FS_ZERO); + txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), + inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS); if (!txattr_addr) return -ENOMEM; @@ -433,6 +433,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (F2FS_I(inode)->i_xattr_nid) { xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); alloc_nid_failed(sbi, new_nid); goto in_page_out; } @@ -443,6 +444,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, set_new_dnode(&dn, inode, NULL, NULL, new_nid); xpage = new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); alloc_nid_failed(sbi, new_nid); goto in_page_out; } @@ -598,7 +600,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, goto exit; } - if (f2fs_xattr_value_same(here, value, size)) + if (value && f2fs_xattr_value_same(here, value, size)) goto exit; } else if ((flags & XATTR_REPLACE)) { error = -ENODATA; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index b833ffeee1e1..8e100c3bf72c 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -16,6 +16,7 @@ #include <linux/slab.h> #include <linux/compat.h> #include <linux/uaccess.h> +#include <linux/iversion.h> #include "fat.h" /* @@ -1055,7 +1056,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo) brelse(bh); if (err) return err; - dir->i_version++; + inode_inc_iversion(dir); if (nr_slots) { /* diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 20a0a89eaca5..ffbbf0520d9e 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -20,6 +20,7 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <asm/unaligned.h> +#include <linux/iversion.h> #include "fat.h" #ifndef CONFIG_FAT_DEFAULT_IOCHARSET @@ -507,7 +508,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) MSDOS_I(inode)->i_pos = 0; inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; - inode->i_version++; + inode_inc_iversion(inode); inode->i_generation = get_seconds(); if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) { @@ -590,7 +591,7 @@ struct inode *fat_build_inode(struct super_block *sb, goto out; } inode->i_ino = iunique(sb, MSDOS_ROOT_INO); - inode->i_version = 1; + inode_set_iversion(inode, 1); err = fat_fill_inode(inode, de); if (err) { iput(inode); @@ -1377,7 +1378,7 @@ static int fat_read_root(struct inode *inode) MSDOS_I(inode)->i_pos = MSDOS_ROOT_INO; inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; - inode->i_version++; + inode_inc_iversion(inode); inode->i_generation = 0; inode->i_mode = fat_make_mode(sbi, ATTR_DIR, S_IRWXUGO); inode->i_op = sbi->dir_ops; @@ -1828,7 +1829,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, if (!root_inode) goto out_fail; root_inode->i_ino = MSDOS_ROOT_INO; - root_inode->i_version = 1; + inode_set_iversion(root_inode, 1); error = fat_read_root(root_inode); if (error < 0) { iput(root_inode); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index d24d2758a363..582ca731a6c9 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -7,6 +7,7 @@ */ #include <linux/module.h> +#include <linux/iversion.h> #include "fat.h" /* Characters that are undesirable in an MS-DOS file name */ @@ -480,7 +481,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, } else mark_inode_dirty(old_inode); - old_dir->i_version++; + inode_inc_iversion(old_dir); old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); if (IS_DIRSYNC(old_dir)) (void)fat_sync_inode(old_dir); @@ -508,7 +509,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, goto out; new_i_pos = sinfo.i_pos; } - new_dir->i_version++; + inode_inc_iversion(new_dir); fat_detach(old_inode); fat_attach(old_inode, new_i_pos); @@ -540,7 +541,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, old_sinfo.bh = NULL; if (err) goto error_dotdot; - old_dir->i_version++; + inode_inc_iversion(old_dir); old_dir->i_ctime = old_dir->i_mtime = ts; if (IS_DIRSYNC(old_dir)) (void)fat_sync_inode(old_dir); diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 02c066663a3a..cefea792cde8 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -20,7 +20,7 @@ #include <linux/slab.h> #include <linux/namei.h> #include <linux/kernel.h> - +#include <linux/iversion.h> #include "fat.h" static inline unsigned long vfat_d_version(struct dentry *dentry) @@ -46,7 +46,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry) { int ret = 1; spin_lock(&dentry->d_lock); - if (vfat_d_version(dentry) != d_inode(dentry->d_parent)->i_version) + if (inode_cmp_iversion(d_inode(dentry->d_parent), vfat_d_version(dentry))) ret = 0; spin_unlock(&dentry->d_lock); return ret; @@ -759,7 +759,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, out: mutex_unlock(&MSDOS_SB(sb)->s_lock); if (!inode) - vfat_d_version_set(dentry, dir->i_version); + vfat_d_version_set(dentry, inode_query_iversion(dir)); return d_splice_alias(inode, dentry); error: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -781,7 +781,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo); if (err) goto out; - dir->i_version++; + inode_inc_iversion(dir); inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); brelse(sinfo.bh); @@ -789,7 +789,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, err = PTR_ERR(inode); goto out; } - inode->i_version++; + inode_inc_iversion(inode); inode->i_mtime = inode->i_atime = inode->i_ctime = ts; /* timestamp is already written, so mark_inode_dirty() is unneeded. */ @@ -823,7 +823,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry) clear_nlink(inode); inode->i_mtime = inode->i_atime = current_time(inode); fat_detach(inode); - vfat_d_version_set(dentry, dir->i_version); + vfat_d_version_set(dentry, inode_query_iversion(dir)); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -849,7 +849,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry) clear_nlink(inode); inode->i_mtime = inode->i_atime = current_time(inode); fat_detach(inode); - vfat_d_version_set(dentry, dir->i_version); + vfat_d_version_set(dentry, inode_query_iversion(dir)); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -875,7 +875,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo); if (err) goto out_free; - dir->i_version++; + inode_inc_iversion(dir); inc_nlink(dir); inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); @@ -885,7 +885,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) /* the directory was completed, just return a error */ goto out; } - inode->i_version++; + inode_inc_iversion(inode); set_nlink(inode, 2); inode->i_mtime = inode->i_atime = inode->i_ctime = ts; /* timestamp is already written, so mark_inode_dirty() is unneeded. */ @@ -951,7 +951,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; new_i_pos = sinfo.i_pos; } - new_dir->i_version++; + inode_inc_iversion(new_dir); fat_detach(old_inode); fat_attach(old_inode, new_i_pos); @@ -979,7 +979,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, old_sinfo.bh = NULL; if (err) goto error_dotdot; - old_dir->i_version++; + inode_inc_iversion(old_dir); old_dir->i_ctime = old_dir->i_mtime = ts; if (IS_DIRSYNC(old_dir)) (void)fat_sync_inode(old_dir); diff --git a/fs/fcntl.c b/fs/fcntl.c index 0522e283a4f4..e95fa0a352ea 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -418,7 +418,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; case F_ADD_SEALS: case F_GET_SEALS: - err = shmem_fcntl(filp, cmd, arg); + err = memfd_fcntl(filp, cmd, arg); break; case F_GET_RW_HINT: case F_SET_RW_HINT: @@ -690,7 +690,7 @@ COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, /* Table to convert sigio signal codes into poll band bitmaps */ -static const long band_table[NSIGPOLL] = { +static const __poll_t band_table[NSIGPOLL] = { POLLIN | POLLRDNORM, /* POLL_IN */ POLLOUT | POLLWRNORM | POLLWRBAND, /* POLL_OUT */ POLLIN | POLLRDNORM | POLLMSG, /* POLL_MSG */ @@ -737,6 +737,7 @@ static void send_sigio_to_task(struct task_struct *p, delivered even if we can't queue. Failure to queue in this case _should_ be reported; we fall back to SIGIO in that case. --sct */ + clear_siginfo(&si); si.si_signo = signum; si.si_errno = 0; si.si_code = reason; @@ -758,7 +759,7 @@ static void send_sigio_to_task(struct task_struct *p, if (reason - POLL_IN >= NSIGPOLL) si.si_band = ~0L; else - si.si_band = band_table[reason - POLL_IN]; + si.si_band = mangle_poll(band_table[reason - POLL_IN]); si.si_fd = fd; if (!do_send_sig_info(signum, &si, p, group)) break; diff --git a/fs/fhandle.c b/fs/fhandle.c index 0ace128f5d23..0ee727485615 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -69,8 +69,7 @@ static long do_sys_name_to_handle(struct path *path, } else retval = 0; /* copy the mount id */ - if (copy_to_user(mnt_id, &real_mount(path->mnt)->mnt_id, - sizeof(*mnt_id)) || + if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) || copy_to_user(ufh, handle, sizeof(struct file_handle) + handle_bytes)) retval = -EFAULT; diff --git a/fs/file.c b/fs/file.c index 3b080834b870..42f0db4bd0fb 100644 --- a/fs/file.c +++ b/fs/file.c @@ -11,18 +11,13 @@ #include <linux/export.h> #include <linux/fs.h> #include <linux/mm.h> -#include <linux/mmzone.h> -#include <linux/time.h> #include <linux/sched/signal.h> #include <linux/slab.h> -#include <linux/vmalloc.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/bitops.h> -#include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> -#include <linux/workqueue.h> unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; @@ -391,7 +386,7 @@ static struct fdtable *close_files(struct files_struct * files) struct file * file = xchg(&fdt->fd[i], NULL); if (file) { filp_close(file, files); - cond_resched_rcu_qs(); + cond_resched(); } } i++; diff --git a/fs/file_table.c b/fs/file_table.c index 2dc9f38bd195..7ec0b3e5f05d 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -23,7 +23,6 @@ #include <linux/sysctl.h> #include <linux/percpu_counter.h> #include <linux/percpu.h> -#include <linux/hardirq.h> #include <linux/task_work.h> #include <linux/ima.h> #include <linux/swap.h> diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index f989efa051a0..48b24bb50d02 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -332,9 +332,13 @@ vxfs_init(void) { int rv; - vxfs_inode_cachep = kmem_cache_create("vxfs_inode", + vxfs_inode_cachep = kmem_cache_create_usercopy("vxfs_inode", sizeof(struct vxfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + offsetof(struct vxfs_inode_info, vii_immed.vi_immed), + sizeof_field(struct vxfs_inode_info, + vii_immed.vi_immed), + NULL); if (!vxfs_inode_cachep) return -ENOMEM; rv = register_filesystem(&vxfs_fs_type); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cea4836385b7..d4d04fee568a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -126,7 +126,7 @@ static void wb_io_lists_depopulated(struct bdi_writeback *wb) * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list * @inode: inode to be moved * @wb: target bdi_writeback - * @head: one of @wb->b_{dirty|io|more_io} + * @head: one of @wb->b_{dirty|io|more_io|dirty_time} * * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io. * Returns %true if @inode is the first occupant of the !dirty_time IO diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 17f0d05bfd4c..aa089a6925d0 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2004,9 +2004,9 @@ out: return ret; } -static unsigned fuse_dev_poll(struct file *file, poll_table *wait) +static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) { - unsigned mask = POLLOUT | POLLWRNORM; + __poll_t mask = POLLOUT | POLLWRNORM; struct fuse_iqueue *fiq; struct fuse_dev *fud = fuse_get_dev(file); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index cb7dff5c45d7..e85e974dd211 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2751,7 +2751,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc, spin_unlock(&fc->lock); } -unsigned fuse_file_poll(struct file *file, poll_table *wait) +__poll_t fuse_file_poll(struct file *file, poll_table *wait) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -2764,7 +2764,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait) return DEFAULT_POLLMASK; poll_wait(file, &ff->poll_wait, wait); - inarg.events = (__u32)poll_requested_events(wait); + inarg.events = mangle_poll(poll_requested_events(wait)); /* * Ask for notification iff there's someone waiting for it. @@ -2786,7 +2786,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait) err = fuse_simple_request(fc, &args); if (!err) - return outarg.revents; + return demangle_poll(outarg.revents); if (err == -ENOSYS) { fc->no_poll = 1; return DEFAULT_POLLMASK; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d5773ca67ad2..c4c093bbf456 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -951,7 +951,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags); long fuse_ioctl_common(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags); -unsigned fuse_file_poll(struct file *file, poll_table *wait); +__poll_t fuse_file_poll(struct file *file, poll_table *wait); int fuse_dev_release(struct inode *inode, struct file *file); bool fuse_write_update_size(struct inode *inode, loff_t pos); diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index 43c827a7cce5..3ed2b088dcfd 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -3,6 +3,7 @@ config GFS2_FS depends on (64BIT || LBDAF) select FS_POSIX_ACL select CRC32 + select LIBCRC32C select QUOTACTL select FS_IOMAP help diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 1daf15a1f00c..2f725b4a386b 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -39,18 +39,21 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, - unsigned int from, unsigned int to) + unsigned int from, unsigned int len) { struct buffer_head *head = page_buffers(page); unsigned int bsize = head->b_size; struct buffer_head *bh; + unsigned int to = from + len; unsigned int start, end; for (bh = head, start = 0; bh != head || !start; bh = bh->b_this_page, start = end) { end = start + bsize; - if (end <= from || start >= to) + if (end <= from) continue; + if (start >= to) + break; if (gfs2_is_jdata(ip)) set_buffer_uptodate(bh); gfs2_trans_add_data(ip->i_gl, bh); @@ -189,7 +192,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w create_empty_buffers(page, inode->i_sb->s_blocksize, BIT(BH_Dirty)|BIT(BH_Uptodate)); } - gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1); + gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize); } return gfs2_write_full_page(page, gfs2_get_block_noalloc, wbc); } @@ -255,7 +258,6 @@ static int gfs2_writepages(struct address_space *mapping, * @wbc: The writeback control * @pvec: The vector of pages * @nr_pages: The number of pages to write - * @end: End position * @done_index: Page index * * Returns: non-zero if loop should terminate, zero otherwise @@ -264,7 +266,7 @@ static int gfs2_writepages(struct address_space *mapping, static int gfs2_write_jdata_pagevec(struct address_space *mapping, struct writeback_control *wbc, struct pagevec *pvec, - int nr_pages, pgoff_t end, + int nr_pages, pgoff_t *done_index) { struct inode *inode = mapping->host; @@ -402,7 +404,7 @@ retry: if (nr_pages == 0) break; - ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end, &done_index); + ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, &done_index); if (ret) done = 1; if (ret > 0) @@ -446,7 +448,8 @@ static int gfs2_jdata_writepages(struct address_space *mapping, ret = gfs2_write_cache_jdata(mapping, wbc); if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) { - gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); + gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_JDATA_WPAGES); ret = gfs2_write_cache_jdata(mapping, wbc); } return ret; @@ -483,8 +486,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) return error; kaddr = kmap_atomic(page); - if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode))) - dsize = (dibh->b_size - sizeof(struct gfs2_dinode)); + if (dsize > gfs2_max_stuffed_size(ip)) + dsize = gfs2_max_stuffed_size(ip); memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); memset(kaddr + dsize, 0, PAGE_SIZE - dsize); kunmap_atomic(kaddr); @@ -501,10 +504,9 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) * @file: The file to read a page for * @page: The page to read * - * This is the core of gfs2's readpage. Its used by the internal file - * reading code as in that case we already hold the glock. Also its + * This is the core of gfs2's readpage. It's used by the internal file + * reading code as in that case we already hold the glock. Also it's * called by gfs2_readpage() once the required lock has been granted. - * */ static int __gfs2_readpage(void *file, struct page *page) @@ -725,7 +727,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (gfs2_is_stuffed(ip)) { error = 0; - if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { + if (pos + len > gfs2_max_stuffed_size(ip)) { error = gfs2_unstuff_dinode(ip, page); if (error == 0) goto prepare_write; @@ -832,7 +834,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, void *kaddr; unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); - BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); + BUG_ON(pos + len > gfs2_max_stuffed_size(ip)); + kaddr = kmap_atomic(page); memcpy(buf + pos, kaddr + pos, copied); flush_dcache_page(page); @@ -890,8 +893,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct buffer_head *dibh; - unsigned int from = pos & (PAGE_SIZE - 1); - unsigned int to = from + len; int ret; struct gfs2_trans *tr = current->journal_info; BUG_ON(!tr); @@ -909,7 +910,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); if (!gfs2_is_writeback(ip)) - gfs2_page_add_databufs(ip, page, from, to); + gfs2_page_add_databufs(ip, page, pos & ~PAGE_MASK, len); ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); if (tr->tr_num_buf_new) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index d5f0d96169c5..86863792f36a 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -69,8 +69,8 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, void *kaddr = kmap(page); u64 dsize = i_size_read(inode); - if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode))) - dsize = dibh->b_size - sizeof(struct gfs2_dinode); + if (dsize > gfs2_max_stuffed_size(ip)) + dsize = gfs2_max_stuffed_size(ip); memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); memset(kaddr + dsize, 0, PAGE_SIZE - dsize); @@ -279,14 +279,13 @@ static inline __be64 *metapointer(unsigned int height, const struct metapath *mp return p + mp->mp_list[height]; } -static void gfs2_metapath_ra(struct gfs2_glock *gl, - const struct buffer_head *bh, const __be64 *pos) +static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end) { - struct buffer_head *rabh; - const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size); const __be64 *t; - for (t = pos; t < endp; t++) { + for (t = start; t < end; t++) { + struct buffer_head *rabh; + if (!*t) continue; @@ -305,21 +304,22 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, } } -/** - * lookup_mp_height - helper function for lookup_metapath - * @ip: the inode - * @mp: the metapath - * @h: the height which needs looking up - */ -static int lookup_mp_height(struct gfs2_inode *ip, struct metapath *mp, int h) +static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, + unsigned int x, unsigned int h) { - __be64 *ptr = metapointer(h, mp); - u64 dblock = be64_to_cpu(*ptr); - - if (!dblock) - return h + 1; + for (; x < h; x++) { + __be64 *ptr = metapointer(x, mp); + u64 dblock = be64_to_cpu(*ptr); + int ret; - return gfs2_meta_indirect_buffer(ip, h + 1, dblock, &mp->mp_bh[h + 1]); + if (!dblock) + break; + ret = gfs2_meta_indirect_buffer(ip, x + 1, dblock, &mp->mp_bh[x + 1]); + if (ret) + return ret; + } + mp->mp_aheight = x + 1; + return 0; } /** @@ -336,25 +336,12 @@ static int lookup_mp_height(struct gfs2_inode *ip, struct metapath *mp, int h) * at which it found the unallocated block. Blocks which are found are * added to the mp->mp_bh[] list. * - * Returns: error or height of metadata tree + * Returns: error */ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) { - unsigned int end_of_metadata = ip->i_height - 1; - unsigned int x; - int ret; - - for (x = 0; x < end_of_metadata; x++) { - ret = lookup_mp_height(ip, mp, x); - if (ret) - goto out; - } - - ret = ip->i_height; -out: - mp->mp_aheight = ret; - return ret; + return __fillup_metapath(ip, mp, 0, ip->i_height - 1); } /** @@ -365,25 +352,25 @@ out: * * Similar to lookup_metapath, but does lookups for a range of heights * - * Returns: error or height of metadata tree + * Returns: error or the number of buffers filled */ static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h) { - unsigned int start_h = h - 1; + unsigned int x = 0; int ret; if (h) { /* find the first buffer we need to look up. */ - while (start_h > 0 && mp->mp_bh[start_h] == NULL) - start_h--; - for (; start_h < h; start_h++) { - ret = lookup_mp_height(ip, mp, start_h); - if (ret) - return ret; + for (x = h - 1; x > 0; x--) { + if (mp->mp_bh[x]) + break; } } - return ip->i_height; + ret = __fillup_metapath(ip, mp, x, h); + if (ret) + return ret; + return mp->mp_aheight - x - 1; } static inline void release_metapath(struct metapath *mp) @@ -474,13 +461,6 @@ enum alloc_state { /* ALLOC_UNSTUFF = 3, TBD and rather complicated */ }; -static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt) -{ - if (hgt) - return sdp->sd_inptrs; - return sdp->sd_diptrs; -} - /** * gfs2_bmap_alloc - Build a metadata tree of the requested height * @inode: The GFS2 inode @@ -788,7 +768,7 @@ int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, goto do_alloc; ret = lookup_metapath(ip, &mp); - if (ret < 0) + if (ret) goto out_release; if (mp.mp_aheight != ip->i_height) @@ -913,17 +893,18 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi } /** - * gfs2_block_truncate_page - Deal with zeroing out data for truncate + * gfs2_block_zero_range - Deal with zeroing out data * * This is partly borrowed from ext3. */ -static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) +static int gfs2_block_zero_range(struct inode *inode, loff_t from, + unsigned int length) { - struct inode *inode = mapping->host; + struct address_space *mapping = inode->i_mapping; struct gfs2_inode *ip = GFS2_I(inode); unsigned long index = from >> PAGE_SHIFT; unsigned offset = from & (PAGE_SIZE-1); - unsigned blocksize, iblock, length, pos; + unsigned blocksize, iblock, pos; struct buffer_head *bh; struct page *page; int err; @@ -933,7 +914,6 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) return 0; blocksize = inode->i_sb->s_blocksize; - length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); if (!page_has_buffers(page)) @@ -1003,11 +983,24 @@ static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize int error; while (oldsize != newsize) { + struct gfs2_trans *tr; + unsigned int offs; + chunk = oldsize - newsize; if (chunk > max_chunk) chunk = max_chunk; + + offs = oldsize & ~PAGE_MASK; + if (offs && chunk > PAGE_SIZE) + chunk = offs + ((chunk - offs) & PAGE_MASK); + truncate_pagecache(inode, oldsize - chunk); oldsize -= chunk; + + tr = current->journal_info; + if (!test_bit(TR_TOUCHED, &tr->tr_flags)) + continue; + gfs2_trans_end(sdp); error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); if (error) @@ -1017,13 +1010,13 @@ static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize return 0; } -static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) +static int trunc_start(struct inode *inode, u64 newsize) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - struct address_space *mapping = inode->i_mapping; - struct buffer_head *dibh; + struct buffer_head *dibh = NULL; int journaled = gfs2_is_jdata(ip); + u64 oldsize = inode->i_size; int error; if (journaled) @@ -1042,10 +1035,13 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) if (gfs2_is_stuffed(ip)) { gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize); } else { - if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) { - error = gfs2_block_truncate_page(mapping, newsize); + unsigned int blocksize = i_blocksize(inode); + unsigned int offs = newsize & (blocksize - 1); + if (offs) { + error = gfs2_block_zero_range(inode, newsize, + blocksize - offs); if (error) - goto out_brelse; + goto out; } ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; } @@ -1059,15 +1055,10 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) else truncate_pagecache(inode, newsize); - if (error) { - brelse(dibh); - return error; - } - -out_brelse: - brelse(dibh); out: - gfs2_trans_end(sdp); + brelse(dibh); + if (current->journal_info) + gfs2_trans_end(sdp); return error; } @@ -1075,10 +1066,11 @@ out: * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein * @ip: inode * @rg_gh: holder of resource group glock - * @mp: current metapath fully populated with buffers + * @bh: buffer head to sweep + * @start: starting point in bh + * @end: end point in bh + * @meta: true if bh points to metadata (rather than data) * @btotal: place to keep count of total blocks freed - * @hgt: height we're processing - * @first: true if this is the first call to this function for this height * * We sweep a metadata buffer (provided by the metapath) for blocks we need to * free, and free them all. However, we do it one rgrp at a time. If this @@ -1093,47 +1085,46 @@ out: * *btotal has the total number of blocks freed */ static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh, - const struct metapath *mp, u32 *btotal, int hgt, - bool preserve1) + struct buffer_head *bh, __be64 *start, __be64 *end, + bool meta, u32 *btotal) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; struct gfs2_trans *tr; - struct buffer_head *bh = mp->mp_bh[hgt]; - __be64 *top, *bottom, *p; + __be64 *p; int blks_outside_rgrp; u64 bn, bstart, isize_blks; s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */ - int meta = ((hgt != ip->i_height - 1) ? 1 : 0); int ret = 0; bool buf_in_tr = false; /* buffer was added to transaction */ - if (gfs2_metatype_check(sdp, bh, - (hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI))) - return -EIO; - more_rgrps: + rgd = NULL; + if (gfs2_holder_initialized(rd_gh)) { + rgd = gfs2_glock2rgrp(rd_gh->gh_gl); + gfs2_assert_withdraw(sdp, + gfs2_glock_is_locked_by_me(rd_gh->gh_gl)); + } blks_outside_rgrp = 0; bstart = 0; blen = 0; - top = metapointer(hgt, mp); /* first ptr from metapath */ - /* If we're keeping some data at the truncation point, we've got to - preserve the metadata tree by adding 1 to the starting metapath. */ - if (preserve1) - top++; - bottom = (__be64 *)(bh->b_data + bh->b_size); - - for (p = top; p < bottom; p++) { + for (p = start; p < end; p++) { if (!*p) continue; bn = be64_to_cpu(*p); - if (gfs2_holder_initialized(rd_gh)) { - rgd = gfs2_glock2rgrp(rd_gh->gh_gl); - gfs2_assert_withdraw(sdp, - gfs2_glock_is_locked_by_me(rd_gh->gh_gl)); + + if (rgd) { + if (!rgrp_contains_block(rgd, bn)) { + blks_outside_rgrp++; + continue; + } } else { - rgd = gfs2_blk2rgrpd(sdp, bn, false); + rgd = gfs2_blk2rgrpd(sdp, bn, true); + if (unlikely(!rgd)) { + ret = -EIO; + goto out; + } ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, rd_gh); if (ret) @@ -1145,11 +1136,6 @@ more_rgrps: gfs2_rs_deltree(&ip->i_res); } - if (!rgrp_contains_block(rgd, bn)) { - blks_outside_rgrp++; - continue; - } - /* The size of our transactions will be unknown until we actually process all the metadata blocks that relate to the rgrp. So we estimate. We know it can't be more than @@ -1168,7 +1154,7 @@ more_rgrps: jblocks_rqsted += isize_blks; revokes = jblocks_rqsted; if (meta) - revokes += hptrs(sdp, hgt); + revokes += end - start; else if (ip->i_depth) revokes += sdp->sd_inptrs; ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes); @@ -1226,7 +1212,11 @@ out_unlock: outside the rgrp we just processed, do it all over again. */ if (current->journal_info) { - struct buffer_head *dibh = mp->mp_bh[0]; + struct buffer_head *dibh; + + ret = gfs2_meta_inode_buffer(ip, &dibh); + if (ret) + goto out; /* Every transaction boundary, we rewrite the dinode to keep its di_blocks current in case of failure. */ @@ -1234,6 +1224,7 @@ out_unlock: current_time(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); up_write(&ip->i_rw_mutex); gfs2_trans_end(sdp); } @@ -1245,38 +1236,48 @@ out: return ret; } +static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h) +{ + if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0]))) + return false; + return true; +} + /** * find_nonnull_ptr - find a non-null pointer given a metapath and height - * assumes the metapath is valid (with buffers) out to height h * @mp: starting metapath * @h: desired height to search * + * Assumes the metapath is valid (with buffers) out to height h. * Returns: true if a non-null pointer was found in the metapath buffer * false if all remaining pointers are NULL in the buffer */ static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp, - unsigned int h) + unsigned int h, + __u16 *end_list, unsigned int end_aligned) { - __be64 *ptr; - unsigned int ptrs = hptrs(sdp, h) - 1; + struct buffer_head *bh = mp->mp_bh[h]; + __be64 *first, *ptr, *end; + + first = metaptr1(h, mp); + ptr = first + mp->mp_list[h]; + end = (__be64 *)(bh->b_data + bh->b_size); + if (end_list && mp_eq_to_hgt(mp, end_list, h)) { + bool keep_end = h < end_aligned; + end = first + end_list[h] + keep_end; + } - while (true) { - ptr = metapointer(h, mp); + while (ptr < end) { if (*ptr) { /* if we have a non-null pointer */ - /* Now zero the metapath after the current height. */ + mp->mp_list[h] = ptr - first; h++; if (h < GFS2_MAX_META_HEIGHT) - memset(&mp->mp_list[h], 0, - (GFS2_MAX_META_HEIGHT - h) * - sizeof(mp->mp_list[0])); + mp->mp_list[h] = 0; return true; } - - if (mp->mp_list[h] < ptrs) - mp->mp_list[h]++; - else - return false; /* no more pointers in this buffer */ + ptr++; } + return false; } enum dealloc_states { @@ -1286,49 +1287,126 @@ enum dealloc_states { DEALLOC_DONE = 3, /* process complete */ }; -static bool mp_eq_to_hgt(struct metapath *mp, __u16 *nbof, unsigned int h) +static inline void +metapointer_range(struct metapath *mp, int height, + __u16 *start_list, unsigned int start_aligned, + __u16 *end_list, unsigned int end_aligned, + __be64 **start, __be64 **end) { - if (memcmp(mp->mp_list, nbof, h * sizeof(mp->mp_list[0]))) - return false; - return true; + struct buffer_head *bh = mp->mp_bh[height]; + __be64 *first; + + first = metaptr1(height, mp); + *start = first; + if (mp_eq_to_hgt(mp, start_list, height)) { + bool keep_start = height < start_aligned; + *start = first + start_list[height] + keep_start; + } + *end = (__be64 *)(bh->b_data + bh->b_size); + if (end_list && mp_eq_to_hgt(mp, end_list, height)) { + bool keep_end = height < end_aligned; + *end = first + end_list[height] + keep_end; + } +} + +static inline bool walk_done(struct gfs2_sbd *sdp, + struct metapath *mp, int height, + __u16 *end_list, unsigned int end_aligned) +{ + __u16 end; + + if (end_list) { + bool keep_end = height < end_aligned; + if (!mp_eq_to_hgt(mp, end_list, height)) + return false; + end = end_list[height] + keep_end; + } else + end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs; + return mp->mp_list[height] >= end; } /** - * trunc_dealloc - truncate a file down to a desired size + * punch_hole - deallocate blocks in a file * @ip: inode to truncate - * @newsize: The desired size of the file + * @offset: the start of the hole + * @length: the size of the hole (or 0 for truncate) + * + * Punch a hole into a file or truncate a file at a given position. This + * function operates in whole blocks (@offset and @length are rounded + * accordingly); partially filled blocks must be cleared otherwise. * - * This function truncates a file to newsize. It works from the - * bottom up, and from the right to the left. In other words, it strips off - * the highest layer (data) before stripping any of the metadata. Doing it - * this way is best in case the operation is interrupted by power failure, etc. - * The dinode is rewritten in every transaction to guarantee integrity. + * This function works from the bottom up, and from the right to the left. In + * other words, it strips off the highest layer (data) before stripping any of + * the metadata. Doing it this way is best in case the operation is interrupted + * by power failure, etc. The dinode is rewritten in every transaction to + * guarantee integrity. */ -static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize) +static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct metapath mp; + struct metapath mp = {}; struct buffer_head *dibh, *bh; struct gfs2_holder rd_gh; - u64 lblock; - __u16 nbof[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */ + unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift; + u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift; + __u16 start_list[GFS2_MAX_META_HEIGHT]; + __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL; + unsigned int start_aligned, uninitialized_var(end_aligned); unsigned int strip_h = ip->i_height - 1; u32 btotal = 0; int ret, state; int mp_h; /* metapath buffers are read in to this height */ - sector_t last_ra = 0; u64 prev_bnr = 0; - bool preserve1; /* need to preserve the first meta pointer? */ + __be64 *start, *end; - if (!newsize) - lblock = 0; - else - lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift; + /* + * The start position of the hole is defined by lblock, start_list, and + * start_aligned. The end position of the hole is defined by lend, + * end_list, and end_aligned. + * + * start_aligned and end_aligned define down to which height the start + * and end positions are aligned to the metadata tree (i.e., the + * position is a multiple of the metadata granularity at the height + * above). This determines at which heights additional meta pointers + * needs to be preserved for the remaining data. + */ + + if (length) { + u64 maxsize = sdp->sd_heightsize[ip->i_height]; + u64 end_offset = offset + length; + u64 lend; + + /* + * Clip the end at the maximum file size for the given height: + * that's how far the metadata goes; files bigger than that + * will have additional layers of indirection. + */ + if (end_offset > maxsize) + end_offset = maxsize; + lend = end_offset >> bsize_shift; + + if (lblock >= lend) + return 0; + + find_metapath(sdp, lend, &mp, ip->i_height); + end_list = __end_list; + memcpy(end_list, mp.mp_list, sizeof(mp.mp_list)); + + for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) { + if (end_list[mp_h]) + break; + } + end_aligned = mp_h; + } - memset(&mp, 0, sizeof(mp)); find_metapath(sdp, lblock, &mp, ip->i_height); + memcpy(start_list, mp.mp_list, sizeof(start_list)); - memcpy(&nbof, &mp.mp_list, sizeof(nbof)); + for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) { + if (start_list[mp_h]) + break; + } + start_aligned = mp_h; ret = gfs2_meta_inode_buffer(ip, &dibh); if (ret) @@ -1336,7 +1414,17 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize) mp.mp_bh[0] = dibh; ret = lookup_metapath(ip, &mp); - if (ret == ip->i_height) + if (ret) + goto out_metapath; + + /* issue read-ahead on metadata */ + for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) { + metapointer_range(&mp, mp_h, start_list, start_aligned, + end_list, end_aligned, &start, &end); + gfs2_metapath_ra(ip->i_gl, start, end); + } + + if (mp.mp_aheight == ip->i_height) state = DEALLOC_MP_FULL; /* We have a complete metapath */ else state = DEALLOC_FILL_MP; /* deal with partial metapath */ @@ -1357,20 +1445,6 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize) /* Truncate a full metapath at the given strip height. * Note that strip_h == mp_h in order to be in this state. */ case DEALLOC_MP_FULL: - if (mp_h > 0) { /* issue read-ahead on metadata */ - __be64 *top; - - bh = mp.mp_bh[mp_h - 1]; - if (bh->b_blocknr != last_ra) { - last_ra = bh->b_blocknr; - top = metaptr1(mp_h - 1, &mp); - gfs2_metapath_ra(ip->i_gl, bh, top); - } - } - /* If we're truncating to a non-zero size and the mp is - at the beginning of file for the strip height, we - need to preserve the first metadata pointer. */ - preserve1 = (newsize && mp_eq_to_hgt(&mp, nbof, mp_h)); bh = mp.mp_bh[mp_h]; gfs2_assert_withdraw(sdp, bh); if (gfs2_assert_withdraw(sdp, @@ -1382,8 +1456,28 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize) prev_bnr, ip->i_height, strip_h, mp_h); } prev_bnr = bh->b_blocknr; - ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal, - mp_h, preserve1); + + if (gfs2_metatype_check(sdp, bh, + (mp_h ? GFS2_METATYPE_IN : + GFS2_METATYPE_DI))) { + ret = -EIO; + goto out; + } + + /* + * Below, passing end_aligned as 0 gives us the + * metapointer range excluding the end point: the end + * point is the first metapath we must not deallocate! + */ + + metapointer_range(&mp, mp_h, start_list, start_aligned, + end_list, 0 /* end_aligned */, + &start, &end); + ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h], + start, end, + mp_h != ip->i_height - 1, + &btotal); + /* If we hit an error or just swept dinode buffer, just exit. */ if (ret || !mp_h) { @@ -1407,20 +1501,20 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize) stripping the previous level of metadata. */ if (mp_h == 0) { strip_h--; - memcpy(&mp.mp_list, &nbof, sizeof(nbof)); + memcpy(mp.mp_list, start_list, sizeof(start_list)); mp_h = strip_h; state = DEALLOC_FILL_MP; break; } mp.mp_list[mp_h] = 0; mp_h--; /* search one metadata height down */ - if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1) - break; /* loop around in the same state */ mp.mp_list[mp_h]++; + if (walk_done(sdp, &mp, mp_h, end_list, end_aligned)) + break; /* Here we've found a part of the metapath that is not * allocated. We need to search at that height for the * next non-null pointer. */ - if (find_nonnull_ptr(sdp, &mp, mp_h)) { + if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) { state = DEALLOC_FILL_MP; mp_h++; } @@ -1435,18 +1529,29 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize) if (ret < 0) goto out; + /* issue read-ahead on metadata */ + if (mp.mp_aheight > 1) { + for (; ret > 1; ret--) { + metapointer_range(&mp, mp.mp_aheight - ret, + start_list, start_aligned, + end_list, end_aligned, + &start, &end); + gfs2_metapath_ra(ip->i_gl, start, end); + } + } + /* If buffers found for the entire strip height */ - if ((ret == ip->i_height) && (mp_h == strip_h)) { + if (mp.mp_aheight - 1 == strip_h) { state = DEALLOC_MP_FULL; break; } - if (ret < ip->i_height) /* We have a partial height */ - mp_h = ret - 1; + if (mp.mp_aheight < ip->i_height) /* We have a partial height */ + mp_h = mp.mp_aheight - 1; /* If we find a non-null block pointer, crawl a bit higher up in the metapath and try again, otherwise we need to look lower for a new starting point. */ - if (find_nonnull_ptr(sdp, &mp, mp_h)) + if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) mp_h++; else state = DEALLOC_MP_LOWER; @@ -1524,7 +1629,6 @@ out: /** * do_shrink - make a file smaller * @inode: the inode - * @oldsize: the current inode size * @newsize: the size to make the file * * Called with an exclusive lock on @inode. The @size must @@ -1533,18 +1637,18 @@ out: * Returns: errno */ -static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize) +static int do_shrink(struct inode *inode, u64 newsize) { struct gfs2_inode *ip = GFS2_I(inode); int error; - error = trunc_start(inode, oldsize, newsize); + error = trunc_start(inode, newsize); if (error < 0) return error; if (gfs2_is_stuffed(ip)) return 0; - error = trunc_dealloc(ip, newsize); + error = punch_hole(ip, newsize, 0); if (error == 0) error = trunc_end(ip); @@ -1553,10 +1657,9 @@ static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize) void gfs2_trim_blocks(struct inode *inode) { - u64 size = inode->i_size; int ret; - ret = do_shrink(inode, size, size); + ret = do_shrink(inode, inode->i_size); WARN_ON(ret != 0); } @@ -1589,8 +1692,7 @@ static int do_grow(struct inode *inode, u64 size) int error; int unstuff = 0; - if (gfs2_is_stuffed(ip) && - (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) { + if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) { error = gfs2_quota_lock_check(ip, &ap); if (error) return error; @@ -1650,7 +1752,6 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) { struct gfs2_inode *ip = GFS2_I(inode); int ret; - u64 oldsize; BUG_ON(!S_ISREG(inode->i_mode)); @@ -1664,13 +1765,12 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) if (ret) goto out; - oldsize = inode->i_size; - if (newsize >= oldsize) { + if (newsize >= inode->i_size) { ret = do_grow(inode, newsize); goto out; } - ret = do_shrink(inode, oldsize, newsize); + ret = do_shrink(inode, newsize); out: gfs2_rsqa_delete(ip, NULL); return ret; @@ -1679,7 +1779,7 @@ out: int gfs2_truncatei_resume(struct gfs2_inode *ip) { int error; - error = trunc_dealloc(ip, i_size_read(&ip->i_inode)); + error = punch_hole(ip, i_size_read(&ip->i_inode), 0); if (!error) error = trunc_end(ip); return error; @@ -1687,7 +1787,7 @@ int gfs2_truncatei_resume(struct gfs2_inode *ip) int gfs2_file_dealloc(struct gfs2_inode *ip) { - return trunc_dealloc(ip, 0); + return punch_hole(ip, 0, 0); } /** @@ -1827,8 +1927,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, return 0; if (gfs2_is_stuffed(ip)) { - if (offset + len > - sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) + if (offset + len > gfs2_max_stuffed_size(ip)) return 1; return 0; } @@ -1855,3 +1954,123 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, return 0; } +static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *dibh; + int error; + + if (offset >= inode->i_size) + return 0; + if (offset + length > inode->i_size) + length = inode->i_size - offset; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + gfs2_trans_add_meta(ip->i_gl, dibh); + memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0, + length); + brelse(dibh); + return 0; +} + +static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset, + loff_t length) +{ + struct gfs2_sbd *sdp = GFS2_SB(inode); + loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize; + int error; + + while (length) { + struct gfs2_trans *tr; + loff_t chunk; + unsigned int offs; + + chunk = length; + if (chunk > max_chunk) + chunk = max_chunk; + + offs = offset & ~PAGE_MASK; + if (offs && chunk > PAGE_SIZE) + chunk = offs + ((chunk - offs) & PAGE_MASK); + + truncate_pagecache_range(inode, offset, chunk); + offset += chunk; + length -= chunk; + + tr = current->journal_info; + if (!test_bit(TR_TOUCHED, &tr->tr_flags)) + continue; + + gfs2_trans_end(sdp); + error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); + if (error) + return error; + } + return 0; +} + +int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length) +{ + struct inode *inode = file_inode(file); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + int error; + + if (gfs2_is_jdata(ip)) + error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA, + GFS2_JTRUNC_REVOKES); + else + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + return error; + + if (gfs2_is_stuffed(ip)) { + error = stuffed_zero_range(inode, offset, length); + if (error) + goto out; + } else { + unsigned int start_off, end_off, blocksize; + + blocksize = i_blocksize(inode); + start_off = offset & (blocksize - 1); + end_off = (offset + length) & (blocksize - 1); + if (start_off) { + unsigned int len = length; + if (length > blocksize - start_off) + len = blocksize - start_off; + error = gfs2_block_zero_range(inode, offset, len); + if (error) + goto out; + if (start_off + length < blocksize) + end_off = 0; + } + if (end_off) { + error = gfs2_block_zero_range(inode, + offset + length - end_off, end_off); + if (error) + goto out; + } + } + + if (gfs2_is_jdata(ip)) { + BUG_ON(!current->journal_info); + gfs2_journaled_truncate_range(inode, offset, length); + } else + truncate_pagecache_range(inode, offset, offset + length - 1); + + file_update_time(file); + mark_inode_dirty(inode); + + if (current->journal_info) + gfs2_trans_end(sdp); + + if (!gfs2_is_stuffed(ip)) + error = punch_hole(ip, offset, length); + +out: + if (current->journal_info) + gfs2_trans_end(sdp); + return error; +} diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index 443cc182cf18..c3402fe00653 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -61,5 +61,6 @@ extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, unsigned int len); extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd); extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd); +extern int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length); #endif /* __BMAP_DOT_H__ */ diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 06a0d1947c77..7c21aea0266b 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -170,8 +170,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, if (!size) return 0; - if (gfs2_is_stuffed(ip) && - offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) + if (gfs2_is_stuffed(ip) && offset + size <= gfs2_max_stuffed_size(ip)) return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset, size); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 58705ef8643a..4f88e201b3f0 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -246,7 +246,9 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) } if ((flags ^ new_flags) & GFS2_DIF_JDATA) { if (new_flags & GFS2_DIF_JDATA) - gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); + gfs2_log_flush(sdp, ip->i_gl, + GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_SET_FLAGS); error = filemap_fdatawrite(inode->i_mapping); if (error) goto out; @@ -924,7 +926,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le struct gfs2_holder gh; int ret; - if (mode & ~FALLOC_FL_KEEP_SIZE) + if (mode & ~(FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; /* fallocate is needed by gfs2_grow to reserve space in the rindex */ if (gfs2_is_jdata(ip) && inode != sdp->sd_rindex) @@ -948,13 +950,18 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le if (ret) goto out_unlock; - ret = gfs2_rsqa_alloc(ip); - if (ret) - goto out_putw; + if (mode & FALLOC_FL_PUNCH_HOLE) { + ret = __gfs2_punch_hole(file, offset, len); + } else { + ret = gfs2_rsqa_alloc(ip); + if (ret) + goto out_putw; - ret = __gfs2_fallocate(file, mode, offset, len); - if (ret) - gfs2_rs_deltree(&ip->i_res); + ret = __gfs2_fallocate(file, mode, offset, len); + + if (ret) + gfs2_rs_deltree(&ip->i_res); + } out_putw: put_write_access(inode); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 11066d8647d2..82fb5583445c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1549,16 +1549,13 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) rhashtable_walk_enter(&gl_hash_table, &iter); do { - gl = ERR_PTR(rhashtable_walk_start(&iter)); - if (IS_ERR(gl)) - goto walk_stop; + rhashtable_walk_start(&iter); while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) if (gl->gl_name.ln_sbd == sdp && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); -walk_stop: rhashtable_walk_stop(&iter); } while (cond_resched(), gl == ERR_PTR(-EAGAIN)); @@ -1924,19 +1921,29 @@ void gfs2_glock_exit(void) destroy_workqueue(gfs2_delete_workqueue); } -static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi) +static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi, loff_t n) { - while ((gi->gl = rhashtable_walk_next(&gi->hti))) { - if (IS_ERR(gi->gl)) { - if (PTR_ERR(gi->gl) == -EAGAIN) - continue; - gi->gl = NULL; - return; + if (n == 0) + gi->gl = rhashtable_walk_peek(&gi->hti); + else { + gi->gl = rhashtable_walk_next(&gi->hti); + n--; + } + for (;;) { + if (IS_ERR_OR_NULL(gi->gl)) { + if (!gi->gl) + return; + if (PTR_ERR(gi->gl) != -EAGAIN) { + gi->gl = NULL; + return; + } + n = 0; + } else if (gi->sdp == gi->gl->gl_name.ln_sbd && + !__lockref_is_dead(&gi->gl->gl_lockref)) { + if (!n--) + break; } - /* Skip entries for other sb and dead entries */ - if (gi->sdp == gi->gl->gl_name.ln_sbd && - !__lockref_is_dead(&gi->gl->gl_lockref)) - return; + gi->gl = rhashtable_walk_next(&gi->hti); } } @@ -1944,18 +1951,24 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct gfs2_glock_iter *gi = seq->private; - loff_t n = *pos; + loff_t n; - rhashtable_walk_enter(&gl_hash_table, &gi->hti); - if (rhashtable_walk_start(&gi->hti) != 0) - return NULL; + /* + * We can either stay where we are, skip to the next hash table + * entry, or start from the beginning. + */ + if (*pos < gi->last_pos) { + rhashtable_walk_exit(&gi->hti); + rhashtable_walk_enter(&gl_hash_table, &gi->hti); + n = *pos + 1; + } else { + n = *pos - gi->last_pos; + } - do { - gfs2_glock_iter_next(gi); - } while (gi->gl && n--); + rhashtable_walk_start(&gi->hti); + gfs2_glock_iter_next(gi, n); gi->last_pos = *pos; - return gi->gl; } @@ -1966,8 +1979,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, (*pos)++; gi->last_pos = *pos; - gfs2_glock_iter_next(gi); - + gfs2_glock_iter_next(gi, 1); return gi->gl; } @@ -1978,7 +1990,6 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) gi->gl = NULL; rhashtable_walk_stop(&gi->hti); - rhashtable_walk_exit(&gi->hti); } static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) @@ -2044,7 +2055,13 @@ static int __gfs2_glocks_open(struct inode *inode, struct file *file, seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; + /* + * Initially, we are "before" the first hash table entry; the + * first call to rhashtable_walk_next gets us the first entry. + */ + gi->last_pos = -1; gi->gl = NULL; + rhashtable_walk_enter(&gl_hash_table, &gi->hti); } return ret; } @@ -2060,6 +2077,7 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file) struct gfs2_glock_iter *gi = seq->private; gi->gl = NULL; + rhashtable_walk_exit(&gi->hti); return seq_release_private(inode, file); } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index cdd1c5f06f45..d8782a7a1e7d 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -107,7 +107,8 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) __gfs2_ail_flush(gl, 0, tr.tr_revokes); gfs2_trans_end(sdp); - gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_AIL_EMPTY_GL); } void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) @@ -128,7 +129,8 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) return; __gfs2_ail_flush(gl, fsync, max_revokes); gfs2_trans_end(sdp); - gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_AIL_FLUSH); } /** @@ -157,7 +159,8 @@ static void rgrp_go_sync(struct gfs2_glock *gl) return; GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); - gfs2_log_flush(sdp, gl, NORMAL_FLUSH); + gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_RGRP_GO_SYNC); filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end); error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end); mapping_set_error(mapping, error); @@ -252,7 +255,8 @@ static void inode_go_sync(struct gfs2_glock *gl) GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); - gfs2_log_flush(gl->gl_name.ln_sbd, gl, NORMAL_FLUSH); + gfs2_log_flush(gl->gl_name.ln_sbd, gl, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_INODE_GO_SYNC); filemap_fdatawrite(metamapping); if (isreg) { struct address_space *mapping = ip->i_inode.i_mapping; @@ -303,7 +307,9 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) } if (ip == GFS2_I(gl->gl_name.ln_sbd->sd_rindex)) { - gfs2_log_flush(gl->gl_name.ln_sbd, NULL, NORMAL_FLUSH); + gfs2_log_flush(gl->gl_name.ln_sbd, NULL, + GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_INODE_GO_INVAL); gl->gl_name.ln_sbd->sd_rindex_uptodate = 0; } if (ip && S_ISREG(ip->i_inode.i_mode)) @@ -495,7 +501,8 @@ static void freeze_go_sync(struct gfs2_glock *gl) gfs2_assert_withdraw(sdp, 0); } queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work); - gfs2_log_flush(sdp, NULL, FREEZE_FLUSH); + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE | + GFS2_LFC_FREEZE_GO_SYNC); } } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 6e18e9793ec4..e0557b8a590a 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -44,7 +44,6 @@ struct gfs2_log_header_host { u32 lh_flags; /* GFS2_LOG_HEAD_... */ u32 lh_tail; /* Block number of log tail */ u32 lh_blkno; - u32 lh_hash; }; /* @@ -861,5 +860,10 @@ static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which) extern struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl); +static inline unsigned gfs2_max_stuffed_size(const struct gfs2_inode *ip) +{ + return GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); +} + #endif /* __INCORE_DOT_H__ */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 4e971b1c7f92..59e0560180ec 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1152,12 +1152,11 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0); if (error) - goto out_end_trans; + goto out_gunlock; error = gfs2_unlink_inode(dip, dentry); - -out_end_trans: gfs2_trans_end(sdp); + out_gunlock: gfs2_glock_dq(ghs + 2); out_rgrp: @@ -1184,11 +1183,10 @@ out_inodes: static int gfs2_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct gfs2_sbd *sdp = GFS2_SB(dir); unsigned int size; size = strlen(symname); - if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) + if (size >= gfs2_max_stuffed_size(GFS2_I(dir))) return -ENAMETOOLONG; return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0, NULL); @@ -1205,8 +1203,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - struct gfs2_sbd *sdp = GFS2_SB(dir); - unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); + unsigned dsize = gfs2_max_stuffed_size(GFS2_I(dir)); return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0, NULL); } diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 65f33a0ac190..006c6164f759 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1091,7 +1091,7 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot *slot) spin_lock(&ls->ls_recover_spin); if (ls->ls_recover_size < jid + 1) { - fs_err(sdp, "recover_slot jid %d gen %u short size %d", + fs_err(sdp, "recover_slot jid %d gen %u short size %d\n", jid, ls->ls_recover_block, ls->ls_recover_size); spin_unlock(&ls->ls_recover_spin); return; @@ -1153,7 +1153,7 @@ static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid, return; } if (ls->ls_recover_size < jid + 1) { - fs_err(sdp, "recovery_result jid %d short size %d", + fs_err(sdp, "recovery_result jid %d short size %d\n", jid, ls->ls_recover_size); spin_unlock(&ls->ls_recover_spin); return; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index f72c44231406..cf6b46247df4 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -14,6 +14,7 @@ #include <linux/buffer_head.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> +#include <linux/crc32c.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> @@ -538,9 +539,12 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp) list_sort(NULL, &sdp->sd_log_le_ordered, &ip_cmp); while (!list_empty(&sdp->sd_log_le_ordered)) { ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); - list_move(&ip->i_ordered, &written); - if (ip->i_inode.i_mapping->nrpages == 0) + if (ip->i_inode.i_mapping->nrpages == 0) { + test_and_clear_bit(GIF_ORDERED, &ip->i_flags); + list_del(&ip->i_ordered); continue; + } + list_move(&ip->i_ordered, &written); spin_unlock(&sdp->sd_ordered_lock); filemap_fdatawrite(ip->i_inode.i_mapping); spin_lock(&sdp->sd_ordered_lock); @@ -648,49 +652,102 @@ out_of_blocks: } /** - * log_write_header - Get and initialize a journal header buffer + * write_log_header - Write a journal log header buffer at sd_log_flush_head * @sdp: The GFS2 superblock + * @jd: journal descriptor of the journal to which we are writing + * @seq: sequence number + * @tail: tail of the log + * @flags: log header flags GFS2_LOG_HEAD_* + * @op_flags: flags to pass to the bio * * Returns: the initialized log buffer descriptor */ -static void log_write_header(struct gfs2_sbd *sdp, u32 flags) +void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + u64 seq, u32 tail, u32 flags, int op_flags) { struct gfs2_log_header *lh; - unsigned int tail; - u32 hash; - int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC; + u32 hash, crc; struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); - enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct timespec64 tv; + struct super_block *sb = sdp->sd_vfs; + u64 addr; + lh = page_address(page); clear_page(lh); - gfs2_assert_withdraw(sdp, (state != SFS_FROZEN)); - - tail = current_tail(sdp); - lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); lh->lh_header.__pad0 = cpu_to_be64(0); lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); - lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++); + lh->lh_sequence = cpu_to_be64(seq); lh->lh_flags = cpu_to_be32(flags); lh->lh_tail = cpu_to_be32(tail); lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head); - hash = gfs2_disk_hash(page_address(page), sizeof(struct gfs2_log_header)); + hash = ~crc32(~0, lh, LH_V1_SIZE); lh->lh_hash = cpu_to_be32(hash); + tv = current_kernel_time64(); + lh->lh_nsec = cpu_to_be32(tv.tv_nsec); + lh->lh_sec = cpu_to_be64(tv.tv_sec); + addr = gfs2_log_bmap(sdp); + lh->lh_addr = cpu_to_be64(addr); + lh->lh_jinode = cpu_to_be64(GFS2_I(jd->jd_inode)->i_no_addr); + + /* We may only write local statfs, quota, etc., when writing to our + own journal. The values are left 0 when recovering a journal + different from our own. */ + if (!(flags & GFS2_LOG_HEAD_RECOVERY)) { + lh->lh_statfs_addr = + cpu_to_be64(GFS2_I(sdp->sd_sc_inode)->i_no_addr); + lh->lh_quota_addr = + cpu_to_be64(GFS2_I(sdp->sd_qc_inode)->i_no_addr); + + spin_lock(&sdp->sd_statfs_spin); + lh->lh_local_total = cpu_to_be64(l_sc->sc_total); + lh->lh_local_free = cpu_to_be64(l_sc->sc_free); + lh->lh_local_dinodes = cpu_to_be64(l_sc->sc_dinodes); + spin_unlock(&sdp->sd_statfs_spin); + } + + BUILD_BUG_ON(offsetof(struct gfs2_log_header, lh_crc) != LH_V1_SIZE); + + crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, + sb->s_blocksize - LH_V1_SIZE - 4); + lh->lh_crc = cpu_to_be32(crc); + + gfs2_log_write(sdp, page, sb->s_blocksize, 0, addr); + gfs2_log_flush_bio(sdp, REQ_OP_WRITE, op_flags); + log_flush_wait(sdp); +} + +/** + * log_write_header - Get and initialize a journal header buffer + * @sdp: The GFS2 superblock + * @flags: The log header flags, including log header origin + * + * Returns: the initialized log buffer descriptor + */ + +static void log_write_header(struct gfs2_sbd *sdp, u32 flags) +{ + unsigned int tail; + int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC; + enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); + + gfs2_assert_withdraw(sdp, (state != SFS_FROZEN)); + tail = current_tail(sdp); + if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) { gfs2_ordered_wait(sdp); log_flush_wait(sdp); op_flags = REQ_SYNC | REQ_META | REQ_PRIO; } - sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); - gfs2_log_write_page(sdp, page); - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, op_flags); - log_flush_wait(sdp); + gfs2_write_log_header(sdp, sdp->sd_jdesc, sdp->sd_log_sequence++, tail, + flags, op_flags); if (sdp->sd_log_tail != tail) log_pull_tail(sdp, tail); @@ -700,11 +757,11 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) * gfs2_log_flush - flush incore transaction(s) * @sdp: the filesystem * @gl: The glock structure to flush. If NULL, flush the whole incore log + * @flags: The log header flags: GFS2_LOG_HEAD_FLUSH_* and debug flags * */ -void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, - enum gfs2_flush_type type) +void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) { struct gfs2_trans *tr; enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); @@ -716,9 +773,9 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, up_write(&sdp->sd_log_flush_lock); return; } - trace_gfs2_log_flush(sdp, 1); + trace_gfs2_log_flush(sdp, 1, flags); - if (type == SHUTDOWN_FLUSH) + if (flags & GFS2_LOG_HEAD_FLUSH_SHUTDOWN) clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); sdp->sd_log_flush_head = sdp->sd_log_head; @@ -743,11 +800,11 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); - log_write_header(sdp, 0); + log_write_header(sdp, flags); } else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ trace_gfs2_log_blocks(sdp, -1); - log_write_header(sdp, 0); + log_write_header(sdp, flags); } lops_after_commit(sdp, tr); @@ -764,7 +821,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); - if (type != NORMAL_FLUSH) { + if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) { if (!sdp->sd_log_idle) { for (;;) { gfs2_ail1_start(sdp); @@ -774,16 +831,17 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, } atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ trace_gfs2_log_blocks(sdp, -1); - log_write_header(sdp, 0); + log_write_header(sdp, flags); sdp->sd_log_head = sdp->sd_log_flush_head; } - if (type == SHUTDOWN_FLUSH || type == FREEZE_FLUSH) + if (flags & (GFS2_LOG_HEAD_FLUSH_SHUTDOWN | + GFS2_LOG_HEAD_FLUSH_FREEZE)) gfs2_log_shutdown(sdp); - if (type == FREEZE_FLUSH) + if (flags & GFS2_LOG_HEAD_FLUSH_FREEZE) atomic_set(&sdp->sd_freeze_state, SFS_FROZEN); } - trace_gfs2_log_flush(sdp, 0); + trace_gfs2_log_flush(sdp, 0, flags); up_write(&sdp->sd_log_flush_lock); kfree(tr); @@ -879,7 +937,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp) sdp->sd_log_flush_head = sdp->sd_log_head; - log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT); + log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT | GFS2_LFC_SHUTDOWN); gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list)); @@ -935,7 +993,8 @@ int gfs2_logd(void *data) did_flush = false; if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { gfs2_ail1_empty(sdp); - gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_LOGD_JFLUSH_REQD); did_flush = true; } @@ -943,7 +1002,8 @@ int gfs2_logd(void *data) gfs2_ail1_start(sdp); gfs2_ail1_wait(sdp); gfs2_ail1_empty(sdp); - gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_LOGD_AIL_FLUSH_REQD); did_flush = true; } diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 9499a6049212..93b52ac1ca1f 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -65,14 +65,10 @@ extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); -enum gfs2_flush_type { - NORMAL_FLUSH = 0, - SYNC_FLUSH, - SHUTDOWN_FLUSH, - FREEZE_FLUSH -}; +extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + u64 seq, u32 tail, u32 flags, int op_flags); extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, - enum gfs2_flush_type type); + u32 type); extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index c8ff7b7954f0..4d6567990baf 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -18,6 +18,7 @@ #include <linux/fs.h> #include <linux/list_sort.h> +#include "dir.h" #include "gfs2.h" #include "incore.h" #include "inode.h" @@ -138,7 +139,7 @@ static void gfs2_log_incr_head(struct gfs2_sbd *sdp) sdp->sd_log_flush_head = 0; } -static u64 gfs2_log_bmap(struct gfs2_sbd *sdp) +u64 gfs2_log_bmap(struct gfs2_sbd *sdp) { unsigned int lbn = sdp->sd_log_flush_head; struct gfs2_journal_extent *je; @@ -161,7 +162,7 @@ static u64 gfs2_log_bmap(struct gfs2_sbd *sdp) * @bvec: The bio_vec * @error: The i/o status * - * This finds the relavent buffers and unlocks then and sets the + * This finds the relevant buffers and unlocks them and sets the * error flag according to the status of the i/o request. This is * used when the log is writing data which has an in-place version * that is pinned in the pagecache. @@ -306,23 +307,22 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) return gfs2_log_alloc_bio(sdp, blkno); } - /** * gfs2_log_write - write to log * @sdp: the filesystem * @page: the page to write * @size: the size of the data to write * @offset: the offset within the page + * @blkno: block number of the log entry * * Try and add the page segment to the current bio. If that fails, * submit the current bio to the device and create a new one, and * then add the page segment to that. */ -static void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, - unsigned size, unsigned offset) +void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, + unsigned size, unsigned offset, u64 blkno) { - u64 blkno = gfs2_log_bmap(sdp); struct bio *bio; int ret; @@ -348,7 +348,8 @@ static void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) { - gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh)); + gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh), + gfs2_log_bmap(sdp)); } /** @@ -365,7 +366,8 @@ static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) { struct super_block *sb = sdp->sd_vfs; - gfs2_log_write(sdp, page, sb->s_blocksize, 0); + gfs2_log_write(sdp, page, sb->s_blocksize, 0, + gfs2_log_bmap(sdp)); } static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type, diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index e529f536c117..e4949394f054 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -26,6 +26,9 @@ extern const struct gfs2_log_operations gfs2_revoke_lops; extern const struct gfs2_log_operations gfs2_databuf_lops; extern const struct gfs2_log_operations *gfs2_log_ops[]; +extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp); +extern void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, + unsigned size, unsigned offset, u64 blkno); extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 0a89e6f7a314..2d55e2c3333c 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -93,7 +93,7 @@ static int __init init_gfs2_fs(void) error = gfs2_glock_init(); if (error) - goto fail; + goto fail_glock; error = -ENOMEM; gfs2_glock_cachep = kmem_cache_create("gfs2_glock", @@ -101,7 +101,7 @@ static int __init init_gfs2_fs(void) 0, 0, gfs2_init_glock_once); if (!gfs2_glock_cachep) - goto fail; + goto fail_cachep1; gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)", sizeof(struct gfs2_glock) + @@ -109,7 +109,7 @@ static int __init init_gfs2_fs(void) 0, 0, gfs2_init_gl_aspace_once); if (!gfs2_glock_aspace_cachep) - goto fail; + goto fail_cachep2; gfs2_inode_cachep = kmem_cache_create("gfs2_inode", sizeof(struct gfs2_inode), @@ -118,107 +118,105 @@ static int __init init_gfs2_fs(void) SLAB_ACCOUNT, gfs2_init_inode_once); if (!gfs2_inode_cachep) - goto fail; + goto fail_cachep3; gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata", sizeof(struct gfs2_bufdata), 0, 0, NULL); if (!gfs2_bufdata_cachep) - goto fail; + goto fail_cachep4; gfs2_rgrpd_cachep = kmem_cache_create("gfs2_rgrpd", sizeof(struct gfs2_rgrpd), 0, 0, NULL); if (!gfs2_rgrpd_cachep) - goto fail; + goto fail_cachep5; gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad", sizeof(struct gfs2_quota_data), 0, 0, NULL); if (!gfs2_quotad_cachep) - goto fail; + goto fail_cachep6; gfs2_qadata_cachep = kmem_cache_create("gfs2_qadata", sizeof(struct gfs2_qadata), 0, 0, NULL); if (!gfs2_qadata_cachep) - goto fail; + goto fail_cachep7; error = register_shrinker(&gfs2_qd_shrinker); if (error) - goto fail; + goto fail_shrinker; error = register_filesystem(&gfs2_fs_type); if (error) - goto fail; + goto fail_fs1; error = register_filesystem(&gfs2meta_fs_type); if (error) - goto fail_unregister; + goto fail_fs2; error = -ENOMEM; gfs_recovery_wq = alloc_workqueue("gfs_recovery", WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); if (!gfs_recovery_wq) - goto fail_wq; + goto fail_wq1; gfs2_control_wq = alloc_workqueue("gfs2_control", WQ_UNBOUND | WQ_FREEZABLE, 0); if (!gfs2_control_wq) - goto fail_recovery; + goto fail_wq2; gfs2_freeze_wq = alloc_workqueue("freeze_workqueue", 0, 0); if (!gfs2_freeze_wq) - goto fail_control; + goto fail_wq3; gfs2_page_pool = mempool_create_page_pool(64, 0); if (!gfs2_page_pool) - goto fail_freeze; + goto fail_mempool; - gfs2_register_debugfs(); + error = gfs2_register_debugfs(); + if (error) + goto fail_debugfs; pr_info("GFS2 installed\n"); return 0; -fail_freeze: +fail_debugfs: + mempool_destroy(gfs2_page_pool); +fail_mempool: destroy_workqueue(gfs2_freeze_wq); -fail_control: +fail_wq3: destroy_workqueue(gfs2_control_wq); -fail_recovery: +fail_wq2: destroy_workqueue(gfs_recovery_wq); -fail_wq: +fail_wq1: unregister_filesystem(&gfs2meta_fs_type); -fail_unregister: +fail_fs2: unregister_filesystem(&gfs2_fs_type); -fail: - list_lru_destroy(&gfs2_qd_lru); -fail_lru: +fail_fs1: unregister_shrinker(&gfs2_qd_shrinker); +fail_shrinker: + kmem_cache_destroy(gfs2_qadata_cachep); +fail_cachep7: + kmem_cache_destroy(gfs2_quotad_cachep); +fail_cachep6: + kmem_cache_destroy(gfs2_rgrpd_cachep); +fail_cachep5: + kmem_cache_destroy(gfs2_bufdata_cachep); +fail_cachep4: + kmem_cache_destroy(gfs2_inode_cachep); +fail_cachep3: + kmem_cache_destroy(gfs2_glock_aspace_cachep); +fail_cachep2: + kmem_cache_destroy(gfs2_glock_cachep); +fail_cachep1: gfs2_glock_exit(); - - if (gfs2_qadata_cachep) - kmem_cache_destroy(gfs2_qadata_cachep); - - if (gfs2_quotad_cachep) - kmem_cache_destroy(gfs2_quotad_cachep); - - if (gfs2_rgrpd_cachep) - kmem_cache_destroy(gfs2_rgrpd_cachep); - - if (gfs2_bufdata_cachep) - kmem_cache_destroy(gfs2_bufdata_cachep); - - if (gfs2_inode_cachep) - kmem_cache_destroy(gfs2_inode_cachep); - - if (gfs2_glock_aspace_cachep) - kmem_cache_destroy(gfs2_glock_aspace_cachep); - - if (gfs2_glock_cachep) - kmem_cache_destroy(gfs2_glock_cachep); - +fail_glock: + list_lru_destroy(&gfs2_qd_lru); +fail_lru: gfs2_sys_uninit(); return error; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index ad55eb86a250..e6a0a8a89ea7 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1382,7 +1382,7 @@ static void gfs2_kill_sb(struct super_block *sb) return; } - gfs2_log_flush(sdp, NULL, SYNC_FLUSH); + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SYNC | GFS2_LFC_KILL_SB); dput(sdp->sd_root_dir); dput(sdp->sd_master_dir); sdp->sd_root_dir = NULL; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index e700fb162664..7a98abd340ee 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -955,7 +955,8 @@ out: gfs2_glock_dq_uninit(&ghs[qx]); inode_unlock(&ip->i_inode); kfree(ghs); - gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH); + gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, + GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_DO_SYNC); return error; } diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 9395a3db1a60..b6b258998bcd 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -14,12 +14,14 @@ #include <linux/buffer_head.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> +#include <linux/crc32c.h> #include "gfs2.h" #include "incore.h" #include "bmap.h" #include "glock.h" #include "glops.h" +#include "log.h" #include "lops.h" #include "meta_io.h" #include "recovery.h" @@ -117,22 +119,6 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd) } } -static int gfs2_log_header_in(struct gfs2_log_header_host *lh, const void *buf) -{ - const struct gfs2_log_header *str = buf; - - if (str->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || - str->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH)) - return 1; - - lh->lh_sequence = be64_to_cpu(str->lh_sequence); - lh->lh_flags = be32_to_cpu(str->lh_flags); - lh->lh_tail = be32_to_cpu(str->lh_tail); - lh->lh_blkno = be32_to_cpu(str->lh_blkno); - lh->lh_hash = be32_to_cpu(str->lh_hash); - return 0; -} - /** * get_log_header - read the log header for a given segment * @jd: the journal @@ -150,29 +136,37 @@ static int gfs2_log_header_in(struct gfs2_log_header_host *lh, const void *buf) static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, struct gfs2_log_header_host *head) { + struct gfs2_log_header *lh; struct buffer_head *bh; - struct gfs2_log_header_host uninitialized_var(lh); - const u32 nothing = 0; - u32 hash; + u32 hash, crc; int error; error = gfs2_replay_read_block(jd, blk, &bh); if (error) return error; + lh = (void *)bh->b_data; - hash = crc32_le((u32)~0, bh->b_data, sizeof(struct gfs2_log_header) - - sizeof(u32)); - hash = crc32_le(hash, (unsigned char const *)¬hing, sizeof(nothing)); - hash ^= (u32)~0; - error = gfs2_log_header_in(&lh, bh->b_data); - brelse(bh); + hash = crc32(~0, lh, LH_V1_SIZE - 4); + hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */ - if (error || lh.lh_blkno != blk || lh.lh_hash != hash) - return 1; + crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, + bh->b_size - LH_V1_SIZE - 4); - *head = lh; + error = lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || + lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) || + be32_to_cpu(lh->lh_blkno) != blk || + be32_to_cpu(lh->lh_hash) != hash || + (lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc); - return 0; + brelse(bh); + + if (!error) { + head->lh_sequence = be64_to_cpu(lh->lh_sequence); + head->lh_flags = be32_to_cpu(lh->lh_flags); + head->lh_tail = be32_to_cpu(lh->lh_tail); + head->lh_blkno = be32_to_cpu(lh->lh_blkno); + } + return error; } /** @@ -370,62 +364,22 @@ static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start, /** * clean_journal - mark a dirty journal as being clean - * @sdp: the filesystem * @jd: the journal - * @gl: the journal's glock * @head: the head journal to start from * * Returns: errno */ -static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) +static void clean_journal(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head) { - struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - unsigned int lblock; - struct gfs2_log_header *lh; - u32 hash; - struct buffer_head *bh; - int error; - struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; - - lblock = head->lh_blkno; - gfs2_replay_incr_blk(jd, &lblock); - bh_map.b_size = 1 << ip->i_inode.i_blkbits; - error = gfs2_block_map(&ip->i_inode, lblock, &bh_map, 0); - if (error) - return error; - if (!bh_map.b_blocknr) { - gfs2_consist_inode(ip); - return -EIO; - } - - bh = sb_getblk(sdp->sd_vfs, bh_map.b_blocknr); - lock_buffer(bh); - memset(bh->b_data, 0, bh->b_size); - set_buffer_uptodate(bh); - clear_buffer_dirty(bh); - unlock_buffer(bh); - - lh = (struct gfs2_log_header *)bh->b_data; - memset(lh, 0, sizeof(struct gfs2_log_header)); - lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); - lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); - lh->lh_header.__pad0 = cpu_to_be64(0); - lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); - lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); - lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1); - lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT); - lh->lh_blkno = cpu_to_be32(lblock); - hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header)); - lh->lh_hash = cpu_to_be32(hash); - - set_buffer_dirty(bh); - if (sync_dirty_buffer(bh)) - gfs2_io_error_bh(sdp, bh); - brelse(bh); - return error; + sdp->sd_log_flush_head = head->lh_blkno; + gfs2_replay_incr_blk(jd, &sdp->sd_log_flush_head); + gfs2_write_log_header(sdp, jd, head->lh_sequence + 1, 0, + GFS2_LOG_HEAD_UNMOUNT | GFS2_LOG_HEAD_RECOVERY, + REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC); } @@ -552,9 +506,7 @@ void gfs2_recover_func(struct work_struct *work) goto fail_gunlock_thaw; } - error = clean_journal(jd, &head); - if (error) - goto fail_gunlock_thaw; + clean_journal(jd, &head); gfs2_glock_dq_uninit(&thaw_gh); t = DIV_ROUND_UP(jiffies - t, HZ); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 95b2a57ded33..8b683917a27e 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -34,6 +34,7 @@ #include "log.h" #include "inode.h" #include "trace_gfs2.h" +#include "dir.h" #define BFITNOENT ((u32)~0) #define NO_BLOCK ((u64)~0) @@ -489,6 +490,13 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) * @blk: The data block number * @exact: True if this needs to be an exact match * + * The @exact argument should be set to true by most callers. The exception + * is when we need to match blocks which are not represented by the rgrp + * bitmap, but which are part of the rgrp (i.e. padding blocks) which are + * there for alignment purposes. Another way of looking at it is that @exact + * matches only valid data/metadata blocks, but with @exact false, it will + * match any block within the extent of the rgrp. + * * Returns: The resource group, or NULL if not found */ @@ -1040,17 +1048,30 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) rgd->rd_free = be32_to_cpu(str->rg_free); rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes); rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration); + /* rd_data0, rd_data and rd_bitbytes already set from rindex */ } static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) { + struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd); struct gfs2_rgrp *str = buf; + u32 crc; str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK); str->rg_free = cpu_to_be32(rgd->rd_free); str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes); - str->__pad = cpu_to_be32(0); + if (next == NULL) + str->rg_skip = 0; + else if (next->rd_addr > rgd->rd_addr) + str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr); str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration); + str->rg_data0 = cpu_to_be64(rgd->rd_data0); + str->rg_data = cpu_to_be32(rgd->rd_data); + str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes); + str->rg_crc = 0; + crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp)); + str->rg_crc = cpu_to_be32(crc); + memset(&str->rg_reserved, 0, sizeof(str->rg_reserved)); } @@ -1318,7 +1339,7 @@ start_new_extent: fail: if (sdp->sd_args.ar_discard) - fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv); + fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv); sdp->sd_args.ar_discard = 0; return -EIO; } @@ -2072,7 +2093,8 @@ next_rgrp: } /* Flushing the log may release space */ if (loops == 2) - gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_INPLACE_RESERVE); } return -ENOSPC; @@ -2453,12 +2475,12 @@ void gfs2_unlink_di(struct inode *inode) update_rgrp_lvb_unlinked(rgd, 1); } -static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) +void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) { struct gfs2_sbd *sdp = rgd->rd_sbd; struct gfs2_rgrpd *tmp_rgd; - tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE); + tmp_rgd = rgblk_free(sdp, ip->i_no_addr, 1, GFS2_BLKST_FREE); if (!tmp_rgd) return; gfs2_assert_withdraw(sdp, rgd == tmp_rgd); @@ -2474,12 +2496,6 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) update_rgrp_lvb_unlinked(rgd, -1); gfs2_statfs_change(sdp, 0, +1, -1); -} - - -void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) -{ - gfs2_free_uninit_di(rgd, ip->i_no_addr); trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE); gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid); gfs2_meta_wipe(ip, ip->i_no_addr, 1); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index d81d46e19726..620be0521866 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -757,7 +757,9 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) bool flush_all = (wbc->sync_mode == WB_SYNC_ALL || gfs2_is_jdata(ip)); if (flush_all) - gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH); + gfs2_log_flush(GFS2_SB(inode), ip->i_gl, + GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_WRITE_INODE); if (bdi->wb.dirty_exceeded) gfs2_ail1_flush(sdp, wbc); else @@ -766,6 +768,12 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) ret = filemap_fdatawait(metamapping); if (ret) mark_inode_dirty_sync(inode); + else { + spin_lock(&inode->i_lock); + if (!(inode->i_flags & I_DIRTY)) + gfs2_ordered_del_inode(ip); + spin_unlock(&inode->i_lock); + } return ret; } @@ -853,7 +861,8 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) gfs2_quota_sync(sdp->sd_vfs, 0); gfs2_statfs_sync(sdp->sd_vfs, 0); - gfs2_log_flush(sdp, NULL, SHUTDOWN_FLUSH); + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SHUTDOWN | + GFS2_LFC_MAKE_FS_RO); wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0); gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks); @@ -946,7 +955,8 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) gfs2_quota_sync(sb, -1); if (wait) - gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_SYNC_FS); return sdp->sd_log_error; } @@ -1650,7 +1660,8 @@ alloc_failed: goto out_unlock; out_truncate: - gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); + gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_EVICT_INODE); metamapping = gfs2_glock2aspace(ip->i_gl); if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) { filemap_fdatawrite(metamapping); diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 9eb9d0a1abd9..c191fa58a1df 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -112,7 +112,7 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) } if (error) { - fs_warn(sdp, "freeze %d error %d", n, error); + fs_warn(sdp, "freeze %d error %d\n", n, error); return error; } @@ -679,7 +679,7 @@ fail_tune: sysfs_remove_group(&sdp->sd_kobj, &tune_group); fail_reg: free_percpu(sdp->sd_lkstats); - fs_err(sdp, "error %d adding sysfs files", error); + fs_err(sdp, "error %d adding sysfs files\n", error); if (sysfs_frees_sdp) kobject_put(&sdp->sd_kobj); else diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index f67a709589d3..b9318b49ff8f 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -353,26 +353,29 @@ TRACE_EVENT(gfs2_pin, /* Flushing the log */ TRACE_EVENT(gfs2_log_flush, - TP_PROTO(const struct gfs2_sbd *sdp, int start), + TP_PROTO(const struct gfs2_sbd *sdp, int start, u32 flags), - TP_ARGS(sdp, start), + TP_ARGS(sdp, start, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, start ) __field( u64, log_seq ) + __field( u32, flags ) ), TP_fast_assign( __entry->dev = sdp->sd_vfs->s_dev; __entry->start = start; __entry->log_seq = sdp->sd_log_sequence; + __entry->flags = flags; ), - TP_printk("%u,%u log flush %s %llu", + TP_printk("%u,%u log flush %s %llu %llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->start ? "start" : "end", - (unsigned long long)__entry->log_seq) + (unsigned long long)__entry->log_seq, + (unsigned long long)__entry->flags) ); /* Reserving/releasing blocks in the log */ diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index ca8b72d0a831..c75cacaa349b 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -92,7 +92,6 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) s64 nbuf; int alloced = test_bit(TR_ALLOCED, &tr->tr_flags); - BUG_ON(!tr); current->journal_info = NULL; if (!test_bit(TR_TOUCHED, &tr->tr_flags)) { @@ -118,7 +117,8 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) up_read(&sdp->sd_log_flush_lock); if (sdp->sd_vfs->s_flags & SB_SYNCHRONOUS) - gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_TRANS_END); if (alloced) sb_end_intwrite(sdp->sd_vfs); } diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig index 24bc20fd42f7..7cc8b4acf66a 100644 --- a/fs/hfsplus/Kconfig +++ b/fs/hfsplus/Kconfig @@ -20,9 +20,6 @@ config HFSPLUS_FS_POSIX_ACL POSIX Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. - To learn more about Access Control Lists, visit the POSIX ACLs for - Linux website <http://acl.bestbits.at/>. - It needs to understand that POSIX ACLs are treated only under Linux. POSIX ACLs doesn't mean something under Mac OS X. Mac OS X beginning with version 10.4 ("Tiger") support NFSv4 ACLs, diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index e8120a282435..15e06fb552da 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -444,7 +444,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, int res = -ENOMEM; mutex_lock(&sbi->vh_mutex); - inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO); + inode = hfsplus_new_inode(dir->i_sb, dir, S_IFLNK | S_IRWXUGO); if (!inode) goto out; @@ -486,7 +486,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, int res = -ENOMEM; mutex_lock(&sbi->vh_mutex); - inode = hfsplus_new_inode(dir->i_sb, mode); + inode = hfsplus_new_inode(dir->i_sb, dir, mode); if (!inode) goto out; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index a015044daa05..d9255abafb81 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -478,7 +478,8 @@ extern const struct address_space_operations hfsplus_aops; extern const struct address_space_operations hfsplus_btree_aops; extern const struct dentry_operations hfsplus_dentry_operations; -struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode); +struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, + umode_t mode); void hfsplus_delete_inode(struct inode *inode); void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 190c60efbc99..c0c8d433864f 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -354,7 +354,8 @@ static const struct file_operations hfsplus_file_operations = { .unlocked_ioctl = hfsplus_ioctl, }; -struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode) +struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, + umode_t mode) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct inode *inode = new_inode(sb); @@ -364,9 +365,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode) return NULL; inode->i_ino = sbi->next_cnid++; - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, dir, mode); set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 1d458b716957..513c357c734b 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -549,7 +549,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if (!sbi->hidden_dir) { mutex_lock(&sbi->vh_mutex); - sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); + sbi->hidden_dir = hfsplus_new_inode(sb, root, S_IFDIR); if (!sbi->hidden_dir) { mutex_unlock(&sbi->vh_mutex); err = -ENOMEM; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8a85f3f53446..8fe1b0aa2896 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -55,16 +55,6 @@ struct hugetlbfs_config { umode_t mode; }; -struct hugetlbfs_inode_info { - struct shared_policy policy; - struct inode vfs_inode; -}; - -static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) -{ - return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); -} - int sysctl_hugetlb_shm_group; enum { @@ -520,8 +510,16 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (hole_end > hole_start) { struct address_space *mapping = inode->i_mapping; + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); inode_lock(inode); + + /* protected by i_mutex */ + if (info->seals & F_SEAL_WRITE) { + inode_unlock(inode); + return -EPERM; + } + i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, @@ -539,6 +537,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); struct vm_area_struct pseudo_vma; @@ -570,6 +569,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, if (error) goto out; + if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { + error = -EPERM; + goto out; + } + /* * Initialize a pseudo vma as this is required by the huge page * allocation routines. If NUMA is configured, use page index @@ -660,6 +664,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) struct hstate *h = hstate_inode(inode); int error; unsigned int ia_valid = attr->ia_valid; + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); BUG_ON(!inode); @@ -668,9 +673,16 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) return error; if (ia_valid & ATTR_SIZE) { - if (attr->ia_size & ~huge_page_mask(h)) + loff_t oldsize = inode->i_size; + loff_t newsize = attr->ia_size; + + if (newsize & ~huge_page_mask(h)) return -EINVAL; - error = hugetlb_vmtruncate(inode, attr->ia_size); + /* protected by i_mutex */ + if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || + (newsize > oldsize && (info->seals & F_SEAL_GROW))) + return -EPERM; + error = hugetlb_vmtruncate(inode, newsize); if (error) return error; } @@ -722,6 +734,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode = new_inode(sb); if (inode) { + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); + inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, @@ -729,6 +743,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_mapping->private_data = resv_map; + info->seals = F_SEAL_SEAL; switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); diff --git a/fs/inode.c b/fs/inode.c index 03102d6ef044..ef362364d396 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -18,6 +18,7 @@ #include <linux/buffer_head.h> /* for inode_has_buffers */ #include <linux/ratelimit.h> #include <linux/list_lru.h> +#include <linux/iversion.h> #include <trace/events/writeback.h> #include "internal.h" @@ -497,7 +498,6 @@ EXPORT_SYMBOL(__remove_inode_hash); void clear_inode(struct inode *inode) { - might_sleep(); /* * We have to cycle tree_lock here because reclaim can be still in the * process of removing the last page (in __delete_from_page_cache()) @@ -1634,17 +1634,21 @@ static int relatime_need_update(const struct path *path, struct inode *inode, int generic_update_time(struct inode *inode, struct timespec *time, int flags) { int iflags = I_DIRTY_TIME; + bool dirty = false; if (flags & S_ATIME) inode->i_atime = *time; if (flags & S_VERSION) - inode_inc_iversion(inode); + dirty = inode_maybe_inc_iversion(inode, false); if (flags & S_CTIME) inode->i_ctime = *time; if (flags & S_MTIME) inode->i_mtime = *time; + if ((flags & (S_ATIME | S_CTIME | S_MTIME)) && + !(inode->i_sb->s_flags & SB_LAZYTIME)) + dirty = true; - if (!(inode->i_sb->s_flags & SB_LAZYTIME) || (flags & S_VERSION)) + if (dirty) iflags |= I_DIRTY_SYNC; __mark_inode_dirty(inode, iflags); return 0; @@ -1863,7 +1867,7 @@ int file_update_time(struct file *file) if (!timespec_equal(&inode->i_ctime, &now)) sync_it |= S_CTIME; - if (IS_I_VERSION(inode)) + if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) sync_it |= S_VERSION; if (!sync_it) diff --git a/fs/iomap.c b/fs/iomap.c index 47d29ccffaef..afd163586aa0 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -65,6 +65,8 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, return ret; if (WARN_ON(iomap.offset > pos)) return -EIO; + if (WARN_ON(iomap.length == 0)) + return -EIO; /* * Cut down the length to the one actually provided by the filesystem, @@ -753,7 +755,8 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) err = invalidate_inode_pages2_range(inode->i_mapping, offset >> PAGE_SHIFT, (offset + dio->size - 1) >> PAGE_SHIFT); - WARN_ON_ONCE(err); + if (err) + dio_warn_stale_pagecache(iocb->ki_filp); } inode_dio_end(file_inode(iocb->ki_filp)); @@ -1018,9 +1021,16 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret) goto out_free_dio; + /* + * Try to invalidate cache pages for the range we're direct + * writing. If this invalidation fails, tough, the write will + * still work, but racing two incompatible write paths is a + * pretty crazy thing to do, so we don't support it 100%. + */ ret = invalidate_inode_pages2_range(mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); + if (ret) + dio_warn_stale_pagecache(iocb->ki_filp); ret = 0; if (iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) && diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 4055f51617ef..c125d662777c 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/checkpoint.c * @@ -5,10 +6,6 @@ * * Copyright 1999 Red Hat Software --- All Rights Reserved * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * * Checkpoint routines for the generic filesystem journaling code. * Part of the ext2fs journaling system. * diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 3c1c31321d9b..8de0e7723316 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/commit.c * @@ -5,10 +6,6 @@ * * Copyright 1998 Red Hat corp --- All Rights Reserved * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * * Journal commit routines for the generic filesystem journaling code; * part of the ext2fs journaling system. */ diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 67546c7ad473..3fbf48ec2188 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/journal.c * @@ -5,10 +6,6 @@ * * Copyright 1998 Red Hat corp --- All Rights Reserved * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * * Generic filesystem journal-writing code; part of the ext2fs * journaling system. * diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 02dd3360cb20..f99910b69c78 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/recovery.c * @@ -5,10 +6,6 @@ * * Copyright 1999-2000 Red Hat Software --- All Rights Reserved * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * * Journal recovery routines for the generic filesystem journaling code; * part of the ext2fs journaling system. */ diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index f9aefcda5854..696ef15ec942 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/revoke.c * @@ -5,10 +6,6 @@ * * Copyright 2000 Red Hat corp --- All Rights Reserved * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * * Journal revoke routines for the generic filesystem journaling code; * part of the ext2fs journaling system. * diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 8b08044b3120..ac311037d7a5 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/transaction.c * @@ -5,10 +6,6 @@ * * Copyright 1998 Red Hat corp --- All Rights Reserved * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * * Generic filesystem transaction handling code; part of the ext2fs * journaling system. * @@ -495,8 +492,10 @@ void jbd2_journal_free_reserved(handle_t *handle) EXPORT_SYMBOL(jbd2_journal_free_reserved); /** - * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle + * int jbd2_journal_start_reserved() - start reserved handle * @handle: handle to start + * @type: for handle statistics + * @line_no: for handle statistics * * Start handle that has been previously reserved with jbd2_journal_reserve(). * This attaches @handle to the running transaction (or creates one if there's @@ -626,6 +625,7 @@ error_out: * int jbd2_journal_restart() - restart a handle . * @handle: handle to restart * @nblocks: nr credits requested + * @gfp_mask: memory allocation flags (for start_this_handle) * * Restart a handle for a multi-transaction filesystem * operation. diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig index d8bb6c411e96..ad850c5bf2ca 100644 --- a/fs/jffs2/Kconfig +++ b/fs/jffs2/Kconfig @@ -68,8 +68,7 @@ config JFFS2_FS_XATTR default n help Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). + the kernel or by users (see the attr(5) manual page for details). If unsure, say N. @@ -82,9 +81,6 @@ config JFFS2_FS_POSIX_ACL Posix Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. - To learn more about Access Control Lists, visit the Posix ACLs for - Linux website <http://acl.bestbits.at/>. - If you don't know what Access Control Lists are, say N config JFFS2_FS_SECURITY diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index d8c274d39ddb..eab04eca95a3 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -362,7 +362,6 @@ error_io: ret = -EIO; error: mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); iget_failed(inode); return ERR_PTR(ret); } diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig index 57cef19951db..851de78fdabb 100644 --- a/fs/jfs/Kconfig +++ b/fs/jfs/Kconfig @@ -16,9 +16,6 @@ config JFS_POSIX_ACL Posix Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. - To learn more about Access Control Lists, visit the Posix ACLs for - Linux website <http://acl.bestbits.at/>. - If you don't know what Access Control Lists are, say N config JFS_SECURITY diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 90373aebfdca..1b9264fd54b6 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -965,9 +965,11 @@ static int __init init_jfs_fs(void) int rc; jfs_inode_cachep = - kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, - init_once); + kmem_cache_create_usercopy("jfs_ip", sizeof(struct jfs_inode_info), + 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, + offsetof(struct jfs_inode_info, i_inline), + sizeof_field(struct jfs_inode_info, i_inline), + init_once); if (jfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 9698e51656b1..c53d9cc5ae7a 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -832,7 +832,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn) * to see if it supports poll (Neither 'poll' nor 'select' return * an appropriate error code). When in doubt, set a suitable timeout value. */ -static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) +static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 066ac313ae5c..a2c0dfc6fdc0 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -48,13 +48,13 @@ void nlmclnt_next_cookie(struct nlm_cookie *c) static struct nlm_lockowner *nlm_get_lockowner(struct nlm_lockowner *lockowner) { - atomic_inc(&lockowner->count); + refcount_inc(&lockowner->count); return lockowner; } static void nlm_put_lockowner(struct nlm_lockowner *lockowner) { - if (!atomic_dec_and_lock(&lockowner->count, &lockowner->host->h_lock)) + if (!refcount_dec_and_lock(&lockowner->count, &lockowner->host->h_lock)) return; list_del(&lockowner->list); spin_unlock(&lockowner->host->h_lock); @@ -105,7 +105,7 @@ static struct nlm_lockowner *nlm_find_lockowner(struct nlm_host *host, fl_owner_ res = __nlm_find_lockowner(host, owner); if (res == NULL && new != NULL) { res = new; - atomic_set(&new->count, 1); + refcount_set(&new->count, 1); new->owner = owner; new->pid = __nlm_alloc_pid(host); new->host = nlm_get_host(host); @@ -204,7 +204,7 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) for(;;) { call = kzalloc(sizeof(*call), GFP_KERNEL); if (call != NULL) { - atomic_set(&call->a_count, 1); + refcount_set(&call->a_count, 1); locks_init_lock(&call->a_args.lock.fl); locks_init_lock(&call->a_res.lock.fl); call->a_host = nlm_get_host(host); @@ -222,7 +222,7 @@ void nlmclnt_release_call(struct nlm_rqst *call) { const struct nlmclnt_operations *nlmclnt_ops = call->a_host->h_nlmclnt_ops; - if (!atomic_dec_and_test(&call->a_count)) + if (!refcount_dec_and_test(&call->a_count)) return; if (nlmclnt_ops && nlmclnt_ops->nlmclnt_release_call) nlmclnt_ops->nlmclnt_release_call(call->a_callback_data); @@ -678,7 +678,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) goto out; } - atomic_inc(&req->a_count); + refcount_inc(&req->a_count); status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); if (status < 0) @@ -769,7 +769,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl nlmclnt_setlockargs(req, fl); req->a_args.block = block; - atomic_inc(&req->a_count); + refcount_inc(&req->a_count); status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); if (status == 0 && req->a_res.status == nlm_lck_denied) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 826a89184f90..d35cd6be0675 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -114,7 +114,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, unsigned long now = jiffies; if (nsm != NULL) - atomic_inc(&nsm->sm_count); + refcount_inc(&nsm->sm_count); else { host = NULL; nsm = nsm_get_handle(ni->net, ni->sap, ni->salen, @@ -151,7 +151,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, host->h_state = 0; host->h_nsmstate = 0; host->h_pidcount = 0; - atomic_set(&host->h_count, 1); + refcount_set(&host->h_count, 1); mutex_init(&host->h_mutex); host->h_nextrebind = now + NLM_HOST_REBIND; host->h_expires = now + NLM_HOST_EXPIRE; @@ -290,7 +290,7 @@ void nlmclnt_release_host(struct nlm_host *host) WARN_ON_ONCE(host->h_server); - if (atomic_dec_and_test(&host->h_count)) { + if (refcount_dec_and_test(&host->h_count)) { WARN_ON_ONCE(!list_empty(&host->h_lockowners)); WARN_ON_ONCE(!list_empty(&host->h_granted)); WARN_ON_ONCE(!list_empty(&host->h_reclaim)); @@ -388,6 +388,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, ln->nrhosts++; nrhosts++; + refcount_inc(&host->h_count); + dprintk("lockd: %s created host %s (%s)\n", __func__, host->h_name, host->h_addrbuf); @@ -410,7 +412,7 @@ void nlmsvc_release_host(struct nlm_host *host) dprintk("lockd: release server host %s\n", host->h_name); WARN_ON_ONCE(!host->h_server); - atomic_dec(&host->h_count); + refcount_dec(&host->h_count); } /* @@ -504,7 +506,7 @@ struct nlm_host * nlm_get_host(struct nlm_host *host) { if (host) { dprintk("lockd: get host %s\n", host->h_name); - atomic_inc(&host->h_count); + refcount_inc(&host->h_count); host->h_expires = jiffies + NLM_HOST_EXPIRE; } return host; @@ -593,7 +595,7 @@ static void nlm_complain_hosts(struct net *net) if (net && host->net != net) continue; dprintk(" %s (cnt %d use %d exp %ld net %x)\n", - host->h_name, atomic_read(&host->h_count), + host->h_name, refcount_read(&host->h_count), host->h_inuse, host->h_expires, host->net->ns.inum); } } @@ -662,16 +664,16 @@ nlm_gc_hosts(struct net *net) for_each_host_safe(host, next, chain, nlm_server_hosts) { if (net && host->net != net) continue; - if (atomic_read(&host->h_count) || host->h_inuse - || time_before(jiffies, host->h_expires)) { + if (host->h_inuse || time_before(jiffies, host->h_expires)) { dprintk("nlm_gc_hosts skipping %s " "(cnt %d use %d exp %ld net %x)\n", - host->h_name, atomic_read(&host->h_count), + host->h_name, refcount_read(&host->h_count), host->h_inuse, host->h_expires, host->net->ns.inum); continue; } - nlm_destroy_host_locked(host); + if (refcount_dec_if_one(&host->h_count)) + nlm_destroy_host_locked(host); } if (net) { diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 96cfb2967ac7..654594ef4f94 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -191,7 +191,7 @@ void nsm_unmonitor(const struct nlm_host *host) struct nsm_res res; int status; - if (atomic_read(&nsm->sm_count) == 1 + if (refcount_read(&nsm->sm_count) == 1 && nsm->sm_monitored && !nsm->sm_sticky) { dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); @@ -279,7 +279,7 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, if (unlikely(new == NULL)) return NULL; - atomic_set(&new->sm_count, 1); + refcount_set(&new->sm_count, 1); new->sm_name = (char *)(new + 1); memcpy(nsm_addr(new), sap, salen); new->sm_addrlen = salen; @@ -337,13 +337,13 @@ retry: cached = nsm_lookup_addr(&ln->nsm_handles, sap); if (cached != NULL) { - atomic_inc(&cached->sm_count); + refcount_inc(&cached->sm_count); spin_unlock(&nsm_lock); kfree(new); dprintk("lockd: found nsm_handle for %s (%s), " "cnt %d\n", cached->sm_name, cached->sm_addrbuf, - atomic_read(&cached->sm_count)); + refcount_read(&cached->sm_count)); return cached; } @@ -388,12 +388,12 @@ struct nsm_handle *nsm_reboot_lookup(const struct net *net, return cached; } - atomic_inc(&cached->sm_count); + refcount_inc(&cached->sm_count); spin_unlock(&nsm_lock); dprintk("lockd: host %s (%s) rebooted, cnt %d\n", cached->sm_name, cached->sm_addrbuf, - atomic_read(&cached->sm_count)); + refcount_read(&cached->sm_count)); return cached; } @@ -404,7 +404,7 @@ struct nsm_handle *nsm_reboot_lookup(const struct net *net, */ void nsm_release(struct nsm_handle *nsm) { - if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) { + if (refcount_dec_and_lock(&nsm->sm_count, &nsm_lock)) { list_del(&nsm->sm_link); spin_unlock(&nsm_lock); dprintk("lockd: destroyed nsm_handle for %s (%s)\n", diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 0d670c5c378f..ea77c66d3cc3 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -295,7 +295,7 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data) void nlmsvc_release_call(struct nlm_rqst *call) { - if (!atomic_dec_and_test(&call->a_count)) + if (!refcount_dec_and_test(&call->a_count)) return; nlmsvc_release_host(call->a_host); kfree(call); diff --git a/fs/mbcache.c b/fs/mbcache.c index b8b8b9ced9f8..bf41e2e72c18 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -94,6 +94,7 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, entry->e_key = key; entry->e_value = value; entry->e_reusable = reusable; + entry->e_referenced = 0; head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { @@ -238,7 +239,9 @@ void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value) spin_lock(&cache->c_list_lock); if (!list_empty(&entry->e_list)) { list_del_init(&entry->e_list); - cache->c_entry_count--; + if (!WARN_ONCE(cache->c_entry_count == 0, + "mbcache: attempt to decrement c_entry_count past zero")) + cache->c_entry_count--; atomic_dec(&entry->e_refcnt); } spin_unlock(&cache->c_list_lock); @@ -269,9 +272,6 @@ static unsigned long mb_cache_count(struct shrinker *shrink, struct mb_cache *cache = container_of(shrink, struct mb_cache, c_shrink); - /* Unlikely, but not impossible */ - if (unlikely(cache->c_entry_count < 0)) - return 0; return cache->c_entry_count; } diff --git a/fs/namei.c b/fs/namei.c index 9cc91fb7f156..921ae32dbc80 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -391,50 +391,6 @@ static inline int do_inode_permission(struct inode *inode, int mask) } /** - * __inode_permission - Check for access rights to a given inode - * @inode: Inode to check permission on - * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) - * - * Check for read/write/execute permissions on an inode. - * - * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. - * - * This does not check for a read-only file system. You probably want - * inode_permission(). - */ -int __inode_permission(struct inode *inode, int mask) -{ - int retval; - - if (unlikely(mask & MAY_WRITE)) { - /* - * Nobody gets write access to an immutable file. - */ - if (IS_IMMUTABLE(inode)) - return -EPERM; - - /* - * Updating mtime will likely cause i_uid and i_gid to be - * written back improperly if their true value is unknown - * to the vfs. - */ - if (HAS_UNMAPPED_ID(inode)) - return -EACCES; - } - - retval = do_inode_permission(inode, mask); - if (retval) - return retval; - - retval = devcgroup_inode_permission(inode, mask); - if (retval) - return retval; - - return security_inode_permission(inode, mask); -} -EXPORT_SYMBOL(__inode_permission); - -/** * sb_permission - Check superblock-level permissions * @sb: Superblock of inode to check permission on * @inode: Inode to check permission on @@ -472,7 +428,32 @@ int inode_permission(struct inode *inode, int mask) retval = sb_permission(inode->i_sb, inode, mask); if (retval) return retval; - return __inode_permission(inode, mask); + + if (unlikely(mask & MAY_WRITE)) { + /* + * Nobody gets write access to an immutable file. + */ + if (IS_IMMUTABLE(inode)) + return -EPERM; + + /* + * Updating mtime will likely cause i_uid and i_gid to be + * written back improperly if their true value is unknown + * to the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return -EACCES; + } + + retval = do_inode_permission(inode, mask); + if (retval) + return retval; + + retval = devcgroup_inode_permission(inode, mask); + if (retval) + return retval; + + return security_inode_permission(inode, mask); } EXPORT_SYMBOL(inode_permission); @@ -1133,9 +1114,6 @@ static int follow_automount(struct path *path, struct nameidata *nd, path->dentry->d_inode) return -EISDIR; - if (path->dentry->d_sb->s_user_ns != &init_user_ns) - return -EACCES; - nd->total_link_count++; if (nd->total_link_count >= 40) return -ELOOP; @@ -2898,6 +2876,27 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, } EXPORT_SYMBOL(vfs_create); +int vfs_mkobj(struct dentry *dentry, umode_t mode, + int (*f)(struct dentry *, umode_t, void *), + void *arg) +{ + struct inode *dir = dentry->d_parent->d_inode; + int error = may_create(dir, dentry); + if (error) + return error; + + mode &= S_IALLUGO; + mode |= S_IFREG; + error = security_inode_create(dir, dentry, mode); + if (error) + return error; + error = f(dentry, mode, arg); + if (!error) + fsnotify_create(dir, dentry); + return error; +} +EXPORT_SYMBOL(vfs_mkobj); + bool may_open_dev(const struct path *path) { return !(path->mnt->mnt_flags & MNT_NODEV) && diff --git a/fs/ncpfs/Kconfig b/fs/ncpfs/Kconfig deleted file mode 100644 index c931cf22a1f6..000000000000 --- a/fs/ncpfs/Kconfig +++ /dev/null @@ -1,108 +0,0 @@ -# -# NCP Filesystem configuration -# -config NCP_FS - tristate "NCP file system support (to mount NetWare volumes)" - depends on IPX!=n || INET - help - NCP (NetWare Core Protocol) is a protocol that runs over IPX and is - used by Novell NetWare clients to talk to file servers. It is to - IPX what NFS is to TCP/IP, if that helps. Saying Y here allows you - to mount NetWare file server volumes and to access them just like - any other Unix directory. For details, please read the file - <file:Documentation/filesystems/ncpfs.txt> in the kernel source and - the IPX-HOWTO from <http://www.tldp.org/docs.html#howto>. - - You do not have to say Y here if you want your Linux box to act as a - file *server* for Novell NetWare clients. - - General information about how to connect Linux, Windows machines and - Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>. - - To compile this as a module, choose M here: the module will be called - ncpfs. Say N unless you are connected to a Novell network. - -config NCPFS_PACKET_SIGNING - bool "Packet signatures" - depends on NCP_FS - help - NCP allows packets to be signed for stronger security. If you want - security, say Y. Normal users can leave it off. To be able to use - packet signing you must use ncpfs > 2.0.12. - -config NCPFS_IOCTL_LOCKING - bool "Proprietary file locking" - depends on NCP_FS - help - Allows locking of records on remote volumes. Say N unless you have - special applications which are able to utilize this locking scheme. - -config NCPFS_STRONG - bool "Clear remove/delete inhibit when needed" - depends on NCP_FS - help - Allows manipulation of files flagged as Delete or Rename Inhibit. - To use this feature you must mount volumes with the ncpmount - parameter "-s" (ncpfs-2.0.12 and newer). Say Y unless you are not - mounting volumes with -f 444. - -config NCPFS_NFS_NS - bool "Use NFS namespace if available" - depends on NCP_FS - help - Allows you to utilize NFS namespace on NetWare servers. It brings - you case sensitive filenames. Say Y. You can disable it at - mount-time with the `-N nfs' parameter of ncpmount. - -config NCPFS_OS2_NS - bool "Use LONG (OS/2) namespace if available" - depends on NCP_FS - help - Allows you to utilize OS2/LONG namespace on NetWare servers. - Filenames in this namespace are limited to 255 characters, they are - case insensitive, and case in names is preserved. Say Y. You can - disable it at mount time with the -N os2 parameter of ncpmount. - -config NCPFS_SMALLDOS - bool "Lowercase DOS filenames" - depends on NCP_FS - ---help--- - If you say Y here, every filename on a NetWare server volume using - the OS2/LONG namespace and created under DOS or on a volume using - DOS namespace will be converted to lowercase characters. - Saying N here will give you these filenames in uppercase. - - This is only a cosmetic option since the OS2/LONG namespace is case - insensitive. The only major reason for this option is backward - compatibility when moving from DOS to OS2/LONG namespace support. - Long filenames (created by Win95) will not be affected. - - This option does not solve the problem that filenames appear - differently under Linux and under Windows, since Windows does an - additional conversions on the client side. You can achieve similar - effects by saying Y to "Allow using of Native Language Support" - below. - -config NCPFS_NLS - bool "Use Native Language Support" - depends on NCP_FS - select NLS - help - Allows you to use codepages and I/O charsets for file name - translation between the server file system and input/output. This - may be useful, if you want to access the server with other operating - systems, e.g. Windows 95. See also NLS for more Information. - - To select codepages and I/O charsets use ncpfs-2.2.0.13 or newer. - -config NCPFS_EXTRAS - bool "Enable symbolic links and execute flags" - depends on NCP_FS - help - This enables the use of symbolic links and an execute permission - bit on NCPFS. The file server need not have long name space or NFS - name space loaded for these to work. - - To use the new attributes, it is recommended to use the flags - '-f 600 -d 755' on the ncpmount command line. - diff --git a/fs/ncpfs/Makefile b/fs/ncpfs/Makefile deleted file mode 100644 index 66fe5f878817..000000000000 --- a/fs/ncpfs/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for the linux ncp filesystem routines. -# - -obj-$(CONFIG_NCP_FS) += ncpfs.o - -ncpfs-y := dir.o file.o inode.o ioctl.o mmap.o ncplib_kernel.o sock.o \ - ncpsign_kernel.o getopt.o - -ncpfs-$(CONFIG_NCPFS_EXTRAS) += symlink.o -ncpfs-$(CONFIG_NCPFS_NFS_NS) += symlink.o - -# If you want debugging output, please uncomment the following line -# ccflags-y := -DDEBUG_NCP=1 - -CFLAGS_ncplib_kernel.o := -finline-functions diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c deleted file mode 100644 index 0c57c5c5d40a..000000000000 --- a/fs/ncpfs/dir.c +++ /dev/null @@ -1,1232 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * dir.c - * - * Copyright (C) 1995, 1996 by Volker Lendecke - * Modified for big endian by J.F. Chadima and David S. Miller - * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache - * Modified 1998, 1999 Wolfram Pienkoss for NLS - * Modified 1999 Wolfram Pienkoss for directory caching - * Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info - * - */ - - -#include <linux/time.h> -#include <linux/errno.h> -#include <linux/stat.h> -#include <linux/kernel.h> -#include <linux/vmalloc.h> -#include <linux/mm.h> -#include <linux/namei.h> -#include <linux/uaccess.h> -#include <asm/byteorder.h> - -#include "ncp_fs.h" - -static void ncp_read_volume_list(struct file *, struct dir_context *, - struct ncp_cache_control *); -static void ncp_do_readdir(struct file *, struct dir_context *, - struct ncp_cache_control *); - -static int ncp_readdir(struct file *, struct dir_context *); - -static int ncp_create(struct inode *, struct dentry *, umode_t, bool); -static struct dentry *ncp_lookup(struct inode *, struct dentry *, unsigned int); -static int ncp_unlink(struct inode *, struct dentry *); -static int ncp_mkdir(struct inode *, struct dentry *, umode_t); -static int ncp_rmdir(struct inode *, struct dentry *); -static int ncp_rename(struct inode *, struct dentry *, - struct inode *, struct dentry *, unsigned int); -static int ncp_mknod(struct inode * dir, struct dentry *dentry, - umode_t mode, dev_t rdev); -#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) -extern int ncp_symlink(struct inode *, struct dentry *, const char *); -#else -#define ncp_symlink NULL -#endif - -const struct file_operations ncp_dir_operations = -{ - .llseek = generic_file_llseek, - .read = generic_read_dir, - .iterate = ncp_readdir, - .unlocked_ioctl = ncp_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ncp_compat_ioctl, -#endif -}; - -const struct inode_operations ncp_dir_inode_operations = -{ - .create = ncp_create, - .lookup = ncp_lookup, - .unlink = ncp_unlink, - .symlink = ncp_symlink, - .mkdir = ncp_mkdir, - .rmdir = ncp_rmdir, - .mknod = ncp_mknod, - .rename = ncp_rename, - .setattr = ncp_notify_change, -}; - -/* - * Dentry operations routines - */ -static int ncp_lookup_validate(struct dentry *, unsigned int); -static int ncp_hash_dentry(const struct dentry *, struct qstr *); -static int ncp_compare_dentry(const struct dentry *, - unsigned int, const char *, const struct qstr *); -static int ncp_delete_dentry(const struct dentry *); -static void ncp_d_prune(struct dentry *dentry); - -const struct dentry_operations ncp_dentry_operations = -{ - .d_revalidate = ncp_lookup_validate, - .d_hash = ncp_hash_dentry, - .d_compare = ncp_compare_dentry, - .d_delete = ncp_delete_dentry, - .d_prune = ncp_d_prune, -}; - -#define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber]) - -static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator) -{ -#ifdef CONFIG_NCPFS_SMALLDOS - int ns = ncp_namespace(i); - - if ((ns == NW_NS_DOS) -#ifdef CONFIG_NCPFS_OS2_NS - || ((ns == NW_NS_OS2) && (nscreator == NW_NS_DOS)) -#endif /* CONFIG_NCPFS_OS2_NS */ - ) - return 0; -#endif /* CONFIG_NCPFS_SMALLDOS */ - return 1; -} - -#define ncp_preserve_case(i) (ncp_namespace(i) != NW_NS_DOS) - -static inline int ncp_case_sensitive(const struct inode *i) -{ -#ifdef CONFIG_NCPFS_NFS_NS - return ncp_namespace(i) == NW_NS_NFS; -#else - return 0; -#endif /* CONFIG_NCPFS_NFS_NS */ -} - -/* - * Note: leave the hash unchanged if the directory - * is case-sensitive. - */ -static int -ncp_hash_dentry(const struct dentry *dentry, struct qstr *this) -{ - struct inode *inode = d_inode_rcu(dentry); - - if (!inode) - return 0; - - if (!ncp_case_sensitive(inode)) { - struct nls_table *t; - unsigned long hash; - int i; - - t = NCP_IO_TABLE(dentry->d_sb); - hash = init_name_hash(dentry); - for (i=0; i<this->len ; i++) - hash = partial_name_hash(ncp_tolower(t, this->name[i]), - hash); - this->hash = end_name_hash(hash); - } - return 0; -} - -static int -ncp_compare_dentry(const struct dentry *dentry, - unsigned int len, const char *str, const struct qstr *name) -{ - struct inode *pinode; - - if (len != name->len) - return 1; - - pinode = d_inode_rcu(dentry->d_parent); - if (!pinode) - return 1; - - if (ncp_case_sensitive(pinode)) - return strncmp(str, name->name, len); - - return ncp_strnicmp(NCP_IO_TABLE(pinode->i_sb), str, name->name, len); -} - -/* - * This is the callback from dput() when d_count is going to 0. - * We use this to unhash dentries with bad inodes. - * Closing files can be safely postponed until iput() - it's done there anyway. - */ -static int -ncp_delete_dentry(const struct dentry * dentry) -{ - struct inode *inode = d_inode(dentry); - - if (inode) { - if (is_bad_inode(inode)) - return 1; - } else - { - /* N.B. Unhash negative dentries? */ - } - return 0; -} - -static inline int -ncp_single_volume(struct ncp_server *server) -{ - return (server->m.mounted_vol[0] != '\0'); -} - -static inline int ncp_is_server_root(struct inode *inode) -{ - return !ncp_single_volume(NCP_SERVER(inode)) && - is_root_inode(inode); -} - - -/* - * This is the callback when the dcache has a lookup hit. - */ - - -#ifdef CONFIG_NCPFS_STRONG -/* try to delete a readonly file (NW R bit set) */ - -static int -ncp_force_unlink(struct inode *dir, struct dentry* dentry) -{ - int res=0x9c,res2; - struct nw_modify_dos_info info; - __le32 old_nwattr; - struct inode *inode; - - memset(&info, 0, sizeof(info)); - - /* remove the Read-Only flag on the NW server */ - inode = d_inode(dentry); - - old_nwattr = NCP_FINFO(inode)->nwattr; - info.attributes = old_nwattr & ~(aRONLY|aDELETEINHIBIT|aRENAMEINHIBIT); - res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(inode), inode, NULL, DM_ATTRIBUTES, &info); - if (res2) - goto leave_me; - - /* now try again the delete operation */ - res = ncp_del_file_or_subdir2(NCP_SERVER(dir), dentry); - - if (res) /* delete failed, set R bit again */ - { - info.attributes = old_nwattr; - res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(inode), inode, NULL, DM_ATTRIBUTES, &info); - if (res2) - goto leave_me; - } -leave_me: - return(res); -} -#endif /* CONFIG_NCPFS_STRONG */ - -#ifdef CONFIG_NCPFS_STRONG -static int -ncp_force_rename(struct inode *old_dir, struct dentry* old_dentry, char *_old_name, - struct inode *new_dir, struct dentry* new_dentry, char *_new_name) -{ - struct nw_modify_dos_info info; - int res=0x90,res2; - struct inode *old_inode = d_inode(old_dentry); - __le32 old_nwattr = NCP_FINFO(old_inode)->nwattr; - __le32 new_nwattr = 0; /* shut compiler warning */ - int old_nwattr_changed = 0; - int new_nwattr_changed = 0; - - memset(&info, 0, sizeof(info)); - - /* remove the Read-Only flag on the NW server */ - - info.attributes = old_nwattr & ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT); - res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(old_inode), old_inode, NULL, DM_ATTRIBUTES, &info); - if (!res2) - old_nwattr_changed = 1; - if (new_dentry && d_really_is_positive(new_dentry)) { - new_nwattr = NCP_FINFO(d_inode(new_dentry))->nwattr; - info.attributes = new_nwattr & ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT); - res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(new_dir), new_dir, _new_name, DM_ATTRIBUTES, &info); - if (!res2) - new_nwattr_changed = 1; - } - /* now try again the rename operation */ - /* but only if something really happened */ - if (new_nwattr_changed || old_nwattr_changed) { - res = ncp_ren_or_mov_file_or_subdir(NCP_SERVER(old_dir), - old_dir, _old_name, - new_dir, _new_name); - } - if (res) - goto leave_me; - /* file was successfully renamed, so: - do not set attributes on old file - it no longer exists - copy attributes from old file to new */ - new_nwattr_changed = old_nwattr_changed; - new_nwattr = old_nwattr; - old_nwattr_changed = 0; - -leave_me:; - if (old_nwattr_changed) { - info.attributes = old_nwattr; - res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(old_inode), old_inode, NULL, DM_ATTRIBUTES, &info); - /* ignore errors */ - } - if (new_nwattr_changed) { - info.attributes = new_nwattr; - res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(new_dir), new_dir, _new_name, DM_ATTRIBUTES, &info); - /* ignore errors */ - } - return(res); -} -#endif /* CONFIG_NCPFS_STRONG */ - - -static int -ncp_lookup_validate(struct dentry *dentry, unsigned int flags) -{ - struct ncp_server *server; - struct dentry *parent; - struct inode *dir; - struct ncp_entry_info finfo; - int res, val = 0, len; - __u8 __name[NCP_MAXPATHLEN + 1]; - - if (dentry == dentry->d_sb->s_root) - return 1; - - if (flags & LOOKUP_RCU) - return -ECHILD; - - parent = dget_parent(dentry); - dir = d_inode(parent); - - if (d_really_is_negative(dentry)) - goto finished; - - server = NCP_SERVER(dir); - - /* - * Inspired by smbfs: - * The default validation is based on dentry age: - * We set the max age at mount time. (But each - * successful server lookup renews the timestamp.) - */ - val = NCP_TEST_AGE(server, dentry); - if (val) - goto finished; - - ncp_dbg(2, "%pd2 not valid, age=%ld, server lookup\n", - dentry, NCP_GET_AGE(dentry)); - - len = sizeof(__name); - if (ncp_is_server_root(dir)) { - res = ncp_io2vol(server, __name, &len, dentry->d_name.name, - dentry->d_name.len, 1); - if (!res) { - res = ncp_lookup_volume(server, __name, &(finfo.i)); - if (!res) - ncp_update_known_namespace(server, finfo.i.volNumber, NULL); - } - } else { - res = ncp_io2vol(server, __name, &len, dentry->d_name.name, - dentry->d_name.len, !ncp_preserve_case(dir)); - if (!res) - res = ncp_obtain_info(server, dir, __name, &(finfo.i)); - } - finfo.volume = finfo.i.volNumber; - ncp_dbg(2, "looked for %pd/%s, res=%d\n", - dentry->d_parent, __name, res); - /* - * If we didn't find it, or if it has a different dirEntNum to - * what we remember, it's not valid any more. - */ - if (!res) { - struct inode *inode = d_inode(dentry); - - inode_lock(inode); - if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) { - ncp_new_dentry(dentry); - val=1; - } else - ncp_dbg(2, "found, but dirEntNum changed\n"); - - ncp_update_inode2(inode, &finfo); - inode_unlock(inode); - } - -finished: - ncp_dbg(2, "result=%d\n", val); - dput(parent); - return val; -} - -static time_t ncp_obtain_mtime(struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - struct ncp_server *server = NCP_SERVER(inode); - struct nw_info_struct i; - - if (!ncp_conn_valid(server) || ncp_is_server_root(inode)) - return 0; - - if (ncp_obtain_info(server, inode, NULL, &i)) - return 0; - - return ncp_date_dos2unix(i.modifyTime, i.modifyDate); -} - -static inline void -ncp_invalidate_dircache_entries(struct dentry *parent) -{ - struct ncp_server *server = NCP_SERVER(d_inode(parent)); - struct dentry *dentry; - - spin_lock(&parent->d_lock); - list_for_each_entry(dentry, &parent->d_subdirs, d_child) { - dentry->d_fsdata = NULL; - ncp_age_dentry(server, dentry); - } - spin_unlock(&parent->d_lock); -} - -static int ncp_readdir(struct file *file, struct dir_context *ctx) -{ - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = d_inode(dentry); - struct page *page = NULL; - struct ncp_server *server = NCP_SERVER(inode); - union ncp_dir_cache *cache = NULL; - struct ncp_cache_control ctl; - int result, mtime_valid = 0; - time_t mtime = 0; - - ctl.page = NULL; - ctl.cache = NULL; - - ncp_dbg(2, "reading %pD2, pos=%d\n", file, (int)ctx->pos); - - result = -EIO; - /* Do not generate '.' and '..' when server is dead. */ - if (!ncp_conn_valid(server)) - goto out; - - result = 0; - if (!dir_emit_dots(file, ctx)) - goto out; - - page = grab_cache_page(&inode->i_data, 0); - if (!page) - goto read_really; - - ctl.cache = cache = kmap(page); - ctl.head = cache->head; - - if (!PageUptodate(page) || !ctl.head.eof) - goto init_cache; - - if (ctx->pos == 2) { - if (jiffies - ctl.head.time >= NCP_MAX_AGE(server)) - goto init_cache; - - mtime = ncp_obtain_mtime(dentry); - mtime_valid = 1; - if ((!mtime) || (mtime != ctl.head.mtime)) - goto init_cache; - } - - if (ctx->pos > ctl.head.end) - goto finished; - - ctl.fpos = ctx->pos + (NCP_DIRCACHE_START - 2); - ctl.ofs = ctl.fpos / NCP_DIRCACHE_SIZE; - ctl.idx = ctl.fpos % NCP_DIRCACHE_SIZE; - - for (;;) { - if (ctl.ofs != 0) { - ctl.page = find_lock_page(&inode->i_data, ctl.ofs); - if (!ctl.page) - goto invalid_cache; - ctl.cache = kmap(ctl.page); - if (!PageUptodate(ctl.page)) - goto invalid_cache; - } - while (ctl.idx < NCP_DIRCACHE_SIZE) { - struct dentry *dent; - bool over; - - spin_lock(&dentry->d_lock); - if (!(NCP_FINFO(inode)->flags & NCPI_DIR_CACHE)) { - spin_unlock(&dentry->d_lock); - goto invalid_cache; - } - dent = ctl.cache->dentry[ctl.idx]; - if (unlikely(!lockref_get_not_dead(&dent->d_lockref))) { - spin_unlock(&dentry->d_lock); - goto invalid_cache; - } - spin_unlock(&dentry->d_lock); - if (d_really_is_negative(dent)) { - dput(dent); - goto invalid_cache; - } - over = !dir_emit(ctx, dent->d_name.name, - dent->d_name.len, - d_inode(dent)->i_ino, DT_UNKNOWN); - dput(dent); - if (over) - goto finished; - ctx->pos += 1; - ctl.idx += 1; - if (ctx->pos > ctl.head.end) - goto finished; - } - if (ctl.page) { - kunmap(ctl.page); - SetPageUptodate(ctl.page); - unlock_page(ctl.page); - put_page(ctl.page); - ctl.page = NULL; - } - ctl.idx = 0; - ctl.ofs += 1; - } -invalid_cache: - if (ctl.page) { - kunmap(ctl.page); - unlock_page(ctl.page); - put_page(ctl.page); - ctl.page = NULL; - } - ctl.cache = cache; -init_cache: - ncp_invalidate_dircache_entries(dentry); - if (!mtime_valid) { - mtime = ncp_obtain_mtime(dentry); - mtime_valid = 1; - } - ctl.head.mtime = mtime; - ctl.head.time = jiffies; - ctl.head.eof = 0; - ctl.fpos = 2; - ctl.ofs = 0; - ctl.idx = NCP_DIRCACHE_START; - ctl.filled = 0; - ctl.valid = 1; -read_really: - spin_lock(&dentry->d_lock); - NCP_FINFO(inode)->flags |= NCPI_DIR_CACHE; - spin_unlock(&dentry->d_lock); - if (ncp_is_server_root(inode)) { - ncp_read_volume_list(file, ctx, &ctl); - } else { - ncp_do_readdir(file, ctx, &ctl); - } - ctl.head.end = ctl.fpos - 1; - ctl.head.eof = ctl.valid; -finished: - if (ctl.page) { - kunmap(ctl.page); - SetPageUptodate(ctl.page); - unlock_page(ctl.page); - put_page(ctl.page); - } - if (page) { - cache->head = ctl.head; - kunmap(page); - SetPageUptodate(page); - unlock_page(page); - put_page(page); - } -out: - return result; -} - -static void ncp_d_prune(struct dentry *dentry) -{ - if (!dentry->d_fsdata) /* not referenced from page cache */ - return; - NCP_FINFO(d_inode(dentry->d_parent))->flags &= ~NCPI_DIR_CACHE; -} - -static int -ncp_fill_cache(struct file *file, struct dir_context *ctx, - struct ncp_cache_control *ctrl, struct ncp_entry_info *entry, - int inval_childs) -{ - struct dentry *newdent, *dentry = file->f_path.dentry; - struct inode *dir = d_inode(dentry); - struct ncp_cache_control ctl = *ctrl; - struct qstr qname; - int valid = 0; - int hashed = 0; - ino_t ino = 0; - __u8 __name[NCP_MAXPATHLEN + 1]; - - qname.len = sizeof(__name); - if (ncp_vol2io(NCP_SERVER(dir), __name, &qname.len, - entry->i.entryName, entry->i.nameLen, - !ncp_preserve_entry_case(dir, entry->i.NSCreator))) - return 1; /* I'm not sure */ - - qname.name = __name; - - newdent = d_hash_and_lookup(dentry, &qname); - if (IS_ERR(newdent)) - goto end_advance; - if (!newdent) { - newdent = d_alloc(dentry, &qname); - if (!newdent) - goto end_advance; - } else { - hashed = 1; - - /* If case sensitivity changed for this volume, all entries below this one - should be thrown away. This entry itself is not affected, as its case - sensitivity is controlled by its own parent. */ - if (inval_childs) - shrink_dcache_parent(newdent); - - /* - * NetWare's OS2 namespace is case preserving yet case - * insensitive. So we update dentry's name as received from - * server. Parent dir's i_mutex is locked because we're in - * readdir. - */ - dentry_update_name_case(newdent, &qname); - } - - if (d_really_is_negative(newdent)) { - struct inode *inode; - - entry->opened = 0; - entry->ino = iunique(dir->i_sb, 2); - inode = ncp_iget(dir->i_sb, entry); - if (inode) { - d_instantiate(newdent, inode); - if (!hashed) - d_rehash(newdent); - } else { - spin_lock(&dentry->d_lock); - NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE; - spin_unlock(&dentry->d_lock); - } - } else { - struct inode *inode = d_inode(newdent); - - inode_lock_nested(inode, I_MUTEX_CHILD); - ncp_update_inode2(inode, entry); - inode_unlock(inode); - } - - if (ctl.idx >= NCP_DIRCACHE_SIZE) { - if (ctl.page) { - kunmap(ctl.page); - SetPageUptodate(ctl.page); - unlock_page(ctl.page); - put_page(ctl.page); - } - ctl.cache = NULL; - ctl.idx -= NCP_DIRCACHE_SIZE; - ctl.ofs += 1; - ctl.page = grab_cache_page(&dir->i_data, ctl.ofs); - if (ctl.page) - ctl.cache = kmap(ctl.page); - } - if (ctl.cache) { - if (d_really_is_positive(newdent)) { - newdent->d_fsdata = newdent; - ctl.cache->dentry[ctl.idx] = newdent; - ino = d_inode(newdent)->i_ino; - ncp_new_dentry(newdent); - } - valid = 1; - } - dput(newdent); -end_advance: - if (!valid) - ctl.valid = 0; - if (!ctl.filled && (ctl.fpos == ctx->pos)) { - if (!ino) - ino = iunique(dir->i_sb, 2); - ctl.filled = !dir_emit(ctx, qname.name, qname.len, - ino, DT_UNKNOWN); - if (!ctl.filled) - ctx->pos += 1; - } - ctl.fpos += 1; - ctl.idx += 1; - *ctrl = ctl; - return (ctl.valid || !ctl.filled); -} - -static void -ncp_read_volume_list(struct file *file, struct dir_context *ctx, - struct ncp_cache_control *ctl) -{ - struct inode *inode = file_inode(file); - struct ncp_server *server = NCP_SERVER(inode); - struct ncp_volume_info info; - struct ncp_entry_info entry; - int i; - - ncp_dbg(1, "pos=%ld\n", (unsigned long)ctx->pos); - - for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) { - int inval_dentry; - - if (ncp_get_volume_info_with_number(server, i, &info) != 0) - return; - if (!strlen(info.volume_name)) - continue; - - ncp_dbg(1, "found vol: %s\n", info.volume_name); - - if (ncp_lookup_volume(server, info.volume_name, - &entry.i)) { - ncp_dbg(1, "could not lookup vol %s\n", - info.volume_name); - continue; - } - inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL); - entry.volume = entry.i.volNumber; - if (!ncp_fill_cache(file, ctx, ctl, &entry, inval_dentry)) - return; - } -} - -static void -ncp_do_readdir(struct file *file, struct dir_context *ctx, - struct ncp_cache_control *ctl) -{ - struct inode *dir = file_inode(file); - struct ncp_server *server = NCP_SERVER(dir); - struct nw_search_sequence seq; - struct ncp_entry_info entry; - int err; - void* buf; - int more; - size_t bufsize; - - ncp_dbg(1, "%pD2, fpos=%ld\n", file, (unsigned long)ctx->pos); - ncp_vdbg("init %pD, volnum=%d, dirent=%u\n", - file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum); - - err = ncp_initialize_search(server, dir, &seq); - if (err) { - ncp_dbg(1, "init failed, err=%d\n", err); - return; - } - /* We MUST NOT use server->buffer_size handshaked with server if we are - using UDP, as for UDP server uses max. buffer size determined by - MTU, and for TCP server uses hardwired value 65KB (== 66560 bytes). - So we use 128KB, just to be sure, as there is no way how to know - this value in advance. */ - bufsize = 131072; - buf = vmalloc(bufsize); - if (!buf) - return; - do { - int cnt; - char* rpl; - size_t rpls; - - err = ncp_search_for_fileset(server, &seq, &more, &cnt, buf, bufsize, &rpl, &rpls); - if (err) /* Error */ - break; - if (!cnt) /* prevent endless loop */ - break; - while (cnt--) { - size_t onerpl; - - if (rpls < offsetof(struct nw_info_struct, entryName)) - break; /* short packet */ - ncp_extract_file_info(rpl, &entry.i); - onerpl = offsetof(struct nw_info_struct, entryName) + entry.i.nameLen; - if (rpls < onerpl) - break; /* short packet */ - (void)ncp_obtain_nfs_info(server, &entry.i); - rpl += onerpl; - rpls -= onerpl; - entry.volume = entry.i.volNumber; - if (!ncp_fill_cache(file, ctx, ctl, &entry, 0)) - break; - } - } while (more); - vfree(buf); - return; -} - -int ncp_conn_logged_in(struct super_block *sb) -{ - struct ncp_server* server = NCP_SBP(sb); - int result; - - if (ncp_single_volume(server)) { - int len; - struct dentry* dent; - __u32 volNumber; - __le32 dirEntNum; - __le32 DosDirNum; - __u8 __name[NCP_MAXPATHLEN + 1]; - - len = sizeof(__name); - result = ncp_io2vol(server, __name, &len, server->m.mounted_vol, - strlen(server->m.mounted_vol), 1); - if (result) - goto out; - result = -ENOENT; - if (ncp_get_volume_root(server, __name, &volNumber, &dirEntNum, &DosDirNum)) { - ncp_vdbg("%s not found\n", server->m.mounted_vol); - goto out; - } - dent = sb->s_root; - if (dent) { - struct inode* ino = d_inode(dent); - if (ino) { - ncp_update_known_namespace(server, volNumber, NULL); - NCP_FINFO(ino)->volNumber = volNumber; - NCP_FINFO(ino)->dirEntNum = dirEntNum; - NCP_FINFO(ino)->DosDirNum = DosDirNum; - result = 0; - } else { - ncp_dbg(1, "d_inode(sb->s_root) == NULL!\n"); - } - } else { - ncp_dbg(1, "sb->s_root == NULL!\n"); - } - } else - result = 0; - -out: - return result; -} - -static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) -{ - struct ncp_server *server = NCP_SERVER(dir); - struct inode *inode = NULL; - struct ncp_entry_info finfo; - int error, res, len; - __u8 __name[NCP_MAXPATHLEN + 1]; - - error = -EIO; - if (!ncp_conn_valid(server)) - goto finished; - - ncp_vdbg("server lookup for %pd2\n", dentry); - - len = sizeof(__name); - if (ncp_is_server_root(dir)) { - res = ncp_io2vol(server, __name, &len, dentry->d_name.name, - dentry->d_name.len, 1); - if (!res) - res = ncp_lookup_volume(server, __name, &(finfo.i)); - if (!res) - ncp_update_known_namespace(server, finfo.i.volNumber, NULL); - } else { - res = ncp_io2vol(server, __name, &len, dentry->d_name.name, - dentry->d_name.len, !ncp_preserve_case(dir)); - if (!res) - res = ncp_obtain_info(server, dir, __name, &(finfo.i)); - } - ncp_vdbg("looked for %pd2, res=%d\n", dentry, res); - /* - * If we didn't find an entry, make a negative dentry. - */ - if (res) - goto add_entry; - - /* - * Create an inode for the entry. - */ - finfo.opened = 0; - finfo.ino = iunique(dir->i_sb, 2); - finfo.volume = finfo.i.volNumber; - error = -EACCES; - inode = ncp_iget(dir->i_sb, &finfo); - - if (inode) { - ncp_new_dentry(dentry); -add_entry: - d_add(dentry, inode); - error = 0; - } - -finished: - ncp_vdbg("result=%d\n", error); - return ERR_PTR(error); -} - -/* - * This code is common to create, mkdir, and mknod. - */ -static int ncp_instantiate(struct inode *dir, struct dentry *dentry, - struct ncp_entry_info *finfo) -{ - struct inode *inode; - int error = -EINVAL; - - finfo->ino = iunique(dir->i_sb, 2); - inode = ncp_iget(dir->i_sb, finfo); - if (!inode) - goto out_close; - d_instantiate(dentry,inode); - error = 0; -out: - return error; - -out_close: - ncp_vdbg("%pd2 failed, closing file\n", dentry); - ncp_close_file(NCP_SERVER(dir), finfo->file_handle); - goto out; -} - -int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev, __le32 attributes) -{ - struct ncp_server *server = NCP_SERVER(dir); - struct ncp_entry_info finfo; - int error, result, len; - int opmode; - __u8 __name[NCP_MAXPATHLEN + 1]; - - ncp_vdbg("creating %pd2, mode=%hx\n", dentry, mode); - - ncp_age_dentry(server, dentry); - len = sizeof(__name); - error = ncp_io2vol(server, __name, &len, dentry->d_name.name, - dentry->d_name.len, !ncp_preserve_case(dir)); - if (error) - goto out; - - error = -EACCES; - - if (S_ISREG(mode) && - (server->m.flags & NCP_MOUNT_EXTRAS) && - (mode & S_IXUGO)) - attributes |= aSYSTEM | aSHARED; - - result = ncp_open_create_file_or_subdir(server, dir, __name, - OC_MODE_CREATE | OC_MODE_OPEN | OC_MODE_REPLACE, - attributes, AR_READ | AR_WRITE, &finfo); - opmode = O_RDWR; - if (result) { - result = ncp_open_create_file_or_subdir(server, dir, __name, - OC_MODE_CREATE | OC_MODE_OPEN | OC_MODE_REPLACE, - attributes, AR_WRITE, &finfo); - if (result) { - if (result == 0x87) - error = -ENAMETOOLONG; - else if (result < 0) - error = result; - ncp_dbg(1, "%pd2 failed\n", dentry); - goto out; - } - opmode = O_WRONLY; - } - finfo.access = opmode; - if (ncp_is_nfs_extras(server, finfo.volume)) { - finfo.i.nfs.mode = mode; - finfo.i.nfs.rdev = new_encode_dev(rdev); - if (ncp_modify_nfs_info(server, finfo.volume, - finfo.i.dirEntNum, - mode, new_encode_dev(rdev)) != 0) - goto out; - } - - error = ncp_instantiate(dir, dentry, &finfo); -out: - return error; -} - -static int ncp_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) -{ - return ncp_create_new(dir, dentry, mode, 0, 0); -} - -static int ncp_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - struct ncp_entry_info finfo; - struct ncp_server *server = NCP_SERVER(dir); - int error, len; - __u8 __name[NCP_MAXPATHLEN + 1]; - - ncp_dbg(1, "making %pd2\n", dentry); - - ncp_age_dentry(server, dentry); - len = sizeof(__name); - error = ncp_io2vol(server, __name, &len, dentry->d_name.name, - dentry->d_name.len, !ncp_preserve_case(dir)); - if (error) - goto out; - - error = ncp_open_create_file_or_subdir(server, dir, __name, - OC_MODE_CREATE, aDIR, - cpu_to_le16(0xffff), - &finfo); - if (error == 0) { - if (ncp_is_nfs_extras(server, finfo.volume)) { - mode |= S_IFDIR; - finfo.i.nfs.mode = mode; - if (ncp_modify_nfs_info(server, - finfo.volume, - finfo.i.dirEntNum, - mode, 0) != 0) - goto out; - } - error = ncp_instantiate(dir, dentry, &finfo); - } else if (error > 0) { - error = -EACCES; - } -out: - return error; -} - -static int ncp_rmdir(struct inode *dir, struct dentry *dentry) -{ - struct ncp_server *server = NCP_SERVER(dir); - int error, result, len; - __u8 __name[NCP_MAXPATHLEN + 1]; - - ncp_dbg(1, "removing %pd2\n", dentry); - - len = sizeof(__name); - error = ncp_io2vol(server, __name, &len, dentry->d_name.name, - dentry->d_name.len, !ncp_preserve_case(dir)); - if (error) - goto out; - - result = ncp_del_file_or_subdir(server, dir, __name); - switch (result) { - case 0x00: - error = 0; - break; - case 0x85: /* unauthorized to delete file */ - case 0x8A: /* unauthorized to delete file */ - error = -EACCES; - break; - case 0x8F: - case 0x90: /* read only */ - error = -EPERM; - break; - case 0x9F: /* in use by another client */ - error = -EBUSY; - break; - case 0xA0: /* directory not empty */ - error = -ENOTEMPTY; - break; - case 0xFF: /* someone deleted file */ - error = -ENOENT; - break; - default: - error = result < 0 ? result : -EACCES; - break; - } -out: - return error; -} - -static int ncp_unlink(struct inode *dir, struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - struct ncp_server *server; - int error; - - server = NCP_SERVER(dir); - ncp_dbg(1, "unlinking %pd2\n", dentry); - - /* - * Check whether to close the file ... - */ - if (inode) { - ncp_vdbg("closing file\n"); - ncp_make_closed(inode); - } - - error = ncp_del_file_or_subdir2(server, dentry); -#ifdef CONFIG_NCPFS_STRONG - /* 9C is Invalid path.. It should be 8F, 90 - read only, but - it is not :-( */ - if ((error == 0x9C || error == 0x90) && server->m.flags & NCP_MOUNT_STRONG) { /* R/O */ - error = ncp_force_unlink(dir, dentry); - } -#endif - switch (error) { - case 0x00: - ncp_dbg(1, "removed %pd2\n", dentry); - break; - case 0x85: - case 0x8A: - error = -EACCES; - break; - case 0x8D: /* some files in use */ - case 0x8E: /* all files in use */ - error = -EBUSY; - break; - case 0x8F: /* some read only */ - case 0x90: /* all read only */ - case 0x9C: /* !!! returned when in-use or read-only by NW4 */ - error = -EPERM; - break; - case 0xFF: - error = -ENOENT; - break; - default: - error = error < 0 ? error : -EACCES; - break; - } - return error; -} - -static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) -{ - struct ncp_server *server = NCP_SERVER(old_dir); - int error; - int old_len, new_len; - __u8 __old_name[NCP_MAXPATHLEN + 1], __new_name[NCP_MAXPATHLEN + 1]; - - if (flags) - return -EINVAL; - - ncp_dbg(1, "%pd2 to %pd2\n", old_dentry, new_dentry); - - ncp_age_dentry(server, old_dentry); - ncp_age_dentry(server, new_dentry); - - old_len = sizeof(__old_name); - error = ncp_io2vol(server, __old_name, &old_len, - old_dentry->d_name.name, old_dentry->d_name.len, - !ncp_preserve_case(old_dir)); - if (error) - goto out; - - new_len = sizeof(__new_name); - error = ncp_io2vol(server, __new_name, &new_len, - new_dentry->d_name.name, new_dentry->d_name.len, - !ncp_preserve_case(new_dir)); - if (error) - goto out; - - error = ncp_ren_or_mov_file_or_subdir(server, old_dir, __old_name, - new_dir, __new_name); -#ifdef CONFIG_NCPFS_STRONG - if ((error == 0x90 || error == 0x8B || error == -EACCES) && - server->m.flags & NCP_MOUNT_STRONG) { /* RO */ - error = ncp_force_rename(old_dir, old_dentry, __old_name, - new_dir, new_dentry, __new_name); - } -#endif - switch (error) { - case 0x00: - ncp_dbg(1, "renamed %pd -> %pd\n", - old_dentry, new_dentry); - ncp_d_prune(old_dentry); - ncp_d_prune(new_dentry); - break; - case 0x9E: - error = -ENAMETOOLONG; - break; - case 0xFF: - error = -ENOENT; - break; - default: - error = error < 0 ? error : -EACCES; - break; - } -out: - return error; -} - -static int ncp_mknod(struct inode * dir, struct dentry *dentry, - umode_t mode, dev_t rdev) -{ - if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) { - ncp_dbg(1, "mode = 0%ho\n", mode); - return ncp_create_new(dir, dentry, mode, rdev, 0); - } - return -EPERM; /* Strange, but true */ -} - -/* The following routines are taken directly from msdos-fs */ - -/* Linear day numbers of the respective 1sts in non-leap years. */ - -static int day_n[] = -{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0}; -/* Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec */ - -static int utc2local(int time) -{ - return time - sys_tz.tz_minuteswest * 60; -} - -static int local2utc(int time) -{ - return time + sys_tz.tz_minuteswest * 60; -} - -/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */ -int -ncp_date_dos2unix(__le16 t, __le16 d) -{ - unsigned short time = le16_to_cpu(t), date = le16_to_cpu(d); - int month, year, secs; - - /* first subtract and mask after that... Otherwise, if - date == 0, bad things happen */ - month = ((date >> 5) - 1) & 15; - year = date >> 9; - secs = (time & 31) * 2 + 60 * ((time >> 5) & 63) + (time >> 11) * 3600 + - 86400 * ((date & 31) - 1 + day_n[month] + (year / 4) + - year * 365 - ((year & 3) == 0 && month < 2 ? 1 : 0) + 3653); - /* days since 1.1.70 plus 80's leap day */ - return local2utc(secs); -} - - -/* Convert linear UNIX date to a MS-DOS time/date pair. */ -void -ncp_date_unix2dos(int unix_date, __le16 *time, __le16 *date) -{ - int day, year, nl_day, month; - - unix_date = utc2local(unix_date); - *time = cpu_to_le16( - (unix_date % 60) / 2 + (((unix_date / 60) % 60) << 5) + - (((unix_date / 3600) % 24) << 11)); - day = unix_date / 86400 - 3652; - year = day / 365; - if ((year + 3) / 4 + 365 * year > day) - year--; - day -= (year + 3) / 4 + 365 * year; - if (day == 59 && !(year & 3)) { - nl_day = day; - month = 2; - } else { - nl_day = (year & 3) || day <= 59 ? day : day - 1; - for (month = 1; month < 12; month++) - if (day_n[month] > nl_day) - break; - } - *date = cpu_to_le16(nl_day - day_n[month - 1] + 1 + (month << 5) + (year << 9)); -} diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c deleted file mode 100644 index 8f8cc0334ddd..000000000000 --- a/fs/ncpfs/file.c +++ /dev/null @@ -1,263 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * file.c - * - * Copyright (C) 1995, 1996 by Volker Lendecke - * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/uaccess.h> - -#include <linux/time.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/fcntl.h> -#include <linux/stat.h> -#include <linux/mm.h> -#include <linux/vmalloc.h> -#include <linux/sched.h> - -#include "ncp_fs.h" - -static int ncp_fsync(struct file *file, loff_t start, loff_t end, int datasync) -{ - return file_write_and_wait_range(file, start, end); -} - -/* - * Open a file with the specified read/write mode. - */ -int ncp_make_open(struct inode *inode, int right) -{ - int error; - int access; - - error = -EINVAL; - if (!inode) { - pr_err("%s: got NULL inode\n", __func__); - goto out; - } - - ncp_dbg(1, "opened=%d, volume # %u, dir entry # %u\n", - atomic_read(&NCP_FINFO(inode)->opened), - NCP_FINFO(inode)->volNumber, - NCP_FINFO(inode)->dirEntNum); - error = -EACCES; - mutex_lock(&NCP_FINFO(inode)->open_mutex); - if (!atomic_read(&NCP_FINFO(inode)->opened)) { - struct ncp_entry_info finfo; - int result; - - /* tries max. rights */ - finfo.access = O_RDWR; - result = ncp_open_create_file_or_subdir(NCP_SERVER(inode), - inode, NULL, OC_MODE_OPEN, - 0, AR_READ | AR_WRITE, &finfo); - if (!result) - goto update; - /* RDWR did not succeeded, try readonly or writeonly as requested */ - switch (right) { - case O_RDONLY: - finfo.access = O_RDONLY; - result = ncp_open_create_file_or_subdir(NCP_SERVER(inode), - inode, NULL, OC_MODE_OPEN, - 0, AR_READ, &finfo); - break; - case O_WRONLY: - finfo.access = O_WRONLY; - result = ncp_open_create_file_or_subdir(NCP_SERVER(inode), - inode, NULL, OC_MODE_OPEN, - 0, AR_WRITE, &finfo); - break; - } - if (result) { - ncp_vdbg("failed, result=%d\n", result); - goto out_unlock; - } - /* - * Update the inode information. - */ - update: - ncp_update_inode(inode, &finfo); - atomic_set(&NCP_FINFO(inode)->opened, 1); - } - - access = NCP_FINFO(inode)->access; - ncp_vdbg("file open, access=%x\n", access); - if (access == right || access == O_RDWR) { - atomic_inc(&NCP_FINFO(inode)->opened); - error = 0; - } - -out_unlock: - mutex_unlock(&NCP_FINFO(inode)->open_mutex); -out: - return error; -} - -static ssize_t -ncp_file_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); - size_t already_read = 0; - off_t pos = iocb->ki_pos; - size_t bufsize; - int error; - void *freepage; - size_t freelen; - - ncp_dbg(1, "enter %pD2\n", file); - - if (!iov_iter_count(to)) - return 0; - if (pos > inode->i_sb->s_maxbytes) - return 0; - iov_iter_truncate(to, inode->i_sb->s_maxbytes - pos); - - error = ncp_make_open(inode, O_RDONLY); - if (error) { - ncp_dbg(1, "open failed, error=%d\n", error); - return error; - } - - bufsize = NCP_SERVER(inode)->buffer_size; - - error = -EIO; - freelen = ncp_read_bounce_size(bufsize); - freepage = vmalloc(freelen); - if (!freepage) - goto outrel; - error = 0; - /* First read in as much as possible for each bufsize. */ - while (iov_iter_count(to)) { - int read_this_time; - size_t to_read = min_t(size_t, - bufsize - (pos % bufsize), - iov_iter_count(to)); - - error = ncp_read_bounce(NCP_SERVER(inode), - NCP_FINFO(inode)->file_handle, - pos, to_read, to, &read_this_time, - freepage, freelen); - if (error) { - error = -EIO; /* NW errno -> Linux errno */ - break; - } - pos += read_this_time; - already_read += read_this_time; - - if (read_this_time != to_read) - break; - } - vfree(freepage); - - iocb->ki_pos = pos; - - file_accessed(file); - - ncp_dbg(1, "exit %pD2\n", file); -outrel: - ncp_inode_close(inode); - return already_read ? already_read : error; -} - -static ssize_t -ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); - size_t already_written = 0; - size_t bufsize; - int errno; - void *bouncebuffer; - off_t pos; - - ncp_dbg(1, "enter %pD2\n", file); - errno = generic_write_checks(iocb, from); - if (errno <= 0) - return errno; - - errno = ncp_make_open(inode, O_WRONLY); - if (errno) { - ncp_dbg(1, "open failed, error=%d\n", errno); - return errno; - } - bufsize = NCP_SERVER(inode)->buffer_size; - - errno = file_update_time(file); - if (errno) - goto outrel; - - bouncebuffer = vmalloc(bufsize); - if (!bouncebuffer) { - errno = -EIO; /* -ENOMEM */ - goto outrel; - } - pos = iocb->ki_pos; - while (iov_iter_count(from)) { - int written_this_time; - size_t to_write = min_t(size_t, - bufsize - (pos % bufsize), - iov_iter_count(from)); - - if (!copy_from_iter_full(bouncebuffer, to_write, from)) { - errno = -EFAULT; - break; - } - if (ncp_write_kernel(NCP_SERVER(inode), - NCP_FINFO(inode)->file_handle, - pos, to_write, bouncebuffer, &written_this_time) != 0) { - errno = -EIO; - break; - } - pos += written_this_time; - already_written += written_this_time; - - if (written_this_time != to_write) - break; - } - vfree(bouncebuffer); - - iocb->ki_pos = pos; - - if (pos > i_size_read(inode)) { - inode_lock(inode); - if (pos > i_size_read(inode)) - i_size_write(inode, pos); - inode_unlock(inode); - } - ncp_dbg(1, "exit %pD2\n", file); -outrel: - ncp_inode_close(inode); - return already_written ? already_written : errno; -} - -static int ncp_release(struct inode *inode, struct file *file) { - if (ncp_make_closed(inode)) { - ncp_dbg(1, "failed to close\n"); - } - return 0; -} - -const struct file_operations ncp_file_operations = -{ - .llseek = generic_file_llseek, - .read_iter = ncp_file_read_iter, - .write_iter = ncp_file_write_iter, - .unlocked_ioctl = ncp_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ncp_compat_ioctl, -#endif - .mmap = ncp_mmap, - .release = ncp_release, - .fsync = ncp_fsync, -}; - -const struct inode_operations ncp_file_inode_operations = -{ - .setattr = ncp_notify_change, -}; diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c deleted file mode 100644 index 5c941bef14c4..000000000000 --- a/fs/ncpfs/getopt.c +++ /dev/null @@ -1,76 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * getopt.c - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/string.h> - -#include <asm/errno.h> - -#include "getopt.h" - -/** - * ncp_getopt - option parser - * @caller: name of the caller, for error messages - * @options: the options string - * @opts: an array of &struct option entries controlling parser operations - * @optopt: output; will contain the current option - * @optarg: output; will contain the value (if one exists) - * @value: output; may be NULL; will be overwritten with the integer value - * of the current argument. - * - * Helper to parse options on the format used by mount ("a=b,c=d,e,f"). - * Returns opts->val if a matching entry in the 'opts' array is found, - * 0 when no more tokens are found, -1 if an error is encountered. - */ -int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts, - char **optopt, char **optarg, unsigned long *value) -{ - char *token; - char *val; - - do { - if ((token = strsep(options, ",")) == NULL) - return 0; - } while (*token == '\0'); - if (optopt) - *optopt = token; - - if ((val = strchr (token, '=')) != NULL) { - *val++ = 0; - } - *optarg = val; - for (; opts->name; opts++) { - if (!strcmp(opts->name, token)) { - if (!val) { - if (opts->has_arg & OPT_NOPARAM) { - return opts->val; - } - pr_info("%s: the %s option requires an argument\n", - caller, token); - return -EINVAL; - } - if (opts->has_arg & OPT_INT) { - int rc = kstrtoul(val, 0, value); - - if (rc) { - pr_info("%s: invalid numeric value in %s=%s\n", - caller, token, val); - return rc; - } - return opts->val; - } - if (opts->has_arg & OPT_STRING) { - return opts->val; - } - pr_info("%s: unexpected argument %s to the %s option\n", - caller, val, token); - return -EINVAL; - } - } - pr_info("%s: Unrecognized mount option %s\n", caller, token); - return -EOPNOTSUPP; -} diff --git a/fs/ncpfs/getopt.h b/fs/ncpfs/getopt.h deleted file mode 100644 index 30f0da317670..000000000000 --- a/fs/ncpfs/getopt.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_GETOPT_H -#define _LINUX_GETOPT_H - -#define OPT_NOPARAM 1 -#define OPT_INT 2 -#define OPT_STRING 4 -struct ncp_option { - const char *name; - unsigned int has_arg; - int val; -}; - -extern int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts, - char **optopt, char **optarg, unsigned long *value); - -#endif /* _LINUX_GETOPT_H */ diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c deleted file mode 100644 index 41de88cdc053..000000000000 --- a/fs/ncpfs/inode.c +++ /dev/null @@ -1,1066 +0,0 @@ -/* - * inode.c - * - * Copyright (C) 1995, 1996 by Volker Lendecke - * Modified for big endian by J.F. Chadima and David S. Miller - * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache - * Modified 1998 Wolfram Pienkoss for NLS - * Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/module.h> - -#include <linux/uaccess.h> -#include <asm/byteorder.h> - -#include <linux/time.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/stat.h> -#include <linux/errno.h> -#include <linux/file.h> -#include <linux/fcntl.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/vfs.h> -#include <linux/mount.h> -#include <linux/seq_file.h> -#include <linux/sched/signal.h> -#include <linux/namei.h> - -#include <net/sock.h> - -#include "ncp_fs.h" -#include "getopt.h" - -#define NCP_DEFAULT_FILE_MODE 0600 -#define NCP_DEFAULT_DIR_MODE 0700 -#define NCP_DEFAULT_TIME_OUT 10 -#define NCP_DEFAULT_RETRY_COUNT 20 - -static void ncp_evict_inode(struct inode *); -static void ncp_put_super(struct super_block *); -static int ncp_statfs(struct dentry *, struct kstatfs *); -static int ncp_show_options(struct seq_file *, struct dentry *); - -static struct kmem_cache * ncp_inode_cachep; - -static struct inode *ncp_alloc_inode(struct super_block *sb) -{ - struct ncp_inode_info *ei; - ei = (struct ncp_inode_info *)kmem_cache_alloc(ncp_inode_cachep, GFP_KERNEL); - if (!ei) - return NULL; - return &ei->vfs_inode; -} - -static void ncp_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode)); -} - -static void ncp_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, ncp_i_callback); -} - -static void init_once(void *foo) -{ - struct ncp_inode_info *ei = (struct ncp_inode_info *) foo; - - mutex_init(&ei->open_mutex); - inode_init_once(&ei->vfs_inode); -} - -static int init_inodecache(void) -{ - ncp_inode_cachep = kmem_cache_create("ncp_inode_cache", - sizeof(struct ncp_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), - init_once); - if (ncp_inode_cachep == NULL) - return -ENOMEM; - return 0; -} - -static void destroy_inodecache(void) -{ - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(ncp_inode_cachep); -} - -static int ncp_remount(struct super_block *sb, int *flags, char* data) -{ - sync_filesystem(sb); - *flags |= SB_NODIRATIME; - return 0; -} - -static const struct super_operations ncp_sops = -{ - .alloc_inode = ncp_alloc_inode, - .destroy_inode = ncp_destroy_inode, - .drop_inode = generic_delete_inode, - .evict_inode = ncp_evict_inode, - .put_super = ncp_put_super, - .statfs = ncp_statfs, - .remount_fs = ncp_remount, - .show_options = ncp_show_options, -}; - -/* - * Fill in the ncpfs-specific information in the inode. - */ -static void ncp_update_dirent(struct inode *inode, struct ncp_entry_info *nwinfo) -{ - NCP_FINFO(inode)->DosDirNum = nwinfo->i.DosDirNum; - NCP_FINFO(inode)->dirEntNum = nwinfo->i.dirEntNum; - NCP_FINFO(inode)->volNumber = nwinfo->volume; -} - -void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo) -{ - ncp_update_dirent(inode, nwinfo); - NCP_FINFO(inode)->nwattr = nwinfo->i.attributes; - NCP_FINFO(inode)->access = nwinfo->access; - memcpy(NCP_FINFO(inode)->file_handle, nwinfo->file_handle, - sizeof(nwinfo->file_handle)); - ncp_dbg(1, "updated %s, volnum=%d, dirent=%u\n", - nwinfo->i.entryName, NCP_FINFO(inode)->volNumber, - NCP_FINFO(inode)->dirEntNum); -} - -static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi) -{ - /* NFS namespace mode overrides others if it's set. */ - ncp_dbg(1, "(%s) nfs.mode=0%o\n", nwi->entryName, nwi->nfs.mode); - if (nwi->nfs.mode) { - /* XXX Security? */ - inode->i_mode = nwi->nfs.mode; - } - - inode->i_blocks = (i_size_read(inode) + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT; - - inode->i_mtime.tv_sec = ncp_date_dos2unix(nwi->modifyTime, nwi->modifyDate); - inode->i_ctime.tv_sec = ncp_date_dos2unix(nwi->creationTime, nwi->creationDate); - inode->i_atime.tv_sec = ncp_date_dos2unix(0, nwi->lastAccessDate); - inode->i_atime.tv_nsec = 0; - inode->i_mtime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; -} - -static void ncp_update_attrs(struct inode *inode, struct ncp_entry_info *nwinfo) -{ - struct nw_info_struct *nwi = &nwinfo->i; - struct ncp_server *server = NCP_SERVER(inode); - - if (nwi->attributes & aDIR) { - inode->i_mode = server->m.dir_mode; - /* for directories dataStreamSize seems to be some - Object ID ??? */ - i_size_write(inode, NCP_BLOCK_SIZE); - } else { - u32 size; - - inode->i_mode = server->m.file_mode; - size = le32_to_cpu(nwi->dataStreamSize); - i_size_write(inode, size); -#ifdef CONFIG_NCPFS_EXTRAS - if ((server->m.flags & (NCP_MOUNT_EXTRAS|NCP_MOUNT_SYMLINKS)) - && (nwi->attributes & aSHARED)) { - switch (nwi->attributes & (aHIDDEN|aSYSTEM)) { - case aHIDDEN: - if (server->m.flags & NCP_MOUNT_SYMLINKS) { - if (/* (size >= NCP_MIN_SYMLINK_SIZE) - && */ (size <= NCP_MAX_SYMLINK_SIZE)) { - inode->i_mode = (inode->i_mode & ~S_IFMT) | S_IFLNK; - NCP_FINFO(inode)->flags |= NCPI_KLUDGE_SYMLINK; - break; - } - } - /* FALLTHROUGH */ - case 0: - if (server->m.flags & NCP_MOUNT_EXTRAS) - inode->i_mode |= S_IRUGO; - break; - case aSYSTEM: - if (server->m.flags & NCP_MOUNT_EXTRAS) - inode->i_mode |= (inode->i_mode >> 2) & S_IXUGO; - break; - /* case aSYSTEM|aHIDDEN: */ - default: - /* reserved combination */ - break; - } - } -#endif - } - if (nwi->attributes & aRONLY) inode->i_mode &= ~S_IWUGO; -} - -void ncp_update_inode2(struct inode* inode, struct ncp_entry_info *nwinfo) -{ - NCP_FINFO(inode)->flags = 0; - if (!atomic_read(&NCP_FINFO(inode)->opened)) { - NCP_FINFO(inode)->nwattr = nwinfo->i.attributes; - ncp_update_attrs(inode, nwinfo); - } - - ncp_update_dates(inode, &nwinfo->i); - ncp_update_dirent(inode, nwinfo); -} - -/* - * Fill in the inode based on the ncp_entry_info structure. Used only for brand new inodes. - */ -static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) -{ - struct ncp_server *server = NCP_SERVER(inode); - - NCP_FINFO(inode)->flags = 0; - - ncp_update_attrs(inode, nwinfo); - - ncp_dbg(2, "inode->i_mode = %u\n", inode->i_mode); - - set_nlink(inode, 1); - inode->i_uid = server->m.uid; - inode->i_gid = server->m.gid; - - ncp_update_dates(inode, &nwinfo->i); - ncp_update_inode(inode, nwinfo); -} - -#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) -static const struct inode_operations ncp_symlink_inode_operations = { - .get_link = page_get_link, - .setattr = ncp_notify_change, -}; -#endif - -/* - * Get a new inode. - */ -struct inode * -ncp_iget(struct super_block *sb, struct ncp_entry_info *info) -{ - struct inode *inode; - - if (info == NULL) { - pr_err("%s: info is NULL\n", __func__); - return NULL; - } - - inode = new_inode(sb); - if (inode) { - atomic_set(&NCP_FINFO(inode)->opened, info->opened); - - inode->i_ino = info->ino; - ncp_set_attr(inode, info); - if (S_ISREG(inode->i_mode)) { - inode->i_op = &ncp_file_inode_operations; - inode->i_fop = &ncp_file_operations; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &ncp_dir_inode_operations; - inode->i_fop = &ncp_dir_operations; -#ifdef CONFIG_NCPFS_NFS_NS - } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { - init_special_inode(inode, inode->i_mode, - new_decode_dev(info->i.nfs.rdev)); -#endif -#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) - } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &ncp_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_data.a_ops = &ncp_symlink_aops; -#endif - } else { - make_bad_inode(inode); - } - insert_inode_hash(inode); - } else - pr_err("%s: iget failed!\n", __func__); - return inode; -} - -static void -ncp_evict_inode(struct inode *inode) -{ - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); - - if (S_ISDIR(inode->i_mode)) { - ncp_dbg(2, "put directory %ld\n", inode->i_ino); - } - - if (ncp_make_closed(inode) != 0) { - /* We can't do anything but complain. */ - pr_err("%s: could not close\n", __func__); - } -} - -static void ncp_stop_tasks(struct ncp_server *server) { - struct sock* sk = server->ncp_sock->sk; - - lock_sock(sk); - sk->sk_error_report = server->error_report; - sk->sk_data_ready = server->data_ready; - sk->sk_write_space = server->write_space; - release_sock(sk); - del_timer_sync(&server->timeout_tm); - - flush_work(&server->rcv.tq); - if (sk->sk_socket->type == SOCK_STREAM) - flush_work(&server->tx.tq); - else - flush_work(&server->timeout_tq); -} - -static int ncp_show_options(struct seq_file *seq, struct dentry *root) -{ - struct ncp_server *server = NCP_SBP(root->d_sb); - unsigned int tmp; - - if (!uid_eq(server->m.uid, GLOBAL_ROOT_UID)) - seq_printf(seq, ",uid=%u", - from_kuid_munged(&init_user_ns, server->m.uid)); - if (!gid_eq(server->m.gid, GLOBAL_ROOT_GID)) - seq_printf(seq, ",gid=%u", - from_kgid_munged(&init_user_ns, server->m.gid)); - if (!uid_eq(server->m.mounted_uid, GLOBAL_ROOT_UID)) - seq_printf(seq, ",owner=%u", - from_kuid_munged(&init_user_ns, server->m.mounted_uid)); - tmp = server->m.file_mode & S_IALLUGO; - if (tmp != NCP_DEFAULT_FILE_MODE) - seq_printf(seq, ",mode=0%o", tmp); - tmp = server->m.dir_mode & S_IALLUGO; - if (tmp != NCP_DEFAULT_DIR_MODE) - seq_printf(seq, ",dirmode=0%o", tmp); - if (server->m.time_out != NCP_DEFAULT_TIME_OUT * HZ / 100) { - tmp = server->m.time_out * 100 / HZ; - seq_printf(seq, ",timeout=%u", tmp); - } - if (server->m.retry_count != NCP_DEFAULT_RETRY_COUNT) - seq_printf(seq, ",retry=%u", server->m.retry_count); - if (server->m.flags != 0) - seq_printf(seq, ",flags=%lu", server->m.flags); - if (server->m.wdog_pid != NULL) - seq_printf(seq, ",wdogpid=%u", pid_vnr(server->m.wdog_pid)); - - return 0; -} - -static const struct ncp_option ncp_opts[] = { - { "uid", OPT_INT, 'u' }, - { "gid", OPT_INT, 'g' }, - { "owner", OPT_INT, 'o' }, - { "mode", OPT_INT, 'm' }, - { "dirmode", OPT_INT, 'd' }, - { "timeout", OPT_INT, 't' }, - { "retry", OPT_INT, 'r' }, - { "flags", OPT_INT, 'f' }, - { "wdogpid", OPT_INT, 'w' }, - { "ncpfd", OPT_INT, 'n' }, - { "infofd", OPT_INT, 'i' }, /* v5 */ - { "version", OPT_INT, 'v' }, - { NULL, 0, 0 } }; - -static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options) { - int optval; - char *optarg; - unsigned long optint; - int version = 0; - int ret; - - data->flags = 0; - data->int_flags = 0; - data->mounted_uid = GLOBAL_ROOT_UID; - data->wdog_pid = NULL; - data->ncp_fd = ~0; - data->time_out = NCP_DEFAULT_TIME_OUT; - data->retry_count = NCP_DEFAULT_RETRY_COUNT; - data->uid = GLOBAL_ROOT_UID; - data->gid = GLOBAL_ROOT_GID; - data->file_mode = NCP_DEFAULT_FILE_MODE; - data->dir_mode = NCP_DEFAULT_DIR_MODE; - data->info_fd = -1; - data->mounted_vol[0] = 0; - - while ((optval = ncp_getopt("ncpfs", &options, ncp_opts, NULL, &optarg, &optint)) != 0) { - ret = optval; - if (ret < 0) - goto err; - switch (optval) { - case 'u': - data->uid = make_kuid(current_user_ns(), optint); - if (!uid_valid(data->uid)) { - ret = -EINVAL; - goto err; - } - break; - case 'g': - data->gid = make_kgid(current_user_ns(), optint); - if (!gid_valid(data->gid)) { - ret = -EINVAL; - goto err; - } - break; - case 'o': - data->mounted_uid = make_kuid(current_user_ns(), optint); - if (!uid_valid(data->mounted_uid)) { - ret = -EINVAL; - goto err; - } - break; - case 'm': - data->file_mode = optint; - break; - case 'd': - data->dir_mode = optint; - break; - case 't': - data->time_out = optint; - break; - case 'r': - data->retry_count = optint; - break; - case 'f': - data->flags = optint; - break; - case 'w': - data->wdog_pid = find_get_pid(optint); - break; - case 'n': - data->ncp_fd = optint; - break; - case 'i': - data->info_fd = optint; - break; - case 'v': - ret = -ECHRNG; - if (optint < NCP_MOUNT_VERSION_V4) - goto err; - if (optint > NCP_MOUNT_VERSION_V5) - goto err; - version = optint; - break; - - } - } - return 0; -err: - put_pid(data->wdog_pid); - data->wdog_pid = NULL; - return ret; -} - -static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) -{ - struct ncp_mount_data_kernel data; - struct ncp_server *server; - struct inode *root_inode; - struct socket *sock; - int error; - int default_bufsize; -#ifdef CONFIG_NCPFS_PACKET_SIGNING - int options; -#endif - struct ncp_entry_info finfo; - - memset(&data, 0, sizeof(data)); - server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL); - if (!server) - return -ENOMEM; - sb->s_fs_info = server; - - error = -EFAULT; - if (raw_data == NULL) - goto out; - switch (*(int*)raw_data) { - case NCP_MOUNT_VERSION: - { - struct ncp_mount_data* md = (struct ncp_mount_data*)raw_data; - - data.flags = md->flags; - data.int_flags = NCP_IMOUNT_LOGGEDIN_POSSIBLE; - data.mounted_uid = make_kuid(current_user_ns(), md->mounted_uid); - data.wdog_pid = find_get_pid(md->wdog_pid); - data.ncp_fd = md->ncp_fd; - data.time_out = md->time_out; - data.retry_count = md->retry_count; - data.uid = make_kuid(current_user_ns(), md->uid); - data.gid = make_kgid(current_user_ns(), md->gid); - data.file_mode = md->file_mode; - data.dir_mode = md->dir_mode; - data.info_fd = -1; - memcpy(data.mounted_vol, md->mounted_vol, - NCP_VOLNAME_LEN+1); - } - break; - case NCP_MOUNT_VERSION_V4: - { - struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data; - - data.flags = md->flags; - data.mounted_uid = make_kuid(current_user_ns(), md->mounted_uid); - data.wdog_pid = find_get_pid(md->wdog_pid); - data.ncp_fd = md->ncp_fd; - data.time_out = md->time_out; - data.retry_count = md->retry_count; - data.uid = make_kuid(current_user_ns(), md->uid); - data.gid = make_kgid(current_user_ns(), md->gid); - data.file_mode = md->file_mode; - data.dir_mode = md->dir_mode; - data.info_fd = -1; - } - break; - default: - error = -ECHRNG; - if (memcmp(raw_data, "vers", 4) == 0) { - error = ncp_parse_options(&data, raw_data); - } - if (error) - goto out; - break; - } - error = -EINVAL; - if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) || - !gid_valid(data.gid)) - goto out; - sock = sockfd_lookup(data.ncp_fd, &error); - if (!sock) - goto out; - - if (sock->type == SOCK_STREAM) - default_bufsize = 0xF000; - else - default_bufsize = 1024; - - sb->s_flags |= SB_NODIRATIME; /* probably even noatime */ - sb->s_maxbytes = 0xFFFFFFFFU; - sb->s_blocksize = 1024; /* Eh... Is this correct? */ - sb->s_blocksize_bits = 10; - sb->s_magic = NCP_SUPER_MAGIC; - sb->s_op = &ncp_sops; - sb->s_d_op = &ncp_dentry_operations; - - server = NCP_SBP(sb); - memset(server, 0, sizeof(*server)); - - error = super_setup_bdi(sb); - if (error) - goto out_fput; - - server->ncp_sock = sock; - - if (data.info_fd != -1) { - struct socket *info_sock = sockfd_lookup(data.info_fd, &error); - if (!info_sock) - goto out_fput; - server->info_sock = info_sock; - error = -EBADFD; - if (info_sock->type != SOCK_STREAM) - goto out_fput2; - } - -/* server->lock = 0; */ - mutex_init(&server->mutex); - server->packet = NULL; -/* server->buffer_size = 0; */ -/* server->conn_status = 0; */ -/* server->root_dentry = NULL; */ -/* server->root_setuped = 0; */ - mutex_init(&server->root_setup_lock); -#ifdef CONFIG_NCPFS_PACKET_SIGNING -/* server->sign_wanted = 0; */ -/* server->sign_active = 0; */ -#endif - init_rwsem(&server->auth_rwsem); - server->auth.auth_type = NCP_AUTH_NONE; -/* server->auth.object_name_len = 0; */ -/* server->auth.object_name = NULL; */ -/* server->auth.object_type = 0; */ -/* server->priv.len = 0; */ -/* server->priv.data = NULL; */ - - server->m = data; - /* Although anything producing this is buggy, it happens - now because of PATH_MAX changes.. */ - if (server->m.time_out < 1) { - server->m.time_out = 10; - pr_info("You need to recompile your ncpfs utils..\n"); - } - server->m.time_out = server->m.time_out * HZ / 100; - server->m.file_mode = (server->m.file_mode & S_IRWXUGO) | S_IFREG; - server->m.dir_mode = (server->m.dir_mode & S_IRWXUGO) | S_IFDIR; - -#ifdef CONFIG_NCPFS_NLS - /* load the default NLS charsets */ - server->nls_vol = load_nls_default(); - server->nls_io = load_nls_default(); -#endif /* CONFIG_NCPFS_NLS */ - - atomic_set(&server->dentry_ttl, 0); /* no caching */ - - INIT_LIST_HEAD(&server->tx.requests); - mutex_init(&server->rcv.creq_mutex); - server->tx.creq = NULL; - server->rcv.creq = NULL; - - timer_setup(&server->timeout_tm, ncpdgram_timeout_call, 0); -#undef NCP_PACKET_SIZE -#define NCP_PACKET_SIZE 131072 - error = -ENOMEM; - server->packet_size = NCP_PACKET_SIZE; - server->packet = vmalloc(NCP_PACKET_SIZE); - if (server->packet == NULL) - goto out_nls; - server->txbuf = vmalloc(NCP_PACKET_SIZE); - if (server->txbuf == NULL) - goto out_packet; - server->rxbuf = vmalloc(NCP_PACKET_SIZE); - if (server->rxbuf == NULL) - goto out_txbuf; - - lock_sock(sock->sk); - server->data_ready = sock->sk->sk_data_ready; - server->write_space = sock->sk->sk_write_space; - server->error_report = sock->sk->sk_error_report; - sock->sk->sk_user_data = server; - sock->sk->sk_data_ready = ncp_tcp_data_ready; - sock->sk->sk_error_report = ncp_tcp_error_report; - if (sock->type == SOCK_STREAM) { - server->rcv.ptr = (unsigned char*)&server->rcv.buf; - server->rcv.len = 10; - server->rcv.state = 0; - INIT_WORK(&server->rcv.tq, ncp_tcp_rcv_proc); - INIT_WORK(&server->tx.tq, ncp_tcp_tx_proc); - sock->sk->sk_write_space = ncp_tcp_write_space; - } else { - INIT_WORK(&server->rcv.tq, ncpdgram_rcv_proc); - INIT_WORK(&server->timeout_tq, ncpdgram_timeout_proc); - } - release_sock(sock->sk); - - ncp_lock_server(server); - error = ncp_connect(server); - ncp_unlock_server(server); - if (error < 0) - goto out_rxbuf; - ncp_dbg(1, "NCP_SBP(sb) = %p\n", NCP_SBP(sb)); - - error = -EMSGSIZE; /* -EREMOTESIDEINCOMPATIBLE */ -#ifdef CONFIG_NCPFS_PACKET_SIGNING - if (ncp_negotiate_size_and_options(server, default_bufsize, - NCP_DEFAULT_OPTIONS, &(server->buffer_size), &options) == 0) - { - if (options != NCP_DEFAULT_OPTIONS) - { - if (ncp_negotiate_size_and_options(server, - default_bufsize, - options & 2, - &(server->buffer_size), &options) != 0) - - { - goto out_disconnect; - } - } - ncp_lock_server(server); - if (options & 2) - server->sign_wanted = 1; - ncp_unlock_server(server); - } - else -#endif /* CONFIG_NCPFS_PACKET_SIGNING */ - if (ncp_negotiate_buffersize(server, default_bufsize, - &(server->buffer_size)) != 0) - goto out_disconnect; - ncp_dbg(1, "bufsize = %d\n", server->buffer_size); - - memset(&finfo, 0, sizeof(finfo)); - finfo.i.attributes = aDIR; - finfo.i.dataStreamSize = 0; /* ignored */ - finfo.i.dirEntNum = 0; - finfo.i.DosDirNum = 0; -#ifdef CONFIG_NCPFS_SMALLDOS - finfo.i.NSCreator = NW_NS_DOS; -#endif - finfo.volume = NCP_NUMBER_OF_VOLUMES; - /* set dates of mountpoint to Jan 1, 1986; 00:00 */ - finfo.i.creationTime = finfo.i.modifyTime - = cpu_to_le16(0x0000); - finfo.i.creationDate = finfo.i.modifyDate - = finfo.i.lastAccessDate - = cpu_to_le16(0x0C21); - finfo.i.nameLen = 0; - finfo.i.entryName[0] = '\0'; - - finfo.opened = 0; - finfo.ino = 2; /* tradition */ - - server->name_space[finfo.volume] = NW_NS_DOS; - - error = -ENOMEM; - root_inode = ncp_iget(sb, &finfo); - if (!root_inode) - goto out_disconnect; - ncp_dbg(1, "root vol=%d\n", NCP_FINFO(root_inode)->volNumber); - sb->s_root = d_make_root(root_inode); - if (!sb->s_root) - goto out_disconnect; - return 0; - -out_disconnect: - ncp_lock_server(server); - ncp_disconnect(server); - ncp_unlock_server(server); -out_rxbuf: - ncp_stop_tasks(server); - vfree(server->rxbuf); -out_txbuf: - vfree(server->txbuf); -out_packet: - vfree(server->packet); -out_nls: -#ifdef CONFIG_NCPFS_NLS - unload_nls(server->nls_io); - unload_nls(server->nls_vol); -#endif - mutex_destroy(&server->rcv.creq_mutex); - mutex_destroy(&server->root_setup_lock); - mutex_destroy(&server->mutex); -out_fput2: - if (server->info_sock) - sockfd_put(server->info_sock); -out_fput: - sockfd_put(sock); -out: - put_pid(data.wdog_pid); - sb->s_fs_info = NULL; - kfree(server); - return error; -} - -static void delayed_free(struct rcu_head *p) -{ - struct ncp_server *server = container_of(p, struct ncp_server, rcu); -#ifdef CONFIG_NCPFS_NLS - /* unload the NLS charsets */ - unload_nls(server->nls_vol); - unload_nls(server->nls_io); -#endif /* CONFIG_NCPFS_NLS */ - kfree(server); -} - -static void ncp_put_super(struct super_block *sb) -{ - struct ncp_server *server = NCP_SBP(sb); - - ncp_lock_server(server); - ncp_disconnect(server); - ncp_unlock_server(server); - - ncp_stop_tasks(server); - - mutex_destroy(&server->rcv.creq_mutex); - mutex_destroy(&server->root_setup_lock); - mutex_destroy(&server->mutex); - - if (server->info_sock) - sockfd_put(server->info_sock); - sockfd_put(server->ncp_sock); - kill_pid(server->m.wdog_pid, SIGTERM, 1); - put_pid(server->m.wdog_pid); - - kfree(server->priv.data); - kfree(server->auth.object_name); - vfree(server->rxbuf); - vfree(server->txbuf); - vfree(server->packet); - call_rcu(&server->rcu, delayed_free); -} - -static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct dentry* d; - struct inode* i; - struct ncp_inode_info* ni; - struct ncp_server* s; - struct ncp_volume_info vi; - struct super_block *sb = dentry->d_sb; - int err; - __u8 dh; - - d = sb->s_root; - if (!d) { - goto dflt; - } - i = d_inode(d); - if (!i) { - goto dflt; - } - ni = NCP_FINFO(i); - if (!ni) { - goto dflt; - } - s = NCP_SBP(sb); - if (!s) { - goto dflt; - } - if (!s->m.mounted_vol[0]) { - goto dflt; - } - - err = ncp_dirhandle_alloc(s, ni->volNumber, ni->DosDirNum, &dh); - if (err) { - goto dflt; - } - err = ncp_get_directory_info(s, dh, &vi); - ncp_dirhandle_free(s, dh); - if (err) { - goto dflt; - } - buf->f_type = NCP_SUPER_MAGIC; - buf->f_bsize = vi.sectors_per_block * 512; - buf->f_blocks = vi.total_blocks; - buf->f_bfree = vi.free_blocks; - buf->f_bavail = vi.free_blocks; - buf->f_files = vi.total_dir_entries; - buf->f_ffree = vi.available_dir_entries; - buf->f_namelen = 12; - return 0; - - /* We cannot say how much disk space is left on a mounted - NetWare Server, because free space is distributed over - volumes, and the current user might have disk quotas. So - free space is not that simple to determine. Our decision - here is to err conservatively. */ - -dflt:; - buf->f_type = NCP_SUPER_MAGIC; - buf->f_bsize = NCP_BLOCK_SIZE; - buf->f_blocks = 0; - buf->f_bfree = 0; - buf->f_bavail = 0; - buf->f_namelen = 12; - return 0; -} - -int ncp_notify_change(struct dentry *dentry, struct iattr *attr) -{ - struct inode *inode = d_inode(dentry); - int result = 0; - __le32 info_mask; - struct nw_modify_dos_info info; - struct ncp_server *server; - - result = -EIO; - - server = NCP_SERVER(inode); - if (!server) /* How this could happen? */ - goto out; - - result = -EPERM; - if (IS_DEADDIR(d_inode(dentry))) - goto out; - - /* ageing the dentry to force validation */ - ncp_age_dentry(server, dentry); - - result = setattr_prepare(dentry, attr); - if (result < 0) - goto out; - - result = -EPERM; - if ((attr->ia_valid & ATTR_UID) && !uid_eq(attr->ia_uid, server->m.uid)) - goto out; - - if ((attr->ia_valid & ATTR_GID) && !gid_eq(attr->ia_gid, server->m.gid)) - goto out; - - if (((attr->ia_valid & ATTR_MODE) && - (attr->ia_mode & - ~(S_IFREG | S_IFDIR | S_IRWXUGO)))) - goto out; - - info_mask = 0; - memset(&info, 0, sizeof(info)); - -#if 1 - if ((attr->ia_valid & ATTR_MODE) != 0) - { - umode_t newmode = attr->ia_mode; - - info_mask |= DM_ATTRIBUTES; - - if (S_ISDIR(inode->i_mode)) { - newmode &= server->m.dir_mode; - } else { -#ifdef CONFIG_NCPFS_EXTRAS - if (server->m.flags & NCP_MOUNT_EXTRAS) { - /* any non-default execute bit set */ - if (newmode & ~server->m.file_mode & S_IXUGO) - info.attributes |= aSHARED | aSYSTEM; - /* read for group/world and not in default file_mode */ - else if (newmode & ~server->m.file_mode & S_IRUGO) - info.attributes |= aSHARED; - } else -#endif - newmode &= server->m.file_mode; - } - if (newmode & S_IWUGO) - info.attributes &= ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT); - else - info.attributes |= (aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT); - -#ifdef CONFIG_NCPFS_NFS_NS - if (ncp_is_nfs_extras(server, NCP_FINFO(inode)->volNumber)) { - result = ncp_modify_nfs_info(server, - NCP_FINFO(inode)->volNumber, - NCP_FINFO(inode)->dirEntNum, - attr->ia_mode, 0); - if (result != 0) - goto out; - info.attributes &= ~(aSHARED | aSYSTEM); - { - /* mark partial success */ - struct iattr tmpattr; - - tmpattr.ia_valid = ATTR_MODE; - tmpattr.ia_mode = attr->ia_mode; - - setattr_copy(inode, &tmpattr); - mark_inode_dirty(inode); - } - } -#endif - } -#endif - - /* Do SIZE before attributes, otherwise mtime together with size does not work... - */ - if ((attr->ia_valid & ATTR_SIZE) != 0) { - int written; - - ncp_dbg(1, "trying to change size to %llu\n", attr->ia_size); - - if ((result = ncp_make_open(inode, O_WRONLY)) < 0) { - result = -EACCES; - goto out; - } - ncp_write_kernel(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle, - attr->ia_size, 0, "", &written); - - /* According to ndir, the changes only take effect after - closing the file */ - ncp_inode_close(inode); - result = ncp_make_closed(inode); - if (result) - goto out; - - if (attr->ia_size != i_size_read(inode)) { - truncate_setsize(inode, attr->ia_size); - mark_inode_dirty(inode); - } - } - if ((attr->ia_valid & ATTR_CTIME) != 0) { - info_mask |= (DM_CREATE_TIME | DM_CREATE_DATE); - ncp_date_unix2dos(attr->ia_ctime.tv_sec, - &info.creationTime, &info.creationDate); - } - if ((attr->ia_valid & ATTR_MTIME) != 0) { - info_mask |= (DM_MODIFY_TIME | DM_MODIFY_DATE); - ncp_date_unix2dos(attr->ia_mtime.tv_sec, - &info.modifyTime, &info.modifyDate); - } - if ((attr->ia_valid & ATTR_ATIME) != 0) { - __le16 dummy; - info_mask |= (DM_LAST_ACCESS_DATE); - ncp_date_unix2dos(attr->ia_atime.tv_sec, - &dummy, &info.lastAccessDate); - } - if (info_mask != 0) { - result = ncp_modify_file_or_subdir_dos_info(NCP_SERVER(inode), - inode, info_mask, &info); - if (result != 0) { - if (info_mask == (DM_CREATE_TIME | DM_CREATE_DATE)) { - /* NetWare seems not to allow this. I - do not know why. So, just tell the - user everything went fine. This is - a terrible hack, but I do not know - how to do this correctly. */ - result = 0; - } else - goto out; - } -#ifdef CONFIG_NCPFS_STRONG - if ((!result) && (info_mask & DM_ATTRIBUTES)) - NCP_FINFO(inode)->nwattr = info.attributes; -#endif - } - if (result) - goto out; - - setattr_copy(inode, attr); - mark_inode_dirty(inode); - -out: - if (result > 0) - result = -EACCES; - return result; -} - -static struct dentry *ncp_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_nodev(fs_type, flags, data, ncp_fill_super); -} - -static struct file_system_type ncp_fs_type = { - .owner = THIS_MODULE, - .name = "ncpfs", - .mount = ncp_mount, - .kill_sb = kill_anon_super, - .fs_flags = FS_BINARY_MOUNTDATA, -}; -MODULE_ALIAS_FS("ncpfs"); - -static int __init init_ncp_fs(void) -{ - int err; - ncp_dbg(1, "called\n"); - - err = init_inodecache(); - if (err) - goto out1; - err = register_filesystem(&ncp_fs_type); - if (err) - goto out; - return 0; -out: - destroy_inodecache(); -out1: - return err; -} - -static void __exit exit_ncp_fs(void) -{ - ncp_dbg(1, "called\n"); - unregister_filesystem(&ncp_fs_type); - destroy_inodecache(); -} - -module_init(init_ncp_fs) -module_exit(exit_ncp_fs) -MODULE_LICENSE("GPL"); diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c deleted file mode 100644 index d378b98cd7b6..000000000000 --- a/fs/ncpfs/ioctl.c +++ /dev/null @@ -1,923 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * ioctl.c - * - * Copyright (C) 1995, 1996 by Volker Lendecke - * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache - * Modified 1998, 1999 Wolfram Pienkoss for NLS - * - */ - -#include <linux/capability.h> -#include <linux/compat.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/ioctl.h> -#include <linux/time.h> -#include <linux/mm.h> -#include <linux/mount.h> -#include <linux/slab.h> -#include <linux/highuid.h> -#include <linux/vmalloc.h> -#include <linux/sched.h> -#include <linux/cred.h> - -#include <linux/uaccess.h> - -#include "ncp_fs.h" - -/* maximum limit for ncp_objectname_ioctl */ -#define NCP_OBJECT_NAME_MAX_LEN 4096 -/* maximum limit for ncp_privatedata_ioctl */ -#define NCP_PRIVATE_DATA_MAX_LEN 8192 -/* maximum negotiable packet size */ -#define NCP_PACKET_SIZE_INTERNAL 65536 - -static int -ncp_get_fs_info(struct ncp_server * server, struct inode *inode, - struct ncp_fs_info __user *arg) -{ - struct ncp_fs_info info; - - if (copy_from_user(&info, arg, sizeof(info))) - return -EFAULT; - - if (info.version != NCP_GET_FS_INFO_VERSION) { - ncp_dbg(1, "info.version invalid: %d\n", info.version); - return -EINVAL; - } - /* TODO: info.addr = server->m.serv_addr; */ - SET_UID(info.mounted_uid, from_kuid_munged(current_user_ns(), server->m.mounted_uid)); - info.connection = server->connection; - info.buffer_size = server->buffer_size; - info.volume_number = NCP_FINFO(inode)->volNumber; - info.directory_id = NCP_FINFO(inode)->DosDirNum; - - if (copy_to_user(arg, &info, sizeof(info))) - return -EFAULT; - return 0; -} - -static int -ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode, - struct ncp_fs_info_v2 __user * arg) -{ - struct ncp_fs_info_v2 info2; - - if (copy_from_user(&info2, arg, sizeof(info2))) - return -EFAULT; - - if (info2.version != NCP_GET_FS_INFO_VERSION_V2) { - ncp_dbg(1, "info.version invalid: %d\n", info2.version); - return -EINVAL; - } - info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); - info2.connection = server->connection; - info2.buffer_size = server->buffer_size; - info2.volume_number = NCP_FINFO(inode)->volNumber; - info2.directory_id = NCP_FINFO(inode)->DosDirNum; - info2.dummy1 = info2.dummy2 = info2.dummy3 = 0; - - if (copy_to_user(arg, &info2, sizeof(info2))) - return -EFAULT; - return 0; -} - -#ifdef CONFIG_COMPAT -struct compat_ncp_objectname_ioctl -{ - s32 auth_type; - u32 object_name_len; - compat_caddr_t object_name; /* a userspace data, in most cases user name */ -}; - -struct compat_ncp_fs_info_v2 { - s32 version; - u32 mounted_uid; - u32 connection; - u32 buffer_size; - - u32 volume_number; - u32 directory_id; - - u32 dummy1; - u32 dummy2; - u32 dummy3; -}; - -struct compat_ncp_ioctl_request { - u32 function; - u32 size; - compat_caddr_t data; -}; - -struct compat_ncp_privatedata_ioctl -{ - u32 len; - compat_caddr_t data; /* ~1000 for NDS */ -}; - -#define NCP_IOC_GET_FS_INFO_V2_32 _IOWR('n', 4, struct compat_ncp_fs_info_v2) -#define NCP_IOC_NCPREQUEST_32 _IOR('n', 1, struct compat_ncp_ioctl_request) -#define NCP_IOC_GETOBJECTNAME_32 _IOWR('n', 9, struct compat_ncp_objectname_ioctl) -#define NCP_IOC_SETOBJECTNAME_32 _IOR('n', 9, struct compat_ncp_objectname_ioctl) -#define NCP_IOC_GETPRIVATEDATA_32 _IOWR('n', 10, struct compat_ncp_privatedata_ioctl) -#define NCP_IOC_SETPRIVATEDATA_32 _IOR('n', 10, struct compat_ncp_privatedata_ioctl) - -static int -ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode, - struct compat_ncp_fs_info_v2 __user * arg) -{ - struct compat_ncp_fs_info_v2 info2; - - if (copy_from_user(&info2, arg, sizeof(info2))) - return -EFAULT; - - if (info2.version != NCP_GET_FS_INFO_VERSION_V2) { - ncp_dbg(1, "info.version invalid: %d\n", info2.version); - return -EINVAL; - } - info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); - info2.connection = server->connection; - info2.buffer_size = server->buffer_size; - info2.volume_number = NCP_FINFO(inode)->volNumber; - info2.directory_id = NCP_FINFO(inode)->DosDirNum; - info2.dummy1 = info2.dummy2 = info2.dummy3 = 0; - - if (copy_to_user(arg, &info2, sizeof(info2))) - return -EFAULT; - return 0; -} -#endif - -#define NCP_IOC_GETMOUNTUID16 _IOW('n', 2, u16) -#define NCP_IOC_GETMOUNTUID32 _IOW('n', 2, u32) -#define NCP_IOC_GETMOUNTUID64 _IOW('n', 2, u64) - -#ifdef CONFIG_NCPFS_NLS -/* Here we are select the iocharset and the codepage for NLS. - * Thanks Petr Vandrovec for idea and many hints. - */ -static int -ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg) -{ - struct ncp_nls_ioctl user; - struct nls_table *codepage; - struct nls_table *iocharset; - struct nls_table *oldset_io; - struct nls_table *oldset_cp; - int utf8; - int err; - - if (copy_from_user(&user, arg, sizeof(user))) - return -EFAULT; - - codepage = NULL; - user.codepage[NCP_IOCSNAME_LEN] = 0; - if (!user.codepage[0] || !strcmp(user.codepage, "default")) - codepage = load_nls_default(); - else { - codepage = load_nls(user.codepage); - if (!codepage) { - return -EBADRQC; - } - } - - iocharset = NULL; - user.iocharset[NCP_IOCSNAME_LEN] = 0; - if (!user.iocharset[0] || !strcmp(user.iocharset, "default")) { - iocharset = load_nls_default(); - utf8 = 0; - } else if (!strcmp(user.iocharset, "utf8")) { - iocharset = load_nls_default(); - utf8 = 1; - } else { - iocharset = load_nls(user.iocharset); - if (!iocharset) { - unload_nls(codepage); - return -EBADRQC; - } - utf8 = 0; - } - - mutex_lock(&server->root_setup_lock); - if (server->root_setuped) { - oldset_cp = codepage; - oldset_io = iocharset; - err = -EBUSY; - } else { - if (utf8) - NCP_SET_FLAG(server, NCP_FLAG_UTF8); - else - NCP_CLR_FLAG(server, NCP_FLAG_UTF8); - oldset_cp = server->nls_vol; - server->nls_vol = codepage; - oldset_io = server->nls_io; - server->nls_io = iocharset; - err = 0; - } - mutex_unlock(&server->root_setup_lock); - unload_nls(oldset_cp); - unload_nls(oldset_io); - - return err; -} - -static int -ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg) -{ - struct ncp_nls_ioctl user; - int len; - - memset(&user, 0, sizeof(user)); - mutex_lock(&server->root_setup_lock); - if (server->nls_vol && server->nls_vol->charset) { - len = strlen(server->nls_vol->charset); - if (len > NCP_IOCSNAME_LEN) - len = NCP_IOCSNAME_LEN; - strncpy(user.codepage, server->nls_vol->charset, len); - user.codepage[len] = 0; - } - - if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) - strcpy(user.iocharset, "utf8"); - else if (server->nls_io && server->nls_io->charset) { - len = strlen(server->nls_io->charset); - if (len > NCP_IOCSNAME_LEN) - len = NCP_IOCSNAME_LEN; - strncpy(user.iocharset, server->nls_io->charset, len); - user.iocharset[len] = 0; - } - mutex_unlock(&server->root_setup_lock); - - if (copy_to_user(arg, &user, sizeof(user))) - return -EFAULT; - return 0; -} -#endif /* CONFIG_NCPFS_NLS */ - -static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg) -{ - struct ncp_server *server = NCP_SERVER(inode); - int result; - struct ncp_ioctl_request request; - char* bouncebuffer; - void __user *argp = (void __user *)arg; - - switch (cmd) { -#ifdef CONFIG_COMPAT - case NCP_IOC_NCPREQUEST_32: -#endif - case NCP_IOC_NCPREQUEST: -#ifdef CONFIG_COMPAT - if (cmd == NCP_IOC_NCPREQUEST_32) { - struct compat_ncp_ioctl_request request32; - if (copy_from_user(&request32, argp, sizeof(request32))) - return -EFAULT; - request.function = request32.function; - request.size = request32.size; - request.data = compat_ptr(request32.data); - } else -#endif - if (copy_from_user(&request, argp, sizeof(request))) - return -EFAULT; - - if ((request.function > 255) - || (request.size > - NCP_PACKET_SIZE - sizeof(struct ncp_request_header))) { - return -EINVAL; - } - bouncebuffer = vmalloc(NCP_PACKET_SIZE_INTERNAL); - if (!bouncebuffer) - return -ENOMEM; - if (copy_from_user(bouncebuffer, request.data, request.size)) { - vfree(bouncebuffer); - return -EFAULT; - } - ncp_lock_server(server); - - /* FIXME: We hack around in the server's structures - here to be able to use ncp_request */ - - server->has_subfunction = 0; - server->current_size = request.size; - memcpy(server->packet, bouncebuffer, request.size); - - result = ncp_request2(server, request.function, - bouncebuffer, NCP_PACKET_SIZE_INTERNAL); - if (result < 0) - result = -EIO; - else - result = server->reply_size; - ncp_unlock_server(server); - ncp_dbg(1, "copy %d bytes\n", result); - if (result >= 0) - if (copy_to_user(request.data, bouncebuffer, result)) - result = -EFAULT; - vfree(bouncebuffer); - return result; - - case NCP_IOC_CONN_LOGGED_IN: - - if (!(server->m.int_flags & NCP_IMOUNT_LOGGEDIN_POSSIBLE)) - return -EINVAL; - mutex_lock(&server->root_setup_lock); - if (server->root_setuped) - result = -EBUSY; - else { - result = ncp_conn_logged_in(inode->i_sb); - if (result == 0) - server->root_setuped = 1; - } - mutex_unlock(&server->root_setup_lock); - return result; - - case NCP_IOC_GET_FS_INFO: - return ncp_get_fs_info(server, inode, argp); - - case NCP_IOC_GET_FS_INFO_V2: - return ncp_get_fs_info_v2(server, inode, argp); - -#ifdef CONFIG_COMPAT - case NCP_IOC_GET_FS_INFO_V2_32: - return ncp_get_compat_fs_info_v2(server, inode, argp); -#endif - /* we have too many combinations of CONFIG_COMPAT, - * CONFIG_64BIT and CONFIG_UID16, so just handle - * any of the possible ioctls */ - case NCP_IOC_GETMOUNTUID16: - { - u16 uid; - - SET_UID(uid, from_kuid_munged(current_user_ns(), server->m.mounted_uid)); - if (put_user(uid, (u16 __user *)argp)) - return -EFAULT; - return 0; - } - case NCP_IOC_GETMOUNTUID32: - { - uid_t uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); - if (put_user(uid, (u32 __user *)argp)) - return -EFAULT; - return 0; - } - case NCP_IOC_GETMOUNTUID64: - { - uid_t uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); - if (put_user(uid, (u64 __user *)argp)) - return -EFAULT; - return 0; - } - case NCP_IOC_GETROOT: - { - struct ncp_setroot_ioctl sr; - - result = -EACCES; - mutex_lock(&server->root_setup_lock); - if (server->m.mounted_vol[0]) { - struct dentry* dentry = inode->i_sb->s_root; - - if (dentry) { - struct inode* s_inode = d_inode(dentry); - - if (s_inode) { - sr.volNumber = NCP_FINFO(s_inode)->volNumber; - sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum; - sr.namespace = server->name_space[sr.volNumber]; - result = 0; - } else - ncp_dbg(1, "d_inode(s_root)==NULL\n"); - } else - ncp_dbg(1, "s_root==NULL\n"); - } else { - sr.volNumber = -1; - sr.namespace = 0; - sr.dirEntNum = 0; - result = 0; - } - mutex_unlock(&server->root_setup_lock); - if (!result && copy_to_user(argp, &sr, sizeof(sr))) - result = -EFAULT; - return result; - } - - case NCP_IOC_SETROOT: - { - struct ncp_setroot_ioctl sr; - __u32 vnum; - __le32 de; - __le32 dosde; - struct dentry* dentry; - - if (copy_from_user(&sr, argp, sizeof(sr))) - return -EFAULT; - mutex_lock(&server->root_setup_lock); - if (server->root_setuped) - result = -EBUSY; - else { - if (sr.volNumber < 0) { - server->m.mounted_vol[0] = 0; - vnum = NCP_NUMBER_OF_VOLUMES; - de = 0; - dosde = 0; - result = 0; - } else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) { - result = -EINVAL; - } else if (ncp_mount_subdir(server, sr.volNumber, - sr.namespace, sr.dirEntNum, - &vnum, &de, &dosde)) { - result = -ENOENT; - } else - result = 0; - - if (result == 0) { - dentry = inode->i_sb->s_root; - if (dentry) { - struct inode* s_inode = d_inode(dentry); - - if (s_inode) { - NCP_FINFO(s_inode)->volNumber = vnum; - NCP_FINFO(s_inode)->dirEntNum = de; - NCP_FINFO(s_inode)->DosDirNum = dosde; - server->root_setuped = 1; - } else { - ncp_dbg(1, "d_inode(s_root)==NULL\n"); - result = -EIO; - } - } else { - ncp_dbg(1, "s_root==NULL\n"); - result = -EIO; - } - } - } - mutex_unlock(&server->root_setup_lock); - - return result; - } - -#ifdef CONFIG_NCPFS_PACKET_SIGNING - case NCP_IOC_SIGN_INIT: - { - struct ncp_sign_init sign; - - if (argp) - if (copy_from_user(&sign, argp, sizeof(sign))) - return -EFAULT; - ncp_lock_server(server); - mutex_lock(&server->rcv.creq_mutex); - if (argp) { - if (server->sign_wanted) { - memcpy(server->sign_root,sign.sign_root,8); - memcpy(server->sign_last,sign.sign_last,16); - server->sign_active = 1; - } - /* ignore when signatures not wanted */ - } else { - server->sign_active = 0; - } - mutex_unlock(&server->rcv.creq_mutex); - ncp_unlock_server(server); - return 0; - } - - case NCP_IOC_SIGN_WANTED: - { - int state; - - ncp_lock_server(server); - state = server->sign_wanted; - ncp_unlock_server(server); - if (put_user(state, (int __user *)argp)) - return -EFAULT; - return 0; - } - - case NCP_IOC_SET_SIGN_WANTED: - { - int newstate; - - /* get only low 8 bits... */ - if (get_user(newstate, (unsigned char __user *)argp)) - return -EFAULT; - result = 0; - ncp_lock_server(server); - if (server->sign_active) { - /* cannot turn signatures OFF when active */ - if (!newstate) - result = -EINVAL; - } else { - server->sign_wanted = newstate != 0; - } - ncp_unlock_server(server); - return result; - } - -#endif /* CONFIG_NCPFS_PACKET_SIGNING */ - -#ifdef CONFIG_NCPFS_IOCTL_LOCKING - case NCP_IOC_LOCKUNLOCK: - { - struct ncp_lock_ioctl rqdata; - - if (copy_from_user(&rqdata, argp, sizeof(rqdata))) - return -EFAULT; - if (rqdata.origin != 0) - return -EINVAL; - /* check for cmd */ - switch (rqdata.cmd) { - case NCP_LOCK_EX: - case NCP_LOCK_SH: - if (rqdata.timeout < 0) - return -EINVAL; - if (rqdata.timeout == 0) - rqdata.timeout = NCP_LOCK_DEFAULT_TIMEOUT; - else if (rqdata.timeout > NCP_LOCK_MAX_TIMEOUT) - rqdata.timeout = NCP_LOCK_MAX_TIMEOUT; - break; - case NCP_LOCK_LOG: - rqdata.timeout = NCP_LOCK_DEFAULT_TIMEOUT; /* has no effect */ - case NCP_LOCK_CLEAR: - break; - default: - return -EINVAL; - } - /* locking needs both read and write access */ - if ((result = ncp_make_open(inode, O_RDWR)) != 0) - { - return result; - } - result = -EISDIR; - if (!S_ISREG(inode->i_mode)) - goto outrel; - if (rqdata.cmd == NCP_LOCK_CLEAR) - { - result = ncp_ClearPhysicalRecord(NCP_SERVER(inode), - NCP_FINFO(inode)->file_handle, - rqdata.offset, - rqdata.length); - if (result > 0) result = 0; /* no such lock */ - } - else - { - int lockcmd; - - switch (rqdata.cmd) - { - case NCP_LOCK_EX: lockcmd=1; break; - case NCP_LOCK_SH: lockcmd=3; break; - default: lockcmd=0; break; - } - result = ncp_LogPhysicalRecord(NCP_SERVER(inode), - NCP_FINFO(inode)->file_handle, - lockcmd, - rqdata.offset, - rqdata.length, - rqdata.timeout); - if (result > 0) result = -EAGAIN; - } -outrel: - ncp_inode_close(inode); - return result; - } -#endif /* CONFIG_NCPFS_IOCTL_LOCKING */ - -#ifdef CONFIG_COMPAT - case NCP_IOC_GETOBJECTNAME_32: - { - struct compat_ncp_objectname_ioctl user; - size_t outl; - - if (copy_from_user(&user, argp, sizeof(user))) - return -EFAULT; - down_read(&server->auth_rwsem); - user.auth_type = server->auth.auth_type; - outl = user.object_name_len; - user.object_name_len = server->auth.object_name_len; - if (outl > user.object_name_len) - outl = user.object_name_len; - result = 0; - if (outl) { - if (copy_to_user(compat_ptr(user.object_name), - server->auth.object_name, - outl)) - result = -EFAULT; - } - up_read(&server->auth_rwsem); - if (!result && copy_to_user(argp, &user, sizeof(user))) - result = -EFAULT; - return result; - } -#endif - - case NCP_IOC_GETOBJECTNAME: - { - struct ncp_objectname_ioctl user; - size_t outl; - - if (copy_from_user(&user, argp, sizeof(user))) - return -EFAULT; - down_read(&server->auth_rwsem); - user.auth_type = server->auth.auth_type; - outl = user.object_name_len; - user.object_name_len = server->auth.object_name_len; - if (outl > user.object_name_len) - outl = user.object_name_len; - result = 0; - if (outl) { - if (copy_to_user(user.object_name, - server->auth.object_name, - outl)) - result = -EFAULT; - } - up_read(&server->auth_rwsem); - if (!result && copy_to_user(argp, &user, sizeof(user))) - result = -EFAULT; - return result; - } - -#ifdef CONFIG_COMPAT - case NCP_IOC_SETOBJECTNAME_32: -#endif - case NCP_IOC_SETOBJECTNAME: - { - struct ncp_objectname_ioctl user; - void* newname; - void* oldname; - size_t oldnamelen; - void* oldprivate; - size_t oldprivatelen; - -#ifdef CONFIG_COMPAT - if (cmd == NCP_IOC_SETOBJECTNAME_32) { - struct compat_ncp_objectname_ioctl user32; - if (copy_from_user(&user32, argp, sizeof(user32))) - return -EFAULT; - user.auth_type = user32.auth_type; - user.object_name_len = user32.object_name_len; - user.object_name = compat_ptr(user32.object_name); - } else -#endif - if (copy_from_user(&user, argp, sizeof(user))) - return -EFAULT; - - if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN) - return -ENOMEM; - if (user.object_name_len) { - newname = memdup_user(user.object_name, - user.object_name_len); - if (IS_ERR(newname)) - return PTR_ERR(newname); - } else { - newname = NULL; - } - down_write(&server->auth_rwsem); - oldname = server->auth.object_name; - oldnamelen = server->auth.object_name_len; - oldprivate = server->priv.data; - oldprivatelen = server->priv.len; - server->auth.auth_type = user.auth_type; - server->auth.object_name_len = user.object_name_len; - server->auth.object_name = newname; - server->priv.len = 0; - server->priv.data = NULL; - up_write(&server->auth_rwsem); - kfree(oldprivate); - kfree(oldname); - return 0; - } - -#ifdef CONFIG_COMPAT - case NCP_IOC_GETPRIVATEDATA_32: -#endif - case NCP_IOC_GETPRIVATEDATA: - { - struct ncp_privatedata_ioctl user; - size_t outl; - -#ifdef CONFIG_COMPAT - if (cmd == NCP_IOC_GETPRIVATEDATA_32) { - struct compat_ncp_privatedata_ioctl user32; - if (copy_from_user(&user32, argp, sizeof(user32))) - return -EFAULT; - user.len = user32.len; - user.data = compat_ptr(user32.data); - } else -#endif - if (copy_from_user(&user, argp, sizeof(user))) - return -EFAULT; - - down_read(&server->auth_rwsem); - outl = user.len; - user.len = server->priv.len; - if (outl > user.len) outl = user.len; - result = 0; - if (outl) { - if (copy_to_user(user.data, - server->priv.data, - outl)) - result = -EFAULT; - } - up_read(&server->auth_rwsem); - if (result) - return result; -#ifdef CONFIG_COMPAT - if (cmd == NCP_IOC_GETPRIVATEDATA_32) { - struct compat_ncp_privatedata_ioctl user32; - user32.len = user.len; - user32.data = (unsigned long) user.data; - if (copy_to_user(argp, &user32, sizeof(user32))) - return -EFAULT; - } else -#endif - if (copy_to_user(argp, &user, sizeof(user))) - return -EFAULT; - - return 0; - } - -#ifdef CONFIG_COMPAT - case NCP_IOC_SETPRIVATEDATA_32: -#endif - case NCP_IOC_SETPRIVATEDATA: - { - struct ncp_privatedata_ioctl user; - void* new; - void* old; - size_t oldlen; - -#ifdef CONFIG_COMPAT - if (cmd == NCP_IOC_SETPRIVATEDATA_32) { - struct compat_ncp_privatedata_ioctl user32; - if (copy_from_user(&user32, argp, sizeof(user32))) - return -EFAULT; - user.len = user32.len; - user.data = compat_ptr(user32.data); - } else -#endif - if (copy_from_user(&user, argp, sizeof(user))) - return -EFAULT; - - if (user.len > NCP_PRIVATE_DATA_MAX_LEN) - return -ENOMEM; - if (user.len) { - new = memdup_user(user.data, user.len); - if (IS_ERR(new)) - return PTR_ERR(new); - } else { - new = NULL; - } - down_write(&server->auth_rwsem); - old = server->priv.data; - oldlen = server->priv.len; - server->priv.len = user.len; - server->priv.data = new; - up_write(&server->auth_rwsem); - kfree(old); - return 0; - } - -#ifdef CONFIG_NCPFS_NLS - case NCP_IOC_SETCHARSETS: - return ncp_set_charsets(server, argp); - - case NCP_IOC_GETCHARSETS: - return ncp_get_charsets(server, argp); - -#endif /* CONFIG_NCPFS_NLS */ - - case NCP_IOC_SETDENTRYTTL: - { - u_int32_t user; - - if (copy_from_user(&user, argp, sizeof(user))) - return -EFAULT; - /* 20 secs at most... */ - if (user > 20000) - return -EINVAL; - user = (user * HZ) / 1000; - atomic_set(&server->dentry_ttl, user); - return 0; - } - - case NCP_IOC_GETDENTRYTTL: - { - u_int32_t user = (atomic_read(&server->dentry_ttl) * 1000) / HZ; - if (copy_to_user(argp, &user, sizeof(user))) - return -EFAULT; - return 0; - } - - } - return -EINVAL; -} - -long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) -{ - struct inode *inode = file_inode(filp); - struct ncp_server *server = NCP_SERVER(inode); - kuid_t uid = current_uid(); - int need_drop_write = 0; - long ret; - - switch (cmd) { - case NCP_IOC_SETCHARSETS: - case NCP_IOC_CONN_LOGGED_IN: - case NCP_IOC_SETROOT: - if (!capable(CAP_SYS_ADMIN)) { - ret = -EPERM; - goto out; - } - break; - } - if (!uid_eq(server->m.mounted_uid, uid)) { - switch (cmd) { - /* - * Only mount owner can issue these ioctls. Information - * necessary to authenticate to other NDS servers are - * stored here. - */ - case NCP_IOC_GETOBJECTNAME: - case NCP_IOC_SETOBJECTNAME: - case NCP_IOC_GETPRIVATEDATA: - case NCP_IOC_SETPRIVATEDATA: -#ifdef CONFIG_COMPAT - case NCP_IOC_GETOBJECTNAME_32: - case NCP_IOC_SETOBJECTNAME_32: - case NCP_IOC_GETPRIVATEDATA_32: - case NCP_IOC_SETPRIVATEDATA_32: -#endif - ret = -EACCES; - goto out; - /* - * These require write access on the inode if user id - * does not match. Note that they do not write to the - * file... But old code did mnt_want_write, so I keep - * it as is. Of course not for mountpoint owner, as - * that breaks read-only mounts altogether as ncpmount - * needs working NCP_IOC_NCPREQUEST and - * NCP_IOC_GET_FS_INFO. Some of these codes (setdentryttl, - * signinit, setsignwanted) should be probably restricted - * to owner only, or even more to CAP_SYS_ADMIN). - */ - case NCP_IOC_GET_FS_INFO: - case NCP_IOC_GET_FS_INFO_V2: - case NCP_IOC_NCPREQUEST: - case NCP_IOC_SETDENTRYTTL: - case NCP_IOC_SIGN_INIT: - case NCP_IOC_LOCKUNLOCK: - case NCP_IOC_SET_SIGN_WANTED: -#ifdef CONFIG_COMPAT - case NCP_IOC_GET_FS_INFO_V2_32: - case NCP_IOC_NCPREQUEST_32: -#endif - ret = mnt_want_write_file(filp); - if (ret) - goto out; - need_drop_write = 1; - ret = inode_permission(inode, MAY_WRITE); - if (ret) - goto outDropWrite; - break; - /* - * Read access required. - */ - case NCP_IOC_GETMOUNTUID16: - case NCP_IOC_GETMOUNTUID32: - case NCP_IOC_GETMOUNTUID64: - case NCP_IOC_GETROOT: - case NCP_IOC_SIGN_WANTED: - ret = inode_permission(inode, MAY_READ); - if (ret) - goto out; - break; - /* - * Anybody can read these. - */ - case NCP_IOC_GETCHARSETS: - case NCP_IOC_GETDENTRYTTL: - default: - /* Three codes below are protected by CAP_SYS_ADMIN above. */ - case NCP_IOC_SETCHARSETS: - case NCP_IOC_CONN_LOGGED_IN: - case NCP_IOC_SETROOT: - break; - } - } - ret = __ncp_ioctl(inode, cmd, arg); -outDropWrite: - if (need_drop_write) - mnt_drop_write_file(filp); -out: - return ret; -} - -#ifdef CONFIG_COMPAT -long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - long ret; - - arg = (unsigned long) compat_ptr(arg); - ret = ncp_ioctl(file, cmd, arg); - return ret; -} -#endif diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c deleted file mode 100644 index a5c5cf2ff007..000000000000 --- a/fs/ncpfs/mmap.c +++ /dev/null @@ -1,125 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * mmap.c - * - * Copyright (C) 1995, 1996 by Volker Lendecke - * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache - * - */ - -#include <linux/stat.h> -#include <linux/time.h> -#include <linux/kernel.h> -#include <linux/gfp.h> -#include <linux/mm.h> -#include <linux/shm.h> -#include <linux/errno.h> -#include <linux/mman.h> -#include <linux/string.h> -#include <linux/fcntl.h> -#include <linux/memcontrol.h> - -#include <linux/uaccess.h> - -#include "ncp_fs.h" - -/* - * Fill in the supplied page for mmap - * XXX: how are we excluding truncate/invalidate here? Maybe need to lock - * page? - */ -static int ncp_file_mmap_fault(struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - char *pg_addr; - unsigned int already_read; - unsigned int count; - int bufsize; - int pos; /* XXX: loff_t ? */ - - /* - * ncpfs has nothing against high pages as long - * as recvmsg and memset works on it - */ - vmf->page = alloc_page(GFP_HIGHUSER); - if (!vmf->page) - return VM_FAULT_OOM; - pg_addr = kmap(vmf->page); - pos = vmf->pgoff << PAGE_SHIFT; - - count = PAGE_SIZE; - /* what we can read in one go */ - bufsize = NCP_SERVER(inode)->buffer_size; - - already_read = 0; - if (ncp_make_open(inode, O_RDONLY) >= 0) { - while (already_read < count) { - int read_this_time; - int to_read; - - to_read = bufsize - (pos % bufsize); - - to_read = min_t(unsigned int, to_read, count - already_read); - - if (ncp_read_kernel(NCP_SERVER(inode), - NCP_FINFO(inode)->file_handle, - pos, to_read, - pg_addr + already_read, - &read_this_time) != 0) { - read_this_time = 0; - } - pos += read_this_time; - already_read += read_this_time; - - if (read_this_time < to_read) { - break; - } - } - ncp_inode_close(inode); - - } - - if (already_read < PAGE_SIZE) - memset(pg_addr + already_read, 0, PAGE_SIZE - already_read); - flush_dcache_page(vmf->page); - kunmap(vmf->page); - - /* - * If I understand ncp_read_kernel() properly, the above always - * fetches from the network, here the analogue of disk. - * -- nyc - */ - count_vm_event(PGMAJFAULT); - count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); - return VM_FAULT_MAJOR; -} - -static const struct vm_operations_struct ncp_file_mmap = -{ - .fault = ncp_file_mmap_fault, -}; - - -/* This is used for a general mmap of a ncp file */ -int ncp_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct inode *inode = file_inode(file); - - ncp_dbg(1, "called\n"); - - if (!ncp_conn_valid(NCP_SERVER(inode))) - return -EIO; - - /* only PAGE_COW or read-only supported now */ - if (vma->vm_flags & VM_SHARED) - return -EINVAL; - /* we do not support files bigger than 4GB... We eventually - supports just 4GB... */ - if (vma_pages(vma) + vma->vm_pgoff - > (1U << (32 - PAGE_SHIFT))) - return -EFBIG; - - vma->vm_ops = &ncp_file_mmap; - file_accessed(file); - return 0; -} diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h deleted file mode 100644 index bdd262b6c198..000000000000 --- a/fs/ncpfs/ncp_fs.h +++ /dev/null @@ -1,101 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include <linux/ncp_fs.h> -#include "ncp_fs_i.h" -#include "ncp_fs_sb.h" - -#undef NCPFS_PARANOIA -#ifdef NCPFS_PARANOIA -#define ncp_vdbg(fmt, ...) \ - pr_debug(fmt, ##__VA_ARGS__) -#else -#define ncp_vdbg(fmt, ...) \ -do { \ - if (0) \ - pr_debug(fmt, ##__VA_ARGS__); \ -} while (0) -#endif - -#ifndef DEBUG_NCP -#define DEBUG_NCP 0 -#endif - -#if DEBUG_NCP > 0 && !defined(DEBUG) -#define DEBUG -#endif - -#define ncp_dbg(level, fmt, ...) \ -do { \ - if (level <= DEBUG_NCP) \ - pr_debug(fmt, ##__VA_ARGS__); \ -} while (0) - -#define NCP_MAX_RPC_TIMEOUT (6*HZ) - - -struct ncp_entry_info { - struct nw_info_struct i; - ino_t ino; - int opened; - int access; - unsigned int volume; - __u8 file_handle[6]; -}; - -static inline struct ncp_server *NCP_SBP(const struct super_block *sb) -{ - return sb->s_fs_info; -} - -#define NCP_SERVER(inode) NCP_SBP((inode)->i_sb) -static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode) -{ - return container_of(inode, struct ncp_inode_info, vfs_inode); -} - -/* linux/fs/ncpfs/inode.c */ -int ncp_notify_change(struct dentry *, struct iattr *); -struct inode *ncp_iget(struct super_block *, struct ncp_entry_info *); -void ncp_update_inode(struct inode *, struct ncp_entry_info *); -void ncp_update_inode2(struct inode *, struct ncp_entry_info *); - -/* linux/fs/ncpfs/dir.c */ -extern const struct inode_operations ncp_dir_inode_operations; -extern const struct file_operations ncp_dir_operations; -extern const struct dentry_operations ncp_dentry_operations; -int ncp_conn_logged_in(struct super_block *); -int ncp_date_dos2unix(__le16 time, __le16 date); -void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date); - -/* linux/fs/ncpfs/ioctl.c */ -long ncp_ioctl(struct file *, unsigned int, unsigned long); -long ncp_compat_ioctl(struct file *, unsigned int, unsigned long); - -/* linux/fs/ncpfs/sock.c */ -int ncp_request2(struct ncp_server *server, int function, - void* reply, int max_reply_size); -static inline int ncp_request(struct ncp_server *server, int function) { - return ncp_request2(server, function, server->packet, server->packet_size); -} -int ncp_connect(struct ncp_server *server); -int ncp_disconnect(struct ncp_server *server); -void ncp_lock_server(struct ncp_server *server); -void ncp_unlock_server(struct ncp_server *server); - -/* linux/fs/ncpfs/symlink.c */ -#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) -extern const struct address_space_operations ncp_symlink_aops; -int ncp_symlink(struct inode*, struct dentry*, const char*); -#endif - -/* linux/fs/ncpfs/file.c */ -extern const struct inode_operations ncp_file_inode_operations; -extern const struct file_operations ncp_file_operations; -int ncp_make_open(struct inode *, int); - -/* linux/fs/ncpfs/mmap.c */ -int ncp_mmap(struct file *, struct vm_area_struct *); - -/* linux/fs/ncpfs/ncplib_kernel.c */ -int ncp_make_closed(struct inode *); - -#include "ncplib_kernel.h" diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h deleted file mode 100644 index 3432bafb53a5..000000000000 --- a/fs/ncpfs/ncp_fs_i.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * ncp_fs_i.h - * - * Copyright (C) 1995 Volker Lendecke - * - */ - -#ifndef _LINUX_NCP_FS_I -#define _LINUX_NCP_FS_I - -/* - * This is the ncpfs part of the inode structure. This must contain - * all the information we need to work with an inode after creation. - */ -struct ncp_inode_info { - __le32 dirEntNum; - __le32 DosDirNum; - __u8 volNumber; - __le32 nwattr; - struct mutex open_mutex; - atomic_t opened; - int access; - int flags; -#define NCPI_KLUDGE_SYMLINK 0x0001 -#define NCPI_DIR_CACHE 0x0002 - __u8 file_handle[6]; - struct inode vfs_inode; -}; - -#endif /* _LINUX_NCP_FS_I */ diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h deleted file mode 100644 index f06cde4adf71..000000000000 --- a/fs/ncpfs/ncp_fs_sb.h +++ /dev/null @@ -1,174 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * ncp_fs_sb.h - * - * Copyright (C) 1995, 1996 by Volker Lendecke - * - */ - -#ifndef _NCP_FS_SB -#define _NCP_FS_SB - -#include <linux/types.h> -#include <linux/ncp_mount.h> -#include <linux/net.h> -#include <linux/mutex.h> -#include <linux/backing-dev.h> -#include <linux/workqueue.h> - -#define NCP_DEFAULT_OPTIONS 0 /* 2 for packet signatures */ - -struct sock; - -struct ncp_mount_data_kernel { - unsigned long flags; /* NCP_MOUNT_* flags */ - unsigned int int_flags; /* internal flags */ -#define NCP_IMOUNT_LOGGEDIN_POSSIBLE 0x0001 - kuid_t mounted_uid; /* Who may umount() this filesystem? */ - struct pid *wdog_pid; /* Who cares for our watchdog packets? */ - unsigned int ncp_fd; /* The socket to the ncp port */ - unsigned int time_out; /* How long should I wait after - sending a NCP request? */ - unsigned int retry_count; /* And how often should I retry? */ - unsigned char mounted_vol[NCP_VOLNAME_LEN + 1]; - kuid_t uid; - kgid_t gid; - umode_t file_mode; - umode_t dir_mode; - int info_fd; -}; - -struct ncp_server { - struct rcu_head rcu; - struct ncp_mount_data_kernel m; /* Nearly all of the mount data is of - interest for us later, so we store - it completely. */ - - __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2]; - - struct socket *ncp_sock;/* ncp socket */ - struct socket *info_sock; - - u8 sequence; - u8 task; - u16 connection; /* Remote connection number */ - - u8 completion; /* Status message from server */ - u8 conn_status; /* Bit 4 = 1 ==> Server going down, no - requests allowed anymore. - Bit 0 = 1 ==> Server is down. */ - - int buffer_size; /* Negotiated bufsize */ - - int reply_size; /* Size of last reply */ - - int packet_size; - unsigned char *packet; /* Here we prepare requests and - receive replies */ - unsigned char *txbuf; /* Storage for current request */ - unsigned char *rxbuf; /* Storage for reply to current request */ - - int lock; /* To prevent mismatch in protocols. */ - struct mutex mutex; - - int current_size; /* for packet preparation */ - int has_subfunction; - int ncp_reply_size; - - int root_setuped; - struct mutex root_setup_lock; - - /* info for packet signing */ - int sign_wanted; /* 1=Server needs signed packets */ - int sign_active; /* 0=don't do signing, 1=do */ - char sign_root[8]; /* generated from password and encr. key */ - char sign_last[16]; - - /* Authentication info: NDS or BINDERY, username */ - struct { - int auth_type; - size_t object_name_len; - void* object_name; - int object_type; - } auth; - /* Password info */ - struct { - size_t len; - void* data; - } priv; - struct rw_semaphore auth_rwsem; - - /* nls info: codepage for volume and charset for I/O */ - struct nls_table *nls_vol; - struct nls_table *nls_io; - - /* maximum age in jiffies */ - atomic_t dentry_ttl; - - /* miscellaneous */ - unsigned int flags; - - spinlock_t requests_lock; /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */ - - void (*data_ready)(struct sock* sk); - void (*error_report)(struct sock* sk); - void (*write_space)(struct sock* sk); /* STREAM mode only */ - struct { - struct work_struct tq; /* STREAM/DGRAM: data/error ready */ - struct ncp_request_reply* creq; /* STREAM/DGRAM: awaiting reply from this request */ - struct mutex creq_mutex; /* DGRAM only: lock accesses to rcv.creq */ - - unsigned int state; /* STREAM only: receiver state */ - struct { - __u32 magic __packed; - __u32 len __packed; - __u16 type __packed; - __u16 p1 __packed; - __u16 p2 __packed; - __u16 p3 __packed; - __u16 type2 __packed; - } buf; /* STREAM only: temporary buffer */ - unsigned char* ptr; /* STREAM only: pointer to data */ - size_t len; /* STREAM only: length of data to receive */ - } rcv; - struct { - struct list_head requests; /* STREAM only: queued requests */ - struct work_struct tq; /* STREAM only: transmitter ready */ - struct ncp_request_reply* creq; /* STREAM only: currently transmitted entry */ - } tx; - struct timer_list timeout_tm; /* DGRAM only: timeout timer */ - struct work_struct timeout_tq; /* DGRAM only: associated queue, we run timers from process context */ - int timeout_last; /* DGRAM only: current timeout length */ - int timeout_retries; /* DGRAM only: retries left */ - struct { - size_t len; - __u8 data[128]; - } unexpected_packet; -}; - -extern void ncp_tcp_rcv_proc(struct work_struct *work); -extern void ncp_tcp_tx_proc(struct work_struct *work); -extern void ncpdgram_rcv_proc(struct work_struct *work); -extern void ncpdgram_timeout_proc(struct work_struct *work); -extern void ncpdgram_timeout_call(struct timer_list *t); -extern void ncp_tcp_data_ready(struct sock* sk); -extern void ncp_tcp_write_space(struct sock* sk); -extern void ncp_tcp_error_report(struct sock* sk); - -#define NCP_FLAG_UTF8 1 - -#define NCP_CLR_FLAG(server, flag) ((server)->flags &= ~(flag)) -#define NCP_SET_FLAG(server, flag) ((server)->flags |= (flag)) -#define NCP_IS_FLAG(server, flag) ((server)->flags & (flag)) - -static inline int ncp_conn_valid(struct ncp_server *server) -{ - return ((server->conn_status & 0x11) == 0); -} - -static inline void ncp_invalidate_conn(struct ncp_server *server) -{ - server->conn_status |= 0x01; -} - -#endif diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c deleted file mode 100644 index 804adfebba2f..000000000000 --- a/fs/ncpfs/ncplib_kernel.c +++ /dev/null @@ -1,1322 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * ncplib_kernel.c - * - * Copyright (C) 1995, 1996 by Volker Lendecke - * Modified for big endian by J.F. Chadima and David S. Miller - * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache - * Modified 1999 Wolfram Pienkoss for NLS - * Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include "ncp_fs.h" - -static inline void assert_server_locked(struct ncp_server *server) -{ - if (server->lock == 0) { - ncp_dbg(1, "server not locked!\n"); - } -} - -static void ncp_add_byte(struct ncp_server *server, __u8 x) -{ - assert_server_locked(server); - *(__u8 *) (&(server->packet[server->current_size])) = x; - server->current_size += 1; - return; -} - -static void ncp_add_word(struct ncp_server *server, __le16 x) -{ - assert_server_locked(server); - put_unaligned(x, (__le16 *) (&(server->packet[server->current_size]))); - server->current_size += 2; - return; -} - -static void ncp_add_be16(struct ncp_server *server, __u16 x) -{ - assert_server_locked(server); - put_unaligned(cpu_to_be16(x), (__be16 *) (&(server->packet[server->current_size]))); - server->current_size += 2; -} - -static void ncp_add_dword(struct ncp_server *server, __le32 x) -{ - assert_server_locked(server); - put_unaligned(x, (__le32 *) (&(server->packet[server->current_size]))); - server->current_size += 4; - return; -} - -static void ncp_add_be32(struct ncp_server *server, __u32 x) -{ - assert_server_locked(server); - put_unaligned(cpu_to_be32(x), (__be32 *)(&(server->packet[server->current_size]))); - server->current_size += 4; -} - -static inline void ncp_add_dword_lh(struct ncp_server *server, __u32 x) { - ncp_add_dword(server, cpu_to_le32(x)); -} - -static void ncp_add_mem(struct ncp_server *server, const void *source, int size) -{ - assert_server_locked(server); - memcpy(&(server->packet[server->current_size]), source, size); - server->current_size += size; - return; -} - -static void ncp_add_pstring(struct ncp_server *server, const char *s) -{ - int len = strlen(s); - assert_server_locked(server); - if (len > 255) { - ncp_dbg(1, "string too long: %s\n", s); - len = 255; - } - ncp_add_byte(server, len); - ncp_add_mem(server, s, len); - return; -} - -static inline void ncp_init_request(struct ncp_server *server) -{ - ncp_lock_server(server); - - server->current_size = sizeof(struct ncp_request_header); - server->has_subfunction = 0; -} - -static inline void ncp_init_request_s(struct ncp_server *server, int subfunction) -{ - ncp_lock_server(server); - - server->current_size = sizeof(struct ncp_request_header) + 2; - ncp_add_byte(server, subfunction); - - server->has_subfunction = 1; -} - -static inline char * -ncp_reply_data(struct ncp_server *server, int offset) -{ - return &(server->packet[sizeof(struct ncp_reply_header) + offset]); -} - -static inline u8 BVAL(const void *data) -{ - return *(const u8 *)data; -} - -static u8 ncp_reply_byte(struct ncp_server *server, int offset) -{ - return *(const u8 *)ncp_reply_data(server, offset); -} - -static inline u16 WVAL_LH(const void *data) -{ - return get_unaligned_le16(data); -} - -static u16 -ncp_reply_le16(struct ncp_server *server, int offset) -{ - return get_unaligned_le16(ncp_reply_data(server, offset)); -} - -static u16 -ncp_reply_be16(struct ncp_server *server, int offset) -{ - return get_unaligned_be16(ncp_reply_data(server, offset)); -} - -static inline u32 DVAL_LH(const void *data) -{ - return get_unaligned_le32(data); -} - -static __le32 -ncp_reply_dword(struct ncp_server *server, int offset) -{ - return get_unaligned((__le32 *)ncp_reply_data(server, offset)); -} - -static inline __u32 ncp_reply_dword_lh(struct ncp_server* server, int offset) { - return le32_to_cpu(ncp_reply_dword(server, offset)); -} - -int -ncp_negotiate_buffersize(struct ncp_server *server, int size, int *target) -{ - int result; - - ncp_init_request(server); - ncp_add_be16(server, size); - - if ((result = ncp_request(server, 33)) != 0) { - ncp_unlock_server(server); - return result; - } - *target = min_t(unsigned int, ncp_reply_be16(server, 0), size); - - ncp_unlock_server(server); - return 0; -} - - -/* options: - * bit 0 ipx checksum - * bit 1 packet signing - */ -int -ncp_negotiate_size_and_options(struct ncp_server *server, - int size, int options, int *ret_size, int *ret_options) { - int result; - - /* there is minimum */ - if (size < NCP_BLOCK_SIZE) size = NCP_BLOCK_SIZE; - - ncp_init_request(server); - ncp_add_be16(server, size); - ncp_add_byte(server, options); - - if ((result = ncp_request(server, 0x61)) != 0) - { - ncp_unlock_server(server); - return result; - } - - /* NCP over UDP returns 0 (!!!) */ - result = ncp_reply_be16(server, 0); - if (result >= NCP_BLOCK_SIZE) - size = min(result, size); - *ret_size = size; - *ret_options = ncp_reply_byte(server, 4); - - ncp_unlock_server(server); - return 0; -} - -int ncp_get_volume_info_with_number(struct ncp_server* server, - int n, struct ncp_volume_info* target) { - int result; - int len; - - ncp_init_request_s(server, 44); - ncp_add_byte(server, n); - - if ((result = ncp_request(server, 22)) != 0) { - goto out; - } - target->total_blocks = ncp_reply_dword_lh(server, 0); - target->free_blocks = ncp_reply_dword_lh(server, 4); - target->purgeable_blocks = ncp_reply_dword_lh(server, 8); - target->not_yet_purgeable_blocks = ncp_reply_dword_lh(server, 12); - target->total_dir_entries = ncp_reply_dword_lh(server, 16); - target->available_dir_entries = ncp_reply_dword_lh(server, 20); - target->sectors_per_block = ncp_reply_byte(server, 28); - - memset(&(target->volume_name), 0, sizeof(target->volume_name)); - - result = -EIO; - len = ncp_reply_byte(server, 29); - if (len > NCP_VOLNAME_LEN) { - ncp_dbg(1, "volume name too long: %d\n", len); - goto out; - } - memcpy(&(target->volume_name), ncp_reply_data(server, 30), len); - result = 0; -out: - ncp_unlock_server(server); - return result; -} - -int ncp_get_directory_info(struct ncp_server* server, __u8 n, - struct ncp_volume_info* target) { - int result; - int len; - - ncp_init_request_s(server, 45); - ncp_add_byte(server, n); - - if ((result = ncp_request(server, 22)) != 0) { - goto out; - } - target->total_blocks = ncp_reply_dword_lh(server, 0); - target->free_blocks = ncp_reply_dword_lh(server, 4); - target->purgeable_blocks = 0; - target->not_yet_purgeable_blocks = 0; - target->total_dir_entries = ncp_reply_dword_lh(server, 8); - target->available_dir_entries = ncp_reply_dword_lh(server, 12); - target->sectors_per_block = ncp_reply_byte(server, 20); - - memset(&(target->volume_name), 0, sizeof(target->volume_name)); - - result = -EIO; - len = ncp_reply_byte(server, 21); - if (len > NCP_VOLNAME_LEN) { - ncp_dbg(1, "volume name too long: %d\n", len); - goto out; - } - memcpy(&(target->volume_name), ncp_reply_data(server, 22), len); - result = 0; -out: - ncp_unlock_server(server); - return result; -} - -int -ncp_close_file(struct ncp_server *server, const char *file_id) -{ - int result; - - ncp_init_request(server); - ncp_add_byte(server, 0); - ncp_add_mem(server, file_id, 6); - - result = ncp_request(server, 66); - ncp_unlock_server(server); - return result; -} - -int -ncp_make_closed(struct inode *inode) -{ - int err; - - err = 0; - mutex_lock(&NCP_FINFO(inode)->open_mutex); - if (atomic_read(&NCP_FINFO(inode)->opened) == 1) { - atomic_set(&NCP_FINFO(inode)->opened, 0); - err = ncp_close_file(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle); - - if (!err) - ncp_vdbg("volnum=%d, dirent=%u, error=%d\n", - NCP_FINFO(inode)->volNumber, - NCP_FINFO(inode)->dirEntNum, err); - } - mutex_unlock(&NCP_FINFO(inode)->open_mutex); - return err; -} - -static void ncp_add_handle_path(struct ncp_server *server, __u8 vol_num, - __le32 dir_base, int have_dir_base, - const char *path) -{ - ncp_add_byte(server, vol_num); - ncp_add_dword(server, dir_base); - if (have_dir_base != 0) { - ncp_add_byte(server, 1); /* dir_base */ - } else { - ncp_add_byte(server, 0xff); /* no handle */ - } - if (path != NULL) { - ncp_add_byte(server, 1); /* 1 component */ - ncp_add_pstring(server, path); - } else { - ncp_add_byte(server, 0); - } -} - -int ncp_dirhandle_alloc(struct ncp_server* server, __u8 volnum, __le32 dirent, - __u8* dirhandle) { - int result; - - ncp_init_request(server); - ncp_add_byte(server, 12); /* subfunction */ - ncp_add_byte(server, NW_NS_DOS); - ncp_add_byte(server, 0); - ncp_add_word(server, 0); - ncp_add_handle_path(server, volnum, dirent, 1, NULL); - if ((result = ncp_request(server, 87)) == 0) { - *dirhandle = ncp_reply_byte(server, 0); - } - ncp_unlock_server(server); - return result; -} - -int ncp_dirhandle_free(struct ncp_server* server, __u8 dirhandle) { - int result; - - ncp_init_request_s(server, 20); - ncp_add_byte(server, dirhandle); - result = ncp_request(server, 22); - ncp_unlock_server(server); - return result; -} - -void ncp_extract_file_info(const void *structure, struct nw_info_struct *target) -{ - const __u8 *name_len; - const int info_struct_size = offsetof(struct nw_info_struct, nameLen); - - memcpy(target, structure, info_struct_size); - name_len = structure + info_struct_size; - target->nameLen = *name_len; - memcpy(target->entryName, name_len + 1, *name_len); - target->entryName[*name_len] = '\0'; - target->volNumber = le32_to_cpu(target->volNumber); - return; -} - -#ifdef CONFIG_NCPFS_NFS_NS -static inline void ncp_extract_nfs_info(const unsigned char *structure, - struct nw_nfs_info *target) -{ - target->mode = DVAL_LH(structure); - target->rdev = DVAL_LH(structure + 8); -} -#endif - -int ncp_obtain_nfs_info(struct ncp_server *server, - struct nw_info_struct *target) - -{ - int result = 0; -#ifdef CONFIG_NCPFS_NFS_NS - __u32 volnum = target->volNumber; - - if (ncp_is_nfs_extras(server, volnum)) { - ncp_init_request(server); - ncp_add_byte(server, 19); /* subfunction */ - ncp_add_byte(server, server->name_space[volnum]); - ncp_add_byte(server, NW_NS_NFS); - ncp_add_byte(server, 0); - ncp_add_byte(server, volnum); - ncp_add_dword(server, target->dirEntNum); - /* We must retrieve both nlinks and rdev, otherwise some server versions - report zeroes instead of valid data */ - ncp_add_dword_lh(server, NSIBM_NFS_MODE | NSIBM_NFS_NLINKS | NSIBM_NFS_RDEV); - - if ((result = ncp_request(server, 87)) == 0) { - ncp_extract_nfs_info(ncp_reply_data(server, 0), &target->nfs); - ncp_dbg(1, "(%s) mode=0%o, rdev=0x%x\n", - target->entryName, target->nfs.mode, - target->nfs.rdev); - } else { - target->nfs.mode = 0; - target->nfs.rdev = 0; - } - ncp_unlock_server(server); - - } else -#endif - { - target->nfs.mode = 0; - target->nfs.rdev = 0; - } - return result; -} - -/* - * Returns information for a (one-component) name relative to - * the specified directory. - */ -int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *path, - struct nw_info_struct *target) -{ - __u8 volnum = NCP_FINFO(dir)->volNumber; - __le32 dirent = NCP_FINFO(dir)->dirEntNum; - int result; - - if (target == NULL) { - pr_err("%s: invalid call\n", __func__); - return -EINVAL; - } - ncp_init_request(server); - ncp_add_byte(server, 6); /* subfunction */ - ncp_add_byte(server, server->name_space[volnum]); - ncp_add_byte(server, server->name_space[volnum]); /* N.B. twice ?? */ - ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */ - ncp_add_dword(server, RIM_ALL); - ncp_add_handle_path(server, volnum, dirent, 1, path); - - if ((result = ncp_request(server, 87)) != 0) - goto out; - ncp_extract_file_info(ncp_reply_data(server, 0), target); - ncp_unlock_server(server); - - result = ncp_obtain_nfs_info(server, target); - return result; - -out: - ncp_unlock_server(server); - return result; -} - -#ifdef CONFIG_NCPFS_NFS_NS -static int -ncp_obtain_DOS_dir_base(struct ncp_server *server, - __u8 ns, __u8 volnum, __le32 dirent, - const char *path, /* At most 1 component */ - __le32 *DOS_dir_base) -{ - int result; - - ncp_init_request(server); - ncp_add_byte(server, 6); /* subfunction */ - ncp_add_byte(server, ns); - ncp_add_byte(server, ns); - ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */ - ncp_add_dword(server, RIM_DIRECTORY); - ncp_add_handle_path(server, volnum, dirent, 1, path); - - if ((result = ncp_request(server, 87)) == 0) - { - if (DOS_dir_base) *DOS_dir_base=ncp_reply_dword(server, 0x34); - } - ncp_unlock_server(server); - return result; -} -#endif /* CONFIG_NCPFS_NFS_NS */ - -static inline int -ncp_get_known_namespace(struct ncp_server *server, __u8 volume) -{ -#if defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) - int result; - __u8 *namespace; - __u16 no_namespaces; - - ncp_init_request(server); - ncp_add_byte(server, 24); /* Subfunction: Get Name Spaces Loaded */ - ncp_add_word(server, 0); - ncp_add_byte(server, volume); - - if ((result = ncp_request(server, 87)) != 0) { - ncp_unlock_server(server); - return NW_NS_DOS; /* not result ?? */ - } - - result = NW_NS_DOS; - no_namespaces = ncp_reply_le16(server, 0); - namespace = ncp_reply_data(server, 2); - - while (no_namespaces > 0) { - ncp_dbg(1, "found %d on %d\n", *namespace, volume); - -#ifdef CONFIG_NCPFS_NFS_NS - if ((*namespace == NW_NS_NFS) && !(server->m.flags&NCP_MOUNT_NO_NFS)) - { - result = NW_NS_NFS; - break; - } -#endif /* CONFIG_NCPFS_NFS_NS */ -#ifdef CONFIG_NCPFS_OS2_NS - if ((*namespace == NW_NS_OS2) && !(server->m.flags&NCP_MOUNT_NO_OS2)) - { - result = NW_NS_OS2; - } -#endif /* CONFIG_NCPFS_OS2_NS */ - namespace += 1; - no_namespaces -= 1; - } - ncp_unlock_server(server); - return result; -#else /* neither OS2 nor NFS - only DOS */ - return NW_NS_DOS; -#endif /* defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) */ -} - -int -ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns) -{ - int ns = ncp_get_known_namespace(server, volume); - - if (ret_ns) - *ret_ns = ns; - - ncp_dbg(1, "namespace[%d] = %d\n", volume, server->name_space[volume]); - - if (server->name_space[volume] == ns) - return 0; - server->name_space[volume] = ns; - return 1; -} - -static int -ncp_ObtainSpecificDirBase(struct ncp_server *server, - __u8 nsSrc, __u8 nsDst, __u8 vol_num, __le32 dir_base, - const char *path, /* At most 1 component */ - __le32 *dirEntNum, __le32 *DosDirNum) -{ - int result; - - ncp_init_request(server); - ncp_add_byte(server, 6); /* subfunction */ - ncp_add_byte(server, nsSrc); - ncp_add_byte(server, nsDst); - ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */ - ncp_add_dword(server, RIM_ALL); - ncp_add_handle_path(server, vol_num, dir_base, 1, path); - - if ((result = ncp_request(server, 87)) != 0) - { - ncp_unlock_server(server); - return result; - } - - if (dirEntNum) - *dirEntNum = ncp_reply_dword(server, 0x30); - if (DosDirNum) - *DosDirNum = ncp_reply_dword(server, 0x34); - ncp_unlock_server(server); - return 0; -} - -int -ncp_mount_subdir(struct ncp_server *server, - __u8 volNumber, __u8 srcNS, __le32 dirEntNum, - __u32* volume, __le32* newDirEnt, __le32* newDosEnt) -{ - int dstNS; - int result; - - ncp_update_known_namespace(server, volNumber, &dstNS); - if ((result = ncp_ObtainSpecificDirBase(server, srcNS, dstNS, volNumber, - dirEntNum, NULL, newDirEnt, newDosEnt)) != 0) - { - return result; - } - *volume = volNumber; - server->m.mounted_vol[1] = 0; - server->m.mounted_vol[0] = 'X'; - return 0; -} - -int -ncp_get_volume_root(struct ncp_server *server, - const char *volname, __u32* volume, __le32* dirent, __le32* dosdirent) -{ - int result; - - ncp_dbg(1, "looking up vol %s\n", volname); - - ncp_init_request(server); - ncp_add_byte(server, 22); /* Subfunction: Generate dir handle */ - ncp_add_byte(server, 0); /* DOS namespace */ - ncp_add_byte(server, 0); /* reserved */ - ncp_add_byte(server, 0); /* reserved */ - ncp_add_byte(server, 0); /* reserved */ - - ncp_add_byte(server, 0); /* faked volume number */ - ncp_add_dword(server, 0); /* faked dir_base */ - ncp_add_byte(server, 0xff); /* Don't have a dir_base */ - ncp_add_byte(server, 1); /* 1 path component */ - ncp_add_pstring(server, volname); - - if ((result = ncp_request(server, 87)) != 0) { - ncp_unlock_server(server); - return result; - } - *dirent = *dosdirent = ncp_reply_dword(server, 4); - *volume = ncp_reply_byte(server, 8); - ncp_unlock_server(server); - return 0; -} - -int -ncp_lookup_volume(struct ncp_server *server, - const char *volname, struct nw_info_struct *target) -{ - int result; - - memset(target, 0, sizeof(*target)); - result = ncp_get_volume_root(server, volname, - &target->volNumber, &target->dirEntNum, &target->DosDirNum); - if (result) { - return result; - } - ncp_update_known_namespace(server, target->volNumber, NULL); - target->nameLen = strlen(volname); - memcpy(target->entryName, volname, target->nameLen+1); - target->attributes = aDIR; - /* set dates to Jan 1, 1986 00:00 */ - target->creationTime = target->modifyTime = cpu_to_le16(0x0000); - target->creationDate = target->modifyDate = target->lastAccessDate = cpu_to_le16(0x0C21); - target->nfs.mode = 0; - return 0; -} - -int ncp_modify_file_or_subdir_dos_info_path(struct ncp_server *server, - struct inode *dir, - const char *path, - __le32 info_mask, - const struct nw_modify_dos_info *info) -{ - __u8 volnum = NCP_FINFO(dir)->volNumber; - __le32 dirent = NCP_FINFO(dir)->dirEntNum; - int result; - - ncp_init_request(server); - ncp_add_byte(server, 7); /* subfunction */ - ncp_add_byte(server, server->name_space[volnum]); - ncp_add_byte(server, 0); /* reserved */ - ncp_add_word(server, cpu_to_le16(0x8006)); /* search attribs: all */ - - ncp_add_dword(server, info_mask); - ncp_add_mem(server, info, sizeof(*info)); - ncp_add_handle_path(server, volnum, dirent, 1, path); - - result = ncp_request(server, 87); - ncp_unlock_server(server); - return result; -} - -int ncp_modify_file_or_subdir_dos_info(struct ncp_server *server, - struct inode *dir, - __le32 info_mask, - const struct nw_modify_dos_info *info) -{ - return ncp_modify_file_or_subdir_dos_info_path(server, dir, NULL, - info_mask, info); -} - -#ifdef CONFIG_NCPFS_NFS_NS -int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent, - __u32 mode, __u32 rdev) - -{ - int result = 0; - - ncp_init_request(server); - if (server->name_space[volnum] == NW_NS_NFS) { - ncp_add_byte(server, 25); /* subfunction */ - ncp_add_byte(server, server->name_space[volnum]); - ncp_add_byte(server, NW_NS_NFS); - ncp_add_byte(server, volnum); - ncp_add_dword(server, dirent); - /* we must always operate on both nlinks and rdev, otherwise - rdev is not set */ - ncp_add_dword_lh(server, NSIBM_NFS_MODE | NSIBM_NFS_NLINKS | NSIBM_NFS_RDEV); - ncp_add_dword_lh(server, mode); - ncp_add_dword_lh(server, 1); /* nlinks */ - ncp_add_dword_lh(server, rdev); - result = ncp_request(server, 87); - } - ncp_unlock_server(server); - return result; -} -#endif - - -static int -ncp_DeleteNSEntry(struct ncp_server *server, - __u8 have_dir_base, __u8 volnum, __le32 dirent, - const char* name, __u8 ns, __le16 attr) -{ - int result; - - ncp_init_request(server); - ncp_add_byte(server, 8); /* subfunction */ - ncp_add_byte(server, ns); - ncp_add_byte(server, 0); /* reserved */ - ncp_add_word(server, attr); /* search attribs: all */ - ncp_add_handle_path(server, volnum, dirent, have_dir_base, name); - - result = ncp_request(server, 87); - ncp_unlock_server(server); - return result; -} - -int -ncp_del_file_or_subdir2(struct ncp_server *server, - struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - __u8 volnum; - __le32 dirent; - - if (!inode) { - return 0xFF; /* Any error */ - } - volnum = NCP_FINFO(inode)->volNumber; - dirent = NCP_FINFO(inode)->DosDirNum; - return ncp_DeleteNSEntry(server, 1, volnum, dirent, NULL, NW_NS_DOS, cpu_to_le16(0x8006)); -} - -int -ncp_del_file_or_subdir(struct ncp_server *server, - struct inode *dir, const char *name) -{ - __u8 volnum = NCP_FINFO(dir)->volNumber; - __le32 dirent = NCP_FINFO(dir)->dirEntNum; - int name_space; - - name_space = server->name_space[volnum]; -#ifdef CONFIG_NCPFS_NFS_NS - if (name_space == NW_NS_NFS) - { - int result; - - result=ncp_obtain_DOS_dir_base(server, name_space, volnum, dirent, name, &dirent); - if (result) return result; - name = NULL; - name_space = NW_NS_DOS; - } -#endif /* CONFIG_NCPFS_NFS_NS */ - return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, name_space, cpu_to_le16(0x8006)); -} - -static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6]) -{ - __le16 *dest = (__le16 *) ret; - dest[1] = cpu_to_le16(v0); - dest[2] = cpu_to_le16(v1); - dest[0] = cpu_to_le16(v0 + 1); - return; -} - -/* If both dir and name are NULL, then in target there's already a - looked-up entry that wants to be opened. */ -int ncp_open_create_file_or_subdir(struct ncp_server *server, - struct inode *dir, const char *name, - int open_create_mode, - __le32 create_attributes, - __le16 desired_acc_rights, - struct ncp_entry_info *target) -{ - __le16 search_attribs = cpu_to_le16(0x0006); - __u8 volnum; - __le32 dirent; - int result; - - volnum = NCP_FINFO(dir)->volNumber; - dirent = NCP_FINFO(dir)->dirEntNum; - - if ((create_attributes & aDIR) != 0) { - search_attribs |= cpu_to_le16(0x8000); - } - ncp_init_request(server); - ncp_add_byte(server, 1); /* subfunction */ - ncp_add_byte(server, server->name_space[volnum]); - ncp_add_byte(server, open_create_mode); - ncp_add_word(server, search_attribs); - ncp_add_dword(server, RIM_ALL); - ncp_add_dword(server, create_attributes); - /* The desired acc rights seem to be the inherited rights mask - for directories */ - ncp_add_word(server, desired_acc_rights); - ncp_add_handle_path(server, volnum, dirent, 1, name); - - if ((result = ncp_request(server, 87)) != 0) - goto out; - if (!(create_attributes & aDIR)) - target->opened = 1; - - /* in target there's a new finfo to fill */ - ncp_extract_file_info(ncp_reply_data(server, 6), &(target->i)); - target->volume = target->i.volNumber; - ConvertToNWfromDWORD(ncp_reply_le16(server, 0), - ncp_reply_le16(server, 2), - target->file_handle); - - ncp_unlock_server(server); - - (void)ncp_obtain_nfs_info(server, &(target->i)); - return 0; - -out: - ncp_unlock_server(server); - return result; -} - -int -ncp_initialize_search(struct ncp_server *server, struct inode *dir, - struct nw_search_sequence *target) -{ - __u8 volnum = NCP_FINFO(dir)->volNumber; - __le32 dirent = NCP_FINFO(dir)->dirEntNum; - int result; - - ncp_init_request(server); - ncp_add_byte(server, 2); /* subfunction */ - ncp_add_byte(server, server->name_space[volnum]); - ncp_add_byte(server, 0); /* reserved */ - ncp_add_handle_path(server, volnum, dirent, 1, NULL); - - result = ncp_request(server, 87); - if (result) - goto out; - memcpy(target, ncp_reply_data(server, 0), sizeof(*target)); - -out: - ncp_unlock_server(server); - return result; -} - -int ncp_search_for_fileset(struct ncp_server *server, - struct nw_search_sequence *seq, - int* more, - int* cnt, - char* buffer, - size_t bufsize, - char** rbuf, - size_t* rsize) -{ - int result; - - ncp_init_request(server); - ncp_add_byte(server, 20); - ncp_add_byte(server, server->name_space[seq->volNumber]); - ncp_add_byte(server, 0); /* datastream */ - ncp_add_word(server, cpu_to_le16(0x8006)); - ncp_add_dword(server, RIM_ALL); - ncp_add_word(server, cpu_to_le16(32767)); /* max returned items */ - ncp_add_mem(server, seq, 9); -#ifdef CONFIG_NCPFS_NFS_NS - if (server->name_space[seq->volNumber] == NW_NS_NFS) { - ncp_add_byte(server, 0); /* 0 byte pattern */ - } else -#endif - { - ncp_add_byte(server, 2); /* 2 byte pattern */ - ncp_add_byte(server, 0xff); /* following is a wildcard */ - ncp_add_byte(server, '*'); - } - result = ncp_request2(server, 87, buffer, bufsize); - if (result) { - ncp_unlock_server(server); - return result; - } - if (server->ncp_reply_size < 12) { - ncp_unlock_server(server); - return 0xFF; - } - *rsize = server->ncp_reply_size - 12; - ncp_unlock_server(server); - buffer = buffer + sizeof(struct ncp_reply_header); - *rbuf = buffer + 12; - *cnt = WVAL_LH(buffer + 10); - *more = BVAL(buffer + 9); - memcpy(seq, buffer, 9); - return 0; -} - -static int -ncp_RenameNSEntry(struct ncp_server *server, - struct inode *old_dir, const char *old_name, __le16 old_type, - struct inode *new_dir, const char *new_name) -{ - int result = -EINVAL; - - if ((old_dir == NULL) || (old_name == NULL) || - (new_dir == NULL) || (new_name == NULL)) - goto out; - - ncp_init_request(server); - ncp_add_byte(server, 4); /* subfunction */ - ncp_add_byte(server, server->name_space[NCP_FINFO(old_dir)->volNumber]); - ncp_add_byte(server, 1); /* rename flag */ - ncp_add_word(server, old_type); /* search attributes */ - - /* source Handle Path */ - ncp_add_byte(server, NCP_FINFO(old_dir)->volNumber); - ncp_add_dword(server, NCP_FINFO(old_dir)->dirEntNum); - ncp_add_byte(server, 1); - ncp_add_byte(server, 1); /* 1 source component */ - - /* dest Handle Path */ - ncp_add_byte(server, NCP_FINFO(new_dir)->volNumber); - ncp_add_dword(server, NCP_FINFO(new_dir)->dirEntNum); - ncp_add_byte(server, 1); - ncp_add_byte(server, 1); /* 1 destination component */ - - /* source path string */ - ncp_add_pstring(server, old_name); - /* dest path string */ - ncp_add_pstring(server, new_name); - - result = ncp_request(server, 87); - ncp_unlock_server(server); -out: - return result; -} - -int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server, - struct inode *old_dir, const char *old_name, - struct inode *new_dir, const char *new_name) -{ - int result; - __le16 old_type = cpu_to_le16(0x06); - -/* If somebody can do it atomic, call me... vandrove@vc.cvut.cz */ - result = ncp_RenameNSEntry(server, old_dir, old_name, old_type, - new_dir, new_name); - if (result == 0xFF) /* File Not Found, try directory */ - { - old_type = cpu_to_le16(0x16); - result = ncp_RenameNSEntry(server, old_dir, old_name, old_type, - new_dir, new_name); - } - if (result != 0x92) return result; /* All except NO_FILES_RENAMED */ - result = ncp_del_file_or_subdir(server, new_dir, new_name); - if (result != 0) return -EACCES; - result = ncp_RenameNSEntry(server, old_dir, old_name, old_type, - new_dir, new_name); - return result; -} - - -/* We have to transfer to/from user space */ -int -ncp_read_kernel(struct ncp_server *server, const char *file_id, - __u32 offset, __u16 to_read, char *target, int *bytes_read) -{ - const char *source; - int result; - - ncp_init_request(server); - ncp_add_byte(server, 0); - ncp_add_mem(server, file_id, 6); - ncp_add_be32(server, offset); - ncp_add_be16(server, to_read); - - if ((result = ncp_request(server, 72)) != 0) { - goto out; - } - *bytes_read = ncp_reply_be16(server, 0); - source = ncp_reply_data(server, 2 + (offset & 1)); - - memcpy(target, source, *bytes_read); -out: - ncp_unlock_server(server); - return result; -} - -/* There is a problem... egrep and some other silly tools do: - x = mmap(NULL, MAP_PRIVATE, PROT_READ|PROT_WRITE, <ncpfs fd>, 32768); - read(<ncpfs fd>, x, 32768); - Now copying read result by copy_to_user causes pagefault. This pagefault - could not be handled because of server was locked due to read. So we have - to use temporary buffer. So ncp_unlock_server must be done before - copy_to_user (and for write, copy_from_user must be done before - ncp_init_request... same applies for send raw packet ioctl). Because of - file is normally read in bigger chunks, caller provides kmalloced - (vmalloced) chunk of memory with size >= to_read... - */ -int -ncp_read_bounce(struct ncp_server *server, const char *file_id, - __u32 offset, __u16 to_read, struct iov_iter *to, - int *bytes_read, void *bounce, __u32 bufsize) -{ - int result; - - ncp_init_request(server); - ncp_add_byte(server, 0); - ncp_add_mem(server, file_id, 6); - ncp_add_be32(server, offset); - ncp_add_be16(server, to_read); - result = ncp_request2(server, 72, bounce, bufsize); - ncp_unlock_server(server); - if (!result) { - int len = get_unaligned_be16((char *)bounce + - sizeof(struct ncp_reply_header)); - result = -EIO; - if (len <= to_read) { - char* source; - - source = (char*)bounce + - sizeof(struct ncp_reply_header) + 2 + - (offset & 1); - *bytes_read = len; - result = 0; - if (copy_to_iter(source, len, to) != len) - result = -EFAULT; - } - } - return result; -} - -int -ncp_write_kernel(struct ncp_server *server, const char *file_id, - __u32 offset, __u16 to_write, - const char *source, int *bytes_written) -{ - int result; - - ncp_init_request(server); - ncp_add_byte(server, 0); - ncp_add_mem(server, file_id, 6); - ncp_add_be32(server, offset); - ncp_add_be16(server, to_write); - ncp_add_mem(server, source, to_write); - - if ((result = ncp_request(server, 73)) == 0) - *bytes_written = to_write; - ncp_unlock_server(server); - return result; -} - -#ifdef CONFIG_NCPFS_IOCTL_LOCKING -int -ncp_LogPhysicalRecord(struct ncp_server *server, const char *file_id, - __u8 locktype, __u32 offset, __u32 length, __u16 timeout) -{ - int result; - - ncp_init_request(server); - ncp_add_byte(server, locktype); - ncp_add_mem(server, file_id, 6); - ncp_add_be32(server, offset); - ncp_add_be32(server, length); - ncp_add_be16(server, timeout); - - if ((result = ncp_request(server, 0x1A)) != 0) - { - ncp_unlock_server(server); - return result; - } - ncp_unlock_server(server); - return 0; -} - -int -ncp_ClearPhysicalRecord(struct ncp_server *server, const char *file_id, - __u32 offset, __u32 length) -{ - int result; - - ncp_init_request(server); - ncp_add_byte(server, 0); /* who knows... lanalyzer says that */ - ncp_add_mem(server, file_id, 6); - ncp_add_be32(server, offset); - ncp_add_be32(server, length); - - if ((result = ncp_request(server, 0x1E)) != 0) - { - ncp_unlock_server(server); - return result; - } - ncp_unlock_server(server); - return 0; -} -#endif /* CONFIG_NCPFS_IOCTL_LOCKING */ - -#ifdef CONFIG_NCPFS_NLS -/* This are the NLS conversion routines with inspirations and code parts - * from the vfat file system and hints from Petr Vandrovec. - */ - -int -ncp__io2vol(struct ncp_server *server, unsigned char *vname, unsigned int *vlen, - const unsigned char *iname, unsigned int ilen, int cc) -{ - struct nls_table *in = server->nls_io; - struct nls_table *out = server->nls_vol; - unsigned char *vname_start; - unsigned char *vname_end; - const unsigned char *iname_end; - - iname_end = iname + ilen; - vname_start = vname; - vname_end = vname + *vlen - 1; - - while (iname < iname_end) { - int chl; - wchar_t ec; - - if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) { - int k; - unicode_t u; - - k = utf8_to_utf32(iname, iname_end - iname, &u); - if (k < 0 || u > MAX_WCHAR_T) - return -EINVAL; - iname += k; - ec = u; - } else { - if (*iname == NCP_ESC) { - int k; - - if (iname_end - iname < 5) - goto nospec; - - ec = 0; - for (k = 1; k < 5; k++) { - unsigned char nc; - - nc = iname[k] - '0'; - if (nc >= 10) { - nc -= 'A' - '0' - 10; - if ((nc < 10) || (nc > 15)) { - goto nospec; - } - } - ec = (ec << 4) | nc; - } - iname += 5; - } else { -nospec:; - if ( (chl = in->char2uni(iname, iname_end - iname, &ec)) < 0) - return chl; - iname += chl; - } - } - - /* unitoupper should be here! */ - - chl = out->uni2char(ec, vname, vname_end - vname); - if (chl < 0) - return chl; - - /* this is wrong... */ - if (cc) { - int chi; - - for (chi = 0; chi < chl; chi++){ - vname[chi] = ncp_toupper(out, vname[chi]); - } - } - vname += chl; - } - - *vname = 0; - *vlen = vname - vname_start; - return 0; -} - -int -ncp__vol2io(struct ncp_server *server, unsigned char *iname, unsigned int *ilen, - const unsigned char *vname, unsigned int vlen, int cc) -{ - struct nls_table *in = server->nls_vol; - struct nls_table *out = server->nls_io; - const unsigned char *vname_end; - unsigned char *iname_start; - unsigned char *iname_end; - unsigned char *vname_cc; - int err; - - vname_cc = NULL; - - if (cc) { - int i; - - /* this is wrong! */ - vname_cc = kmalloc(vlen, GFP_KERNEL); - if (!vname_cc) - return -ENOMEM; - for (i = 0; i < vlen; i++) - vname_cc[i] = ncp_tolower(in, vname[i]); - vname = vname_cc; - } - - iname_start = iname; - iname_end = iname + *ilen - 1; - vname_end = vname + vlen; - - while (vname < vname_end) { - wchar_t ec; - int chl; - - if ( (chl = in->char2uni(vname, vname_end - vname, &ec)) < 0) { - err = chl; - goto quit; - } - vname += chl; - - /* unitolower should be here! */ - - if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) { - int k; - - k = utf32_to_utf8(ec, iname, iname_end - iname); - if (k < 0) { - err = -ENAMETOOLONG; - goto quit; - } - iname += k; - } else { - if ( (chl = out->uni2char(ec, iname, iname_end - iname)) >= 0) { - iname += chl; - } else { - int k; - - if (iname_end - iname < 5) { - err = -ENAMETOOLONG; - goto quit; - } - *iname = NCP_ESC; - for (k = 4; k > 0; k--) { - unsigned char v; - - v = (ec & 0xF) + '0'; - if (v > '9') { - v += 'A' - '9' - 1; - } - iname[k] = v; - ec >>= 4; - } - iname += 5; - } - } - } - - *iname = 0; - *ilen = iname - iname_start; - err = 0; -quit:; - if (cc) - kfree(vname_cc); - return err; -} - -#else - -int -ncp__io2vol(unsigned char *vname, unsigned int *vlen, - const unsigned char *iname, unsigned int ilen, int cc) -{ - int i; - - if (*vlen <= ilen) - return -ENAMETOOLONG; - - if (cc) - for (i = 0; i < ilen; i++) { - *vname = toupper(*iname); - vname++; - iname++; - } - else { - memmove(vname, iname, ilen); - vname += ilen; - } - - *vlen = ilen; - *vname = 0; - return 0; -} - -int -ncp__vol2io(unsigned char *iname, unsigned int *ilen, - const unsigned char *vname, unsigned int vlen, int cc) -{ - int i; - - if (*ilen <= vlen) - return -ENAMETOOLONG; - - if (cc) - for (i = 0; i < vlen; i++) { - *iname = tolower(*vname); - iname++; - vname++; - } - else { - memmove(iname, vname, vlen); - iname += vlen; - } - - *ilen = vlen; - *iname = 0; - return 0; -} - -#endif diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h deleted file mode 100644 index aaae8aa9bf7d..000000000000 --- a/fs/ncpfs/ncplib_kernel.h +++ /dev/null @@ -1,215 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * ncplib_kernel.h - * - * Copyright (C) 1995, 1996 by Volker Lendecke - * Modified for big endian by J.F. Chadima and David S. Miller - * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache - * Modified 1998, 1999 Wolfram Pienkoss for NLS - * Modified 1999 Wolfram Pienkoss for directory caching - * - */ - -#ifndef _NCPLIB_H -#define _NCPLIB_H - - -#include <linux/fs.h> -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/stat.h> -#include <linux/fcntl.h> -#include <linux/pagemap.h> - -#include <linux/uaccess.h> -#include <asm/byteorder.h> -#include <asm/unaligned.h> -#include <asm/string.h> - -#ifdef CONFIG_NCPFS_NLS -#include <linux/nls.h> -#else -#include <linux/ctype.h> -#endif /* CONFIG_NCPFS_NLS */ - -#define NCP_MIN_SYMLINK_SIZE 8 -#define NCP_MAX_SYMLINK_SIZE 512 - -#define NCP_BLOCK_SHIFT 9 -#define NCP_BLOCK_SIZE (1 << (NCP_BLOCK_SHIFT)) - -int ncp_negotiate_buffersize(struct ncp_server *, int, int *); -int ncp_negotiate_size_and_options(struct ncp_server *server, int size, - int options, int *ret_size, int *ret_options); - -int ncp_get_volume_info_with_number(struct ncp_server* server, int n, - struct ncp_volume_info *target); - -int ncp_get_directory_info(struct ncp_server* server, __u8 dirhandle, - struct ncp_volume_info* target); - -int ncp_close_file(struct ncp_server *, const char *); -static inline int ncp_read_bounce_size(__u32 size) { - return sizeof(struct ncp_reply_header) + 2 + 2 + size + 8; -}; -int ncp_read_bounce(struct ncp_server *, const char *, __u32, __u16, - struct iov_iter *, int *, void *bounce, __u32 bouncelen); -int ncp_read_kernel(struct ncp_server *, const char *, __u32, __u16, - char *, int *); -int ncp_write_kernel(struct ncp_server *, const char *, __u32, __u16, - const char *, int *); - -static inline void ncp_inode_close(struct inode *inode) { - atomic_dec(&NCP_FINFO(inode)->opened); -} - -void ncp_extract_file_info(const void* src, struct nw_info_struct* target); -int ncp_obtain_info(struct ncp_server *server, struct inode *, const char *, - struct nw_info_struct *target); -int ncp_obtain_nfs_info(struct ncp_server *server, struct nw_info_struct *target); -int ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns); -int ncp_get_volume_root(struct ncp_server *server, const char *volname, - __u32 *volume, __le32 *dirent, __le32 *dosdirent); -int ncp_lookup_volume(struct ncp_server *, const char *, struct nw_info_struct *); -int ncp_modify_file_or_subdir_dos_info(struct ncp_server *, struct inode *, - __le32, const struct nw_modify_dos_info *info); -int ncp_modify_file_or_subdir_dos_info_path(struct ncp_server *, struct inode *, - const char* path, __le32, const struct nw_modify_dos_info *info); -int ncp_modify_nfs_info(struct ncp_server *, __u8 volnum, __le32 dirent, - __u32 mode, __u32 rdev); - -int ncp_del_file_or_subdir2(struct ncp_server *, struct dentry*); -int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, const char *); -int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, const char *, - int, __le32, __le16, struct ncp_entry_info *); - -int ncp_initialize_search(struct ncp_server *, struct inode *, - struct nw_search_sequence *target); -int ncp_search_for_fileset(struct ncp_server *server, - struct nw_search_sequence *seq, - int* more, int* cnt, - char* buffer, size_t bufsize, - char** rbuf, size_t* rsize); - -int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server, - struct inode *, const char *, struct inode *, const char *); - - -int -ncp_LogPhysicalRecord(struct ncp_server *server, - const char *file_id, __u8 locktype, - __u32 offset, __u32 length, __u16 timeout); - -#ifdef CONFIG_NCPFS_IOCTL_LOCKING -int -ncp_ClearPhysicalRecord(struct ncp_server *server, - const char *file_id, - __u32 offset, __u32 length); -#endif /* CONFIG_NCPFS_IOCTL_LOCKING */ - -int -ncp_mount_subdir(struct ncp_server *, __u8, __u8, __le32, - __u32* volume, __le32* dirent, __le32* dosdirent); -int ncp_dirhandle_alloc(struct ncp_server *, __u8 vol, __le32 dirent, __u8 *dirhandle); -int ncp_dirhandle_free(struct ncp_server *, __u8 dirhandle); - -int ncp_create_new(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev, __le32 attributes); - -static inline int ncp_is_nfs_extras(struct ncp_server* server, unsigned int volnum) { -#ifdef CONFIG_NCPFS_NFS_NS - return (server->m.flags & NCP_MOUNT_NFS_EXTRAS) && - (server->name_space[volnum] == NW_NS_NFS); -#else - return 0; -#endif -} - -#ifdef CONFIG_NCPFS_NLS - -int ncp__io2vol(struct ncp_server *, unsigned char *, unsigned int *, - const unsigned char *, unsigned int, int); -int ncp__vol2io(struct ncp_server *, unsigned char *, unsigned int *, - const unsigned char *, unsigned int, int); - -#define NCP_ESC ':' -#define NCP_IO_TABLE(sb) (NCP_SBP(sb)->nls_io) -#define ncp_tolower(t, c) nls_tolower(t, c) -#define ncp_toupper(t, c) nls_toupper(t, c) -#define ncp_strnicmp(t, s1, s2, len) \ - nls_strnicmp(t, s1, s2, len) -#define ncp_io2vol(S,m,i,n,k,U) ncp__io2vol(S,m,i,n,k,U) -#define ncp_vol2io(S,m,i,n,k,U) ncp__vol2io(S,m,i,n,k,U) - -#else - -int ncp__io2vol(unsigned char *, unsigned int *, - const unsigned char *, unsigned int, int); -int ncp__vol2io(unsigned char *, unsigned int *, - const unsigned char *, unsigned int, int); - -#define NCP_IO_TABLE(sb) NULL -#define ncp_tolower(t, c) tolower(c) -#define ncp_toupper(t, c) toupper(c) -#define ncp_io2vol(S,m,i,n,k,U) ncp__io2vol(m,i,n,k,U) -#define ncp_vol2io(S,m,i,n,k,U) ncp__vol2io(m,i,n,k,U) - - -static inline int ncp_strnicmp(const struct nls_table *t, - const unsigned char *s1, const unsigned char *s2, int len) -{ - while (len--) { - if (tolower(*s1++) != tolower(*s2++)) - return 1; - } - - return 0; -} - -#endif /* CONFIG_NCPFS_NLS */ - -#define NCP_GET_AGE(dentry) (jiffies - (dentry)->d_time) -#define NCP_MAX_AGE(server) atomic_read(&(server)->dentry_ttl) -#define NCP_TEST_AGE(server,dentry) (NCP_GET_AGE(dentry) < NCP_MAX_AGE(server)) - -static inline void -ncp_age_dentry(struct ncp_server* server, struct dentry* dentry) -{ - dentry->d_time = jiffies - NCP_MAX_AGE(server); -} - -static inline void -ncp_new_dentry(struct dentry* dentry) -{ - dentry->d_time = jiffies; -} - -struct ncp_cache_head { - time_t mtime; - unsigned long time; /* cache age */ - unsigned long end; /* last valid fpos in cache */ - int eof; -}; - -#define NCP_DIRCACHE_SIZE ((int)(PAGE_SIZE/sizeof(struct dentry *))) -union ncp_dir_cache { - struct ncp_cache_head head; - struct dentry *dentry[NCP_DIRCACHE_SIZE]; -}; - -#define NCP_FIRSTCACHE_SIZE ((int)((NCP_DIRCACHE_SIZE * \ - sizeof(struct dentry *) - sizeof(struct ncp_cache_head)) / \ - sizeof(struct dentry *))) - -#define NCP_DIRCACHE_START (NCP_DIRCACHE_SIZE - NCP_FIRSTCACHE_SIZE) - -struct ncp_cache_control { - struct ncp_cache_head head; - struct page *page; - union ncp_dir_cache *cache; - unsigned long fpos, ofs; - int filled, valid, idx; -}; - -#endif /* _NCPLIB_H */ diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c deleted file mode 100644 index 8085b1a3ba47..000000000000 --- a/fs/ncpfs/ncpsign_kernel.c +++ /dev/null @@ -1,128 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * ncpsign_kernel.c - * - * Arne de Bruijn (arne@knoware.nl), 1997 - * - */ - - -#ifdef CONFIG_NCPFS_PACKET_SIGNING - -#include <linux/string.h> -#include <linux/ncp.h> -#include <linux/bitops.h> -#include "ncp_fs.h" -#include "ncpsign_kernel.h" - -/* i386: 32-bit, little endian, handles mis-alignment */ -#ifdef __i386__ -#define GET_LE32(p) (*(const int *)(p)) -#define PUT_LE32(p,v) { *(int *)(p)=v; } -#else -/* from include/ncplib.h */ -#define BVAL(buf,pos) (((const __u8 *)(buf))[pos]) -#define PVAL(buf,pos) ((unsigned)BVAL(buf,pos)) -#define BSET(buf,pos,val) (((__u8 *)(buf))[pos] = (val)) - -static inline __u16 -WVAL_LH(const __u8 * buf, int pos) -{ - return PVAL(buf, pos) | PVAL(buf, pos + 1) << 8; -} -static inline __u32 -DVAL_LH(const __u8 * buf, int pos) -{ - return WVAL_LH(buf, pos) | WVAL_LH(buf, pos + 2) << 16; -} -static inline void -WSET_LH(__u8 * buf, int pos, __u16 val) -{ - BSET(buf, pos, val & 0xff); - BSET(buf, pos + 1, val >> 8); -} -static inline void -DSET_LH(__u8 * buf, int pos, __u32 val) -{ - WSET_LH(buf, pos, val & 0xffff); - WSET_LH(buf, pos + 2, val >> 16); -} - -#define GET_LE32(p) DVAL_LH(p,0) -#define PUT_LE32(p,v) DSET_LH(p,0,v) -#endif - -static void nwsign(char *r_data1, char *r_data2, char *outdata) { - int i; - unsigned int w0,w1,w2,w3; - static int rbit[4]={0, 2, 1, 3}; -#ifdef __i386__ - unsigned int *data2=(unsigned int *)r_data2; -#else - unsigned int data2[16]; - for (i=0;i<16;i++) - data2[i]=GET_LE32(r_data2+(i<<2)); -#endif - w0=GET_LE32(r_data1); - w1=GET_LE32(r_data1+4); - w2=GET_LE32(r_data1+8); - w3=GET_LE32(r_data1+12); - for (i=0;i<16;i+=4) { - w0=rol32(w0 + ((w1 & w2) | ((~w1) & w3)) + data2[i+0],3); - w3=rol32(w3 + ((w0 & w1) | ((~w0) & w2)) + data2[i+1],7); - w2=rol32(w2 + ((w3 & w0) | ((~w3) & w1)) + data2[i+2],11); - w1=rol32(w1 + ((w2 & w3) | ((~w2) & w0)) + data2[i+3],19); - } - for (i=0;i<4;i++) { - w0=rol32(w0 + (((w2 | w3) & w1) | (w2 & w3)) + 0x5a827999 + data2[i+0],3); - w3=rol32(w3 + (((w1 | w2) & w0) | (w1 & w2)) + 0x5a827999 + data2[i+4],5); - w2=rol32(w2 + (((w0 | w1) & w3) | (w0 & w1)) + 0x5a827999 + data2[i+8],9); - w1=rol32(w1 + (((w3 | w0) & w2) | (w3 & w0)) + 0x5a827999 + data2[i+12],13); - } - for (i=0;i<4;i++) { - w0=rol32(w0 + ((w1 ^ w2) ^ w3) + 0x6ed9eba1 + data2[rbit[i]+0],3); - w3=rol32(w3 + ((w0 ^ w1) ^ w2) + 0x6ed9eba1 + data2[rbit[i]+8],9); - w2=rol32(w2 + ((w3 ^ w0) ^ w1) + 0x6ed9eba1 + data2[rbit[i]+4],11); - w1=rol32(w1 + ((w2 ^ w3) ^ w0) + 0x6ed9eba1 + data2[rbit[i]+12],15); - } - PUT_LE32(outdata,(w0+GET_LE32(r_data1)) & 0xffffffff); - PUT_LE32(outdata+4,(w1+GET_LE32(r_data1+4)) & 0xffffffff); - PUT_LE32(outdata+8,(w2+GET_LE32(r_data1+8)) & 0xffffffff); - PUT_LE32(outdata+12,(w3+GET_LE32(r_data1+12)) & 0xffffffff); -} - -/* Make a signature for the current packet and add it at the end of the */ -/* packet. */ -void __sign_packet(struct ncp_server *server, const char *packet, size_t size, __u32 totalsize, void *sign_buff) { - unsigned char data[64]; - - memcpy(data, server->sign_root, 8); - *(__u32*)(data + 8) = totalsize; - if (size < 52) { - memcpy(data + 12, packet, size); - memset(data + 12 + size, 0, 52 - size); - } else { - memcpy(data + 12, packet, 52); - } - nwsign(server->sign_last, data, server->sign_last); - memcpy(sign_buff, server->sign_last, 8); -} - -int sign_verify_reply(struct ncp_server *server, const char *packet, size_t size, __u32 totalsize, const void *sign_buff) { - unsigned char data[64]; - unsigned char hash[16]; - - memcpy(data, server->sign_root, 8); - *(__u32*)(data + 8) = totalsize; - if (size < 52) { - memcpy(data + 12, packet, size); - memset(data + 12 + size, 0, 52 - size); - } else { - memcpy(data + 12, packet, 52); - } - nwsign(server->sign_last, data, hash); - return memcmp(sign_buff, hash, 8); -} - -#endif /* CONFIG_NCPFS_PACKET_SIGNING */ - diff --git a/fs/ncpfs/ncpsign_kernel.h b/fs/ncpfs/ncpsign_kernel.h deleted file mode 100644 index 57ff0a0650b8..000000000000 --- a/fs/ncpfs/ncpsign_kernel.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * ncpsign_kernel.h - * - * Arne de Bruijn (arne@knoware.nl), 1997 - * - */ - -#ifndef _NCPSIGN_KERNEL_H -#define _NCPSIGN_KERNEL_H - -#ifdef CONFIG_NCPFS_PACKET_SIGNING -void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff); -int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff); -#endif - -static inline size_t sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff) { -#ifdef CONFIG_NCPFS_PACKET_SIGNING - if (server->sign_active) { - __sign_packet(server, data, size, totalsize, sign_buff); - return 8; - } -#endif - return 0; -} - -#endif diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c deleted file mode 100644 index efb176b1751a..000000000000 --- a/fs/ncpfs/sock.c +++ /dev/null @@ -1,854 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/ncpfs/sock.c - * - * Copyright (C) 1992, 1993 Rick Sladkey - * - * Modified 1995, 1996 by Volker Lendecke to be usable for ncp - * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/time.h> -#include <linux/errno.h> -#include <linux/socket.h> -#include <linux/fcntl.h> -#include <linux/stat.h> -#include <linux/string.h> -#include <linux/sched/signal.h> -#include <linux/uaccess.h> -#include <linux/in.h> -#include <linux/net.h> -#include <linux/mm.h> -#include <linux/netdevice.h> -#include <linux/signal.h> -#include <linux/slab.h> -#include <net/scm.h> -#include <net/sock.h> -#include <linux/ipx.h> -#include <linux/poll.h> -#include <linux/file.h> - -#include "ncp_fs.h" - -#include "ncpsign_kernel.h" - -static int _recv(struct socket *sock, void *buf, int size, unsigned flags) -{ - struct msghdr msg = {NULL, }; - struct kvec iov = {buf, size}; - return kernel_recvmsg(sock, &msg, &iov, 1, size, flags); -} - -static int _send(struct socket *sock, const void *buff, int len) -{ - struct msghdr msg = { .msg_flags = 0 }; - struct kvec vec = {.iov_base = (void *)buff, .iov_len = len}; - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &vec, 1, len); - return sock_sendmsg(sock, &msg); -} - -struct ncp_request_reply { - struct list_head req; - wait_queue_head_t wq; - atomic_t refs; - unsigned char* reply_buf; - size_t datalen; - int result; - enum { RQ_DONE, RQ_INPROGRESS, RQ_QUEUED, RQ_IDLE, RQ_ABANDONED } status; - struct iov_iter from; - struct kvec tx_iov[3]; - u_int16_t tx_type; - u_int32_t sign[6]; -}; - -static inline struct ncp_request_reply* ncp_alloc_req(void) -{ - struct ncp_request_reply *req; - - req = kmalloc(sizeof(struct ncp_request_reply), GFP_KERNEL); - if (!req) - return NULL; - - init_waitqueue_head(&req->wq); - atomic_set(&req->refs, (1)); - req->status = RQ_IDLE; - - return req; -} - -static void ncp_req_get(struct ncp_request_reply *req) -{ - atomic_inc(&req->refs); -} - -static void ncp_req_put(struct ncp_request_reply *req) -{ - if (atomic_dec_and_test(&req->refs)) - kfree(req); -} - -void ncp_tcp_data_ready(struct sock *sk) -{ - struct ncp_server *server = sk->sk_user_data; - - server->data_ready(sk); - schedule_work(&server->rcv.tq); -} - -void ncp_tcp_error_report(struct sock *sk) -{ - struct ncp_server *server = sk->sk_user_data; - - server->error_report(sk); - schedule_work(&server->rcv.tq); -} - -void ncp_tcp_write_space(struct sock *sk) -{ - struct ncp_server *server = sk->sk_user_data; - - /* We do not need any locking: we first set tx.creq, and then we do sendmsg, - not vice versa... */ - server->write_space(sk); - if (server->tx.creq) - schedule_work(&server->tx.tq); -} - -void ncpdgram_timeout_call(struct timer_list *t) -{ - struct ncp_server *server = from_timer(server, t, timeout_tm); - - schedule_work(&server->timeout_tq); -} - -static inline void ncp_finish_request(struct ncp_server *server, struct ncp_request_reply *req, int result) -{ - req->result = result; - if (req->status != RQ_ABANDONED) - memcpy(req->reply_buf, server->rxbuf, req->datalen); - req->status = RQ_DONE; - wake_up_all(&req->wq); - ncp_req_put(req); -} - -static void __abort_ncp_connection(struct ncp_server *server) -{ - struct ncp_request_reply *req; - - ncp_invalidate_conn(server); - del_timer(&server->timeout_tm); - while (!list_empty(&server->tx.requests)) { - req = list_entry(server->tx.requests.next, struct ncp_request_reply, req); - - list_del_init(&req->req); - ncp_finish_request(server, req, -EIO); - } - req = server->rcv.creq; - if (req) { - server->rcv.creq = NULL; - ncp_finish_request(server, req, -EIO); - server->rcv.ptr = NULL; - server->rcv.state = 0; - } - req = server->tx.creq; - if (req) { - server->tx.creq = NULL; - ncp_finish_request(server, req, -EIO); - } -} - -static inline int get_conn_number(struct ncp_reply_header *rp) -{ - return rp->conn_low | (rp->conn_high << 8); -} - -static inline void __ncp_abort_request(struct ncp_server *server, struct ncp_request_reply *req, int err) -{ - /* If req is done, we got signal, but we also received answer... */ - switch (req->status) { - case RQ_IDLE: - case RQ_DONE: - break; - case RQ_QUEUED: - list_del_init(&req->req); - ncp_finish_request(server, req, err); - break; - case RQ_INPROGRESS: - req->status = RQ_ABANDONED; - break; - case RQ_ABANDONED: - break; - } -} - -static inline void ncp_abort_request(struct ncp_server *server, struct ncp_request_reply *req, int err) -{ - mutex_lock(&server->rcv.creq_mutex); - __ncp_abort_request(server, req, err); - mutex_unlock(&server->rcv.creq_mutex); -} - -static inline void __ncptcp_abort(struct ncp_server *server) -{ - __abort_ncp_connection(server); -} - -static int ncpdgram_send(struct socket *sock, struct ncp_request_reply *req) -{ - struct msghdr msg = { .msg_iter = req->from, .msg_flags = MSG_DONTWAIT }; - return sock_sendmsg(sock, &msg); -} - -static void __ncptcp_try_send(struct ncp_server *server) -{ - struct ncp_request_reply *rq; - struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT }; - int result; - - rq = server->tx.creq; - if (!rq) - return; - - msg.msg_iter = rq->from; - result = sock_sendmsg(server->ncp_sock, &msg); - - if (result == -EAGAIN) - return; - - if (result < 0) { - pr_err("tcp: Send failed: %d\n", result); - __ncp_abort_request(server, rq, result); - return; - } - if (!msg_data_left(&msg)) { - server->rcv.creq = rq; - server->tx.creq = NULL; - return; - } - rq->from = msg.msg_iter; -} - -static inline void ncp_init_header(struct ncp_server *server, struct ncp_request_reply *req, struct ncp_request_header *h) -{ - req->status = RQ_INPROGRESS; - h->conn_low = server->connection; - h->conn_high = server->connection >> 8; - h->sequence = ++server->sequence; -} - -static void ncpdgram_start_request(struct ncp_server *server, struct ncp_request_reply *req) -{ - size_t signlen, len = req->tx_iov[1].iov_len; - struct ncp_request_header *h = req->tx_iov[1].iov_base; - - ncp_init_header(server, req, h); - signlen = sign_packet(server, - req->tx_iov[1].iov_base + sizeof(struct ncp_request_header) - 1, - len - sizeof(struct ncp_request_header) + 1, - cpu_to_le32(len), req->sign); - if (signlen) { - /* NCP over UDP appends signature */ - req->tx_iov[2].iov_base = req->sign; - req->tx_iov[2].iov_len = signlen; - } - iov_iter_kvec(&req->from, WRITE | ITER_KVEC, - req->tx_iov + 1, signlen ? 2 : 1, len + signlen); - server->rcv.creq = req; - server->timeout_last = server->m.time_out; - server->timeout_retries = server->m.retry_count; - ncpdgram_send(server->ncp_sock, req); - mod_timer(&server->timeout_tm, jiffies + server->m.time_out); -} - -#define NCP_TCP_XMIT_MAGIC (0x446D6454) -#define NCP_TCP_XMIT_VERSION (1) -#define NCP_TCP_RCVD_MAGIC (0x744E6350) - -static void ncptcp_start_request(struct ncp_server *server, struct ncp_request_reply *req) -{ - size_t signlen, len = req->tx_iov[1].iov_len; - struct ncp_request_header *h = req->tx_iov[1].iov_base; - - ncp_init_header(server, req, h); - signlen = sign_packet(server, req->tx_iov[1].iov_base + sizeof(struct ncp_request_header) - 1, - len - sizeof(struct ncp_request_header) + 1, - cpu_to_be32(len + 24), req->sign + 4) + 16; - - req->sign[0] = htonl(NCP_TCP_XMIT_MAGIC); - req->sign[1] = htonl(len + signlen); - req->sign[2] = htonl(NCP_TCP_XMIT_VERSION); - req->sign[3] = htonl(req->datalen + 8); - /* NCP over TCP prepends signature */ - req->tx_iov[0].iov_base = req->sign; - req->tx_iov[0].iov_len = signlen; - iov_iter_kvec(&req->from, WRITE | ITER_KVEC, - req->tx_iov, 2, len + signlen); - - server->tx.creq = req; - __ncptcp_try_send(server); -} - -static inline void __ncp_start_request(struct ncp_server *server, struct ncp_request_reply *req) -{ - /* we copy the data so that we do not depend on the caller - staying alive */ - memcpy(server->txbuf, req->tx_iov[1].iov_base, req->tx_iov[1].iov_len); - req->tx_iov[1].iov_base = server->txbuf; - - if (server->ncp_sock->type == SOCK_STREAM) - ncptcp_start_request(server, req); - else - ncpdgram_start_request(server, req); -} - -static int ncp_add_request(struct ncp_server *server, struct ncp_request_reply *req) -{ - mutex_lock(&server->rcv.creq_mutex); - if (!ncp_conn_valid(server)) { - mutex_unlock(&server->rcv.creq_mutex); - pr_err("tcp: Server died\n"); - return -EIO; - } - ncp_req_get(req); - if (server->tx.creq || server->rcv.creq) { - req->status = RQ_QUEUED; - list_add_tail(&req->req, &server->tx.requests); - mutex_unlock(&server->rcv.creq_mutex); - return 0; - } - __ncp_start_request(server, req); - mutex_unlock(&server->rcv.creq_mutex); - return 0; -} - -static void __ncp_next_request(struct ncp_server *server) -{ - struct ncp_request_reply *req; - - server->rcv.creq = NULL; - if (list_empty(&server->tx.requests)) { - return; - } - req = list_entry(server->tx.requests.next, struct ncp_request_reply, req); - list_del_init(&req->req); - __ncp_start_request(server, req); -} - -static void info_server(struct ncp_server *server, unsigned int id, const void * data, size_t len) -{ - if (server->info_sock) { - struct msghdr msg = { .msg_flags = MSG_NOSIGNAL }; - __be32 hdr[2] = {cpu_to_be32(len + 8), cpu_to_be32(id)}; - struct kvec iov[2] = { - {.iov_base = hdr, .iov_len = 8}, - {.iov_base = (void *)data, .iov_len = len}, - }; - - iov_iter_kvec(&msg.msg_iter, ITER_KVEC | WRITE, - iov, 2, len + 8); - - sock_sendmsg(server->info_sock, &msg); - } -} - -void ncpdgram_rcv_proc(struct work_struct *work) -{ - struct ncp_server *server = - container_of(work, struct ncp_server, rcv.tq); - struct socket* sock; - - sock = server->ncp_sock; - - while (1) { - struct ncp_reply_header reply; - int result; - - result = _recv(sock, &reply, sizeof(reply), MSG_PEEK | MSG_DONTWAIT); - if (result < 0) { - break; - } - if (result >= sizeof(reply)) { - struct ncp_request_reply *req; - - if (reply.type == NCP_WATCHDOG) { - unsigned char buf[10]; - - if (server->connection != get_conn_number(&reply)) { - goto drop; - } - result = _recv(sock, buf, sizeof(buf), MSG_DONTWAIT); - if (result < 0) { - ncp_dbg(1, "recv failed with %d\n", result); - continue; - } - if (result < 10) { - ncp_dbg(1, "too short (%u) watchdog packet\n", result); - continue; - } - if (buf[9] != '?') { - ncp_dbg(1, "bad signature (%02X) in watchdog packet\n", buf[9]); - continue; - } - buf[9] = 'Y'; - _send(sock, buf, sizeof(buf)); - continue; - } - if (reply.type != NCP_POSITIVE_ACK && reply.type != NCP_REPLY) { - result = _recv(sock, server->unexpected_packet.data, sizeof(server->unexpected_packet.data), MSG_DONTWAIT); - if (result < 0) { - continue; - } - info_server(server, 0, server->unexpected_packet.data, result); - continue; - } - mutex_lock(&server->rcv.creq_mutex); - req = server->rcv.creq; - if (req && (req->tx_type == NCP_ALLOC_SLOT_REQUEST || (server->sequence == reply.sequence && - server->connection == get_conn_number(&reply)))) { - if (reply.type == NCP_POSITIVE_ACK) { - server->timeout_retries = server->m.retry_count; - server->timeout_last = NCP_MAX_RPC_TIMEOUT; - mod_timer(&server->timeout_tm, jiffies + NCP_MAX_RPC_TIMEOUT); - } else if (reply.type == NCP_REPLY) { - result = _recv(sock, server->rxbuf, req->datalen, MSG_DONTWAIT); -#ifdef CONFIG_NCPFS_PACKET_SIGNING - if (result >= 0 && server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) { - if (result < 8 + 8) { - result = -EIO; - } else { - unsigned int hdrl; - - result -= 8; - hdrl = sock->sk->sk_family == AF_INET ? 8 : 6; - if (sign_verify_reply(server, server->rxbuf + hdrl, result - hdrl, cpu_to_le32(result), server->rxbuf + result)) { - pr_info("Signature violation\n"); - result = -EIO; - } - } - } -#endif - del_timer(&server->timeout_tm); - server->rcv.creq = NULL; - ncp_finish_request(server, req, result); - __ncp_next_request(server); - mutex_unlock(&server->rcv.creq_mutex); - continue; - } - } - mutex_unlock(&server->rcv.creq_mutex); - } -drop:; - _recv(sock, &reply, sizeof(reply), MSG_DONTWAIT); - } -} - -static void __ncpdgram_timeout_proc(struct ncp_server *server) -{ - /* If timer is pending, we are processing another request... */ - if (!timer_pending(&server->timeout_tm)) { - struct ncp_request_reply* req; - - req = server->rcv.creq; - if (req) { - int timeout; - - if (server->m.flags & NCP_MOUNT_SOFT) { - if (server->timeout_retries-- == 0) { - __ncp_abort_request(server, req, -ETIMEDOUT); - return; - } - } - /* Ignore errors */ - ncpdgram_send(server->ncp_sock, req); - timeout = server->timeout_last << 1; - if (timeout > NCP_MAX_RPC_TIMEOUT) { - timeout = NCP_MAX_RPC_TIMEOUT; - } - server->timeout_last = timeout; - mod_timer(&server->timeout_tm, jiffies + timeout); - } - } -} - -void ncpdgram_timeout_proc(struct work_struct *work) -{ - struct ncp_server *server = - container_of(work, struct ncp_server, timeout_tq); - mutex_lock(&server->rcv.creq_mutex); - __ncpdgram_timeout_proc(server); - mutex_unlock(&server->rcv.creq_mutex); -} - -static int do_tcp_rcv(struct ncp_server *server, void *buffer, size_t len) -{ - int result; - - if (buffer) { - result = _recv(server->ncp_sock, buffer, len, MSG_DONTWAIT); - } else { - static unsigned char dummy[1024]; - - if (len > sizeof(dummy)) { - len = sizeof(dummy); - } - result = _recv(server->ncp_sock, dummy, len, MSG_DONTWAIT); - } - if (result < 0) { - return result; - } - if (result > len) { - pr_err("tcp: bug in recvmsg (%u > %zu)\n", result, len); - return -EIO; - } - return result; -} - -static int __ncptcp_rcv_proc(struct ncp_server *server) -{ - /* We have to check the result, so store the complete header */ - while (1) { - int result; - struct ncp_request_reply *req; - int datalen; - int type; - - while (server->rcv.len) { - result = do_tcp_rcv(server, server->rcv.ptr, server->rcv.len); - if (result == -EAGAIN) { - return 0; - } - if (result <= 0) { - req = server->rcv.creq; - if (req) { - __ncp_abort_request(server, req, -EIO); - } else { - __ncptcp_abort(server); - } - if (result < 0) { - pr_err("tcp: error in recvmsg: %d\n", result); - } else { - ncp_dbg(1, "tcp: EOF\n"); - } - return -EIO; - } - if (server->rcv.ptr) { - server->rcv.ptr += result; - } - server->rcv.len -= result; - } - switch (server->rcv.state) { - case 0: - if (server->rcv.buf.magic != htonl(NCP_TCP_RCVD_MAGIC)) { - pr_err("tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic)); - __ncptcp_abort(server); - return -EIO; - } - datalen = ntohl(server->rcv.buf.len) & 0x0FFFFFFF; - if (datalen < 10) { - pr_err("tcp: Unexpected reply len %d\n", datalen); - __ncptcp_abort(server); - return -EIO; - } -#ifdef CONFIG_NCPFS_PACKET_SIGNING - if (server->sign_active) { - if (datalen < 18) { - pr_err("tcp: Unexpected reply len %d\n", datalen); - __ncptcp_abort(server); - return -EIO; - } - server->rcv.buf.len = datalen - 8; - server->rcv.ptr = (unsigned char*)&server->rcv.buf.p1; - server->rcv.len = 8; - server->rcv.state = 4; - break; - } -#endif - type = ntohs(server->rcv.buf.type); -#ifdef CONFIG_NCPFS_PACKET_SIGNING -cont:; -#endif - if (type != NCP_REPLY) { - if (datalen - 8 <= sizeof(server->unexpected_packet.data)) { - *(__u16*)(server->unexpected_packet.data) = htons(type); - server->unexpected_packet.len = datalen - 8; - - server->rcv.state = 5; - server->rcv.ptr = server->unexpected_packet.data + 2; - server->rcv.len = datalen - 10; - break; - } - ncp_dbg(1, "tcp: Unexpected NCP type %02X\n", type); -skipdata2:; - server->rcv.state = 2; -skipdata:; - server->rcv.ptr = NULL; - server->rcv.len = datalen - 10; - break; - } - req = server->rcv.creq; - if (!req) { - ncp_dbg(1, "Reply without appropriate request\n"); - goto skipdata2; - } - if (datalen > req->datalen + 8) { - pr_err("tcp: Unexpected reply len %d (expected at most %zd)\n", datalen, req->datalen + 8); - server->rcv.state = 3; - goto skipdata; - } - req->datalen = datalen - 8; - ((struct ncp_reply_header*)server->rxbuf)->type = NCP_REPLY; - server->rcv.ptr = server->rxbuf + 2; - server->rcv.len = datalen - 10; - server->rcv.state = 1; - break; -#ifdef CONFIG_NCPFS_PACKET_SIGNING - case 4: - datalen = server->rcv.buf.len; - type = ntohs(server->rcv.buf.type2); - goto cont; -#endif - case 1: - req = server->rcv.creq; - if (req->tx_type != NCP_ALLOC_SLOT_REQUEST) { - if (((struct ncp_reply_header*)server->rxbuf)->sequence != server->sequence) { - pr_err("tcp: Bad sequence number\n"); - __ncp_abort_request(server, req, -EIO); - return -EIO; - } - if ((((struct ncp_reply_header*)server->rxbuf)->conn_low | (((struct ncp_reply_header*)server->rxbuf)->conn_high << 8)) != server->connection) { - pr_err("tcp: Connection number mismatch\n"); - __ncp_abort_request(server, req, -EIO); - return -EIO; - } - } -#ifdef CONFIG_NCPFS_PACKET_SIGNING - if (server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) { - if (sign_verify_reply(server, server->rxbuf + 6, req->datalen - 6, cpu_to_be32(req->datalen + 16), &server->rcv.buf.type)) { - pr_err("tcp: Signature violation\n"); - __ncp_abort_request(server, req, -EIO); - return -EIO; - } - } -#endif - ncp_finish_request(server, req, req->datalen); - nextreq:; - __ncp_next_request(server); - case 2: - next:; - server->rcv.ptr = (unsigned char*)&server->rcv.buf; - server->rcv.len = 10; - server->rcv.state = 0; - break; - case 3: - ncp_finish_request(server, server->rcv.creq, -EIO); - goto nextreq; - case 5: - info_server(server, 0, server->unexpected_packet.data, server->unexpected_packet.len); - goto next; - } - } -} - -void ncp_tcp_rcv_proc(struct work_struct *work) -{ - struct ncp_server *server = - container_of(work, struct ncp_server, rcv.tq); - - mutex_lock(&server->rcv.creq_mutex); - __ncptcp_rcv_proc(server); - mutex_unlock(&server->rcv.creq_mutex); -} - -void ncp_tcp_tx_proc(struct work_struct *work) -{ - struct ncp_server *server = - container_of(work, struct ncp_server, tx.tq); - - mutex_lock(&server->rcv.creq_mutex); - __ncptcp_try_send(server); - mutex_unlock(&server->rcv.creq_mutex); -} - -static int do_ncp_rpc_call(struct ncp_server *server, int size, - unsigned char* reply_buf, int max_reply_size) -{ - int result; - struct ncp_request_reply *req; - - req = ncp_alloc_req(); - if (!req) - return -ENOMEM; - - req->reply_buf = reply_buf; - req->datalen = max_reply_size; - req->tx_iov[1].iov_base = server->packet; - req->tx_iov[1].iov_len = size; - req->tx_type = *(u_int16_t*)server->packet; - - result = ncp_add_request(server, req); - if (result < 0) - goto out; - - if (wait_event_interruptible(req->wq, req->status == RQ_DONE)) { - ncp_abort_request(server, req, -EINTR); - result = -EINTR; - goto out; - } - - result = req->result; - -out: - ncp_req_put(req); - - return result; -} - -/* - * We need the server to be locked here, so check! - */ - -static int ncp_do_request(struct ncp_server *server, int size, - void* reply, int max_reply_size) -{ - int result; - - if (server->lock == 0) { - pr_err("Server not locked!\n"); - return -EIO; - } - if (!ncp_conn_valid(server)) { - return -EIO; - } - { - sigset_t old_set; - unsigned long mask, flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - old_set = current->blocked; - if (current->flags & PF_EXITING) - mask = 0; - else - mask = sigmask(SIGKILL); - if (server->m.flags & NCP_MOUNT_INTR) { - /* FIXME: This doesn't seem right at all. So, like, - we can't handle SIGINT and get whatever to stop? - What if we've blocked it ourselves? What about - alarms? Why, in fact, are we mucking with the - sigmask at all? -- r~ */ - if (current->sighand->action[SIGINT - 1].sa.sa_handler == SIG_DFL) - mask |= sigmask(SIGINT); - if (current->sighand->action[SIGQUIT - 1].sa.sa_handler == SIG_DFL) - mask |= sigmask(SIGQUIT); - } - siginitsetinv(¤t->blocked, mask); - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - - result = do_ncp_rpc_call(server, size, reply, max_reply_size); - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->blocked = old_set; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - } - - ncp_dbg(2, "do_ncp_rpc_call returned %d\n", result); - - return result; -} - -/* ncp_do_request assures that at least a complete reply header is - * received. It assumes that server->current_size contains the ncp - * request size - */ -int ncp_request2(struct ncp_server *server, int function, - void* rpl, int size) -{ - struct ncp_request_header *h; - struct ncp_reply_header* reply = rpl; - int result; - - h = (struct ncp_request_header *) (server->packet); - if (server->has_subfunction != 0) { - *(__u16 *) & (h->data[0]) = htons(server->current_size - sizeof(*h) - 2); - } - h->type = NCP_REQUEST; - /* - * The server shouldn't know or care what task is making a - * request, so we always use the same task number. - */ - h->task = 2; /* (current->pid) & 0xff; */ - h->function = function; - - result = ncp_do_request(server, server->current_size, reply, size); - if (result < 0) { - ncp_dbg(1, "ncp_request_error: %d\n", result); - goto out; - } - server->completion = reply->completion_code; - server->conn_status = reply->connection_state; - server->reply_size = result; - server->ncp_reply_size = result - sizeof(struct ncp_reply_header); - - result = reply->completion_code; - - if (result != 0) - ncp_vdbg("completion code=%x\n", result); -out: - return result; -} - -int ncp_connect(struct ncp_server *server) -{ - struct ncp_request_header *h; - int result; - - server->connection = 0xFFFF; - server->sequence = 255; - - h = (struct ncp_request_header *) (server->packet); - h->type = NCP_ALLOC_SLOT_REQUEST; - h->task = 2; /* see above */ - h->function = 0; - - result = ncp_do_request(server, sizeof(*h), server->packet, server->packet_size); - if (result < 0) - goto out; - server->connection = h->conn_low + (h->conn_high * 256); - result = 0; -out: - return result; -} - -int ncp_disconnect(struct ncp_server *server) -{ - struct ncp_request_header *h; - - h = (struct ncp_request_header *) (server->packet); - h->type = NCP_DEALLOC_SLOT_REQUEST; - h->task = 2; /* see above */ - h->function = 0; - - return ncp_do_request(server, sizeof(*h), server->packet, server->packet_size); -} - -void ncp_lock_server(struct ncp_server *server) -{ - mutex_lock(&server->mutex); - if (server->lock) - pr_warn("%s: was locked!\n", __func__); - server->lock = 1; -} - -void ncp_unlock_server(struct ncp_server *server) -{ - if (!server->lock) { - pr_warn("%s: was not locked!\n", __func__); - return; - } - server->lock = 0; - mutex_unlock(&server->mutex); -} diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c deleted file mode 100644 index b6e16da4837a..000000000000 --- a/fs/ncpfs/symlink.c +++ /dev/null @@ -1,182 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/ncpfs/symlink.c - * - * Code for allowing symbolic links on NCPFS (i.e. NetWare) - * Symbolic links are not supported on native NetWare, so we use an - * infrequently-used flag (Sh) and store a two-word magic header in - * the file to make sure we don't accidentally use a non-link file - * as a link. - * - * When using the NFS namespace, we set the mode to indicate a symlink and - * don't bother with the magic numbers. - * - * from linux/fs/ext2/symlink.c - * - * Copyright (C) 1998-99, Frank A. Vorstenbosch - * - * ncpfs symlink handling code - * NLS support (c) 1999 Petr Vandrovec - * Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info - * - */ - - -#include <linux/uaccess.h> - -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/slab.h> -#include <linux/mm.h> -#include <linux/stat.h> -#include "ncp_fs.h" - -/* these magic numbers must appear in the symlink file -- this makes it a bit - more resilient against the magic attributes being set on random files. */ - -#define NCP_SYMLINK_MAGIC0 cpu_to_le32(0x6c6d7973) /* "symlnk->" */ -#define NCP_SYMLINK_MAGIC1 cpu_to_le32(0x3e2d6b6e) - -/* ----- read a symbolic link ------------------------------------------ */ - -static int ncp_symlink_readpage(struct file *file, struct page *page) -{ - struct inode *inode = page->mapping->host; - int error, length, len; - char *link, *rawlink; - char *buf = kmap(page); - - error = -ENOMEM; - rawlink = kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL); - if (!rawlink) - goto fail; - - if (ncp_make_open(inode,O_RDONLY)) - goto failEIO; - - error=ncp_read_kernel(NCP_SERVER(inode),NCP_FINFO(inode)->file_handle, - 0,NCP_MAX_SYMLINK_SIZE,rawlink,&length); - - ncp_inode_close(inode); - /* Close file handle if no other users... */ - ncp_make_closed(inode); - if (error) - goto failEIO; - - if (NCP_FINFO(inode)->flags & NCPI_KLUDGE_SYMLINK) { - if (length<NCP_MIN_SYMLINK_SIZE || - ((__le32 *)rawlink)[0]!=NCP_SYMLINK_MAGIC0 || - ((__le32 *)rawlink)[1]!=NCP_SYMLINK_MAGIC1) - goto failEIO; - link = rawlink + 8; - length -= 8; - } else { - link = rawlink; - } - - len = NCP_MAX_SYMLINK_SIZE; - error = ncp_vol2io(NCP_SERVER(inode), buf, &len, link, length, 0); - kfree(rawlink); - if (error) - goto fail; - SetPageUptodate(page); - kunmap(page); - unlock_page(page); - return 0; - -failEIO: - error = -EIO; - kfree(rawlink); -fail: - SetPageError(page); - kunmap(page); - unlock_page(page); - return error; -} - -/* - * symlinks can't do much... - */ -const struct address_space_operations ncp_symlink_aops = { - .readpage = ncp_symlink_readpage, -}; - -/* ----- create a new symbolic link -------------------------------------- */ - -int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct inode *inode; - char *rawlink; - int length, err, i, outlen; - int kludge; - umode_t mode; - __le32 attr; - unsigned int hdr; - - ncp_dbg(1, "dir=%p, dentry=%p, symname=%s\n", dir, dentry, symname); - - if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) - kludge = 0; - else -#ifdef CONFIG_NCPFS_EXTRAS - if (NCP_SERVER(dir)->m.flags & NCP_MOUNT_SYMLINKS) - kludge = 1; - else -#endif - /* EPERM is returned by VFS if symlink procedure does not exist */ - return -EPERM; - - rawlink = kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL); - if (!rawlink) - return -ENOMEM; - - if (kludge) { - mode = 0; - attr = aSHARED | aHIDDEN; - ((__le32 *)rawlink)[0]=NCP_SYMLINK_MAGIC0; - ((__le32 *)rawlink)[1]=NCP_SYMLINK_MAGIC1; - hdr = 8; - } else { - mode = S_IFLNK | S_IRWXUGO; - attr = 0; - hdr = 0; - } - - length = strlen(symname); - /* map to/from server charset, do not touch upper/lower case as - symlink can point out of ncp filesystem */ - outlen = NCP_MAX_SYMLINK_SIZE - hdr; - err = ncp_io2vol(NCP_SERVER(dir), rawlink + hdr, &outlen, symname, length, 0); - if (err) - goto failfree; - - outlen += hdr; - - err = -EIO; - if (ncp_create_new(dir,dentry,mode,0,attr)) { - goto failfree; - } - - inode=d_inode(dentry); - - if (ncp_make_open(inode, O_WRONLY)) - goto failfree; - - if (ncp_write_kernel(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle, - 0, outlen, rawlink, &i) || i!=outlen) { - goto fail; - } - - ncp_inode_close(inode); - ncp_make_closed(inode); - kfree(rawlink); - return 0; -fail:; - ncp_inode_close(inode); - ncp_make_closed(inode); -failfree:; - kfree(rawlink); - return err; -} - -/* ----- EOF ----- */ diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 995d707537da..7cb5c38c19e4 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -137,6 +137,11 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, return bio; } +static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map) +{ + return offset >= map->start && offset < map->start + map->len; +} + static struct bio * do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, struct page *page, struct pnfs_block_dev_map *map, @@ -156,8 +161,8 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, /* translate to physical disk offset */ disk_addr = (u64)isect << SECTOR_SHIFT; - if (disk_addr < map->start || disk_addr >= map->start + map->len) { - if (!dev->map(dev, disk_addr, map)) + if (!offset_in_map(disk_addr, map)) { + if (!dev->map(dev, disk_addr, map) || !offset_in_map(disk_addr, map)) return ERR_PTR(-EIO); bio = bl_submit_bio(bio); } @@ -184,6 +189,29 @@ retry: return bio; } +static void bl_mark_devices_unavailable(struct nfs_pgio_header *header, bool rw) +{ + struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg); + size_t bytes_left = header->args.count; + sector_t isect, extent_length = 0; + struct pnfs_block_extent be; + + isect = header->args.offset >> SECTOR_SHIFT; + bytes_left += header->args.offset - (isect << SECTOR_SHIFT); + + while (bytes_left > 0) { + if (!ext_tree_lookup(bl, isect, &be, rw)) + return; + extent_length = be.be_length - (isect - be.be_f_offset); + nfs4_mark_deviceid_unavailable(be.be_device); + isect += extent_length; + if (bytes_left > extent_length << SECTOR_SHIFT) + bytes_left -= extent_length << SECTOR_SHIFT; + else + bytes_left = 0; + } +} + static void bl_end_io_read(struct bio *bio) { struct parallel_io *par = bio->bi_private; @@ -194,6 +222,7 @@ static void bl_end_io_read(struct bio *bio) if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); + bl_mark_devices_unavailable(header, false); } bio_put(bio); @@ -323,6 +352,7 @@ static void bl_end_io_write(struct bio *bio) if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); + bl_mark_devices_unavailable(header, true); } bio_put(bio); put_parallel(par); @@ -552,6 +582,31 @@ static int decode_sector_number(__be32 **rp, sector_t *sp) return 0; } +static struct nfs4_deviceid_node * +bl_find_get_deviceid(struct nfs_server *server, + const struct nfs4_deviceid *id, struct rpc_cred *cred, + gfp_t gfp_mask) +{ + struct nfs4_deviceid_node *node; + unsigned long start, end; + +retry: + node = nfs4_find_get_deviceid(server, id, cred, gfp_mask); + if (!node) + return ERR_PTR(-ENODEV); + + if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) + return node; + + end = jiffies; + start = end - PNFS_DEVICE_RETRY_TIMEOUT; + if (!time_in_range(node->timestamp_unavailable, start, end)) { + nfs4_delete_deviceid(node->ld, node->nfs_client, id); + goto retry; + } + return ERR_PTR(-ENODEV); +} + static int bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo, struct layout_verification *lv, struct list_head *extents, @@ -573,16 +628,18 @@ bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo, memcpy(&id, p, NFS4_DEVICEID4_SIZE); p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); - error = -EIO; - be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id, + be->be_device = bl_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id, lo->plh_lc_cred, gfp_mask); - if (!be->be_device) + if (IS_ERR(be->be_device)) { + error = PTR_ERR(be->be_device); goto out_free_be; + } /* * The next three values are read in as bytes, but stored in the * extent structure in 512-byte granularity. */ + error = -EIO; if (decode_sector_number(&p, &be->be_f_offset) < 0) goto out_put_deviceid; if (decode_sector_number(&p, &be->be_length) < 0) @@ -692,11 +749,16 @@ out_free_scratch: __free_page(scratch); out: dprintk("%s returns %d\n", __func__, status); - if (status) { + switch (status) { + case -ENODEV: + /* Our extent block devices are unavailable */ + set_bit(NFS_LSEG_UNAVAILABLE, &lseg->pls_flags); + case 0: + return lseg; + default: kfree(lseg); return ERR_PTR(status); } - return lseg; } static void @@ -798,6 +860,13 @@ bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) } pnfs_generic_pg_init_read(pgio, req); + + if (pgio->pg_lseg && + test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) { + pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg); + pnfs_set_lo_fail(pgio->pg_lseg); + nfs_pageio_reset_read_mds(pgio); + } } /* @@ -853,6 +922,14 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); pnfs_generic_pg_init_write(pgio, req, wb_size); + + if (pgio->pg_lseg && + test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) { + + pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg); + pnfs_set_lo_fail(pgio->pg_lseg); + nfs_pageio_reset_write_mds(pgio); + } } /* @@ -887,6 +964,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = { .name = "LAYOUT_BLOCK_VOLUME", .owner = THIS_MODULE, .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_LAYOUTRET_ON_ERROR | PNFS_READ_WHOLE_PAGE, .read_pagelist = bl_read_pagelist, .write_pagelist = bl_write_pagelist, @@ -910,6 +988,7 @@ static struct pnfs_layoutdriver_type scsilayout_type = { .name = "LAYOUT_SCSI", .owner = THIS_MODULE, .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_LAYOUTRET_ON_ERROR | PNFS_READ_WHOLE_PAGE, .read_pagelist = bl_read_pagelist, .write_pagelist = bl_write_pagelist, @@ -967,6 +1046,7 @@ static void __exit nfs4blocklayout_exit(void) } MODULE_ALIAS("nfs-layouttype4-3"); +MODULE_ALIAS("nfs-layouttype4-5"); module_init(nfs4blocklayout_init); module_exit(nfs4blocklayout_exit); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index efc007f00742..716bc75e9ed2 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -92,10 +92,9 @@ struct pnfs_block_volume { }; struct pnfs_block_dev_map { - sector_t start; - sector_t len; - - sector_t disk_offset; + u64 start; + u64 len; + u64 disk_offset; struct block_device *bdev; }; diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 95f74bd2c067..a7efd83779d2 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -533,14 +533,11 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_free_volumes; ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask); - if (ret) { - bl_free_device(top); - kfree(top); - goto out_free_volumes; - } node = &top->node; nfs4_init_deviceid_node(node, server, &pdev->dev_id); + if (ret) + nfs4_mark_deviceid_unavailable(node); out_free_volumes: kfree(volumes); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index ade44ca0c66c..d8b47624fee2 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -12,6 +12,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/iversion.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> @@ -347,7 +348,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs4_stateid_copy(&delegation->stateid, &res->delegation); delegation->type = res->delegation_type; delegation->pagemod_limit = res->pagemod_limit; - delegation->change_attr = inode->i_version; + delegation->change_attr = inode_peek_iversion_raw(inode); delegation->cred = get_rpccred(cred); delegation->inode = inode; delegation->flags = 1<<NFS_DELEGATION_REFERENCED; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index d2972d537469..8c10b0562e75 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -775,10 +775,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) spin_lock(&dreq->lock); - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { - dreq->flags = 0; + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) dreq->error = hdr->error; - } if (dreq->error == 0) { nfs_direct_good_bytes(dreq, hdr); if (nfs_write_need_commit(hdr)) { diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 83fd09fc8f77..ab5de3246c5c 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -48,10 +48,6 @@ nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent) *max_len = len; return FILEID_INVALID; } - if (IS_AUTOMOUNT(inode)) { - *max_len = FILEID_INVALID; - goto out; - } p[FILEID_HIGH_OFF] = NFS_FILEID(inode) >> 32; p[FILEID_LOW_OFF] = NFS_FILEID(inode); @@ -59,7 +55,6 @@ nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent) p[len - 1] = 0; /* Padding */ nfs_copy_fh(clnt_fh, server_fh); *max_len = len; -out: dprintk("%s: result fh fileid %llu mode %u size %d\n", __func__, NFS_FILEID(inode), inode->i_mode, *max_len); return *max_len; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 4e54d8b5413a..d175724ff566 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -895,9 +895,7 @@ fl_pnfs_update_layout(struct inode *ino, lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode, gfp_flags); - if (!lseg) - lseg = ERR_PTR(-ENOMEM); - if (IS_ERR(lseg)) + if (IS_ERR_OR_NULL(lseg)) goto out; lo = NFS_I(ino)->layout; diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c index 3025fe8584a0..0ee4b93d36ea 100644 --- a/fs/nfs/fscache-index.c +++ b/fs/nfs/fscache-index.c @@ -16,6 +16,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_fs_sb.h> #include <linux/in6.h> +#include <linux/iversion.h> #include "internal.h" #include "fscache.h" @@ -211,7 +212,7 @@ static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data, auxdata.ctime = nfsi->vfs_inode.i_ctime; if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = nfsi->vfs_inode.i_version; + auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); if (bufmax > sizeof(auxdata)) bufmax = sizeof(auxdata); @@ -243,7 +244,7 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, auxdata.ctime = nfsi->vfs_inode.i_ctime; if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = nfsi->vfs_inode.i_version; + auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); if (memcmp(data, &auxdata, datalen) != 0) return FSCACHE_CHECKAUX_OBSOLETE; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b992d2382ffa..ceeaf0fb6657 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -38,8 +38,8 @@ #include <linux/slab.h> #include <linux/compat.h> #include <linux/freezer.h> - #include <linux/uaccess.h> +#include <linux/iversion.h> #include "nfs4_fs.h" #include "callback.h" @@ -483,7 +483,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st memset(&inode->i_atime, 0, sizeof(inode->i_atime)); memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); - inode->i_version = 0; + inode_set_iversion_raw(inode, 0); inode->i_size = 0; clear_nlink(inode); inode->i_uid = make_kuid(&init_user_ns, -2); @@ -508,7 +508,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st else if (nfs_server_capable(inode, NFS_CAP_CTIME)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) - inode->i_version = fattr->change_attr; + inode_set_iversion_raw(inode, fattr->change_attr); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE); @@ -735,12 +735,20 @@ int nfs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; + struct nfs_server *server = NFS_SERVER(inode); + unsigned long cache_validity; int err = 0; + bool force_sync = query_flags & AT_STATX_FORCE_SYNC; + bool do_update = false; trace_nfs_getattr_enter(inode); + + if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) + goto out_no_update; + /* Flush out writes to the server in order to update c/mtime. */ - if (S_ISREG(inode->i_mode)) { + if ((request_mask & (STATX_CTIME|STATX_MTIME)) && + S_ISREG(inode->i_mode)) { err = filemap_write_and_wait(inode->i_mapping); if (err) goto out; @@ -757,24 +765,42 @@ int nfs_getattr(const struct path *path, struct kstat *stat, */ if ((path->mnt->mnt_flags & MNT_NOATIME) || ((path->mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) - need_atime = 0; - - if (need_atime || nfs_need_revalidate_inode(inode)) { - struct nfs_server *server = NFS_SERVER(inode); - + request_mask &= ~STATX_ATIME; + + /* Is the user requesting attributes that might need revalidation? */ + if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME| + STATX_MTIME|STATX_UID|STATX_GID| + STATX_SIZE|STATX_BLOCKS))) + goto out_no_revalidate; + + /* Check whether the cached attributes are stale */ + do_update |= force_sync || nfs_attribute_cache_expired(inode); + cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); + do_update |= cache_validity & + (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL); + if (request_mask & STATX_ATIME) + do_update |= cache_validity & NFS_INO_INVALID_ATIME; + if (request_mask & (STATX_CTIME|STATX_MTIME)) + do_update |= cache_validity & NFS_INO_REVAL_PAGECACHE; + if (do_update) { + /* Update the attribute cache */ if (!(server->flags & NFS_MOUNT_NOAC)) nfs_readdirplus_parent_cache_miss(path->dentry); else nfs_readdirplus_parent_cache_hit(path->dentry); err = __nfs_revalidate_inode(server, inode); + if (err) + goto out; } else nfs_readdirplus_parent_cache_hit(path->dentry); - if (!err) { - generic_fillattr(inode, stat); - stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); - if (S_ISDIR(inode->i_mode)) - stat->blksize = NFS_SERVER(inode)->dtsize; - } +out_no_revalidate: + /* Only return attributes that were revalidated. */ + stat->result_mask &= request_mask; +out_no_update: + generic_fillattr(inode, stat); + stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); + if (S_ISDIR(inode->i_mode)) + stat->blksize = NFS_SERVER(inode)->dtsize; out: trace_nfs_getattr_exit(inode, err); return err; @@ -1144,7 +1170,6 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map if (mapping->nrpages != 0) { if (S_ISREG(inode->i_mode)) { - unmap_mapping_range(mapping, 0, 0, 0); ret = nfs_sync_mapping(mapping); if (ret < 0) return ret; @@ -1289,8 +1314,8 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) - && inode->i_version == fattr->pre_change_attr) { - inode->i_version = fattr->change_attr; + && !inode_cmp_iversion_raw(inode, fattr->pre_change_attr)) { + inode_set_iversion_raw(inode, fattr->change_attr); if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); ret |= NFS_INO_INVALID_ATTR; @@ -1348,7 +1373,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (!nfs_file_has_buffered_writers(nfsi)) { /* Verify a few of the more important attributes */ - if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr) + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode_cmp_iversion_raw(inode, fattr->change_attr)) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE; if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) @@ -1642,7 +1667,7 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa } if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { - fattr->pre_change_attr = inode->i_version; + fattr->pre_change_attr = inode_peek_iversion_raw(inode); fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; } if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && @@ -1778,7 +1803,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { - if (inode->i_version != fattr->change_attr) { + if (inode_cmp_iversion_raw(inode, fattr->change_attr)) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); /* Could it be a race with writeback? */ @@ -1790,7 +1815,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); } - inode->i_version = fattr->change_attr; + inode_set_iversion_raw(inode, fattr->change_attr); } } else { nfsi->cache_validity |= save_cache_validity; diff --git a/fs/nfs/io.c b/fs/nfs/io.c index 20fef85d2bb1..9034b4926909 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -99,7 +99,7 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) { if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { set_bit(NFS_INO_ODIRECT, &nfsi->flags); - nfs_wb_all(inode); + nfs_sync_mapping(inode->i_mapping); } } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 65a7e5da508c..04612c24d394 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -861,6 +861,7 @@ static int nfs4_set_client(struct nfs_server *server, set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status)) set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags); + server->port = rpc_get_port(addr); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); @@ -1123,19 +1124,36 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, /* Initialise the client representation from the parent server */ nfs_server_copy_userdata(server, parent_server); - /* Get a client representation. - * Note: NFSv4 always uses TCP, */ + /* Get a client representation */ +#ifdef CONFIG_SUNRPC_XPRT_RDMA + rpc_set_port(data->addr, NFS_RDMA_PORT); error = nfs4_set_client(server, data->hostname, data->addr, data->addrlen, parent_client->cl_ipaddr, - rpc_protocol(parent_server->client), + XPRT_TRANSPORT_RDMA, + parent_server->client->cl_timeout, + parent_client->cl_mvops->minor_version, + parent_client->cl_net); + if (!error) + goto init_server; +#endif /* CONFIG_SUNRPC_XPRT_RDMA */ + + rpc_set_port(data->addr, NFS_PORT); + error = nfs4_set_client(server, data->hostname, + data->addr, + data->addrlen, + parent_client->cl_ipaddr, + XPRT_TRANSPORT_TCP, parent_server->client->cl_timeout, parent_client->cl_mvops->minor_version, parent_client->cl_net); if (error < 0) goto error; +#ifdef CONFIG_SUNRPC_XPRT_RDMA +init_server: +#endif error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor); if (error < 0) goto error; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 626d1382002e..6b3b372b59b9 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -8,7 +8,6 @@ #include <linux/file.h> #include <linux/falloc.h> #include <linux/nfs_fs.h> -#include <uapi/linux/btrfs.h> /* BTRFS_IOC_CLONE/BTRFS_IOC_CLONE_RANGE */ #include "delegation.h" #include "internal.h" #include "iostat.h" diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 30426c1a1bbd..22dc30a679a0 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -568,9 +568,13 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, struct idmap_msg *im; struct idmap *idmap = (struct idmap *)aux; struct key *key = cons->key; - int ret = -ENOMEM; + int ret = -ENOKEY; + + if (!aux) + goto out1; /* msg and im are freed in idmap_pipe_destroy_msg */ + ret = -ENOMEM; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out1; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 8c3f327d858d..24f06dcc2b08 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -270,8 +270,6 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, if (mountdata->addrlen == 0) continue; - rpc_set_port(mountdata->addr, NFS_PORT); - memcpy(page2, buf->data, buf->len); page2[buf->len] = '\0'; mountdata->hostname = page2; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 56fa5a16e097..47f3c273245e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -54,6 +54,7 @@ #include <linux/xattr.h> #include <linux/utsname.h> #include <linux/freezer.h> +#include <linux/iversion.h> #include "nfs4_fs.h" #include "delegation.h" @@ -1045,16 +1046,16 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo, spin_lock(&dir->i_lock); nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; - if (cinfo->atomic && cinfo->before == dir->i_version) { + if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(dir)) { nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; nfsi->attrtimeo_timestamp = jiffies; } else { nfs_force_lookup_revalidate(dir); - if (cinfo->before != dir->i_version) + if (cinfo->before != inode_peek_iversion_raw(dir)) nfsi->cache_validity |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; } - dir->i_version = cinfo->after; + inode_set_iversion_raw(dir, cinfo->after); nfsi->read_cache_jiffies = timestamp; nfsi->attr_gencount = nfs_inc_attr_generation_counter(); nfs_fscache_invalidate(dir); @@ -2019,7 +2020,7 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta return ret; } -static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, int err) +static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, struct file_lock *fl, int err) { switch (err) { default: @@ -2066,7 +2067,11 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct return -EAGAIN; case -ENOMEM: case -NFS4ERR_DENIED: - /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + if (fl) { + struct nfs4_lock_state *lsp = fl->fl_u.nfs4_fl.owner; + if (lsp) + set_bit(NFS_LOCK_LOST, &lsp->ls_flags); + } return 0; } return err; @@ -2102,7 +2107,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, err = nfs4_open_recover_helper(opendata, FMODE_READ); } nfs4_opendata_put(opendata); - return nfs4_handle_delegation_recall_error(server, state, stateid, err); + return nfs4_handle_delegation_recall_error(server, state, stateid, NULL, err); } static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) @@ -2454,7 +2459,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) data->file_created = true; else if (o_res->cinfo.before != o_res->cinfo.after) data->file_created = true; - if (data->file_created || dir->i_version != o_res->cinfo.after) + if (data->file_created || + inode_peek_iversion_raw(dir) != o_res->cinfo.after) update_changeattr(dir, &o_res->cinfo, o_res->f_attr->time_start); } @@ -3148,6 +3154,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data) struct nfs4_state *state = calldata->state; struct nfs_server *server = NFS_SERVER(calldata->inode); nfs4_stateid *res_stateid = NULL; + struct nfs4_exception exception = { + .state = state, + .inode = calldata->inode, + .stateid = &calldata->arg.stateid, + }; dprintk("%s: begin!\n", __func__); if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -3213,7 +3224,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data) case -NFS4ERR_BAD_STATEID: break; default: - if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + task->tk_status = nfs4_async_handle_exception(task, + server, task->tk_status, &exception); + if (exception.retry) goto out_restart; } nfs_clear_open_stateid(state, &calldata->arg.stateid, @@ -5757,6 +5770,10 @@ struct nfs4_delegreturndata { static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_delegreturndata *data = calldata; + struct nfs4_exception exception = { + .inode = data->inode, + .stateid = &data->stateid, + }; if (!nfs4_sequence_done(task, &data->res.seq_res)) return; @@ -5818,10 +5835,11 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) } /* Fallthrough */ default: - if (nfs4_async_handle_error(task, data->res.server, - NULL, NULL) == -EAGAIN) { + task->tk_status = nfs4_async_handle_exception(task, + data->res.server, task->tk_status, + &exception); + if (exception.retry) goto out_restart; - } } data->rpc_status = task->tk_status; return; @@ -6059,6 +6077,10 @@ static void nfs4_locku_release_calldata(void *data) static void nfs4_locku_done(struct rpc_task *task, void *data) { struct nfs4_unlockdata *calldata = data; + struct nfs4_exception exception = { + .inode = calldata->lsp->ls_state->inode, + .stateid = &calldata->arg.stateid, + }; if (!nfs4_sequence_done(task, &calldata->res.seq_res)) return; @@ -6082,8 +6104,10 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) rpc_restart_call_prepare(task); break; default: - if (nfs4_async_handle_error(task, calldata->server, - NULL, NULL) == -EAGAIN) + task->tk_status = nfs4_async_handle_exception(task, + calldata->server, task->tk_status, + &exception); + if (exception.retry) rpc_restart_call_prepare(task); } nfs_release_seqid(calldata->arg.seqid); @@ -6739,7 +6763,7 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, if (err != 0) return err; err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); - return nfs4_handle_delegation_recall_error(server, state, stateid, err); + return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err); } struct nfs_release_lockowner_data { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e4f4a09ed9f4..91a4d4eeb235 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1482,6 +1482,7 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ struct inode *inode = state->inode; struct nfs_inode *nfsi = NFS_I(inode); struct file_lock *fl; + struct nfs4_lock_state *lsp; int status = 0; struct file_lock_context *flctx = inode->i_flctx; struct list_head *list; @@ -1522,7 +1523,9 @@ restart: case -NFS4ERR_DENIED: case -NFS4ERR_RECLAIM_BAD: case -NFS4ERR_RECLAIM_CONFLICT: - /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + lsp = fl->fl_u.nfs4_fl.owner; + if (lsp) + set_bit(NFS_LOCK_LOST, &lsp->ls_flags); status = 0; } spin_lock(&flctx->flc_lock); diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 0d91d84e5822..c394e4447100 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -32,7 +32,7 @@ static struct ctl_table nfs4_cb_sysctls[] = { .data = &nfs_idmap_cache_timeout, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = proc_dointvec, }, { } }; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 77c6729e57f0..65c9c4175145 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -7678,6 +7678,22 @@ nfs4_stat_to_errno(int stat) .p_name = #proc, \ } +#if defined(CONFIG_NFS_V4_1) +#define PROC41(proc, argtype, restype) \ + PROC(proc, argtype, restype) +#else +#define PROC41(proc, argtype, restype) \ + STUB(proc) +#endif + +#if defined(CONFIG_NFS_V4_2) +#define PROC42(proc, argtype, restype) \ + PROC(proc, argtype, restype) +#else +#define PROC42(proc, argtype, restype) \ + STUB(proc) +#endif + const struct rpc_procinfo nfs4_procedures[] = { PROC(READ, enc_read, dec_read), PROC(WRITE, enc_write, dec_write), @@ -7698,7 +7714,6 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC(ACCESS, enc_access, dec_access), PROC(GETATTR, enc_getattr, dec_getattr), PROC(LOOKUP, enc_lookup, dec_lookup), - PROC(LOOKUPP, enc_lookupp, dec_lookupp), PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root), PROC(REMOVE, enc_remove, dec_remove), PROC(RENAME, enc_rename, dec_rename), @@ -7717,33 +7732,30 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), PROC(SECINFO, enc_secinfo, dec_secinfo), PROC(FSID_PRESENT, enc_fsid_present, dec_fsid_present), -#if defined(CONFIG_NFS_V4_1) - PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), - PROC(CREATE_SESSION, enc_create_session, dec_create_session), - PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), - PROC(SEQUENCE, enc_sequence, dec_sequence), - PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), - PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), - PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), - PROC(LAYOUTGET, enc_layoutget, dec_layoutget), - PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), - PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), - PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), - PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), - PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), + PROC41(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), + PROC41(CREATE_SESSION, enc_create_session, dec_create_session), + PROC41(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), + PROC41(SEQUENCE, enc_sequence, dec_sequence), + PROC41(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC41(RECLAIM_COMPLETE,enc_reclaim_complete, dec_reclaim_complete), + PROC41(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), + PROC41(LAYOUTGET, enc_layoutget, dec_layoutget), + PROC41(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), + PROC41(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), + PROC41(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), + PROC41(TEST_STATEID, enc_test_stateid, dec_test_stateid), + PROC41(FREE_STATEID, enc_free_stateid, dec_free_stateid), STUB(GETDEVICELIST), - PROC(BIND_CONN_TO_SESSION, + PROC41(BIND_CONN_TO_SESSION, enc_bind_conn_to_session, dec_bind_conn_to_session), - PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), -#endif /* CONFIG_NFS_V4_1 */ -#ifdef CONFIG_NFS_V4_2 - PROC(SEEK, enc_seek, dec_seek), - PROC(ALLOCATE, enc_allocate, dec_allocate), - PROC(DEALLOCATE, enc_deallocate, dec_deallocate), - PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), - PROC(CLONE, enc_clone, dec_clone), - PROC(COPY, enc_copy, dec_copy), -#endif /* CONFIG_NFS_V4_2 */ + PROC41(DESTROY_CLIENTID,enc_destroy_clientid, dec_destroy_clientid), + PROC42(SEEK, enc_seek, dec_seek), + PROC42(ALLOCATE, enc_allocate, dec_allocate), + PROC42(DEALLOCATE, enc_deallocate, dec_deallocate), + PROC42(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), + PROC42(CLONE, enc_clone, dec_clone), + PROC42(COPY, enc_copy, dec_copy), + PROC(LOOKUPP, enc_lookupp, dec_lookupp), }; static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 093290c42d7c..bd60f8d1e181 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -9,6 +9,7 @@ #define _TRACE_NFS_H #include <linux/tracepoint.h> +#include <linux/iversion.h> #define nfs_show_file_type(ftype) \ __print_symbolic(ftype, \ @@ -61,7 +62,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event, __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); - __entry->version = inode->i_version; + __entry->version = inode_peek_iversion_raw(inode); ), TP_printk( @@ -100,7 +101,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done, __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->type = nfs_umode_to_dtype(inode->i_mode); - __entry->version = inode->i_version; + __entry->version = inode_peek_iversion_raw(inode); __entry->size = i_size_read(inode); __entry->nfsi_flags = nfsi->flags; __entry->cache_validity = nfsi->cache_validity; @@ -796,15 +797,15 @@ TRACE_EVENT(nfs_readpage_done, ) ); -/* - * XXX: I tried using NFS_UNSTABLE and friends in this table, but they - * all evaluate to 0 for some reason, even if I include linux/nfs.h. - */ +TRACE_DEFINE_ENUM(NFS_UNSTABLE); +TRACE_DEFINE_ENUM(NFS_DATA_SYNC); +TRACE_DEFINE_ENUM(NFS_FILE_SYNC); + #define nfs_show_stable(stable) \ __print_symbolic(stable, \ - { 0, " (UNSTABLE)" }, \ - { 1, " (DATA_SYNC)" }, \ - { 2, " (FILE_SYNC)" }) + { NFS_UNSTABLE, "UNSTABLE" }, \ + { NFS_DATA_SYNC, "DATA_SYNC" }, \ + { NFS_FILE_SYNC, "FILE_SYNC" }) TRACE_EVENT(nfs_initiate_write, TP_PROTO( @@ -837,12 +838,12 @@ TRACE_EVENT(nfs_initiate_write, TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%lu stable=%d%s", + "offset=%lld count=%lu stable=%s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->offset, __entry->count, - __entry->stable, nfs_show_stable(__entry->stable) + nfs_show_stable(__entry->stable) ) ); @@ -881,13 +882,13 @@ TRACE_EVENT(nfs_writeback_done, TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld status=%d stable=%d%s " + "offset=%lld status=%d stable=%s " "verifier 0x%016llx", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->offset, __entry->status, - __entry->stable, nfs_show_stable(__entry->stable), + nfs_show_stable(__entry->stable), __entry->verifier ) ); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index d0543e19098a..18a7626ac638 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -537,7 +537,7 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free); * @cinfo: Commit information for the call (writes only) */ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, - unsigned int count, unsigned int offset, + unsigned int count, int how, struct nfs_commit_info *cinfo) { struct nfs_page *req = hdr->req; @@ -546,10 +546,10 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, * NB: take care not to mess about with hdr->commit et al. */ hdr->args.fh = NFS_FH(hdr->inode); - hdr->args.offset = req_offset(req) + offset; + hdr->args.offset = req_offset(req); /* pnfs_set_layoutcommit needs this */ hdr->mds_offset = hdr->args.offset; - hdr->args.pgbase = req->wb_pgbase + offset; + hdr->args.pgbase = req->wb_pgbase; hdr->args.pages = hdr->page_array.pagevec; hdr->args.count = count; hdr->args.context = get_nfs_open_context(req->wb_context); @@ -789,7 +789,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, desc->pg_ioflags &= ~FLUSH_COND_STABLE; /* Set up the argument struct */ - nfs_pgio_rpcsetup(hdr, mirror->pg_count, 0, desc->pg_ioflags, &cinfo); + nfs_pgio_rpcsetup(hdr, mirror->pg_count, desc->pg_ioflags, &cinfo); desc->pg_rpc_callops = &nfs_pgio_common_ops; return 0; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index d602fe9e1ac8..c13e826614b5 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -655,7 +655,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return 0; list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (pnfs_match_lseg_recall(lseg, recall_range, seq)) { - dprintk("%s: freeing lseg %p iomode %d seq %u" + dprintk("%s: freeing lseg %p iomode %d seq %u " "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_seq, lseg->pls_range.offset, lseg->pls_range.length); @@ -2255,7 +2255,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_write_mds(desc); mirror->pg_recoalesce = 1; } - hdr->release(hdr); + hdr->completion_ops->completion(hdr); } static enum pnfs_try_status @@ -2378,7 +2378,7 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_read_mds(desc); mirror->pg_recoalesce = 1; } - hdr->release(hdr); + hdr->completion_ops->completion(hdr); } /* diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 8d507c361d98..daf6cbf5c15f 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -40,6 +40,7 @@ enum { NFS_LSEG_ROC, /* roc bit received from server */ NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */ NFS_LSEG_LAYOUTRETURN, /* layoutreturn bit set for layoutreturn */ + NFS_LSEG_UNAVAILABLE, /* unavailable bit set for temporary problem */ }; /* Individual ip address */ @@ -86,6 +87,7 @@ enum pnfs_try_status { */ #define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */ #define NFS4_DEF_DS_RETRANS 5 +#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ) /* error codes for internal use */ #define NFS4ERR_RESET_TO_MDS 12001 @@ -524,8 +526,10 @@ static inline int pnfs_return_layout(struct inode *ino) struct nfs_inode *nfsi = NFS_I(ino); struct nfs_server *nfss = NFS_SERVER(ino); - if (pnfs_enabled_sb(nfss) && nfsi->layout) + if (pnfs_enabled_sb(nfss) && nfsi->layout) { + set_bit(NFS_LAYOUT_RETURN_REQUESTED, &nfsi->layout->plh_flags); return _pnfs_return_layout(ino); + } return 0; } diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 2961fcd7a2df..e8a07b3f9aaa 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -43,7 +43,6 @@ #define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) #define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) -#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ) static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE]; static DEFINE_SPINLOCK(nfs4_deviceid_lock); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4a379d7918f2..7428a669d7a7 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -23,6 +23,7 @@ #include <linux/export.h> #include <linux/freezer.h> #include <linux/wait.h> +#include <linux/iversion.h> #include <linux/uaccess.h> @@ -753,11 +754,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) */ spin_lock(&mapping->private_lock); if (!nfs_have_writebacks(inode) && - NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) { - spin_lock(&inode->i_lock); - inode->i_version++; - spin_unlock(&inode->i_lock); - } + NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) + inode_inc_iversion_raw(inode); if (likely(!PageSwapCache(req->wb_page))) { set_bit(PG_MAPPED, &req->wb_flags); SetPagePrivate(req->wb_page); @@ -1837,6 +1835,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); next: nfs_unlock_and_release_request(req); + /* Latency breaker */ + cond_resched(); } nfss = NFS_SERVER(data->inode); if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index f650e475d8f0..fdf2aad73470 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -60,10 +60,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) gi->gid[i] = exp->ex_anon_gid; else gi->gid[i] = rqgi->gid[i]; - - /* Each thread allocates its own gi, no race */ - groups_sort(gi); } + + /* Each thread allocates its own gi, no race */ + groups_sort(gi); } else { gi = get_group_info(rqgi); } diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 43f31cf49bae..b8444189223b 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -11,6 +11,7 @@ #include <linux/crc32.h> #include <linux/sunrpc/svc.h> #include <uapi/linux/nfsd/nfsfh.h> +#include <linux/iversion.h> static inline __u32 ino_t_to_u32(ino_t ino) { @@ -259,7 +260,7 @@ static inline u64 nfsd4_change_attribute(struct inode *inode) chattr = inode->i_ctime.tv_sec; chattr <<= 30; chattr += inode->i_ctime.tv_nsec; - chattr += inode->i_version; + chattr += inode_query_iversion(inode); return chattr; } diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 6c5009cc4e6f..68cb9e4740b4 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -130,7 +130,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf, } int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned int flags, - time_t ctime, __u64 cno) + time64_t ctime, __u64 cno) { int err; diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h index 7bbccc099709..10e16935fff6 100644 --- a/fs/nilfs2/segbuf.h +++ b/fs/nilfs2/segbuf.h @@ -46,7 +46,7 @@ struct nilfs_segsum_info { unsigned long nfileblk; u64 seg_seq; __u64 cno; - time_t ctime; + time64_t ctime; sector_t next; }; @@ -120,7 +120,7 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, struct nilfs_segment_buffer *prev); void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, struct the_nilfs *); -int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned int, time_t, +int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned int, time64_t, __u64); int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *); int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 9f3ffba41533..0953635e7d48 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2040,7 +2040,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) goto out; /* Update time stamp */ - sci->sc_seg_ctime = get_seconds(); + sci->sc_seg_ctime = ktime_get_real_seconds(); err = nilfs_segctor_collect(sci, nilfs, mode); if (unlikely(err)) diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 84084a4d9b3e..04634e3e3d58 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -157,7 +157,7 @@ struct nilfs_sc_info { unsigned long sc_blk_cnt; unsigned long sc_datablk_cnt; unsigned long sc_nblk_this_inc; - time_t sc_seg_ctime; + time64_t sc_seg_ctime; __u64 sc_cno; unsigned long sc_flags; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 1341a41e7b43..c7fa139d50e8 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -526,7 +526,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) * @modtime: modification time (option) */ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, - unsigned long nblocks, time_t modtime) + unsigned long nblocks, time64_t modtime) { struct buffer_head *bh; struct nilfs_segment_usage *su; diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index 158a9190c8ec..673a891350f4 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -35,7 +35,7 @@ int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end); int nilfs_sufile_alloc(struct inode *, __u64 *); int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum); int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, - unsigned long nblocks, time_t modtime); + unsigned long nblocks, time64_t modtime); int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned int, size_t); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 3073b646e1ba..6ffeca84d7c3 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -283,10 +283,10 @@ int nilfs_commit_super(struct super_block *sb, int flag) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_super_block **sbp = nilfs->ns_sbp; - time_t t; + time64_t t; /* nilfs->ns_sem must be locked by the caller. */ - t = get_seconds(); + t = ktime_get_real_seconds(); nilfs->ns_sbwtime = t; sbp[0]->s_wtime = cpu_to_le64(t); sbp[0]->s_sum = 0; diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 490303e3d517..4b25837e7724 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -31,7 +31,7 @@ static struct kset *nilfs_kset; #define NILFS_SHOW_TIME(time_t_val, buf) ({ \ struct tm res; \ int count = 0; \ - time_to_tm(time_t_val, 0, &res); \ + time64_to_tm(time_t_val, 0, &res); \ res.tm_year += 1900; \ res.tm_mon += 1; \ count = scnprintf(buf, PAGE_SIZE, \ @@ -579,7 +579,7 @@ nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t ctime; + time64_t ctime; down_read(&nilfs->ns_segctor_sem); ctime = nilfs->ns_ctime; @@ -593,13 +593,13 @@ nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t ctime; + time64_t ctime; down_read(&nilfs->ns_segctor_sem); ctime = nilfs->ns_ctime; up_read(&nilfs->ns_segctor_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime); + return snprintf(buf, PAGE_SIZE, "%llu\n", ctime); } static ssize_t @@ -607,7 +607,7 @@ nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t nongc_ctime; + time64_t nongc_ctime; down_read(&nilfs->ns_segctor_sem); nongc_ctime = nilfs->ns_nongc_ctime; @@ -621,14 +621,13 @@ nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t nongc_ctime; + time64_t nongc_ctime; down_read(&nilfs->ns_segctor_sem); nongc_ctime = nilfs->ns_nongc_ctime; up_read(&nilfs->ns_segctor_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)nongc_ctime); + return snprintf(buf, PAGE_SIZE, "%llu\n", nongc_ctime); } static ssize_t @@ -728,7 +727,7 @@ nilfs_superblock_sb_write_time_show(struct nilfs_superblock_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t sbwtime; + time64_t sbwtime; down_read(&nilfs->ns_sem); sbwtime = nilfs->ns_sbwtime; @@ -742,13 +741,13 @@ nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t sbwtime; + time64_t sbwtime; down_read(&nilfs->ns_sem); sbwtime = nilfs->ns_sbwtime; up_read(&nilfs->ns_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)sbwtime); + return snprintf(buf, PAGE_SIZE, "%llu\n", sbwtime); } static ssize_t diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 883d732b0259..36da1779f976 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -116,7 +116,7 @@ struct the_nilfs { */ struct buffer_head *ns_sbh[2]; struct nilfs_super_block *ns_sbp[2]; - time_t ns_sbwtime; + time64_t ns_sbwtime; unsigned int ns_sbwcount; unsigned int ns_sbsize; unsigned int ns_mount_state; @@ -131,8 +131,8 @@ struct the_nilfs { __u64 ns_nextnum; unsigned long ns_pseg_offset; __u64 ns_cno; - time_t ns_ctime; - time_t ns_nongc_ctime; + time64_t ns_ctime; + time64_t ns_nongc_ctime; atomic_t ns_ndirtyblks; /* @@ -267,7 +267,7 @@ struct nilfs_root { static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) { - u64 t = get_seconds(); + u64 t = ktime_get_real_seconds(); return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index d0d4bc4c4b70..ef08d64c84b8 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -239,10 +239,10 @@ out_close_fd: } /* intofiy userspace file descriptor functions */ -static unsigned int fanotify_poll(struct file *file, poll_table *wait) +static __poll_t fanotify_poll(struct file *file, poll_table *wait) { struct fsnotify_group *group = file->private_data; - int ret = 0; + __poll_t ret = 0; poll_wait(file, &group->notification_waitq, wait); spin_lock(&group->notification_lock); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index d3c20e0bb046..5c29bf16814f 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -107,10 +107,10 @@ static inline u32 inotify_mask_to_arg(__u32 mask) } /* intofiy userspace file descriptor functions */ -static unsigned int inotify_poll(struct file *file, poll_table *wait) +static __poll_t inotify_poll(struct file *file, poll_table *wait) { struct fsnotify_group *group = file->private_data; - int ret = 0; + __poll_t ret = 0; poll_wait(file, &group->notification_waitq, wait); spin_lock(&group->notification_lock); diff --git a/fs/nsfs.c b/fs/nsfs.c index 7c6f76d29f56..36b0772701a0 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -103,14 +103,14 @@ slow: goto got_it; } -void *ns_get_path(struct path *path, struct task_struct *task, - const struct proc_ns_operations *ns_ops) +void *ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb, + void *private_data) { struct ns_common *ns; void *ret; again: - ns = ns_ops->get(task); + ns = ns_get_cb(private_data); if (!ns) return ERR_PTR(-ENOENT); @@ -120,6 +120,29 @@ again: return ret; } +struct ns_get_path_task_args { + const struct proc_ns_operations *ns_ops; + struct task_struct *task; +}; + +static struct ns_common *ns_get_path_task(void *private_data) +{ + struct ns_get_path_task_args *args = private_data; + + return args->ns_ops->get(args->task); +} + +void *ns_get_path(struct path *path, struct task_struct *task, + const struct proc_ns_operations *ns_ops) +{ + struct ns_get_path_task_args args = { + .ns_ops = ns_ops, + .task = task, + }; + + return ns_get_path_cb(path, ns_get_path_task, &args); +} + int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)) { diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 7c410f879412..1c1ee489284b 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -560,13 +560,6 @@ static int ntfs_read_locked_inode(struct inode *vi) ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); /* Setup the generic vfs inode parts now. */ - - /* - * This is for checking whether an inode has changed w.r.t. a file so - * that the file can be updated if necessary (compare with f_version). - */ - vi->i_version = 1; - vi->i_uid = vol->uid; vi->i_gid = vol->gid; vi->i_mode = 0; @@ -1240,7 +1233,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) base_ni = NTFS_I(base_vi); /* Just mirror the values from the base inode. */ - vi->i_version = base_vi->i_version; vi->i_uid = base_vi->i_uid; vi->i_gid = base_vi->i_gid; set_nlink(vi, base_vi->i_nlink); @@ -1507,7 +1499,6 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) ni = NTFS_I(vi); base_ni = NTFS_I(base_vi); /* Just mirror the values from the base inode. */ - vi->i_version = base_vi->i_version; vi->i_uid = base_vi->i_uid; vi->i_gid = base_vi->i_gid; set_nlink(vi, base_vi->i_nlink); diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index ee8392aee9f6..2831f495a674 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -2641,12 +2641,6 @@ mft_rec_already_initialized: goto undo_mftbmp_alloc; } vi->i_ino = bit; - /* - * This is for checking whether an inode has changed w.r.t. a - * file so that the file can be updated if necessary (compare - * with f_version). - */ - vi->i_version = 1; /* The owner and group come from the ntfs volume. */ vi->i_uid = vol->uid; diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 40b5cc97f7b0..917fadca8a7b 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -311,7 +311,9 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) if (had_lock < 0) return ERR_PTR(had_lock); + down_read(&OCFS2_I(inode)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(inode, type, di_bh); + up_read(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); brelse(di_bh); @@ -330,7 +332,9 @@ int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh) if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return 0; + down_read(&OCFS2_I(inode)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh); + up_read(&OCFS2_I(inode)->ip_xattr_sem); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); @@ -361,8 +365,10 @@ int ocfs2_init_acl(handle_t *handle, if (!S_ISLNK(inode->i_mode)) { if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + down_read(&OCFS2_I(dir)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, dir_bh); + up_read(&OCFS2_I(dir)->ip_xattr_sem); if (IS_ERR(acl)) return PTR_ERR(acl); } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index ab5105f9767e..9a876bb07cac 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec); static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); + +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle, + struct ocfs2_extent_tree *et, + struct buffer_head **new_eb_bh, + int blk_wanted, int *blk_given); +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et); + static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, @@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, if (!obj) obj = (void *)bh->b_data; et->et_object = obj; + et->et_dealloc = NULL; et->et_ops->eo_fill_root_el(et); if (!et->et_ops->eo_fill_max_leaf_clusters) @@ -1158,7 +1166,7 @@ static int ocfs2_add_branch(handle_t *handle, struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) { - int status, new_blocks, i; + int status, new_blocks, i, block_given = 0; u64 next_blkno, new_last_eb_blk; struct buffer_head *bh; struct buffer_head **new_eb_bhs = NULL; @@ -1213,11 +1221,31 @@ static int ocfs2_add_branch(handle_t *handle, goto bail; } - status = ocfs2_create_new_meta_bhs(handle, et, new_blocks, - meta_ac, new_eb_bhs); - if (status < 0) { - mlog_errno(status); - goto bail; + /* Firstyly, try to reuse dealloc since we have already estimated how + * many extent blocks we may use. + */ + if (!ocfs2_is_dealloc_empty(et)) { + status = ocfs2_reuse_blk_from_dealloc(handle, et, + new_eb_bhs, new_blocks, + &block_given); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + BUG_ON(block_given > new_blocks); + + if (block_given < new_blocks) { + BUG_ON(!meta_ac); + status = ocfs2_create_new_meta_bhs(handle, et, + new_blocks - block_given, + meta_ac, + &new_eb_bhs[block_given]); + if (status < 0) { + mlog_errno(status); + goto bail; + } } /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be @@ -1340,15 +1368,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle, struct ocfs2_alloc_context *meta_ac, struct buffer_head **ret_new_eb_bh) { - int status, i; + int status, i, block_given = 0; u32 new_clusters; struct buffer_head *new_eb_bh = NULL; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *root_el; struct ocfs2_extent_list *eb_el; - status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, - &new_eb_bh); + if (!ocfs2_is_dealloc_empty(et)) { + status = ocfs2_reuse_blk_from_dealloc(handle, et, + &new_eb_bh, 1, + &block_given); + } else if (meta_ac) { + status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, + &new_eb_bh); + + } else { + BUG(); + } + if (status < 0) { mlog_errno(status); goto bail; @@ -1511,7 +1549,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, int depth = le16_to_cpu(el->l_tree_depth); struct buffer_head *bh = NULL; - BUG_ON(meta_ac == NULL); + BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et)); shift = ocfs2_find_branch_target(et, &bh); if (shift < 0) { @@ -2598,11 +2636,8 @@ static void ocfs2_unlink_subtree(handle_t *handle, int i; struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el; - struct ocfs2_extent_list *el; struct ocfs2_extent_block *eb; - el = path_leaf_el(left_path); - eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data; for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) @@ -3938,7 +3973,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, struct ocfs2_path *path, struct ocfs2_extent_rec *insert_rec) { - int ret, i, next_free; + int i, next_free; struct buffer_head *bh; struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec; @@ -3955,7 +3990,6 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), "Owner %llu has a bad extent list\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); - ret = -EIO; return; } @@ -5057,7 +5091,6 @@ int ocfs2_split_extent(handle_t *handle, struct buffer_head *last_eb_bh = NULL; struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; struct ocfs2_merge_ctxt ctxt; - struct ocfs2_extent_list *rightmost_el; if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < @@ -5093,9 +5126,7 @@ int ocfs2_split_extent(handle_t *handle, } eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - rightmost_el = &eb->h_list; - } else - rightmost_el = path_root_el(path); + } if (rec->e_cpos == split_rec->e_cpos && rec->e_leaf_clusters == split_rec->e_leaf_clusters) @@ -6585,6 +6616,154 @@ ocfs2_find_per_slot_free_list(int type, return fl; } +static struct ocfs2_per_slot_free_list * +ocfs2_find_preferred_free_list(int type, + int preferred_slot, + int *real_slot, + struct ocfs2_cached_dealloc_ctxt *ctxt) +{ + struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; + + while (fl) { + if (fl->f_inode_type == type && fl->f_slot == preferred_slot) { + *real_slot = fl->f_slot; + return fl; + } + + fl = fl->f_next_suballocator; + } + + /* If we can't find any free list matching preferred slot, just use + * the first one. + */ + fl = ctxt->c_first_suballocator; + *real_slot = fl->f_slot; + + return fl; +} + +/* Return Value 1 indicates empty */ +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et) +{ + struct ocfs2_per_slot_free_list *fl = NULL; + + if (!et->et_dealloc) + return 1; + + fl = et->et_dealloc->c_first_suballocator; + if (!fl) + return 1; + + if (!fl->f_first) + return 1; + + return 0; +} + +/* If extent was deleted from tree due to extent rotation and merging, and + * no metadata is reserved ahead of time. Try to reuse some extents + * just deleted. This is only used to reuse extent blocks. + * It is supposed to find enough extent blocks in dealloc if our estimation + * on metadata is accurate. + */ +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle, + struct ocfs2_extent_tree *et, + struct buffer_head **new_eb_bh, + int blk_wanted, int *blk_given) +{ + int i, status = 0, real_slot; + struct ocfs2_cached_dealloc_ctxt *dealloc; + struct ocfs2_per_slot_free_list *fl; + struct ocfs2_cached_block_free *bf; + struct ocfs2_extent_block *eb; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); + + *blk_given = 0; + + /* If extent tree doesn't have a dealloc, this is not faulty. Just + * tell upper caller dealloc can't provide any block and it should + * ask for alloc to claim more space. + */ + dealloc = et->et_dealloc; + if (!dealloc) + goto bail; + + for (i = 0; i < blk_wanted; i++) { + /* Prefer to use local slot */ + fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE, + osb->slot_num, &real_slot, + dealloc); + /* If no more block can be reused, we should claim more + * from alloc. Just return here normally. + */ + if (!fl) { + status = 0; + break; + } + + bf = fl->f_first; + fl->f_first = bf->free_next; + + new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk); + if (new_eb_bh[i] == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + mlog(0, "Reusing block(%llu) from " + "dealloc(local slot:%d, real slot:%d)\n", + bf->free_blk, osb->slot_num, real_slot); + + ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]); + + status = ocfs2_journal_access_eb(handle, et->et_ci, + new_eb_bh[i], + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize); + eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data; + + /* We can't guarantee that buffer head is still cached, so + * polutlate the extent block again. + */ + strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); + eb->h_blkno = cpu_to_le64(bf->free_blk); + eb->h_fs_generation = cpu_to_le32(osb->fs_generation); + eb->h_suballoc_slot = cpu_to_le16(real_slot); + eb->h_suballoc_loc = cpu_to_le64(bf->free_bg); + eb->h_suballoc_bit = cpu_to_le16(bf->free_bit); + eb->h_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); + + /* We'll also be dirtied by the caller, so + * this isn't absolutely necessary. + */ + ocfs2_journal_dirty(handle, new_eb_bh[i]); + + if (!fl->f_first) { + dealloc->c_first_suballocator = fl->f_next_suballocator; + kfree(fl); + } + kfree(bf); + } + + *blk_given = i; + +bail: + if (unlikely(status < 0)) { + for (i = 0; i < blk_wanted; i++) + brelse(new_eb_bh[i]); + } + + return status; +} + int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, int type, int slot, u64 suballoc, u64 blkno, unsigned int bit) @@ -7382,6 +7561,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) struct buffer_head *gd_bh = NULL; struct ocfs2_dinode *main_bm; struct ocfs2_group_desc *gd = NULL; + struct ocfs2_trim_fs_info info, *pinfo = NULL; start = range->start >> osb->s_clustersize_bits; len = range->len >> osb->s_clustersize_bits; @@ -7419,6 +7599,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) trace_ocfs2_trim_fs(start, len, minlen); + ocfs2_trim_fs_lock_res_init(osb); + ret = ocfs2_trim_fs_lock(osb, NULL, 1); + if (ret < 0) { + if (ret != -EAGAIN) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + goto out_unlock; + } + + mlog(ML_NOTICE, "Wait for trim on device (%s) to " + "finish, which is running from another node.\n", + osb->dev_str); + ret = ocfs2_trim_fs_lock(osb, &info, 0); + if (ret < 0) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + goto out_unlock; + } + + if (info.tf_valid && info.tf_success && + info.tf_start == start && info.tf_len == len && + info.tf_minlen == minlen) { + /* Avoid sending duplicated trim to a shared device */ + mlog(ML_NOTICE, "The same trim on device (%s) was " + "just done from node (%u), return.\n", + osb->dev_str, info.tf_nodenum); + range->len = info.tf_trimlen; + goto out_trimunlock; + } + } + + info.tf_nodenum = osb->node_num; + info.tf_start = start; + info.tf_len = len; + info.tf_minlen = minlen; + /* Determine first and last group to examine based on start and len */ first_group = ocfs2_which_cluster_group(main_bm_inode, start); if (first_group == osb->first_cluster_group_blkno) @@ -7463,6 +7679,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); } range->len = trimmed * sb->s_blocksize; + + info.tf_trimlen = range->len; + info.tf_success = (ret ? 0 : 1); + pinfo = &info; +out_trimunlock: + ocfs2_trim_fs_unlock(osb, pinfo); + ocfs2_trim_fs_lock_res_uninit(osb); out_unlock: ocfs2_inode_unlock(main_bm_inode, 0); brelse(main_bm_bh); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 27b75cf32cfa..250bcacdf9e9 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -61,6 +61,7 @@ struct ocfs2_extent_tree { ocfs2_journal_access_func et_root_journal_access; void *et_object; unsigned int et_max_leaf_clusters; + struct ocfs2_cached_dealloc_ctxt *et_dealloc; }; /* diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index d1516327b787..e8e205bf2e41 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -797,6 +797,7 @@ struct ocfs2_write_ctxt { struct ocfs2_cached_dealloc_ctxt w_dealloc; struct list_head w_unwritten_list; + unsigned int w_unwritten_count; }; void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) @@ -1386,6 +1387,7 @@ retry: desc->c_clear_unwritten = 0; list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list); list_add_tail(&new->ue_node, &wc->w_unwritten_list); + wc->w_unwritten_count++; new = NULL; unlock: spin_unlock(&oi->ip_lock); @@ -2256,7 +2258,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, ue->ue_phys = desc->c_phys; list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list); - dwc->dw_zero_count++; + dwc->dw_zero_count += wc->w_unwritten_count; } ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc); @@ -2330,6 +2332,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode, ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + /* Attach dealloc with extent tree in case that we may reuse extents + * which are already unlinked from current extent tree due to extent + * rotation and merging. + */ + et.et_dealloc = &dealloc; + ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, &data_ac, &meta_ac); if (ret) { diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 62e8ec619b4c..af2e7473956e 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -314,12 +314,13 @@ void o2quo_conn_err(u8 node) node, qs->qs_connected); clear_bit(node, qs->qs_conn_bm); + + if (test_bit(node, qs->qs_hb_bm)) + o2quo_set_hold(qs, node); } mlog(0, "node %u, %d total\n", node, qs->qs_connected); - if (test_bit(node, qs->qs_hb_bm)) - o2quo_set_hold(qs, node); spin_unlock(&qs->qs_lock); } diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index bebe59feca58..eac5140aac47 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -918,7 +918,8 @@ static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) { struct kvec vec = { .iov_len = len, .iov_base = data, }; struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; - return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags); + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, len); + return sock_recvmsg(sock, &msg, MSG_DONTWAIT); } static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index b95e7df5b76a..0276f7f8d5e6 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -196,7 +196,7 @@ struct o2net_msg_handler { u32 nh_msg_type; u32 nh_key; o2net_msg_handler_func *nh_func; - o2net_msg_handler_func *nh_func_data; + void *nh_func_data; o2net_post_msg_handler_func *nh_post_func; struct kref nh_kref; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index febe6312ceff..b7520e20a770 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -42,6 +42,7 @@ #include <linux/highmem.h> #include <linux/quotaops.h> #include <linux/sort.h> +#include <linux/iversion.h> #include <cluster/masklog.h> @@ -1174,7 +1175,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, le16_add_cpu(&pde->rec_len, le16_to_cpu(de->rec_len)); de->inode = 0; - dir->i_version++; + inode_inc_iversion(dir); ocfs2_journal_dirty(handle, bh); goto bail; } @@ -1729,7 +1730,7 @@ int __ocfs2_add_entry(handle_t *handle, if (ocfs2_dir_indexed(dir)) ocfs2_recalc_free_list(dir, handle, lookup); - dir->i_version++; + inode_inc_iversion(dir); ocfs2_journal_dirty(handle, insert_bh); retval = 0; goto bail; @@ -1775,7 +1776,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode, * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (*f_version != inode->i_version) { + if (inode_cmp_iversion(inode, *f_version)) { for (i = 0; i < i_size_read(inode) && i < offset; ) { de = (struct ocfs2_dir_entry *) (data->id_data + i); @@ -1791,7 +1792,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode, i += le16_to_cpu(de->rec_len); } ctx->pos = offset = i; - *f_version = inode->i_version; + *f_version = inode_query_iversion(inode); } de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos); @@ -1869,7 +1870,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (*f_version != inode->i_version) { + if (inode_cmp_iversion(inode, *f_version)) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ocfs2_dir_entry *) (bh->b_data + i); /* It's too expensive to do a full @@ -1886,7 +1887,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, offset = i; ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; - *f_version = inode->i_version; + *f_version = inode_query_iversion(inode); } while (ctx->pos < i_size_read(inode) @@ -1940,7 +1941,7 @@ static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version, */ int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx) { - u64 version = inode->i_version; + u64 version = inode_query_iversion(inode); ocfs2_dir_foreach_blk(inode, &version, ctx, true); return 0; } @@ -1957,7 +1958,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx) trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); - error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level); + error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level, 1); if (lock_level && error >= 0) { /* We release EX lock which used to update atime * and get PR lock again to reduce contention diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 9c3e0f13ca87..a7df226f9449 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1122,13 +1122,6 @@ recheck: /* sleep if we haven't finished voting yet */ if (sleep) { unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS); - - /* - if (kref_read(&mle->mle_refs) < 2) - mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle, - kref_read(&mle->mle_refs), - res->lockname.len, res->lockname.name); - */ atomic_set(&mle->woken, 0); (void)wait_event_timeout(mle->wq, (atomic_read(&mle->woken) == 1), diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 9c7c18c0e129..385fcefa8bc5 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -220,9 +220,9 @@ static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr) return 0; } -static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait) +static __poll_t dlmfs_file_poll(struct file *file, poll_table *wait) { - int event = 0; + __poll_t event = 0; struct inode *inode = file_inode(file); struct dlmfs_inode_private *ip = DLMFS_I(inode); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 4689940a953c..9479f99c2145 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -259,6 +259,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = { .flags = 0, }; +static struct ocfs2_lock_res_ops ocfs2_trim_fs_lops = { + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, +}; + static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = { .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, }; @@ -676,6 +680,24 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res, &ocfs2_nfs_sync_lops, osb); } +void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb) +{ + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + ocfs2_lock_res_init_once(lockres); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name); + ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS, + &ocfs2_trim_fs_lops, osb); +} + +void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb) +{ + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + ocfs2_simple_drop_lockres(osb, lockres); + ocfs2_lock_res_free(lockres); +} + static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, struct ocfs2_super *osb) { @@ -1742,6 +1764,27 @@ int ocfs2_rw_lock(struct inode *inode, int write) return status; } +int ocfs2_try_rw_lock(struct inode *inode, int write) +{ + int status, level; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog(0, "inode %llu try to take %s RW lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + if (ocfs2_mount_local(osb)) + return 0; + + lockres = &OCFS2_I(inode)->ip_rw_lockres; + + level = write ? DLM_LOCK_EX : DLM_LOCK_PR; + + status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0); + return status; +} + void ocfs2_rw_unlock(struct inode *inode, int write) { int level = write ? DLM_LOCK_EX : DLM_LOCK_PR; @@ -2486,6 +2529,15 @@ int ocfs2_inode_lock_with_page(struct inode *inode, ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); if (ret == -EAGAIN) { unlock_page(page); + /* + * If we can't get inode lock immediately, we should not return + * directly here, since this will lead to a softlockup problem. + * The method is to get a blocking lock and immediately unlock + * before returning, this can avoid CPU resource waste due to + * lots of retries, and benefits fairness in getting lock. + */ + if (ocfs2_inode_lock(inode, ret_bh, ex) == 0) + ocfs2_inode_unlock(inode, ex); ret = AOP_TRUNCATED_PAGE; } @@ -2494,13 +2546,18 @@ int ocfs2_inode_lock_with_page(struct inode *inode, int ocfs2_inode_lock_atime(struct inode *inode, struct vfsmount *vfsmnt, - int *level) + int *level, int wait) { int ret; - ret = ocfs2_inode_lock(inode, NULL, 0); + if (wait) + ret = ocfs2_inode_lock(inode, NULL, 0); + else + ret = ocfs2_try_inode_lock(inode, NULL, 0); + if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); return ret; } @@ -2512,9 +2569,14 @@ int ocfs2_inode_lock_atime(struct inode *inode, struct buffer_head *bh = NULL; ocfs2_inode_unlock(inode, 0); - ret = ocfs2_inode_lock(inode, &bh, 1); + if (wait) + ret = ocfs2_inode_lock(inode, &bh, 1); + else + ret = ocfs2_try_inode_lock(inode, &bh, 1); + if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); return ret; } *level = 1; @@ -2745,6 +2807,70 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex) ex ? LKM_EXMODE : LKM_PRMODE); } +int ocfs2_trim_fs_lock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info, int trylock) +{ + int status; + struct ocfs2_trim_fs_lvb *lvb; + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + if (info) + info->tf_valid = 0; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, + trylock ? DLM_LKF_NOQUEUE : 0, 0); + if (status < 0) { + if (status != -EAGAIN) + mlog_errno(status); + return status; + } + + if (info) { + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) && + lvb->lvb_version == OCFS2_TRIMFS_LVB_VERSION) { + info->tf_valid = 1; + info->tf_success = lvb->lvb_success; + info->tf_nodenum = be32_to_cpu(lvb->lvb_nodenum); + info->tf_start = be64_to_cpu(lvb->lvb_start); + info->tf_len = be64_to_cpu(lvb->lvb_len); + info->tf_minlen = be64_to_cpu(lvb->lvb_minlen); + info->tf_trimlen = be64_to_cpu(lvb->lvb_trimlen); + } + } + + return status; +} + +void ocfs2_trim_fs_unlock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info) +{ + struct ocfs2_trim_fs_lvb *lvb; + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + if (ocfs2_mount_local(osb)) + return; + + if (info) { + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + lvb->lvb_version = OCFS2_TRIMFS_LVB_VERSION; + lvb->lvb_success = info->tf_success; + lvb->lvb_nodenum = cpu_to_be32(info->tf_nodenum); + lvb->lvb_start = cpu_to_be64(info->tf_start); + lvb->lvb_len = cpu_to_be64(info->tf_len); + lvb->lvb_minlen = cpu_to_be64(info->tf_minlen); + lvb->lvb_trimlen = cpu_to_be64(info->tf_trimlen); + } + + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); +} + int ocfs2_dentry_lock(struct dentry *dentry, int ex) { int ret; diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index a7fc18ba0dc1..256e0a9067b8 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -70,6 +70,29 @@ struct ocfs2_orphan_scan_lvb { __be32 lvb_os_seqno; }; +#define OCFS2_TRIMFS_LVB_VERSION 1 + +struct ocfs2_trim_fs_lvb { + __u8 lvb_version; + __u8 lvb_success; + __u8 lvb_reserved[2]; + __be32 lvb_nodenum; + __be64 lvb_start; + __be64 lvb_len; + __be64 lvb_minlen; + __be64 lvb_trimlen; +}; + +struct ocfs2_trim_fs_info { + u8 tf_valid; /* lvb is valid, or not */ + u8 tf_success; /* trim is successful, or not */ + u32 tf_nodenum; /* osb node number */ + u64 tf_start; /* trim start offset in clusters */ + u64 tf_len; /* trim end offset in clusters */ + u64 tf_minlen; /* trim minimum contiguous free clusters */ + u64 tf_trimlen; /* trimmed length in bytes */ +}; + struct ocfs2_lock_holder { struct list_head oh_list; struct pid *oh_owner_pid; @@ -116,13 +139,14 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res); int ocfs2_create_new_inode_locks(struct inode *inode); int ocfs2_drop_inode_locks(struct inode *inode); int ocfs2_rw_lock(struct inode *inode, int write); +int ocfs2_try_rw_lock(struct inode *inode, int write); void ocfs2_rw_unlock(struct inode *inode, int write); int ocfs2_open_lock(struct inode *inode); int ocfs2_try_open_lock(struct inode *inode, int write); void ocfs2_open_unlock(struct inode *inode); int ocfs2_inode_lock_atime(struct inode *inode, struct vfsmount *vfsmnt, - int *level); + int *level, int wait); int ocfs2_inode_lock_full_nested(struct inode *inode, struct buffer_head **ret_bh, int ex, @@ -140,6 +164,9 @@ int ocfs2_inode_lock_with_page(struct inode *inode, /* 99% of the time we don't want to supply any additional flags -- * those are for very specific cases only. */ #define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL) +#define ocfs2_try_inode_lock(i, b, e)\ + ocfs2_inode_lock_full_nested(i, b, e, OCFS2_META_LOCK_NOQUEUE,\ + OI_LS_NORMAL) void ocfs2_inode_unlock(struct inode *inode, int ex); int ocfs2_super_lock(struct ocfs2_super *osb, @@ -153,6 +180,12 @@ int ocfs2_rename_lock(struct ocfs2_super *osb); void ocfs2_rename_unlock(struct ocfs2_super *osb); int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex); void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex); +void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb); +void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb); +int ocfs2_trim_fs_lock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info, int trylock); +void ocfs2_trim_fs_unlock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info); int ocfs2_dentry_lock(struct dentry *dentry, int ex); void ocfs2_dentry_unlock(struct dentry *dentry, int ex); int ocfs2_file_lock(struct file *file, int ex, int trylock); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index e4719e0a3f99..06cb96462bf9 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -38,6 +38,7 @@ #include "inode.h" #include "super.h" #include "symlink.h" +#include "aops.h" #include "ocfs2_trace.h" #include "buffer_head_io.h" @@ -832,6 +833,50 @@ out: return ret; } +/* Is IO overwriting allocated blocks? */ +int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh, + u64 map_start, u64 map_len) +{ + int ret = 0, is_last; + u32 mapping_end, cpos; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_rec rec; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if (ocfs2_size_fits_inline_data(di_bh, map_start + map_len)) + return ret; + else + return -EAGAIN; + } + + cpos = map_start >> osb->s_clustersize_bits; + mapping_end = ocfs2_clusters_for_bytes(inode->i_sb, + map_start + map_len); + is_last = 0; + while (cpos < mapping_end && !is_last) { + ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, + NULL, &rec, &is_last); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rec.e_blkno == 0ULL) + break; + + if (rec.e_flags & OCFS2_EXT_REFCOUNTED) + break; + + cpos = le32_to_cpu(rec.e_cpos) + + le16_to_cpu(rec.e_leaf_clusters); + } + + if (cpos < mapping_end) + ret = -EAGAIN; +out: + return ret; +} + int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence) { struct inode *inode = file->f_mapping->host; diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index 67ea57d2fd59..1057586ec19f 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -53,6 +53,9 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 map_start, u64 map_len); +int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh, + u64 map_start, u64 map_len); + int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin); int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index a1d051055472..5d1784a365a3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -140,6 +140,8 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) spin_unlock(&oi->ip_lock); } + file->f_mode |= FMODE_NOWAIT; + leave: return status; } @@ -2132,12 +2134,12 @@ out: } static int ocfs2_prepare_inode_for_write(struct file *file, - loff_t pos, - size_t count) + loff_t pos, size_t count, int wait) { - int ret = 0, meta_level = 0; + int ret = 0, meta_level = 0, overwrite_io = 0; struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); + struct buffer_head *di_bh = NULL; loff_t end; /* @@ -2145,13 +2147,40 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * if we need to make modifications here. */ for(;;) { - ret = ocfs2_inode_lock(inode, NULL, meta_level); + if (wait) + ret = ocfs2_inode_lock(inode, NULL, meta_level); + else + ret = ocfs2_try_inode_lock(inode, + overwrite_io ? NULL : &di_bh, meta_level); if (ret < 0) { meta_level = -1; - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out; } + /* + * Check if IO will overwrite allocated blocks in case + * IOCB_NOWAIT flag is set. + */ + if (!wait && !overwrite_io) { + overwrite_io = 1; + if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) { + ret = -EAGAIN; + goto out_unlock; + } + + ret = ocfs2_overwrite_io(inode, di_bh, pos, count); + brelse(di_bh); + di_bh = NULL; + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (ret < 0) { + if (ret != -EAGAIN) + mlog_errno(ret); + goto out_unlock; + } + } + /* Clear suid / sgid if necessary. We do this here * instead of later in the write path because * remove_suid() calls ->setattr without any hint that @@ -2199,7 +2228,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file, out_unlock: trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, - pos, count); + pos, count, wait); + + brelse(di_bh); if (meta_level >= 0) ocfs2_inode_unlock(inode, meta_level); @@ -2211,7 +2242,7 @@ out: static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - int direct_io, rw_level; + int rw_level; ssize_t written = 0; ssize_t ret; size_t count = iov_iter_count(from); @@ -2223,6 +2254,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, void *saved_ki_complete = NULL; int append_write = ((iocb->ki_pos + count) >= i_size_read(inode) ? 1 : 0); + int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; + int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2230,12 +2263,17 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, file->f_path.dentry->d_name.name, (unsigned int)from->nr_segs); /* GRRRRR */ + if (!direct_io && nowait) + return -EOPNOTSUPP; + if (count == 0) return 0; - direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; - - inode_lock(inode); + if (nowait) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else + inode_lock(inode); /* * Concurrent O_DIRECT writes are allowed with @@ -2244,9 +2282,13 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, */ rw_level = (!direct_io || full_coherency || append_write); - ret = ocfs2_rw_lock(inode, rw_level); + if (nowait) + ret = ocfs2_try_rw_lock(inode, rw_level); + else + ret = ocfs2_rw_lock(inode, rw_level); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out_mutex; } @@ -2260,9 +2302,13 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, * other nodes to drop their caches. Buffered I/O * already does this in write_begin(). */ - ret = ocfs2_inode_lock(inode, NULL, 1); + if (nowait) + ret = ocfs2_try_inode_lock(inode, NULL, 1); + else + ret = ocfs2_inode_lock(inode, NULL, 1); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out; } @@ -2277,9 +2323,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, } count = ret; - ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count); + ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out; } @@ -2355,6 +2402,8 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, int ret = 0, rw_level = -1, lock_level = 0; struct file *filp = iocb->ki_filp; struct inode *inode = file_inode(filp); + int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; + int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2369,14 +2418,22 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, goto bail; } + if (!direct_io && nowait) + return -EOPNOTSUPP; + /* * buffered reads protect themselves in ->readpage(). O_DIRECT reads * need locks to protect pending reads from racing with truncate. */ - if (iocb->ki_flags & IOCB_DIRECT) { - ret = ocfs2_rw_lock(inode, 0); + if (direct_io) { + if (nowait) + ret = ocfs2_try_rw_lock(inode, 0); + else + ret = ocfs2_rw_lock(inode, 0); + if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto bail; } rw_level = 0; @@ -2393,9 +2450,11 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, * like i_size. This allows the checks down below * generic_file_aio_read() a chance of actually working. */ - ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level); + ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level, + !nowait); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto bail; } ocfs2_inode_unlock(inode, lock_level); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 1a1e0078ab38..d51b80edd972 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -28,6 +28,7 @@ #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/quotaops.h> +#include <linux/iversion.h> #include <asm/byteorder.h> @@ -302,7 +303,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); - inode->i_version = 1; + inode_set_iversion(inode, 1); inode->i_generation = le32_to_cpu(fe->i_generation); inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); inode->i_mode = le16_to_cpu(fe->i_mode); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 36304434eacf..e5dcea6cee5f 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -666,23 +666,24 @@ static int __ocfs2_journal_access(handle_t *handle, /* we can safely remove this assertion after testing. */ if (!buffer_uptodate(bh)) { mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); - mlog(ML_ERROR, "b_blocknr=%llu\n", - (unsigned long long)bh->b_blocknr); + mlog(ML_ERROR, "b_blocknr=%llu, b_state=0x%lx\n", + (unsigned long long)bh->b_blocknr, bh->b_state); lock_buffer(bh); /* - * A previous attempt to write this buffer head failed. - * Nothing we can do but to retry the write and hope for - * the best. + * A previous transaction with a couple of buffer heads fail + * to checkpoint, so all the bhs are marked as BH_Write_EIO. + * For current transaction, the bh is just among those error + * bhs which previous transaction handle. We can't just clear + * its BH_Write_EIO and reuse directly, since other bhs are + * not written to disk yet and that will cause metadata + * inconsistency. So we should set fs read-only to avoid + * further damage. */ if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) { - clear_buffer_write_io_error(bh); - set_buffer_uptodate(bh); - } - - if (!buffer_uptodate(bh)) { unlock_buffer(bh); - return -EIO; + return ocfs2_error(osb->sb, "A previous attempt to " + "write this buffer head failed\n"); } unlock_buffer(bh); } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 098f5c712569..fb9a20e3d608 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -184,7 +184,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) int ret = 0, lock_level = 0; ret = ocfs2_inode_lock_atime(file_inode(file), - file->f_path.mnt, &lock_level); + file->f_path.mnt, &lock_level, 1); if (ret < 0) { mlog_errno(ret); goto out; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 3b0a10d9b36f..c801eddc4bf3 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -41,6 +41,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/quotaops.h> +#include <linux/iversion.h> #include <cluster/masklog.h> @@ -1520,7 +1521,7 @@ static int ocfs2_rename(struct inode *old_dir, mlog_errno(status); goto bail; } - new_dir->i_version++; + inode_inc_iversion(new_dir); if (S_ISDIR(new_inode->i_mode)) ocfs2_set_links_count(newfe, 0); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 9a50f222ac97..6867eef2e06b 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -404,6 +404,7 @@ struct ocfs2_super struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; struct ocfs2_lock_res osb_nfs_sync_lockres; + struct ocfs2_lock_res osb_trim_fs_lockres; struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index d277aabf5dfb..7051b994c776 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -50,6 +50,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_NFS_SYNC, OCFS2_LOCK_TYPE_ORPHAN_SCAN, OCFS2_LOCK_TYPE_REFCOUNT, + OCFS2_LOCK_TYPE_TRIM_FS, OCFS2_NUM_LOCK_TYPES }; @@ -93,6 +94,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_REFCOUNT: c = 'T'; break; + case OCFS2_LOCK_TYPE_TRIM_FS: + c = 'I'; + break; default: c = '\0'; } @@ -115,6 +119,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount", + [OCFS2_LOCK_TYPE_TRIM_FS] = "TrimFs", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index a0b5d00ef0a9..e2a11aaece10 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1449,20 +1449,22 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range); TRACE_EVENT(ocfs2_prepare_inode_for_write, TP_PROTO(unsigned long long ino, unsigned long long saved_pos, - unsigned long count), - TP_ARGS(ino, saved_pos, count), + unsigned long count, int wait), + TP_ARGS(ino, saved_pos, count, wait), TP_STRUCT__entry( __field(unsigned long long, ino) __field(unsigned long long, saved_pos) __field(unsigned long, count) + __field(int, wait) ), TP_fast_assign( __entry->ino = ino; __entry->saved_pos = saved_pos; __entry->count = count; + __entry->wait = wait; ), - TP_printk("%llu %llu %lu", __entry->ino, - __entry->saved_pos, __entry->count) + TP_printk("%llu %llu %lu %d", __entry->ino, + __entry->saved_pos, __entry->count, __entry->wait) ); DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index b39d14cbfa34..7a922190a8c7 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -12,6 +12,7 @@ #include <linux/writeback.h> #include <linux/workqueue.h> #include <linux/llist.h> +#include <linux/iversion.h> #include <cluster/masklog.h> @@ -289,7 +290,7 @@ out: mlog_errno(err); return err; } - gqinode->i_version++; + inode_inc_iversion(gqinode); ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh); return len; } diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 9f0b95abc09f..d8f5f6ce99dc 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2438,6 +2438,8 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, } le16_add_cpu(&bg->bg_free_bits_count, num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { + if (undo_fn) + jbd_unlock_bh_state(group_bh); return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", (unsigned long long)le64_to_cpu(bg->bg_blkno), le16_to_cpu(bg->bg_bits), @@ -2563,16 +2565,16 @@ static int _ocfs2_free_clusters(handle_t *handle, int status; u16 bg_start_bit; u64 bg_blkno; - struct ocfs2_dinode *fe; /* You can't ever have a contiguous set of clusters * bigger than a block group bitmap so we never have to worry * about looping on them. * This is expensive. We can safely remove once this stuff has * gotten tested really well. */ - BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); + BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, + ocfs2_blocks_to_clusters(bitmap_inode->i_sb, + start_blk))); - fe = (struct ocfs2_dinode *) bitmap_bh->b_data; ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, &bg_start_bit); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 80efa5699fb0..ffa4952d432b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -474,9 +474,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); - status = -EINVAL; + status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL; mlog_errno(status); - /* FIXME: Should ERROR_RO_FS */ mlog(ML_ERROR, "Unable to load system inode %d, " "possibly corrupt fs?", i); goto bail; @@ -505,7 +504,7 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); - status = -EINVAL; + status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL; mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n", status, i, osb->slot_num); goto bail; @@ -1208,14 +1207,15 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) read_super_error: brelse(bh); + if (status) + mlog_errno(status); + if (osb) { atomic_set(&osb->vol_state, VOLUME_DISABLED); wake_up(&osb->osb_mount_event); ocfs2_dismount_volume(sb, 1); } - if (status) - mlog_errno(status); return status; } @@ -1843,6 +1843,9 @@ static int ocfs2_mount_volume(struct super_block *sb) status = ocfs2_dlm_init(osb); if (status < 0) { mlog_errno(status); + if (status == -EBADR && ocfs2_userspace_stack(osb)) + mlog(ML_ERROR, "couldn't mount because cluster name on" + " disk does not match the running cluster name.\n"); goto leave; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index c5898c59d411..c261c1dfd374 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -638,14 +638,17 @@ int ocfs2_calc_xattr_init(struct inode *dir, si->value_len); if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + down_read(&OCFS2_I(dir)->ip_xattr_sem); acl_len = ocfs2_xattr_get_nolock(dir, dir_bh, OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT, "", NULL, 0); + up_read(&OCFS2_I(dir)->ip_xattr_sem); if (acl_len > 0) { a_size = ocfs2_xattr_entry_real_size(0, acl_len); if (S_ISDIR(mode)) a_size <<= 1; } else if (acl_len != 0 && acl_len != -ENODATA) { + ret = acl_len; mlog_errno(ret); return ret; } @@ -6415,7 +6418,7 @@ static int ocfs2_reflink_xattr_header(handle_t *handle, * and then insert the extents one by one. */ if (xv->xr_list.l_tree_depth) { - memcpy(new_xv, &def_xv, sizeof(def_xv)); + memcpy(new_xv, &def_xv, OCFS2_XATTR_ROOT_SIZE); vb->vb_xv = new_xv; vb->vb_bh = value_bh; ocfs2_init_xattr_value_extent_tree(&data_et, diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index ded456f17de6..f073cd9e6687 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -162,7 +162,7 @@ static ssize_t orangefs_devreq_read(struct file *file, struct orangefs_kernel_op_s *op, *temp; __s32 proto_ver = ORANGEFS_KERNEL_PROTO_VERSION; static __s32 magic = ORANGEFS_DEVREQ_MAGIC; - struct orangefs_kernel_op_s *cur_op = NULL; + struct orangefs_kernel_op_s *cur_op; unsigned long ret; /* We do not support blocking IO. */ @@ -186,6 +186,7 @@ static ssize_t orangefs_devreq_read(struct file *file, return -EAGAIN; restart: + cur_op = NULL; /* Get next op (if any) from top of list. */ spin_lock(&orangefs_request_list_lock); list_for_each_entry_safe(op, temp, &orangefs_request_list, list) { @@ -814,15 +815,15 @@ void orangefs_dev_cleanup(void) ORANGEFS_REQDEVICE_NAME); } -static unsigned int orangefs_devreq_poll(struct file *file, +static __poll_t orangefs_devreq_poll(struct file *file, struct poll_table_struct *poll_table) { - int poll_revent_mask = 0; + __poll_t poll_revent_mask = 0; poll_wait(file, &orangefs_request_list_waitq, poll_table); if (!list_empty(&orangefs_request_list)) - poll_revent_mask |= POLL_IN; + poll_revent_mask |= POLLIN; return poll_revent_mask; } diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 1668fd645c45..0d228cd087e6 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -452,7 +452,7 @@ ssize_t orangefs_inode_read(struct inode *inode, static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; - loff_t pos = *(&iocb->ki_pos); + loff_t pos = iocb->ki_pos; ssize_t rc = 0; BUG_ON(iocb->private); @@ -492,9 +492,6 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite } } - if (file->f_pos > i_size_read(file->f_mapping->host)) - orangefs_i_size_write(file->f_mapping->host, file->f_pos); - rc = generic_write_checks(iocb, iter); if (rc <= 0) { @@ -508,7 +505,7 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite * pos to the end of the file, so we will wait till now to set * pos... */ - pos = *(&iocb->ki_pos); + pos = iocb->ki_pos; rc = do_readv_writev(ORANGEFS_IO_WRITE, file, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 97adf7d100b5..2595453fe737 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -533,17 +533,6 @@ do { \ sys_attr.mask = ORANGEFS_ATTR_SYS_ALL_SETABLE; \ } while (0) -static inline void orangefs_i_size_write(struct inode *inode, loff_t i_size) -{ -#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) - inode_lock(inode); -#endif - i_size_write(inode, i_size); -#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) - inode_unlock(inode); -#endif -} - static inline void orangefs_set_timeout(struct dentry *dentry) { unsigned long time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 36f1390b5ed7..62d49e53061c 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -610,11 +610,16 @@ void orangefs_kill_sb(struct super_block *sb) int orangefs_inode_cache_initialize(void) { - orangefs_inode_cache = kmem_cache_create("orangefs_inode_cache", - sizeof(struct orangefs_inode_s), - 0, - ORANGEFS_CACHE_CREATE_FLAGS, - orangefs_inode_cache_ctor); + orangefs_inode_cache = kmem_cache_create_usercopy( + "orangefs_inode_cache", + sizeof(struct orangefs_inode_s), + 0, + ORANGEFS_CACHE_CREATE_FLAGS, + offsetof(struct orangefs_inode_s, + link_target), + sizeof_field(struct orangefs_inode_s, + link_target), + orangefs_inode_cache_ctor); if (!orangefs_inode_cache) { gossip_err("Cannot create orangefs_inode_cache\n"); diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c index 835c6e148afc..0577d6dba8c8 100644 --- a/fs/orangefs/waitqueue.c +++ b/fs/orangefs/waitqueue.c @@ -29,10 +29,10 @@ static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s */ void purge_waiting_ops(void) { - struct orangefs_kernel_op_s *op; + struct orangefs_kernel_op_s *op, *tmp; spin_lock(&orangefs_request_list_lock); - list_for_each_entry(op, &orangefs_request_list, list) { + list_for_each_entry_safe(op, tmp, &orangefs_request_list, list) { gossip_debug(GOSSIP_WAIT_DEBUG, "pvfs2-client-core: purging op tag %llu %s\n", llu(op->tag), diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 5ac415466861..406e72de88f6 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -47,9 +47,28 @@ config OVERLAY_FS_INDEX The inodes index feature prevents breaking of lower hardlinks on copy up. - Note, that the inodes index feature is read-only backward compatible. - That is, mounting an overlay which has an index dir on a kernel that - doesn't support this feature read-only, will not have any negative - outcomes. However, mounting the same overlay with an old kernel - read-write and then mounting it again with a new kernel, will have - unexpected results. + Note, that the inodes index feature is not backward compatible. + That is, mounting an overlay which has an inodes index on a kernel + that doesn't support this feature will have unexpected results. + +config OVERLAY_FS_NFS_EXPORT + bool "Overlayfs: turn on NFS export feature by default" + depends on OVERLAY_FS + depends on OVERLAY_FS_INDEX + help + If this config option is enabled then overlay filesystems will use + the inodes index dir to decode overlay NFS file handles by default. + In this case, it is still possible to turn off NFS export support + globally with the "nfs_export=off" module option or on a filesystem + instance basis with the "nfs_export=off" mount option. + + The NFS export feature creates an index on copy up of every file and + directory. This full index is used to detect overlay filesystems + inconsistencies on lookup, like redirect from multiple upper dirs to + the same lower dir. The full index may incur some overhead on mount + time, especially when verifying that directory file handles are not + stale. + + Note, that the NFS export feature is not backward compatible. + That is, mounting an overlay which has a full index on a kernel + that doesn't support this feature will have unexpected results. diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile index 99373bbc1478..30802347a020 100644 --- a/fs/overlayfs/Makefile +++ b/fs/overlayfs/Makefile @@ -4,4 +4,5 @@ obj-$(CONFIG_OVERLAY_FS) += overlay.o -overlay-objs := super.o namei.o util.o inode.o dir.o readdir.o copy_up.o +overlay-objs := super.o namei.o util.o inode.o dir.o readdir.o copy_up.o \ + export.o diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index eb3b8d39fb61..d855f508fa20 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -232,13 +232,13 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) return err; } -struct ovl_fh *ovl_encode_fh(struct dentry *lower, bool is_upper) +struct ovl_fh *ovl_encode_fh(struct dentry *real, bool is_upper) { struct ovl_fh *fh; int fh_type, fh_len, dwords; void *buf; int buflen = MAX_HANDLE_SZ; - uuid_t *uuid = &lower->d_sb->s_uuid; + uuid_t *uuid = &real->d_sb->s_uuid; buf = kmalloc(buflen, GFP_KERNEL); if (!buf) @@ -250,7 +250,7 @@ struct ovl_fh *ovl_encode_fh(struct dentry *lower, bool is_upper) * the price or reconnecting the dentry. */ dwords = buflen >> 2; - fh_type = exportfs_encode_fh(lower, buf, &dwords, 0); + fh_type = exportfs_encode_fh(real, buf, &dwords, 0); buflen = (dwords << 2); fh = ERR_PTR(-EIO); @@ -288,8 +288,8 @@ out: return fh; } -static int ovl_set_origin(struct dentry *dentry, struct dentry *lower, - struct dentry *upper) +int ovl_set_origin(struct dentry *dentry, struct dentry *lower, + struct dentry *upper) { const struct ovl_fh *fh = NULL; int err; @@ -315,6 +315,94 @@ static int ovl_set_origin(struct dentry *dentry, struct dentry *lower, return err; } +/* Store file handle of @upper dir in @index dir entry */ +static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index) +{ + const struct ovl_fh *fh; + int err; + + fh = ovl_encode_fh(upper, true); + if (IS_ERR(fh)) + return PTR_ERR(fh); + + err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh, fh->len, 0); + + kfree(fh); + return err; +} + +/* + * Create and install index entry. + * + * Caller must hold i_mutex on indexdir. + */ +static int ovl_create_index(struct dentry *dentry, struct dentry *origin, + struct dentry *upper) +{ + struct dentry *indexdir = ovl_indexdir(dentry->d_sb); + struct inode *dir = d_inode(indexdir); + struct dentry *index = NULL; + struct dentry *temp = NULL; + struct qstr name = { }; + int err; + + /* + * For now this is only used for creating index entry for directories, + * because non-dir are copied up directly to index and then hardlinked + * to upper dir. + * + * TODO: implement create index for non-dir, so we can call it when + * encoding file handle for non-dir in case index does not exist. + */ + if (WARN_ON(!d_is_dir(dentry))) + return -EIO; + + /* Directory not expected to be indexed before copy up */ + if (WARN_ON(ovl_test_flag(OVL_INDEX, d_inode(dentry)))) + return -EIO; + + err = ovl_get_index_name(origin, &name); + if (err) + return err; + + temp = ovl_lookup_temp(indexdir); + if (IS_ERR(temp)) + goto temp_err; + + err = ovl_do_mkdir(dir, temp, S_IFDIR, true); + if (err) + goto out; + + err = ovl_set_upper_fh(upper, temp); + if (err) + goto out_cleanup; + + index = lookup_one_len(name.name, indexdir, name.len); + if (IS_ERR(index)) { + err = PTR_ERR(index); + } else { + err = ovl_do_rename(dir, temp, dir, index, 0); + dput(index); + } + + if (err) + goto out_cleanup; + +out: + dput(temp); + kfree(name.name); + return err; + +temp_err: + err = PTR_ERR(temp); + temp = NULL; + goto out; + +out_cleanup: + ovl_cleanup(dir, temp); + goto out; +} + struct ovl_copy_up_ctx { struct dentry *parent; struct dentry *dentry; @@ -327,6 +415,7 @@ struct ovl_copy_up_ctx { struct dentry *workdir; bool tmpfile; bool origin; + bool indexed; }; static int ovl_link_up(struct ovl_copy_up_ctx *c) @@ -361,7 +450,10 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) } } inode_unlock(udir); - ovl_set_nlink_upper(c->dentry); + if (err) + return err; + + err = ovl_set_nlink_upper(c->dentry); return err; } @@ -498,6 +590,12 @@ static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c) if (err) goto out_cleanup; + if (S_ISDIR(c->stat.mode) && c->indexed) { + err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp); + if (err) + goto out_cleanup; + } + if (c->tmpfile) { inode_lock_nested(udir, I_MUTEX_PARENT); err = ovl_install_temp(c, temp, &newdentry); @@ -536,20 +634,33 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) { int err; struct ovl_fs *ofs = c->dentry->d_sb->s_fs_info; - bool indexed = false; + bool to_index = false; - if (ovl_indexdir(c->dentry->d_sb) && !S_ISDIR(c->stat.mode) && - c->stat.nlink > 1) - indexed = true; + /* + * Indexed non-dir is copied up directly to the index entry and then + * hardlinked to upper dir. Indexed dir is copied up to indexdir, + * then index entry is created and then copied up dir installed. + * Copying dir up to indexdir instead of workdir simplifies locking. + */ + if (ovl_need_index(c->dentry)) { + c->indexed = true; + if (S_ISDIR(c->stat.mode)) + c->workdir = ovl_indexdir(c->dentry->d_sb); + else + to_index = true; + } - if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || indexed) + if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index) c->origin = true; - if (indexed) { + if (to_index) { c->destdir = ovl_indexdir(c->dentry->d_sb); err = ovl_get_index_name(c->lowerpath.dentry, &c->destname); if (err) return err; + } else if (WARN_ON(!c->parent)) { + /* Disconnected dentry must be copied up to index dir */ + return -EIO; } else { /* * Mark parent "impure" because it may now contain non-pure @@ -572,11 +683,17 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) } } - if (indexed) { - if (!err) - ovl_set_flag(OVL_INDEX, d_inode(c->dentry)); - kfree(c->destname.name); - } else if (!err) { + + if (err) + goto out; + + if (c->indexed) + ovl_set_flag(OVL_INDEX, d_inode(c->dentry)); + + if (to_index) { + /* Initialize nlink for copy up of disconnected dentry */ + err = ovl_set_nlink_upper(c->dentry); + } else { struct inode *udir = d_inode(c->destdir); /* Restore timestamps on parent (best effort) */ @@ -587,6 +704,9 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) ovl_dentry_set_upper_alias(c->dentry); } +out: + if (to_index) + kfree(c->destname.name); return err; } @@ -611,14 +731,17 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, if (err) return err; - ovl_path_upper(parent, &parentpath); - ctx.destdir = parentpath.dentry; - ctx.destname = dentry->d_name; + if (parent) { + ovl_path_upper(parent, &parentpath); + ctx.destdir = parentpath.dentry; + ctx.destname = dentry->d_name; - err = vfs_getattr(&parentpath, &ctx.pstat, - STATX_ATIME | STATX_MTIME, AT_STATX_SYNC_AS_STAT); - if (err) - return err; + err = vfs_getattr(&parentpath, &ctx.pstat, + STATX_ATIME | STATX_MTIME, + AT_STATX_SYNC_AS_STAT); + if (err) + return err; + } /* maybe truncate regular file. this has no effect on dirs */ if (flags & O_TRUNC) @@ -639,7 +762,7 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, } else { if (!ovl_dentry_upper(dentry)) err = ovl_do_copy_up(&ctx); - if (!err && !ovl_dentry_has_upper_alias(dentry)) + if (!err && parent && !ovl_dentry_has_upper_alias(dentry)) err = ovl_link_up(&ctx); ovl_copy_up_end(dentry); } @@ -652,10 +775,19 @@ int ovl_copy_up_flags(struct dentry *dentry, int flags) { int err = 0; const struct cred *old_cred = ovl_override_creds(dentry->d_sb); + bool disconnected = (dentry->d_flags & DCACHE_DISCONNECTED); + + /* + * With NFS export, copy up can get called for a disconnected non-dir. + * In this case, we will copy up lower inode to index dir without + * linking it to upper dir. + */ + if (WARN_ON(disconnected && d_is_dir(dentry))) + return -EIO; while (!err) { struct dentry *next; - struct dentry *parent; + struct dentry *parent = NULL; /* * Check if copy-up has happened as well as for upper alias (in @@ -671,12 +803,12 @@ int ovl_copy_up_flags(struct dentry *dentry, int flags) * with rename. */ if (ovl_dentry_upper(dentry) && - ovl_dentry_has_upper_alias(dentry)) + (ovl_dentry_has_upper_alias(dentry) || disconnected)) break; next = dget(dentry); /* find the topmost dentry not yet copied up */ - for (;;) { + for (; !disconnected;) { parent = dget_parent(next); if (ovl_dentry_upper(parent)) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index f9788bc116a8..839709c7803a 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -63,8 +63,7 @@ struct dentry *ovl_lookup_temp(struct dentry *workdir) } /* caller holds i_mutex on workdir */ -static struct dentry *ovl_whiteout(struct dentry *workdir, - struct dentry *dentry) +static struct dentry *ovl_whiteout(struct dentry *workdir) { int err; struct dentry *whiteout; @@ -83,6 +82,38 @@ static struct dentry *ovl_whiteout(struct dentry *workdir, return whiteout; } +/* Caller must hold i_mutex on both workdir and dir */ +int ovl_cleanup_and_whiteout(struct dentry *workdir, struct inode *dir, + struct dentry *dentry) +{ + struct inode *wdir = workdir->d_inode; + struct dentry *whiteout; + int err; + int flags = 0; + + whiteout = ovl_whiteout(workdir); + err = PTR_ERR(whiteout); + if (IS_ERR(whiteout)) + return err; + + if (d_is_dir(dentry)) + flags = RENAME_EXCHANGE; + + err = ovl_do_rename(wdir, whiteout, dir, dentry, flags); + if (err) + goto kill_whiteout; + if (flags) + ovl_cleanup(wdir, dentry); + +out: + dput(whiteout); + return err; + +kill_whiteout: + ovl_cleanup(wdir, whiteout); + goto out; +} + int ovl_create_real(struct inode *dir, struct dentry *newdentry, struct cattr *attr, struct dentry *hardlink, bool debug) { @@ -181,11 +212,6 @@ static bool ovl_type_origin(struct dentry *dentry) return OVL_TYPE_ORIGIN(ovl_path_type(dentry)); } -static bool ovl_may_have_whiteouts(struct dentry *dentry) -{ - return ovl_test_flag(OVL_WHITEOUTS, d_inode(dentry)); -} - static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct cattr *attr, struct dentry *hardlink) { @@ -301,37 +327,6 @@ out: return ERR_PTR(err); } -static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry) -{ - int err; - struct dentry *ret = NULL; - LIST_HEAD(list); - - err = ovl_check_empty_dir(dentry, &list); - if (err) { - ret = ERR_PTR(err); - goto out_free; - } - - /* - * When removing an empty opaque directory, then it makes no sense to - * replace it with an exact replica of itself. - * - * If upperdentry has whiteouts, clear them. - * - * Can race with copy-up, since we don't hold the upperdir mutex. - * Doesn't matter, since copy-up can't create a non-empty directory - * from an empty one. - */ - if (!list_empty(&list)) - ret = ovl_clear_empty(dentry, &list); - -out_free: - ovl_cache_free(&list); - - return ret; -} - static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name, const struct posix_acl *acl) { @@ -623,23 +618,20 @@ static bool ovl_matches_upper(struct dentry *dentry, struct dentry *upper) return d_inode(ovl_dentry_upper(dentry)) == d_inode(upper); } -static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) +static int ovl_remove_and_whiteout(struct dentry *dentry, + struct list_head *list) { struct dentry *workdir = ovl_workdir(dentry); - struct inode *wdir = workdir->d_inode; struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); - struct inode *udir = upperdir->d_inode; - struct dentry *whiteout; struct dentry *upper; struct dentry *opaquedir = NULL; int err; - int flags = 0; if (WARN_ON(!workdir)) return -EROFS; - if (is_dir) { - opaquedir = ovl_check_empty_and_clear(dentry); + if (!list_empty(list)) { + opaquedir = ovl_clear_empty(dentry, list); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) goto out; @@ -662,24 +654,13 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) goto out_dput_upper; } - whiteout = ovl_whiteout(workdir, dentry); - err = PTR_ERR(whiteout); - if (IS_ERR(whiteout)) - goto out_dput_upper; - - if (d_is_dir(upper)) - flags = RENAME_EXCHANGE; - - err = ovl_do_rename(wdir, whiteout, udir, upper, flags); + err = ovl_cleanup_and_whiteout(workdir, d_inode(upperdir), upper); if (err) - goto kill_whiteout; - if (flags) - ovl_cleanup(wdir, upper); + goto out_d_drop; ovl_dentry_version_inc(dentry->d_parent, true); out_d_drop: d_drop(dentry); - dput(whiteout); out_dput_upper: dput(upper); out_unlock: @@ -688,13 +669,10 @@ out_dput: dput(opaquedir); out: return err; - -kill_whiteout: - ovl_cleanup(wdir, whiteout); - goto out_d_drop; } -static int ovl_remove_upper(struct dentry *dentry, bool is_dir) +static int ovl_remove_upper(struct dentry *dentry, bool is_dir, + struct list_head *list) { struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); struct inode *dir = upperdir->d_inode; @@ -702,10 +680,8 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) struct dentry *opaquedir = NULL; int err; - /* Redirect/origin dir can be !ovl_lower_positive && not clean */ - if (is_dir && (ovl_dentry_get_redirect(dentry) || - ovl_may_have_whiteouts(dentry))) { - opaquedir = ovl_check_empty_and_clear(dentry); + if (!list_empty(list)) { + opaquedir = ovl_clear_empty(dentry, list); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) goto out; @@ -746,11 +722,26 @@ out: return err; } +static bool ovl_pure_upper(struct dentry *dentry) +{ + return !ovl_dentry_lower(dentry) && + !ovl_test_flag(OVL_WHITEOUTS, d_inode(dentry)); +} + static int ovl_do_remove(struct dentry *dentry, bool is_dir) { int err; bool locked = false; const struct cred *old_cred; + bool lower_positive = ovl_lower_positive(dentry); + LIST_HEAD(list); + + /* No need to clean pure upper removed by vfs_rmdir() */ + if (is_dir && (lower_positive || !ovl_pure_upper(dentry))) { + err = ovl_check_empty_dir(dentry, &list); + if (err) + goto out; + } err = ovl_want_write(dentry); if (err) @@ -765,10 +756,10 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) goto out_drop_write; old_cred = ovl_override_creds(dentry->d_sb); - if (!ovl_lower_positive(dentry)) - err = ovl_remove_upper(dentry, is_dir); + if (!lower_positive) + err = ovl_remove_upper(dentry, is_dir, &list); else - err = ovl_remove_and_whiteout(dentry, is_dir); + err = ovl_remove_and_whiteout(dentry, &list); revert_creds(old_cred); if (!err) { if (is_dir) @@ -780,6 +771,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) out_drop_write: ovl_drop_write(dentry); out: + ovl_cache_free(&list); return err; } @@ -915,6 +907,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, bool samedir = olddir == newdir; struct dentry *opaquedir = NULL; const struct cred *old_cred = NULL; + LIST_HEAD(list); err = -EINVAL; if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE)) @@ -929,6 +922,27 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, if (!overwrite && !ovl_can_move(new)) goto out; + if (overwrite && new_is_dir && !ovl_pure_upper(new)) { + err = ovl_check_empty_dir(new, &list); + if (err) + goto out; + } + + if (overwrite) { + if (ovl_lower_positive(old)) { + if (!ovl_dentry_is_whiteout(new)) { + /* Whiteout source */ + flags |= RENAME_WHITEOUT; + } else { + /* Switch whiteouts */ + flags |= RENAME_EXCHANGE; + } + } else if (is_dir && ovl_dentry_is_whiteout(new)) { + flags |= RENAME_EXCHANGE; + cleanup_whiteout = true; + } + } + err = ovl_want_write(old); if (err) goto out; @@ -952,9 +966,8 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, old_cred = ovl_override_creds(old->d_sb); - if (overwrite && new_is_dir && (ovl_type_merge_or_lower(new) || - ovl_may_have_whiteouts(new))) { - opaquedir = ovl_check_empty_and_clear(new); + if (!list_empty(&list)) { + opaquedir = ovl_clear_empty(new, &list); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) { opaquedir = NULL; @@ -962,21 +975,6 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, } } - if (overwrite) { - if (ovl_lower_positive(old)) { - if (!ovl_dentry_is_whiteout(new)) { - /* Whiteout source */ - flags |= RENAME_WHITEOUT; - } else { - /* Switch whiteouts */ - flags |= RENAME_EXCHANGE; - } - } else if (is_dir && ovl_dentry_is_whiteout(new)) { - flags |= RENAME_EXCHANGE; - cleanup_whiteout = true; - } - } - old_upperdir = ovl_dentry_upper(old->d_parent); new_upperdir = ovl_dentry_upper(new->d_parent); @@ -1094,6 +1092,7 @@ out_drop_write: ovl_drop_write(old); out: dput(opaquedir); + ovl_cache_free(&list); return err; } diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c new file mode 100644 index 000000000000..bb94ce9da5c8 --- /dev/null +++ b/fs/overlayfs/export.c @@ -0,0 +1,715 @@ +/* + * Overlayfs NFS export support. + * + * Amir Goldstein <amir73il@gmail.com> + * + * Copyright (C) 2017-2018 CTERA Networks. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/cred.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/xattr.h> +#include <linux/exportfs.h> +#include <linux/ratelimit.h> +#include "overlayfs.h" + +/* + * We only need to encode origin if there is a chance that the same object was + * encoded pre copy up and then we need to stay consistent with the same + * encoding also after copy up. If non-pure upper is not indexed, then it was + * copied up before NFS export was enabled. In that case we don't need to worry + * about staying consistent with pre copy up encoding and we encode an upper + * file handle. Overlay root dentry is a private case of non-indexed upper. + * + * The following table summarizes the different file handle encodings used for + * different overlay object types: + * + * Object type | Encoding + * -------------------------------- + * Pure upper | U + * Non-indexed upper | U + * Indexed upper | L (*) + * Non-upper | L (*) + * + * U = upper file handle + * L = lower file handle + * + * (*) Connecting an overlay dir from real lower dentry is not always + * possible when there are redirects in lower layers. To mitigate this case, + * we copy up the lower dir first and then encode an upper dir file handle. + */ +static bool ovl_should_encode_origin(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + + if (!ovl_dentry_lower(dentry)) + return false; + + /* + * Decoding a merge dir, whose origin's parent is under a redirected + * lower dir is not always possible. As a simple aproximation, we do + * not encode lower dir file handles when overlay has multiple lower + * layers and origin is below the topmost lower layer. + * + * TODO: copy up only the parent that is under redirected lower. + */ + if (d_is_dir(dentry) && ofs->upper_mnt && + OVL_E(dentry)->lowerstack[0].layer->idx > 1) + return false; + + /* Decoding a non-indexed upper from origin is not implemented */ + if (ovl_dentry_upper(dentry) && + !ovl_test_flag(OVL_INDEX, d_inode(dentry))) + return false; + + return true; +} + +static int ovl_encode_maybe_copy_up(struct dentry *dentry) +{ + int err; + + if (ovl_dentry_upper(dentry)) + return 0; + + err = ovl_want_write(dentry); + if (err) + return err; + + err = ovl_copy_up(dentry); + + ovl_drop_write(dentry); + return err; +} + +static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) +{ + struct dentry *origin = ovl_dentry_lower(dentry); + struct ovl_fh *fh = NULL; + int err; + + /* + * If we should not encode a lower dir file handle, copy up and encode + * an upper dir file handle. + */ + if (!ovl_should_encode_origin(dentry)) { + err = ovl_encode_maybe_copy_up(dentry); + if (err) + goto fail; + + origin = NULL; + } + + /* Encode an upper or origin file handle */ + fh = ovl_encode_fh(origin ?: ovl_dentry_upper(dentry), !origin); + err = PTR_ERR(fh); + if (IS_ERR(fh)) + goto fail; + + err = -EOVERFLOW; + if (fh->len > buflen) + goto fail; + + memcpy(buf, (char *)fh, fh->len); + err = fh->len; + +out: + kfree(fh); + return err; + +fail: + pr_warn_ratelimited("overlayfs: failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n", + dentry, err, buflen, fh ? (int)fh->len : 0, + fh ? fh->type : 0); + goto out; +} + +static int ovl_dentry_to_fh(struct dentry *dentry, u32 *fid, int *max_len) +{ + int res, len = *max_len << 2; + + res = ovl_d_to_fh(dentry, (char *)fid, len); + if (res <= 0) + return FILEID_INVALID; + + len = res; + + /* Round up to dwords */ + *max_len = (len + 3) >> 2; + return OVL_FILEID; +} + +static int ovl_encode_inode_fh(struct inode *inode, u32 *fid, int *max_len, + struct inode *parent) +{ + struct dentry *dentry; + int type; + + /* TODO: encode connectable file handles */ + if (parent) + return FILEID_INVALID; + + dentry = d_find_any_alias(inode); + if (WARN_ON(!dentry)) + return FILEID_INVALID; + + type = ovl_dentry_to_fh(dentry, fid, max_len); + + dput(dentry); + return type; +} + +/* + * Find or instantiate an overlay dentry from real dentries and index. + */ +static struct dentry *ovl_obtain_alias(struct super_block *sb, + struct dentry *upper_alias, + struct ovl_path *lowerpath, + struct dentry *index) +{ + struct dentry *lower = lowerpath ? lowerpath->dentry : NULL; + struct dentry *upper = upper_alias ?: index; + struct dentry *dentry; + struct inode *inode; + struct ovl_entry *oe; + + /* We get overlay directory dentries with ovl_lookup_real() */ + if (d_is_dir(upper ?: lower)) + return ERR_PTR(-EIO); + + inode = ovl_get_inode(sb, dget(upper), lower, index, !!lower); + if (IS_ERR(inode)) { + dput(upper); + return ERR_CAST(inode); + } + + if (index) + ovl_set_flag(OVL_INDEX, inode); + + dentry = d_find_any_alias(inode); + if (!dentry) { + dentry = d_alloc_anon(inode->i_sb); + if (!dentry) + goto nomem; + oe = ovl_alloc_entry(lower ? 1 : 0); + if (!oe) + goto nomem; + + if (lower) { + oe->lowerstack->dentry = dget(lower); + oe->lowerstack->layer = lowerpath->layer; + } + dentry->d_fsdata = oe; + if (upper_alias) + ovl_dentry_set_upper_alias(dentry); + } + + return d_instantiate_anon(dentry, inode); + +nomem: + iput(inode); + dput(dentry); + return ERR_PTR(-ENOMEM); +} + +/* Get the upper or lower dentry in stach whose on layer @idx */ +static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx) +{ + struct ovl_entry *oe = dentry->d_fsdata; + int i; + + if (!idx) + return ovl_dentry_upper(dentry); + + for (i = 0; i < oe->numlower; i++) { + if (oe->lowerstack[i].layer->idx == idx) + return oe->lowerstack[i].dentry; + } + + return NULL; +} + +/* + * Lookup a child overlay dentry to get a connected overlay dentry whose real + * dentry is @real. If @real is on upper layer, we lookup a child overlay + * dentry with the same name as the real dentry. Otherwise, we need to consult + * index for lookup. + */ +static struct dentry *ovl_lookup_real_one(struct dentry *connected, + struct dentry *real, + struct ovl_layer *layer) +{ + struct inode *dir = d_inode(connected); + struct dentry *this, *parent = NULL; + struct name_snapshot name; + int err; + + /* + * Lookup child overlay dentry by real name. The dir mutex protects us + * from racing with overlay rename. If the overlay dentry that is above + * real has already been moved to a parent that is not under the + * connected overlay dir, we return -ECHILD and restart the lookup of + * connected real path from the top. + */ + inode_lock_nested(dir, I_MUTEX_PARENT); + err = -ECHILD; + parent = dget_parent(real); + if (ovl_dentry_real_at(connected, layer->idx) != parent) + goto fail; + + /* + * We also need to take a snapshot of real dentry name to protect us + * from racing with underlying layer rename. In this case, we don't + * care about returning ESTALE, only from dereferencing a free name + * pointer because we hold no lock on the real dentry. + */ + take_dentry_name_snapshot(&name, real); + this = lookup_one_len(name.name, connected, strlen(name.name)); + err = PTR_ERR(this); + if (IS_ERR(this)) { + goto fail; + } else if (!this || !this->d_inode) { + dput(this); + err = -ENOENT; + goto fail; + } else if (ovl_dentry_real_at(this, layer->idx) != real) { + dput(this); + err = -ESTALE; + goto fail; + } + +out: + release_dentry_name_snapshot(&name); + dput(parent); + inode_unlock(dir); + return this; + +fail: + pr_warn_ratelimited("overlayfs: failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", + real, layer->idx, connected, err); + this = ERR_PTR(err); + goto out; +} + +static struct dentry *ovl_lookup_real(struct super_block *sb, + struct dentry *real, + struct ovl_layer *layer); + +/* + * Lookup an indexed or hashed overlay dentry by real inode. + */ +static struct dentry *ovl_lookup_real_inode(struct super_block *sb, + struct dentry *real, + struct ovl_layer *layer) +{ + struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt }; + struct dentry *index = NULL; + struct dentry *this = NULL; + struct inode *inode; + + /* + * Decoding upper dir from index is expensive, so first try to lookup + * overlay dentry in inode/dcache. + */ + inode = ovl_lookup_inode(sb, real, !layer->idx); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (inode) { + this = d_find_any_alias(inode); + iput(inode); + } + + /* + * For decoded lower dir file handle, lookup index by origin to check + * if lower dir was copied up and and/or removed. + */ + if (!this && layer->idx && ofs->indexdir && !WARN_ON(!d_is_dir(real))) { + index = ovl_lookup_index(ofs, NULL, real, false); + if (IS_ERR(index)) + return index; + } + + /* Get connected upper overlay dir from index */ + if (index) { + struct dentry *upper = ovl_index_upper(ofs, index); + + dput(index); + if (IS_ERR_OR_NULL(upper)) + return upper; + + /* + * ovl_lookup_real() in lower layer may call recursively once to + * ovl_lookup_real() in upper layer. The first level call walks + * back lower parents to the topmost indexed parent. The second + * recursive call walks back from indexed upper to the topmost + * connected/hashed upper parent (or up to root). + */ + this = ovl_lookup_real(sb, upper, &upper_layer); + dput(upper); + } + + if (!this) + return NULL; + + if (WARN_ON(ovl_dentry_real_at(this, layer->idx) != real)) { + dput(this); + this = ERR_PTR(-EIO); + } + + return this; +} + +/* + * Lookup an indexed or hashed overlay dentry, whose real dentry is an + * ancestor of @real. + */ +static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb, + struct dentry *real, + struct ovl_layer *layer) +{ + struct dentry *next, *parent = NULL; + struct dentry *ancestor = ERR_PTR(-EIO); + + if (real == layer->mnt->mnt_root) + return dget(sb->s_root); + + /* Find the topmost indexed or hashed ancestor */ + next = dget(real); + for (;;) { + parent = dget_parent(next); + + /* + * Lookup a matching overlay dentry in inode/dentry + * cache or in index by real inode. + */ + ancestor = ovl_lookup_real_inode(sb, next, layer); + if (ancestor) + break; + + if (parent == layer->mnt->mnt_root) { + ancestor = dget(sb->s_root); + break; + } + + /* + * If @real has been moved out of the layer root directory, + * we will eventully hit the real fs root. This cannot happen + * by legit overlay rename, so we return error in that case. + */ + if (parent == next) { + ancestor = ERR_PTR(-EXDEV); + break; + } + + dput(next); + next = parent; + } + + dput(parent); + dput(next); + + return ancestor; +} + +/* + * Lookup a connected overlay dentry whose real dentry is @real. + * If @real is on upper layer, we lookup a child overlay dentry with the same + * path the real dentry. Otherwise, we need to consult index for lookup. + */ +static struct dentry *ovl_lookup_real(struct super_block *sb, + struct dentry *real, + struct ovl_layer *layer) +{ + struct dentry *connected; + int err = 0; + + connected = ovl_lookup_real_ancestor(sb, real, layer); + if (IS_ERR(connected)) + return connected; + + while (!err) { + struct dentry *next, *this; + struct dentry *parent = NULL; + struct dentry *real_connected = ovl_dentry_real_at(connected, + layer->idx); + + if (real_connected == real) + break; + + /* Find the topmost dentry not yet connected */ + next = dget(real); + for (;;) { + parent = dget_parent(next); + + if (parent == real_connected) + break; + + /* + * If real has been moved out of 'real_connected', + * we will not find 'real_connected' and hit the layer + * root. In that case, we need to restart connecting. + * This game can go on forever in the worst case. We + * may want to consider taking s_vfs_rename_mutex if + * this happens more than once. + */ + if (parent == layer->mnt->mnt_root) { + dput(connected); + connected = dget(sb->s_root); + break; + } + + /* + * If real file has been moved out of the layer root + * directory, we will eventully hit the real fs root. + * This cannot happen by legit overlay rename, so we + * return error in that case. + */ + if (parent == next) { + err = -EXDEV; + break; + } + + dput(next); + next = parent; + } + + if (!err) { + this = ovl_lookup_real_one(connected, next, layer); + if (IS_ERR(this)) + err = PTR_ERR(this); + + /* + * Lookup of child in overlay can fail when racing with + * overlay rename of child away from 'connected' parent. + * In this case, we need to restart the lookup from the + * top, because we cannot trust that 'real_connected' is + * still an ancestor of 'real'. There is a good chance + * that the renamed overlay ancestor is now in cache, so + * ovl_lookup_real_ancestor() will find it and we can + * continue to connect exactly from where lookup failed. + */ + if (err == -ECHILD) { + this = ovl_lookup_real_ancestor(sb, real, + layer); + err = IS_ERR(this) ? PTR_ERR(this) : 0; + } + if (!err) { + dput(connected); + connected = this; + } + } + + dput(parent); + dput(next); + } + + if (err) + goto fail; + + return connected; + +fail: + pr_warn_ratelimited("overlayfs: failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", + real, layer->idx, connected, err); + dput(connected); + return ERR_PTR(err); +} + +/* + * Get an overlay dentry from upper/lower real dentries and index. + */ +static struct dentry *ovl_get_dentry(struct super_block *sb, + struct dentry *upper, + struct ovl_path *lowerpath, + struct dentry *index) +{ + struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt }; + struct ovl_layer *layer = upper ? &upper_layer : lowerpath->layer; + struct dentry *real = upper ?: (index ?: lowerpath->dentry); + + /* + * Obtain a disconnected overlay dentry from a non-dir real dentry + * and index. + */ + if (!d_is_dir(real)) + return ovl_obtain_alias(sb, upper, lowerpath, index); + + /* Removed empty directory? */ + if ((real->d_flags & DCACHE_DISCONNECTED) || d_unhashed(real)) + return ERR_PTR(-ENOENT); + + /* + * If real dentry is connected and hashed, get a connected overlay + * dentry whose real dentry is @real. + */ + return ovl_lookup_real(sb, real, layer); +} + +static struct dentry *ovl_upper_fh_to_d(struct super_block *sb, + struct ovl_fh *fh) +{ + struct ovl_fs *ofs = sb->s_fs_info; + struct dentry *dentry; + struct dentry *upper; + + if (!ofs->upper_mnt) + return ERR_PTR(-EACCES); + + upper = ovl_decode_fh(fh, ofs->upper_mnt); + if (IS_ERR_OR_NULL(upper)) + return upper; + + dentry = ovl_get_dentry(sb, upper, NULL, NULL); + dput(upper); + + return dentry; +} + +static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, + struct ovl_fh *fh) +{ + struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_path origin = { }; + struct ovl_path *stack = &origin; + struct dentry *dentry = NULL; + struct dentry *index = NULL; + struct inode *inode = NULL; + bool is_deleted = false; + int err; + + /* First lookup indexed upper by fh */ + if (ofs->indexdir) { + index = ovl_get_index_fh(ofs, fh); + err = PTR_ERR(index); + if (IS_ERR(index)) { + if (err != -ESTALE) + return ERR_PTR(err); + + /* Found a whiteout index - treat as deleted inode */ + is_deleted = true; + index = NULL; + } + } + + /* Then try to get upper dir by index */ + if (index && d_is_dir(index)) { + struct dentry *upper = ovl_index_upper(ofs, index); + + err = PTR_ERR(upper); + if (IS_ERR_OR_NULL(upper)) + goto out_err; + + dentry = ovl_get_dentry(sb, upper, NULL, NULL); + dput(upper); + goto out; + } + + /* Then lookup origin by fh */ + err = ovl_check_origin_fh(ofs, fh, NULL, &stack); + if (err) { + goto out_err; + } else if (index) { + err = ovl_verify_origin(index, origin.dentry, false); + if (err) + goto out_err; + } else if (is_deleted) { + /* Lookup deleted non-dir by origin inode */ + if (!d_is_dir(origin.dentry)) + inode = ovl_lookup_inode(sb, origin.dentry, false); + err = -ESTALE; + if (!inode || atomic_read(&inode->i_count) == 1) + goto out_err; + + /* Deleted but still open? */ + index = dget(ovl_i_dentry_upper(inode)); + } + + dentry = ovl_get_dentry(sb, NULL, &origin, index); + +out: + dput(origin.dentry); + dput(index); + iput(inode); + return dentry; + +out_err: + dentry = ERR_PTR(err); + goto out; +} + +static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct dentry *dentry = NULL; + struct ovl_fh *fh = (struct ovl_fh *) fid; + int len = fh_len << 2; + unsigned int flags = 0; + int err; + + err = -EINVAL; + if (fh_type != OVL_FILEID) + goto out_err; + + err = ovl_check_fh_len(fh, len); + if (err) + goto out_err; + + flags = fh->flags; + dentry = (flags & OVL_FH_FLAG_PATH_UPPER) ? + ovl_upper_fh_to_d(sb, fh) : + ovl_lower_fh_to_d(sb, fh); + err = PTR_ERR(dentry); + if (IS_ERR(dentry) && err != -ESTALE) + goto out_err; + + return dentry; + +out_err: + pr_warn_ratelimited("overlayfs: failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n", + len, fh_type, flags, err); + return ERR_PTR(err); +} + +static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + pr_warn_ratelimited("overlayfs: connectable file handles not supported; use 'no_subtree_check' exportfs option.\n"); + return ERR_PTR(-EACCES); +} + +static int ovl_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + /* + * ovl_fh_to_dentry() returns connected dir overlay dentries and + * ovl_fh_to_parent() is not implemented, so we should not get here. + */ + WARN_ON_ONCE(1); + return -EIO; +} + +static struct dentry *ovl_get_parent(struct dentry *dentry) +{ + /* + * ovl_fh_to_dentry() returns connected dir overlay dentries, so we + * should not get here. + */ + WARN_ON_ONCE(1); + return ERR_PTR(-EIO); +} + +const struct export_operations ovl_export_operations = { + .encode_fh = ovl_encode_inode_fh, + .fh_to_dentry = ovl_fh_to_dentry, + .fh_to_parent = ovl_fh_to_parent, + .get_name = ovl_get_name, + .get_parent = ovl_get_parent, +}; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 00b6b294272a..fcd97b783fa1 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -105,12 +105,20 @@ int ovl_getattr(const struct path *path, struct kstat *stat, * Lower hardlinks may be broken on copy up to different * upper files, so we cannot use the lower origin st_ino * for those different files, even for the same fs case. + * + * Similarly, several redirected dirs can point to the + * same dir on a lower layer. With the "verify_lower" + * feature, we do not use the lower origin st_ino, if + * we haven't verified that this redirect is unique. + * * With inodes index enabled, it is safe to use st_ino - * of an indexed hardlinked origin. The index validates - * that the upper hardlink is not broken. + * of an indexed origin. The index validates that the + * upper hardlink is not broken and that a redirected + * dir is the only redirect to that origin. */ - if (is_dir || lowerstat.nlink == 1 || - ovl_test_flag(OVL_INDEX, d_inode(dentry))) + if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || + (!ovl_verify_lower(dentry->d_sb) && + (is_dir || lowerstat.nlink == 1))) stat->ino = lowerstat.ino; if (samefs) @@ -343,8 +351,10 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type) static bool ovl_open_need_copy_up(struct dentry *dentry, int flags) { + /* Copy up of disconnected dentry does not set upper alias */ if (ovl_dentry_upper(dentry) && - ovl_dentry_has_upper_alias(dentry)) + (ovl_dentry_has_upper_alias(dentry) || + (dentry->d_flags & DCACHE_DISCONNECTED))) return false; if (special_file(d_inode(dentry)->i_mode)) @@ -604,9 +614,25 @@ static int ovl_inode_set(struct inode *inode, void *data) } static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry, - struct dentry *upperdentry) + struct dentry *upperdentry, bool strict) { /* + * For directories, @strict verify from lookup path performs consistency + * checks, so NULL lower/upper in dentry must match NULL lower/upper in + * inode. Non @strict verify from NFS handle decode path passes NULL for + * 'unknown' lower/upper. + */ + if (S_ISDIR(inode->i_mode) && strict) { + /* Real lower dir moved to upper layer under us? */ + if (!lowerdentry && ovl_inode_lower(inode)) + return false; + + /* Lookup of an uncovered redirect origin? */ + if (!upperdentry && ovl_inode_upper(inode)) + return false; + } + + /* * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL. * This happens when finding a copied up overlay inode for a renamed * or hardlinked overlay dentry and lower dentry cannot be followed @@ -625,14 +651,35 @@ static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry, return true; } -struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry, - struct dentry *index) +struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, + bool is_upper) { - struct dentry *lowerdentry = ovl_dentry_lower(dentry); + struct inode *inode, *key = d_inode(real); + + inode = ilookup5(sb, (unsigned long) key, ovl_inode_test, key); + if (!inode) + return NULL; + + if (!ovl_verify_inode(inode, is_upper ? NULL : real, + is_upper ? real : NULL, false)) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, + struct dentry *lowerdentry, struct dentry *index, + unsigned int numlower) +{ + struct ovl_fs *ofs = sb->s_fs_info; struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; struct inode *inode; /* Already indexed or could be indexed on copy up? */ - bool indexed = (index || (ovl_indexdir(dentry->d_sb) && !upperdentry)); + bool indexed = (index || (ovl_indexdir(sb) && !upperdentry)); + struct dentry *origin = indexed ? lowerdentry : NULL; + bool is_dir; if (WARN_ON(upperdentry && indexed && !lowerdentry)) return ERR_PTR(-EIO); @@ -641,17 +688,22 @@ struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry, realinode = d_inode(lowerdentry); /* - * Copy up origin (lower) may exist for non-indexed upper, but we must - * not use lower as hash key in that case. - * Hash inodes that are or could be indexed by origin inode and - * non-indexed upper inodes that could be hard linked by upper inode. + * Copy up origin (lower) may exist for non-indexed non-dir upper, but + * we must not use lower as hash key in that case. + * Hash non-dir that is or could be indexed by origin inode. + * Hash dir that is or could be merged by origin inode. + * Hash pure upper and non-indexed non-dir by upper inode. + * Hash non-indexed dir by upper inode for NFS export. */ - if (!S_ISDIR(realinode->i_mode) && (upperdentry || indexed)) { - struct inode *key = d_inode(indexed ? lowerdentry : - upperdentry); - unsigned int nlink; + is_dir = S_ISDIR(realinode->i_mode); + if (is_dir && (indexed || !sb->s_export_op || !ofs->upper_mnt)) + origin = lowerdentry; + + if (upperdentry || origin) { + struct inode *key = d_inode(origin ?: upperdentry); + unsigned int nlink = is_dir ? 1 : realinode->i_nlink; - inode = iget5_locked(dentry->d_sb, (unsigned long) key, + inode = iget5_locked(sb, (unsigned long) key, ovl_inode_test, ovl_inode_set, key); if (!inode) goto out_nomem; @@ -660,7 +712,8 @@ struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry, * Verify that the underlying files stored in the inode * match those in the dentry. */ - if (!ovl_verify_inode(inode, lowerdentry, upperdentry)) { + if (!ovl_verify_inode(inode, lowerdentry, upperdentry, + true)) { iput(inode); inode = ERR_PTR(-ESTALE); goto out; @@ -670,11 +723,12 @@ struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry, goto out; } - nlink = ovl_get_nlink(lowerdentry, upperdentry, - realinode->i_nlink); + /* Recalculate nlink for non-dir due to indexing */ + if (!is_dir) + nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink); set_nlink(inode, nlink); } else { - inode = new_inode(dentry->d_sb); + inode = new_inode(sb); if (!inode) goto out_nomem; } @@ -685,10 +739,8 @@ struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry, ovl_set_flag(OVL_IMPURE, inode); /* Check for non-merge dir that may have whiteouts */ - if (S_ISDIR(realinode->i_mode)) { - struct ovl_entry *oe = dentry->d_fsdata; - - if (((upperdentry && lowerdentry) || oe->numlower > 1) || + if (is_dir) { + if (((upperdentry && lowerdentry) || numlower > 1) || ovl_check_origin_xattr(upperdentry ?: lowerdentry)) { ovl_set_flag(OVL_WHITEOUTS, inode); } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index beb945e1963c..de3e6da1d5a5 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -9,6 +9,7 @@ #include <linux/fs.h> #include <linux/cred.h> +#include <linux/ctype.h> #include <linux/namei.h> #include <linux/xattr.h> #include <linux/ratelimit.h> @@ -84,15 +85,54 @@ invalid: static int ovl_acceptable(void *ctx, struct dentry *dentry) { - return 1; + /* + * A non-dir origin may be disconnected, which is fine, because + * we only need it for its unique inode number. + */ + if (!d_is_dir(dentry)) + return 1; + + /* Don't decode a deleted empty directory */ + if (d_unhashed(dentry)) + return 0; + + /* Check if directory belongs to the layer we are decoding from */ + return is_subdir(dentry, ((struct vfsmount *)ctx)->mnt_root); } -static struct ovl_fh *ovl_get_origin_fh(struct dentry *dentry) +/* + * Check validity of an overlay file handle buffer. + * + * Return 0 for a valid file handle. + * Return -ENODATA for "origin unknown". + * Return <0 for an invalid file handle. + */ +int ovl_check_fh_len(struct ovl_fh *fh, int fh_len) { - int res; + if (fh_len < sizeof(struct ovl_fh) || fh_len < fh->len) + return -EINVAL; + + if (fh->magic != OVL_FH_MAGIC) + return -EINVAL; + + /* Treat larger version and unknown flags as "origin unknown" */ + if (fh->version > OVL_FH_VERSION || fh->flags & ~OVL_FH_FLAG_ALL) + return -ENODATA; + + /* Treat endianness mismatch as "origin unknown" */ + if (!(fh->flags & OVL_FH_FLAG_ANY_ENDIAN) && + (fh->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) + return -ENODATA; + + return 0; +} + +static struct ovl_fh *ovl_get_fh(struct dentry *dentry, const char *name) +{ + int res, err; struct ovl_fh *fh = NULL; - res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, NULL, 0); + res = vfs_getxattr(dentry, name, NULL, 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) return NULL; @@ -102,28 +142,20 @@ static struct ovl_fh *ovl_get_origin_fh(struct dentry *dentry) if (res == 0) return NULL; - fh = kzalloc(res, GFP_KERNEL); + fh = kzalloc(res, GFP_KERNEL); if (!fh) return ERR_PTR(-ENOMEM); - res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, fh, res); + res = vfs_getxattr(dentry, name, fh, res); if (res < 0) goto fail; - if (res < sizeof(struct ovl_fh) || res < fh->len) - goto invalid; - - if (fh->magic != OVL_FH_MAGIC) + err = ovl_check_fh_len(fh, res); + if (err < 0) { + if (err == -ENODATA) + goto out; goto invalid; - - /* Treat larger version and unknown flags as "origin unknown" */ - if (fh->version > OVL_FH_VERSION || fh->flags & ~OVL_FH_FLAG_ALL) - goto out; - - /* Treat endianness mismatch as "origin unknown" */ - if (!(fh->flags & OVL_FH_FLAG_ANY_ENDIAN) && - (fh->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) - goto out; + } return fh; @@ -139,47 +171,41 @@ invalid: goto out; } -static struct dentry *ovl_get_origin(struct dentry *dentry, - struct vfsmount *mnt) +struct dentry *ovl_decode_fh(struct ovl_fh *fh, struct vfsmount *mnt) { - struct dentry *origin = NULL; - struct ovl_fh *fh = ovl_get_origin_fh(dentry); + struct dentry *real; int bytes; - if (IS_ERR_OR_NULL(fh)) - return (struct dentry *)fh; - /* * Make sure that the stored uuid matches the uuid of the lower * layer where file handle will be decoded. */ if (!uuid_equal(&fh->uuid, &mnt->mnt_sb->s_uuid)) - goto out; + return NULL; bytes = (fh->len - offsetof(struct ovl_fh, fid)); - origin = exportfs_decode_fh(mnt, (struct fid *)fh->fid, - bytes >> 2, (int)fh->type, - ovl_acceptable, NULL); - if (IS_ERR(origin)) { - /* Treat stale file handle as "origin unknown" */ - if (origin == ERR_PTR(-ESTALE)) - origin = NULL; - goto out; + real = exportfs_decode_fh(mnt, (struct fid *)fh->fid, + bytes >> 2, (int)fh->type, + ovl_acceptable, mnt); + if (IS_ERR(real)) { + /* + * Treat stale file handle to lower file as "origin unknown". + * upper file handle could become stale when upper file is + * unlinked and this information is needed to handle stale + * index entries correctly. + */ + if (real == ERR_PTR(-ESTALE) && + !(fh->flags & OVL_FH_FLAG_PATH_UPPER)) + real = NULL; + return real; } - if (ovl_dentry_weird(origin) || - ((d_inode(origin)->i_mode ^ d_inode(dentry)->i_mode) & S_IFMT)) - goto invalid; - -out: - kfree(fh); - return origin; + if (ovl_dentry_weird(real)) { + dput(real); + return NULL; + } -invalid: - pr_warn_ratelimited("overlayfs: invalid origin (%pd2)\n", origin); - dput(origin); - origin = NULL; - goto out; + return real; } static bool ovl_is_opaquedir(struct dentry *dentry) @@ -284,47 +310,81 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, } -static int ovl_check_origin(struct dentry *upperdentry, - struct ovl_path *lower, unsigned int numlower, - struct ovl_path **stackp, unsigned int *ctrp) +int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, + struct dentry *upperdentry, struct ovl_path **stackp) { - struct vfsmount *mnt; struct dentry *origin = NULL; int i; - for (i = 0; i < numlower; i++) { - mnt = lower[i].layer->mnt; - origin = ovl_get_origin(upperdentry, mnt); - if (IS_ERR(origin)) - return PTR_ERR(origin); - + for (i = 0; i < ofs->numlower; i++) { + origin = ovl_decode_fh(fh, ofs->lower_layers[i].mnt); if (origin) break; } if (!origin) - return 0; + return -ESTALE; + else if (IS_ERR(origin)) + return PTR_ERR(origin); + + if (upperdentry && !ovl_is_whiteout(upperdentry) && + ((d_inode(origin)->i_mode ^ d_inode(upperdentry)->i_mode) & S_IFMT)) + goto invalid; - BUG_ON(*ctrp); if (!*stackp) *stackp = kmalloc(sizeof(struct ovl_path), GFP_KERNEL); if (!*stackp) { dput(origin); return -ENOMEM; } - **stackp = (struct ovl_path){.dentry = origin, .layer = lower[i].layer}; - *ctrp = 1; + **stackp = (struct ovl_path){ + .dentry = origin, + .layer = &ofs->lower_layers[i] + }; + + return 0; + +invalid: + pr_warn_ratelimited("overlayfs: invalid origin (%pd2, ftype=%x, origin ftype=%x).\n", + upperdentry, d_inode(upperdentry)->i_mode & S_IFMT, + d_inode(origin)->i_mode & S_IFMT); + dput(origin); + return -EIO; +} + +static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry, + struct ovl_path **stackp, unsigned int *ctrp) +{ + struct ovl_fh *fh = ovl_get_fh(upperdentry, OVL_XATTR_ORIGIN); + int err; + + if (IS_ERR_OR_NULL(fh)) + return PTR_ERR(fh); + + err = ovl_check_origin_fh(ofs, fh, upperdentry, stackp); + kfree(fh); + + if (err) { + if (err == -ESTALE) + return 0; + return err; + } + + if (WARN_ON(*ctrp)) + return -EIO; + *ctrp = 1; return 0; } /* - * Verify that @fh matches the origin file handle stored in OVL_XATTR_ORIGIN. + * Verify that @fh matches the file handle stored in xattr @name. * Return 0 on match, -ESTALE on mismatch, < 0 on error. */ -static int ovl_verify_origin_fh(struct dentry *dentry, const struct ovl_fh *fh) +static int ovl_verify_fh(struct dentry *dentry, const char *name, + const struct ovl_fh *fh) { - struct ovl_fh *ofh = ovl_get_origin_fh(dentry); + struct ovl_fh *ofh = ovl_get_fh(dentry, name); int err = 0; if (!ofh) @@ -341,28 +401,28 @@ static int ovl_verify_origin_fh(struct dentry *dentry, const struct ovl_fh *fh) } /* - * Verify that an inode matches the origin file handle stored in upper inode. + * Verify that @real dentry matches the file handle stored in xattr @name. * - * If @set is true and there is no stored file handle, encode and store origin - * file handle in OVL_XATTR_ORIGIN. + * If @set is true and there is no stored file handle, encode @real and store + * file handle in xattr @name. * - * Return 0 on match, -ESTALE on mismatch, < 0 on error. + * Return 0 on match, -ESTALE on mismatch, -ENODATA on no xattr, < 0 on error. */ -int ovl_verify_origin(struct dentry *dentry, struct dentry *origin, - bool is_upper, bool set) +int ovl_verify_set_fh(struct dentry *dentry, const char *name, + struct dentry *real, bool is_upper, bool set) { struct inode *inode; struct ovl_fh *fh; int err; - fh = ovl_encode_fh(origin, is_upper); + fh = ovl_encode_fh(real, is_upper); err = PTR_ERR(fh); if (IS_ERR(fh)) goto fail; - err = ovl_verify_origin_fh(dentry, fh); + err = ovl_verify_fh(dentry, name, fh); if (set && err == -ENODATA) - err = ovl_do_setxattr(dentry, OVL_XATTR_ORIGIN, fh, fh->len, 0); + err = ovl_do_setxattr(dentry, name, fh, fh->len, 0); if (err) goto fail; @@ -371,45 +431,71 @@ out: return err; fail: - inode = d_inode(origin); - pr_warn_ratelimited("overlayfs: failed to verify origin (%pd2, ino=%lu, err=%i)\n", - origin, inode ? inode->i_ino : 0, err); + inode = d_inode(real); + pr_warn_ratelimited("overlayfs: failed to verify %s (%pd2, ino=%lu, err=%i)\n", + is_upper ? "upper" : "origin", real, + inode ? inode->i_ino : 0, err); goto out; } +/* Get upper dentry from index */ +struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) +{ + struct ovl_fh *fh; + struct dentry *upper; + + if (!d_is_dir(index)) + return dget(index); + + fh = ovl_get_fh(index, OVL_XATTR_UPPER); + if (IS_ERR_OR_NULL(fh)) + return ERR_CAST(fh); + + upper = ovl_decode_fh(fh, ofs->upper_mnt); + kfree(fh); + + if (IS_ERR_OR_NULL(upper)) + return upper ?: ERR_PTR(-ESTALE); + + if (!d_is_dir(upper)) { + pr_warn_ratelimited("overlayfs: invalid index upper (%pd2, upper=%pd2).\n", + index, upper); + dput(upper); + return ERR_PTR(-EIO); + } + + return upper; +} + +/* Is this a leftover from create/whiteout of directory index entry? */ +static bool ovl_is_temp_index(struct dentry *index) +{ + return index->d_name.name[0] == '#'; +} + /* * Verify that an index entry name matches the origin file handle stored in * OVL_XATTR_ORIGIN and that origin file handle can be decoded to lower path. * Return 0 on match, -ESTALE on mismatch or stale origin, < 0 on error. */ -int ovl_verify_index(struct dentry *index, struct ovl_path *lower, - unsigned int numlower) +int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) { struct ovl_fh *fh = NULL; size_t len; struct ovl_path origin = { }; struct ovl_path *stack = &origin; - unsigned int ctr = 0; + struct dentry *upper = NULL; int err; if (!d_inode(index)) return 0; - /* - * Directory index entries are going to be used for looking up - * redirected upper dirs by lower dir fh when decoding an overlay - * file handle of a merge dir. Whiteout index entries are going to be - * used as an indication that an exported overlay file handle should - * be treated as stale (i.e. after unlink of the overlay inode). - * We don't know the verification rules for directory and whiteout - * index entries, because they have not been implemented yet, so return - * EINVAL if those entries are found to abort the mount to avoid - * corrupting an index that was created by a newer kernel. - */ - err = -EINVAL; - if (d_is_dir(index) || ovl_is_whiteout(index)) + /* Cleanup leftover from index create/cleanup attempt */ + err = -ESTALE; + if (ovl_is_temp_index(index)) goto fail; + err = -EINVAL; if (index->d_name.len < sizeof(struct ovl_fh)*2) goto fail; @@ -420,26 +506,68 @@ int ovl_verify_index(struct dentry *index, struct ovl_path *lower, goto fail; err = -EINVAL; - if (hex2bin((u8 *)fh, index->d_name.name, len) || len != fh->len) + if (hex2bin((u8 *)fh, index->d_name.name, len)) goto fail; - err = ovl_verify_origin_fh(index, fh); + err = ovl_check_fh_len(fh, len); if (err) goto fail; - err = ovl_check_origin(index, lower, numlower, &stack, &ctr); - if (!err && !ctr) - err = -ESTALE; + /* + * Whiteout index entries are used as an indication that an exported + * overlay file handle should be treated as stale (i.e. after unlink + * of the overlay inode). These entries contain no origin xattr. + */ + if (ovl_is_whiteout(index)) + goto out; + + /* + * Verifying directory index entries are not stale is expensive, so + * only verify stale dir index if NFS export is enabled. + */ + if (d_is_dir(index) && !ofs->config.nfs_export) + goto out; + + /* + * Directory index entries should have 'upper' xattr pointing to the + * real upper dir. Non-dir index entries are hardlinks to the upper + * real inode. For non-dir index, we can read the copy up origin xattr + * directly from the index dentry, but for dir index we first need to + * decode the upper directory. + */ + upper = ovl_index_upper(ofs, index); + if (IS_ERR_OR_NULL(upper)) { + err = PTR_ERR(upper); + /* + * Directory index entries with no 'upper' xattr need to be + * removed. When dir index entry has a stale 'upper' xattr, + * we assume that upper dir was removed and we treat the dir + * index as orphan entry that needs to be whited out. + */ + if (err == -ESTALE) + goto orphan; + else if (!err) + err = -ESTALE; + goto fail; + } + + err = ovl_verify_fh(upper, OVL_XATTR_ORIGIN, fh); + dput(upper); if (err) goto fail; - /* Check if index is orphan and don't warn before cleaning it */ - if (d_inode(index)->i_nlink == 1 && - ovl_get_nlink(origin.dentry, index, 0) == 0) - err = -ENOENT; + /* Check if non-dir index is orphan and don't warn before cleaning it */ + if (!d_is_dir(index) && d_inode(index)->i_nlink == 1) { + err = ovl_check_origin_fh(ofs, fh, index, &stack); + if (err) + goto fail; + + if (ovl_get_nlink(origin.dentry, index, 0) == 0) + goto orphan; + } - dput(origin.dentry); out: + dput(origin.dentry); kfree(fh); return err; @@ -447,6 +575,28 @@ fail: pr_warn_ratelimited("overlayfs: failed to verify index (%pd2, ftype=%x, err=%i)\n", index, d_inode(index)->i_mode & S_IFMT, err); goto out; + +orphan: + pr_warn_ratelimited("overlayfs: orphan index entry (%pd2, ftype=%x, nlink=%u)\n", + index, d_inode(index)->i_mode & S_IFMT, + d_inode(index)->i_nlink); + err = -ENOENT; + goto out; +} + +static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name) +{ + char *n, *s; + + n = kzalloc(fh->len * 2, GFP_KERNEL); + if (!n) + return -ENOMEM; + + s = bin2hex(n, fh, fh->len); + *name = (struct qstr) QSTR_INIT(n, s - n); + + return 0; + } /* @@ -466,35 +616,58 @@ fail: */ int ovl_get_index_name(struct dentry *origin, struct qstr *name) { - int err; struct ovl_fh *fh; - char *n, *s; + int err; fh = ovl_encode_fh(origin, false); if (IS_ERR(fh)) return PTR_ERR(fh); - err = -ENOMEM; - n = kzalloc(fh->len * 2, GFP_KERNEL); - if (n) { - s = bin2hex(n, fh, fh->len); - *name = (struct qstr) QSTR_INIT(n, s - n); - err = 0; - } - kfree(fh); + err = ovl_get_index_name_fh(fh, name); + kfree(fh); return err; +} + +/* Lookup index by file handle for NFS export */ +struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh) +{ + struct dentry *index; + struct qstr name; + int err; + + err = ovl_get_index_name_fh(fh, &name); + if (err) + return ERR_PTR(err); + + index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len); + kfree(name.name); + if (IS_ERR(index)) { + if (PTR_ERR(index) == -ENOENT) + index = NULL; + return index; + } + if (d_is_negative(index)) + err = 0; + else if (ovl_is_whiteout(index)) + err = -ESTALE; + else if (ovl_dentry_weird(index)) + err = -EIO; + else + return index; + + dput(index); + return ERR_PTR(err); } -static struct dentry *ovl_lookup_index(struct dentry *dentry, - struct dentry *upper, - struct dentry *origin) +struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, + struct dentry *origin, bool verify) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; struct dentry *index; struct inode *inode; struct qstr name; + bool is_dir = d_is_dir(origin); int err; err = ovl_get_index_name(origin, &name); @@ -518,8 +691,16 @@ static struct dentry *ovl_lookup_index(struct dentry *dentry, inode = d_inode(index); if (d_is_negative(index)) { goto out_dput; - } else if (upper && d_inode(upper) != inode) { - goto out_dput; + } else if (ovl_is_whiteout(index) && !verify) { + /* + * When index lookup is called with !verify for decoding an + * overlay file handle, a whiteout index implies that decode + * should treat file handle as stale and no need to print a + * warning about it. + */ + dput(index); + index = ERR_PTR(-ESTALE); + goto out; } else if (ovl_dentry_weird(index) || ovl_is_whiteout(index) || ((inode->i_mode ^ d_inode(origin)->i_mode) & S_IFMT)) { /* @@ -533,8 +714,25 @@ static struct dentry *ovl_lookup_index(struct dentry *dentry, index, d_inode(index)->i_mode & S_IFMT, d_inode(origin)->i_mode & S_IFMT); goto fail; - } + } else if (is_dir && verify) { + if (!upper) { + pr_warn_ratelimited("overlayfs: suspected uncovered redirected dir found (origin=%pd2, index=%pd2).\n", + origin, index); + goto fail; + } + /* Verify that dir index 'upper' xattr points to upper dir */ + err = ovl_verify_upper(index, upper, false); + if (err) { + if (err == -ESTALE) { + pr_warn_ratelimited("overlayfs: suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n", + upper, origin, index); + } + goto fail; + } + } else if (upper && d_inode(upper) != inode) { + goto out_dput; + } out: kfree(name.name); return index; @@ -572,16 +770,25 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path) return (idx < oe->numlower) ? idx + 1 : -1; } -static int ovl_find_layer(struct ovl_fs *ofs, struct ovl_path *path) +/* Fix missing 'origin' xattr */ +static int ovl_fix_origin(struct dentry *dentry, struct dentry *lower, + struct dentry *upper) { - int i; + int err; - for (i = 0; i < ofs->numlower; i++) { - if (ofs->lower_layers[i].mnt == path->layer->mnt) - break; - } + if (ovl_check_origin_xattr(upper)) + return 0; + + err = ovl_want_write(dentry); + if (err) + return err; + + err = ovl_set_origin(dentry, lower, upper); + if (!err) + err = ovl_set_impure(dentry->d_parent, upper->d_parent); - return i; + ovl_drop_write(dentry); + return err; } struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, @@ -594,6 +801,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata; struct ovl_path *stack = NULL; struct dentry *upperdir, *upperdentry = NULL; + struct dentry *origin = NULL; struct dentry *index = NULL; unsigned int ctr = 0; struct inode *inode = NULL; @@ -638,8 +846,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * number - it's the same as if we held a reference * to a dentry in lower layer that was moved under us. */ - err = ovl_check_origin(upperdentry, roe->lowerstack, - roe->numlower, &stack, &ctr); + err = ovl_check_origin(ofs, upperdentry, &stack, &ctr); if (err) goto out_put_upper; } @@ -674,6 +881,34 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (!this) continue; + /* + * If no origin fh is stored in upper of a merge dir, store fh + * of lower dir and set upper parent "impure". + */ + if (upperdentry && !ctr && !ofs->noxattr) { + err = ovl_fix_origin(dentry, this, upperdentry); + if (err) { + dput(this); + goto out_put; + } + } + + /* + * When "verify_lower" feature is enabled, do not merge with a + * lower dir that does not match a stored origin xattr. In any + * case, only verified origin is used for index lookup. + */ + if (upperdentry && !ctr && ovl_verify_lower(dentry->d_sb)) { + err = ovl_verify_origin(upperdentry, this, false); + if (err) { + dput(this); + break; + } + + /* Bless lower dir as verified origin */ + origin = this; + } + stack[ctr].dentry = this; stack[ctr].layer = lower.layer; ctr++; @@ -693,25 +928,30 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, */ err = -EPERM; if (d.redirect && !ofs->config.redirect_follow) { - pr_warn_ratelimited("overlay: refusing to follow redirect for (%pd2)\n", dentry); + pr_warn_ratelimited("overlayfs: refusing to follow redirect for (%pd2)\n", + dentry); goto out_put; } if (d.redirect && d.redirect[0] == '/' && poe != roe) { poe = roe; - /* Find the current layer on the root dentry */ - i = ovl_find_layer(ofs, &lower); - if (WARN_ON(i == ofs->numlower)) - break; + i = lower.layer->idx - 1; } } - /* Lookup index by lower inode and verify it matches upper inode */ - if (ctr && !d.is_dir && ovl_indexdir(dentry->d_sb)) { - struct dentry *origin = stack[0].dentry; + /* + * Lookup index by lower inode and verify it matches upper inode. + * We only trust dir index if we verified that lower dir matches + * origin, otherwise dir index entries may be inconsistent and we + * ignore them. Always lookup index of non-dir and non-upper. + */ + if (ctr && (!upperdentry || !d.is_dir)) + origin = stack[0].dentry; - index = ovl_lookup_index(dentry, upperdentry, origin); + if (origin && ovl_indexdir(dentry->d_sb) && + (!d.is_dir || ovl_index_all(dentry->d_sb))) { + index = ovl_lookup_index(ofs, upperdentry, origin, true); if (IS_ERR(index)) { err = PTR_ERR(index); index = NULL; @@ -724,17 +964,22 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (!oe) goto out_put; - oe->opaque = upperopaque; memcpy(oe->lowerstack, stack, sizeof(struct ovl_path) * ctr); dentry->d_fsdata = oe; + if (upperopaque) + ovl_dentry_set_opaque(dentry); + if (upperdentry) ovl_dentry_set_upper_alias(dentry); else if (index) upperdentry = dget(index); if (upperdentry || ctr) { - inode = ovl_get_inode(dentry, upperdentry, index); + if (ctr) + origin = stack[0].dentry; + inode = ovl_get_inode(dentry->d_sb, upperdentry, origin, index, + ctr); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_free_oe; @@ -748,9 +993,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, dput(index); kfree(stack); kfree(d.redirect); - d_add(dentry, inode); - - return NULL; + return d_splice_alias(inode, dentry); out_free_oe: dentry->d_fsdata = NULL; @@ -771,9 +1014,9 @@ out: bool ovl_lower_positive(struct dentry *dentry) { - struct ovl_entry *oe = dentry->d_fsdata; struct ovl_entry *poe = dentry->d_parent->d_fsdata; const struct qstr *name = &dentry->d_name; + const struct cred *old_cred; unsigned int i; bool positive = false; bool done = false; @@ -783,12 +1026,13 @@ bool ovl_lower_positive(struct dentry *dentry) * whiteout. */ if (!dentry->d_inode) - return oe->opaque; + return ovl_dentry_is_opaque(dentry); /* Negative upper -> positive lower */ if (!ovl_dentry_upper(dentry)) return true; + old_cred = ovl_override_creds(dentry->d_sb); /* Positive upper -> have to look up lower to see whether it exists */ for (i = 0; !done && !positive && i < poe->numlower; i++) { struct dentry *this; @@ -818,6 +1062,7 @@ bool ovl_lower_positive(struct dentry *dentry) dput(this); } } + revert_creds(old_cred); return positive; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index b489099ccd49..0df25a9c94bd 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -27,8 +27,9 @@ enum ovl_path_type { #define OVL_XATTR_ORIGIN OVL_XATTR_PREFIX "origin" #define OVL_XATTR_IMPURE OVL_XATTR_PREFIX "impure" #define OVL_XATTR_NLINK OVL_XATTR_PREFIX "nlink" +#define OVL_XATTR_UPPER OVL_XATTR_PREFIX "upper" -enum ovl_flag { +enum ovl_inode_flag { /* Pure upper dir that may contain non pure upper entries */ OVL_IMPURE, /* Non-merge dir that may contain whiteout entries */ @@ -36,6 +37,11 @@ enum ovl_flag { OVL_INDEX, }; +enum ovl_entry_flag { + OVL_E_UPPER_ALIAS, + OVL_E_OPAQUE, +}; + /* * The tuple (fh,uuid) is a universal unique identifier for a copy up origin, * where: @@ -62,6 +68,9 @@ enum ovl_flag { #error Endianness not defined #endif +/* The type returned by overlay exportfs ops when encoding an ovl_fh handle */ +#define OVL_FILEID 0xfb + /* On-disk and in-memeory format for redirect by file handle */ struct ovl_fh { u8 version; /* 0 */ @@ -194,6 +203,8 @@ const struct cred *ovl_override_creds(struct super_block *sb); struct super_block *ovl_same_sb(struct super_block *sb); bool ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); +bool ovl_index_all(struct super_block *sb); +bool ovl_verify_lower(struct super_block *sb); struct ovl_entry *ovl_alloc_entry(unsigned int numlower); bool ovl_dentry_remote(struct dentry *dentry); bool ovl_dentry_weird(struct dentry *dentry); @@ -210,6 +221,9 @@ struct inode *ovl_inode_lower(struct inode *inode); struct inode *ovl_inode_real(struct inode *inode); struct ovl_dir_cache *ovl_dir_cache(struct inode *inode); void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache); +void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry); +void ovl_dentry_clear_flag(unsigned long flag, struct dentry *dentry); +bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry); bool ovl_dentry_is_opaque(struct dentry *dentry); bool ovl_dentry_is_whiteout(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry); @@ -238,6 +252,7 @@ void ovl_clear_flag(unsigned long flag, struct inode *inode); bool ovl_test_flag(unsigned long flag, struct inode *inode); bool ovl_inuse_trylock(struct dentry *dentry); void ovl_inuse_unlock(struct dentry *dentry); +bool ovl_need_index(struct dentry *dentry); int ovl_nlink_start(struct dentry *dentry, bool *locked); void ovl_nlink_end(struct dentry *dentry, bool locked); int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); @@ -249,15 +264,35 @@ static inline bool ovl_is_impuredir(struct dentry *dentry) /* namei.c */ -int ovl_verify_origin(struct dentry *dentry, struct dentry *origin, - bool is_upper, bool set); -int ovl_verify_index(struct dentry *index, struct ovl_path *lower, - unsigned int numlower); +int ovl_check_fh_len(struct ovl_fh *fh, int fh_len); +struct dentry *ovl_decode_fh(struct ovl_fh *fh, struct vfsmount *mnt); +int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, + struct dentry *upperdentry, struct ovl_path **stackp); +int ovl_verify_set_fh(struct dentry *dentry, const char *name, + struct dentry *real, bool is_upper, bool set); +struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index); +int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index); int ovl_get_index_name(struct dentry *origin, struct qstr *name); +struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh); +struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, + struct dentry *origin, bool verify); int ovl_path_next(int idx, struct dentry *dentry, struct path *path); -struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags); bool ovl_lower_positive(struct dentry *dentry); +static inline int ovl_verify_origin(struct dentry *upper, + struct dentry *origin, bool set) +{ + return ovl_verify_set_fh(upper, OVL_XATTR_ORIGIN, origin, false, set); +} + +static inline int ovl_verify_upper(struct dentry *index, + struct dentry *upper, bool set) +{ + return ovl_verify_set_fh(index, OVL_XATTR_UPPER, upper, true, set); +} + /* readdir.c */ extern const struct file_operations ovl_dir_operations; int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); @@ -267,8 +302,7 @@ void ovl_dir_cache_free(struct inode *inode); int ovl_check_d_type_supported(struct path *realpath); void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, struct dentry *dentry, int level); -int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, - struct ovl_path *lower, unsigned int numlower); +int ovl_indexdir_cleanup(struct ovl_fs *ofs); /* inode.c */ int ovl_set_nlink_upper(struct dentry *dentry); @@ -291,8 +325,11 @@ int ovl_update_time(struct inode *inode, struct timespec *ts, int flags); bool ovl_is_private_xattr(const char *name); struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev); -struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry, - struct dentry *index); +struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, + bool is_upper); +struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, + struct dentry *lowerdentry, struct dentry *index, + unsigned int numlower); static inline void ovl_copyattr(struct inode *from, struct inode *to) { to->i_uid = from->i_uid; @@ -306,6 +343,8 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to) /* dir.c */ extern const struct inode_operations ovl_dir_inode_operations; struct dentry *ovl_lookup_temp(struct dentry *workdir); +int ovl_cleanup_and_whiteout(struct dentry *workdir, struct inode *dir, + struct dentry *dentry); struct cattr { dev_t rdev; umode_t mode; @@ -321,4 +360,9 @@ int ovl_copy_up(struct dentry *dentry); int ovl_copy_up_flags(struct dentry *dentry, int flags); int ovl_copy_xattr(struct dentry *old, struct dentry *new); int ovl_set_attr(struct dentry *upper, struct kstat *stat); -struct ovl_fh *ovl_encode_fh(struct dentry *lower, bool is_upper); +struct ovl_fh *ovl_encode_fh(struct dentry *real, bool is_upper); +int ovl_set_origin(struct dentry *dentry, struct dentry *lower, + struct dentry *upper); + +/* export.c */ +extern const struct export_operations ovl_export_operations; diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 9d0bc03bf6e4..bfef6edcc111 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -17,11 +17,14 @@ struct ovl_config { bool redirect_follow; const char *redirect_mode; bool index; + bool nfs_export; }; struct ovl_layer { struct vfsmount *mnt; dev_t pseudo_dev; + /* Index of this layer in fs root (upper == 0) */ + int idx; }; struct ovl_path { @@ -58,8 +61,7 @@ struct ovl_fs { struct ovl_entry { union { struct { - unsigned long has_upper; - bool opaque; + unsigned long flags; }; struct rcu_head rcu; }; @@ -69,6 +71,11 @@ struct ovl_entry { struct ovl_entry *ovl_alloc_entry(unsigned int numlower); +static inline struct ovl_entry *OVL_E(struct dentry *dentry) +{ + return (struct ovl_entry *) dentry->d_fsdata; +} + struct ovl_inode { struct ovl_dir_cache *cache; const char *redirect; diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 8c98578d27a1..c11f5c0906c3 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -593,8 +593,15 @@ static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path) return ERR_PTR(res); } if (list_empty(&cache->entries)) { - /* Good oportunity to get rid of an unnecessary "impure" flag */ - ovl_do_removexattr(ovl_dentry_upper(dentry), OVL_XATTR_IMPURE); + /* + * A good opportunity to get rid of an unneeded "impure" flag. + * Removing the "impure" xattr is best effort. + */ + if (!ovl_want_write(dentry)) { + ovl_do_removexattr(ovl_dentry_upper(dentry), + OVL_XATTR_IMPURE); + ovl_drop_write(dentry); + } ovl_clear_flag(OVL_IMPURE, d_inode(dentry)); kfree(cache); return NULL; @@ -769,10 +776,14 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, struct dentry *dentry = file->f_path.dentry; struct file *realfile = od->realfile; + /* Nothing to sync for lower */ + if (!OVL_TYPE_UPPER(ovl_path_type(dentry))) + return 0; + /* * Need to check if we started out being a lower dir, but got copied up */ - if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) { + if (!od->is_upper) { struct inode *inode = file_inode(file); realfile = READ_ONCE(od->upperfile); @@ -858,8 +869,11 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) int err; struct ovl_cache_entry *p, *n; struct rb_root root = RB_ROOT; + const struct cred *old_cred; + old_cred = ovl_override_creds(dentry->d_sb); err = ovl_dir_read_merged(dentry, list, &root); + revert_creds(old_cred); if (err) return err; @@ -1016,13 +1030,13 @@ void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, } } -int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, - struct ovl_path *lower, unsigned int numlower) +int ovl_indexdir_cleanup(struct ovl_fs *ofs) { int err; + struct dentry *indexdir = ofs->indexdir; struct dentry *index = NULL; - struct inode *dir = dentry->d_inode; - struct path path = { .mnt = mnt, .dentry = dentry }; + struct inode *dir = indexdir->d_inode; + struct path path = { .mnt = ofs->upper_mnt, .dentry = indexdir }; LIST_HEAD(list); struct rb_root root = RB_ROOT; struct ovl_cache_entry *p; @@ -1046,19 +1060,40 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, if (p->len == 2 && p->name[1] == '.') continue; } - index = lookup_one_len(p->name, dentry, p->len); + index = lookup_one_len(p->name, indexdir, p->len); if (IS_ERR(index)) { err = PTR_ERR(index); index = NULL; break; } - err = ovl_verify_index(index, lower, numlower); - /* Cleanup stale and orphan index entries */ - if (err && (err == -ESTALE || err == -ENOENT)) + err = ovl_verify_index(ofs, index); + if (!err) { + goto next; + } else if (err == -ESTALE) { + /* Cleanup stale index entries */ + err = ovl_cleanup(dir, index); + } else if (err != -ENOENT) { + /* + * Abort mount to avoid corrupting the index if + * an incompatible index entry was found or on out + * of memory. + */ + break; + } else if (ofs->config.nfs_export) { + /* + * Whiteout orphan index to block future open by + * handle after overlay nlink dropped to zero. + */ + err = ovl_cleanup_and_whiteout(indexdir, dir, index); + } else { + /* Cleanup orphan index entries */ err = ovl_cleanup(dir, index); + } + if (err) break; +next: dput(index); index = NULL; } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 76440feb79f6..9ee37c76091d 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -45,6 +45,11 @@ module_param_named(index, ovl_index_def, bool, 0644); MODULE_PARM_DESC(ovl_index_def, "Default to on or off for the inodes index feature"); +static bool ovl_nfs_export_def = IS_ENABLED(CONFIG_OVERLAY_FS_NFS_EXPORT); +module_param_named(nfs_export, ovl_nfs_export_def, bool, 0644); +MODULE_PARM_DESC(ovl_nfs_export_def, + "Default to on or off for the NFS export feature"); + static void ovl_entry_stack_free(struct ovl_entry *oe) { unsigned int i; @@ -211,6 +216,7 @@ static void ovl_destroy_inode(struct inode *inode) struct ovl_inode *oi = OVL_I(inode); dput(oi->__upperdentry); + iput(oi->lower); kfree(oi->redirect); ovl_dir_cache_free(inode); mutex_destroy(&oi->lock); @@ -341,6 +347,9 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) seq_printf(m, ",redirect_dir=%s", ofs->config.redirect_mode); if (ofs->config.index != ovl_index_def) seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off"); + if (ofs->config.nfs_export != ovl_nfs_export_def) + seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ? + "on" : "off"); return 0; } @@ -373,6 +382,8 @@ enum { OPT_REDIRECT_DIR, OPT_INDEX_ON, OPT_INDEX_OFF, + OPT_NFS_EXPORT_ON, + OPT_NFS_EXPORT_OFF, OPT_ERR, }; @@ -384,6 +395,8 @@ static const match_table_t ovl_tokens = { {OPT_REDIRECT_DIR, "redirect_dir=%s"}, {OPT_INDEX_ON, "index=on"}, {OPT_INDEX_OFF, "index=off"}, + {OPT_NFS_EXPORT_ON, "nfs_export=on"}, + {OPT_NFS_EXPORT_OFF, "nfs_export=off"}, {OPT_ERR, NULL} }; @@ -490,6 +503,14 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->index = false; break; + case OPT_NFS_EXPORT_ON: + config->nfs_export = true; + break; + + case OPT_NFS_EXPORT_OFF: + config->nfs_export = false; + break; + default: pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); return -EINVAL; @@ -520,10 +541,6 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs, bool retried = false; bool locked = false; - err = mnt_want_write(mnt); - if (err) - goto out_err; - inode_lock_nested(dir, I_MUTEX_PARENT); locked = true; @@ -588,7 +605,6 @@ retry: goto out_err; } out_unlock: - mnt_drop_write(mnt); if (locked) inode_unlock(dir); @@ -700,12 +716,16 @@ static int ovl_lower_dir(const char *name, struct path *path, *remote = true; /* - * The inodes index feature needs to encode and decode file - * handles, so it requires that all layers support them. + * The inodes index feature and NFS export need to encode and decode + * file handles, so they require that all layers support them. */ - if (ofs->config.index && !ovl_can_decode_fh(path->dentry->d_sb)) { + if ((ofs->config.nfs_export || + (ofs->config.index && ofs->config.upperdir)) && + !ovl_can_decode_fh(path->dentry->d_sb)) { ofs->config.index = false; - pr_warn("overlayfs: fs on '%s' does not support file handles, falling back to index=off.\n", name); + ofs->config.nfs_export = false; + pr_warn("overlayfs: fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n", + name); } return 0; @@ -929,12 +949,17 @@ out: static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath) { + struct vfsmount *mnt = ofs->upper_mnt; struct dentry *temp; int err; + err = mnt_want_write(mnt); + if (err) + return err; + ofs->workdir = ovl_workdir_create(ofs, OVL_WORKDIR_NAME, false); if (!ofs->workdir) - return 0; + goto out; /* * Upper should support d_type, else whiteouts are visible. Given @@ -944,7 +969,7 @@ static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath) */ err = ovl_check_d_type_supported(workpath); if (err < 0) - return err; + goto out; /* * We allowed this configuration and don't want to break users over @@ -967,7 +992,9 @@ static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath) err = ovl_do_setxattr(ofs->workdir, OVL_XATTR_OPAQUE, "0", 1, 0); if (err) { ofs->noxattr = true; - pr_warn("overlayfs: upper fs does not support xattr.\n"); + ofs->config.index = false; + pr_warn("overlayfs: upper fs does not support xattr, falling back to index=off.\n"); + err = 0; } else { vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE); } @@ -979,7 +1006,15 @@ static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath) pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n"); } - return 0; + /* NFS export of r/w mount depends on index */ + if (ofs->config.nfs_export && !ofs->config.index) { + pr_warn("overlayfs: NFS export requires \"index=on\", falling back to nfs_export=off.\n"); + ofs->config.nfs_export = false; + } + +out: + mnt_drop_write(mnt); + return err; } static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath) @@ -1026,11 +1061,16 @@ out: static int ovl_get_indexdir(struct ovl_fs *ofs, struct ovl_entry *oe, struct path *upperpath) { + struct vfsmount *mnt = ofs->upper_mnt; int err; + err = mnt_want_write(mnt); + if (err) + return err; + /* Verify lower root is upper root origin */ err = ovl_verify_origin(upperpath->dentry, oe->lowerstack[0].dentry, - false, true); + true); if (err) { pr_err("overlayfs: failed to verify upper root origin\n"); goto out; @@ -1038,23 +1078,33 @@ static int ovl_get_indexdir(struct ovl_fs *ofs, struct ovl_entry *oe, ofs->indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true); if (ofs->indexdir) { - /* Verify upper root is index dir origin */ - err = ovl_verify_origin(ofs->indexdir, upperpath->dentry, - true, true); + /* + * Verify upper root is exclusively associated with index dir. + * Older kernels stored upper fh in "trusted.overlay.origin" + * xattr. If that xattr exists, verify that it is a match to + * upper dir file handle. In any case, verify or set xattr + * "trusted.overlay.upper" to indicate that index may have + * directory entries. + */ + if (ovl_check_origin_xattr(ofs->indexdir)) { + err = ovl_verify_set_fh(ofs->indexdir, OVL_XATTR_ORIGIN, + upperpath->dentry, true, false); + if (err) + pr_err("overlayfs: failed to verify index dir 'origin' xattr\n"); + } + err = ovl_verify_upper(ofs->indexdir, upperpath->dentry, true); if (err) - pr_err("overlayfs: failed to verify index dir origin\n"); + pr_err("overlayfs: failed to verify index dir 'upper' xattr\n"); /* Cleanup bad/stale/orphan index entries */ if (!err) - err = ovl_indexdir_cleanup(ofs->indexdir, - ofs->upper_mnt, - oe->lowerstack, - oe->numlower); + err = ovl_indexdir_cleanup(ofs); } if (err || !ofs->indexdir) pr_warn("overlayfs: try deleting index dir or mounting with '-o index=off' to disable inodes index.\n"); out: + mnt_drop_write(mnt); return err; } @@ -1094,6 +1144,7 @@ static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack, ofs->lower_layers[ofs->numlower].mnt = mnt; ofs->lower_layers[ofs->numlower].pseudo_dev = dev; + ofs->lower_layers[ofs->numlower].idx = i + 1; ofs->numlower++; /* Check if all lower layers are on same sb */ @@ -1131,6 +1182,10 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, } else if (!ofs->config.upperdir && stacklen == 1) { pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n"); goto out_err; + } else if (!ofs->config.upperdir && ofs->config.nfs_export && + ofs->config.redirect_follow) { + pr_warn("overlayfs: NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n"); + ofs->config.nfs_export = false; } err = -ENOMEM; @@ -1207,6 +1262,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_err; ofs->config.index = ovl_index_def; + ofs->config.nfs_export = ovl_nfs_export_def; err = ovl_parse_opt((char *) data, &ofs->config); if (err) goto out_err; @@ -1257,13 +1313,26 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (err) goto out_free_oe; - if (!ofs->indexdir) + /* Force r/o mount with no index dir */ + if (!ofs->indexdir) { + dput(ofs->workdir); + ofs->workdir = NULL; sb->s_flags |= SB_RDONLY; + } + } - /* Show index=off/on in /proc/mounts for any of the reasons above */ - if (!ofs->indexdir) + /* Show index=off in /proc/mounts for forced r/o mount */ + if (!ofs->indexdir) { ofs->config.index = false; + if (ofs->upper_mnt && ofs->config.nfs_export) { + pr_warn("overlayfs: NFS export requires an index dir, falling back to nfs_export=off.\n"); + ofs->config.nfs_export = false; + } + } + + if (ofs->config.nfs_export) + sb->s_export_op = &ovl_export_operations; /* Never override disk quota limits or use reserved space */ cap_lower(cred->cap_effective, CAP_SYS_RESOURCE); @@ -1279,15 +1348,15 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!root_dentry) goto out_free_oe; + root_dentry->d_fsdata = oe; + mntput(upperpath.mnt); if (upperpath.dentry) { - oe->has_upper = true; + ovl_dentry_set_upper_alias(root_dentry); if (ovl_is_impuredir(upperpath.dentry)) ovl_set_flag(OVL_IMPURE, d_inode(root_dentry)); } - root_dentry->d_fsdata = oe; - /* Root is always merge -> can have whiteouts */ ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry)); ovl_inode_init(d_inode(root_dentry), upperpath.dentry, diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index d6bb1c9f5e7a..930784a26623 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -63,6 +63,22 @@ struct dentry *ovl_indexdir(struct super_block *sb) return ofs->indexdir; } +/* Index all files on copy up. For now only enabled for NFS export */ +bool ovl_index_all(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->config.nfs_export && ofs->config.index; +} + +/* Verify lower origin on lookup. For now only enabled for NFS export */ +bool ovl_verify_lower(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->config.nfs_export && ofs->config.index; +} + struct ovl_entry *ovl_alloc_entry(unsigned int numlower) { size_t size = offsetof(struct ovl_entry, lowerstack[numlower]); @@ -194,10 +210,24 @@ void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache) OVL_I(inode)->cache = cache; } +void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry) +{ + set_bit(flag, &OVL_E(dentry)->flags); +} + +void ovl_dentry_clear_flag(unsigned long flag, struct dentry *dentry) +{ + clear_bit(flag, &OVL_E(dentry)->flags); +} + +bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry) +{ + return test_bit(flag, &OVL_E(dentry)->flags); +} + bool ovl_dentry_is_opaque(struct dentry *dentry) { - struct ovl_entry *oe = dentry->d_fsdata; - return oe->opaque; + return ovl_dentry_test_flag(OVL_E_OPAQUE, dentry); } bool ovl_dentry_is_whiteout(struct dentry *dentry) @@ -207,28 +237,23 @@ bool ovl_dentry_is_whiteout(struct dentry *dentry) void ovl_dentry_set_opaque(struct dentry *dentry) { - struct ovl_entry *oe = dentry->d_fsdata; - - oe->opaque = true; + ovl_dentry_set_flag(OVL_E_OPAQUE, dentry); } /* - * For hard links it's possible for ovl_dentry_upper() to return positive, while - * there's no actual upper alias for the inode. Copy up code needs to know - * about the existence of the upper alias, so it can't use ovl_dentry_upper(). + * For hard links and decoded file handles, it's possible for ovl_dentry_upper() + * to return positive, while there's no actual upper alias for the inode. + * Copy up code needs to know about the existence of the upper alias, so it + * can't use ovl_dentry_upper(). */ bool ovl_dentry_has_upper_alias(struct dentry *dentry) { - struct ovl_entry *oe = dentry->d_fsdata; - - return oe->has_upper; + return ovl_dentry_test_flag(OVL_E_UPPER_ALIAS, dentry); } void ovl_dentry_set_upper_alias(struct dentry *dentry) { - struct ovl_entry *oe = dentry->d_fsdata; - - oe->has_upper = true; + ovl_dentry_set_flag(OVL_E_UPPER_ALIAS, dentry); } bool ovl_redirect_dir(struct super_block *sb) @@ -257,7 +282,7 @@ void ovl_inode_init(struct inode *inode, struct dentry *upperdentry, if (upperdentry) OVL_I(inode)->__upperdentry = upperdentry; if (lowerdentry) - OVL_I(inode)->lower = d_inode(lowerdentry); + OVL_I(inode)->lower = igrab(d_inode(lowerdentry)); ovl_copyattr(d_inode(upperdentry ?: lowerdentry), inode); } @@ -273,7 +298,7 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) */ smp_wmb(); OVL_I(inode)->__upperdentry = upperdentry; - if (!S_ISDIR(upperinode->i_mode) && inode_unhashed(inode)) { + if (inode_unhashed(inode)) { inode->i_private = upperinode; __insert_inode_hash(inode, (unsigned long) upperinode); } @@ -447,10 +472,32 @@ void ovl_inuse_unlock(struct dentry *dentry) } } +/* + * Does this overlay dentry need to be indexed on copy up? + */ +bool ovl_need_index(struct dentry *dentry) +{ + struct dentry *lower = ovl_dentry_lower(dentry); + + if (!lower || !ovl_indexdir(dentry->d_sb)) + return false; + + /* Index all files for NFS export and consistency verification */ + if (ovl_index_all(dentry->d_sb)) + return true; + + /* Index only lower hardlinks on copy up */ + if (!d_is_dir(lower) && d_inode(lower)->i_nlink > 1) + return true; + + return false; +} + /* Caller must hold OVL_I(inode)->lock */ static void ovl_cleanup_index(struct dentry *dentry) { - struct inode *dir = ovl_indexdir(dentry->d_sb)->d_inode; + struct dentry *indexdir = ovl_indexdir(dentry->d_sb); + struct inode *dir = indexdir->d_inode; struct dentry *lowerdentry = ovl_dentry_lower(dentry); struct dentry *upperdentry = ovl_dentry_upper(dentry); struct dentry *index = NULL; @@ -463,7 +510,7 @@ static void ovl_cleanup_index(struct dentry *dentry) goto fail; inode = d_inode(upperdentry); - if (inode->i_nlink != 1) { + if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) { pr_warn_ratelimited("overlayfs: cleanup linked index (%pd2, ino=%lu, nlink=%u)\n", upperdentry, inode->i_ino, inode->i_nlink); /* @@ -481,13 +528,17 @@ static void ovl_cleanup_index(struct dentry *dentry) } inode_lock_nested(dir, I_MUTEX_PARENT); - /* TODO: whiteout instead of cleanup to block future open by handle */ - index = lookup_one_len(name.name, ovl_indexdir(dentry->d_sb), name.len); + index = lookup_one_len(name.name, indexdir, name.len); err = PTR_ERR(index); - if (!IS_ERR(index)) - err = ovl_cleanup(dir, index); - else + if (IS_ERR(index)) { index = NULL; + } else if (ovl_index_all(dentry->d_sb)) { + /* Whiteout orphan index to block future open by handle */ + err = ovl_cleanup_and_whiteout(indexdir, dir, index); + } else { + /* Cleanup orphan index entries */ + err = ovl_cleanup(dir, index); + } inode_unlock(dir); if (err) @@ -512,16 +563,16 @@ int ovl_nlink_start(struct dentry *dentry, bool *locked) const struct cred *old_cred; int err; - if (!d_inode(dentry) || d_is_dir(dentry)) + if (!d_inode(dentry)) return 0; /* * With inodes index is enabled, we store the union overlay nlink - * in an xattr on the index inode. When whiting out lower hardlinks + * in an xattr on the index inode. When whiting out an indexed lower, * we need to decrement the overlay persistent nlink, but before the * first copy up, we have no upper index inode to store the xattr. * - * As a workaround, before whiteout/rename over of a lower hardlink, + * As a workaround, before whiteout/rename over an indexed lower, * copy up to create the upper index. Creating the upper index will * initialize the overlay nlink, so it could be dropped if unlink * or rename succeeds. @@ -529,8 +580,7 @@ int ovl_nlink_start(struct dentry *dentry, bool *locked) * TODO: implement metadata only index copy up when called with * ovl_copy_up_flags(dentry, O_PATH). */ - if (ovl_indexdir(dentry->d_sb) && !ovl_dentry_has_upper_alias(dentry) && - d_inode(ovl_dentry_lower(dentry))->i_nlink > 1) { + if (ovl_need_index(dentry) && !ovl_dentry_has_upper_alias(dentry)) { err = ovl_copy_up(dentry); if (err) return err; @@ -540,7 +590,7 @@ int ovl_nlink_start(struct dentry *dentry, bool *locked) if (err) return err; - if (!ovl_test_flag(OVL_INDEX, d_inode(dentry))) + if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, d_inode(dentry))) goto out; old_cred = ovl_override_creds(dentry->d_sb); diff --git a/fs/pipe.c b/fs/pipe.c index 6d98566201ef..0913aed7fd0d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -35,11 +35,6 @@ */ unsigned int pipe_max_size = 1048576; -/* - * Minimum pipe size, as required by POSIX - */ -unsigned int pipe_min_size = PAGE_SIZE; - /* Maximum allocatable pages per user. Hard limit is unset by default, soft * matches default values. */ @@ -515,10 +510,10 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } /* No kernel lock held - fine */ -static unsigned int +static __poll_t pipe_poll(struct file *filp, poll_table *wait) { - unsigned int mask; + __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; int nrbufs; @@ -610,12 +605,21 @@ static unsigned long account_pipe_buffers(struct user_struct *user, static bool too_many_pipe_buffers_soft(unsigned long user_bufs) { - return pipe_user_pages_soft && user_bufs >= pipe_user_pages_soft; + unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); + + return soft_limit && user_bufs > soft_limit; } static bool too_many_pipe_buffers_hard(unsigned long user_bufs) { - return pipe_user_pages_hard && user_bufs >= pipe_user_pages_hard; + unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); + + return hard_limit && user_bufs > hard_limit; +} + +static bool is_unprivileged_user(void) +{ + return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); } struct pipe_inode_info *alloc_pipe_info(void) @@ -624,22 +628,23 @@ struct pipe_inode_info *alloc_pipe_info(void) unsigned long pipe_bufs = PIPE_DEF_BUFFERS; struct user_struct *user = get_current_user(); unsigned long user_bufs; + unsigned int max_size = READ_ONCE(pipe_max_size); pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); if (pipe == NULL) goto out_free_uid; - if (pipe_bufs * PAGE_SIZE > pipe_max_size && !capable(CAP_SYS_RESOURCE)) - pipe_bufs = pipe_max_size >> PAGE_SHIFT; + if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE)) + pipe_bufs = max_size >> PAGE_SHIFT; user_bufs = account_pipe_buffers(user, 0, pipe_bufs); - if (too_many_pipe_buffers_soft(user_bufs)) { + if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) { user_bufs = account_pipe_buffers(user, pipe_bufs, 1); pipe_bufs = 1; } - if (too_many_pipe_buffers_hard(user_bufs)) + if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user()) goto out_revert_acct; pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), @@ -1020,18 +1025,16 @@ const struct file_operations pipefifo_fops = { * Currently we rely on the pipe array holding a power-of-2 number * of pages. Returns 0 on error. */ -unsigned int round_pipe_size(unsigned int size) +unsigned int round_pipe_size(unsigned long size) { - unsigned long nr_pages; - - if (size < pipe_min_size) - size = pipe_min_size; - - nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (nr_pages == 0) + if (size > (1U << 31)) return 0; - return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; + /* Minimum pipe size, as required by POSIX */ + if (size < PAGE_SIZE) + return PAGE_SIZE; + + return roundup_pow_of_two(size); } /* @@ -1046,8 +1049,6 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) long ret = 0; size = round_pipe_size(arg); - if (size == 0) - return -EINVAL; nr_pages = size >> PAGE_SHIFT; if (!nr_pages) @@ -1069,7 +1070,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) if (nr_pages > pipe->buffers && (too_many_pipe_buffers_hard(user_bufs) || too_many_pipe_buffers_soft(user_bufs)) && - !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { + is_unprivileged_user()) { ret = -EPERM; goto out_revert_acct; } @@ -1125,16 +1126,6 @@ out_revert_acct: } /* - * This should work even if CONFIG_PROC_FS isn't set, as proc_dopipe_max_size - * will return an error. - */ -int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, - size_t *lenp, loff_t *ppos) -{ - return proc_dopipe_max_size(table, write, buf, lenp, ppos); -} - -/* * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same * location, so checking ->i_pipe is not enough to verify that this is a * pipe. diff --git a/fs/posix_acl.c b/fs/posix_acl.c index eebf5f6cf6d5..2fd0fde16fe1 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -43,7 +43,7 @@ struct posix_acl *get_cached_acl(struct inode *inode, int type) rcu_read_lock(); acl = rcu_dereference(*p); if (!acl || is_uncached_acl(acl) || - atomic_inc_not_zero(&acl->a_refcount)) + refcount_inc_not_zero(&acl->a_refcount)) break; rcu_read_unlock(); cpu_relax(); @@ -164,7 +164,7 @@ EXPORT_SYMBOL(get_acl); void posix_acl_init(struct posix_acl *acl, int count) { - atomic_set(&acl->a_refcount, 1); + refcount_set(&acl->a_refcount, 1); acl->a_count = count; } EXPORT_SYMBOL(posix_acl_init); @@ -197,7 +197,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags) sizeof(struct posix_acl_entry); clone = kmemdup(acl, size, flags); if (clone) - atomic_set(&clone->a_refcount, 1); + refcount_set(&clone->a_refcount, 1); } return clone; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 79375fc115d2..598803576e4c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -430,8 +430,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, * safe because the task has stopped executing permanently. */ if (permitted && (task->flags & PF_DUMPCORE)) { - eip = KSTK_EIP(task); - esp = KSTK_ESP(task); + if (try_get_task_stack(task)) { + eip = KSTK_EIP(task); + esp = KSTK_ESP(task); + put_task_stack(task); + } } } @@ -733,16 +736,10 @@ static int children_seq_open(struct inode *inode, struct file *file) return ret; } -int children_seq_release(struct inode *inode, struct file *file) -{ - seq_release(inode, file); - return 0; -} - const struct file_operations proc_tid_children_operations = { .open = children_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = children_seq_release, + .release = seq_release, }; #endif /* CONFIG_PROC_CHILDREN */ diff --git a/fs/proc/base.c b/fs/proc/base.c index 60316b52d659..9298324325ed 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -75,6 +75,7 @@ #include <linux/ptrace.h> #include <linux/tracehook.h> #include <linux/printk.h> +#include <linux/cache.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/audit.h> @@ -100,6 +101,8 @@ #include "internal.h" #include "fd.h" +#include "../../lib/kstrtox.h" + /* NOTE: * Implementing inode permission operations in /proc is almost * certainly an error. Permission checks need to happen during @@ -110,8 +113,8 @@ * in /proc for a task before it execs a suid executable. */ -static u8 nlink_tid; -static u8 nlink_tgid; +static u8 nlink_tid __ro_after_init; +static u8 nlink_tgid __ro_after_init; struct pid_entry { const char *name; @@ -1370,7 +1373,7 @@ static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; - WRITE_ONCE(task->fail_nth, n); + task->fail_nth = n; put_task_struct(task); return count; @@ -1386,8 +1389,7 @@ static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; - len = snprintf(numbuf, sizeof(numbuf), "%u\n", - READ_ONCE(task->fail_nth)); + len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth); len = simple_read_from_buffer(buf, count, ppos, numbuf, len); put_task_struct(task); @@ -1907,8 +1909,33 @@ end_instantiate: static int dname_to_vma_addr(struct dentry *dentry, unsigned long *start, unsigned long *end) { - if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) + const char *str = dentry->d_name.name; + unsigned long long sval, eval; + unsigned int len; + + len = _parse_integer(str, 16, &sval); + if (len & KSTRTOX_OVERFLOW) + return -EINVAL; + if (sval != (unsigned long)sval) return -EINVAL; + str += len; + + if (*str != '-') + return -EINVAL; + str++; + + len = _parse_integer(str, 16, &eval); + if (len & KSTRTOX_OVERFLOW) + return -EINVAL; + if (eval != (unsigned long)eval) + return -EINVAL; + str += len; + + if (*str != '\0') + return -EINVAL; + + *start = sval; + *end = eval; return 0; } @@ -2000,9 +2027,9 @@ out: } struct map_files_info { + unsigned long start; + unsigned long end; fmode_t mode; - unsigned int len; - unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ }; /* @@ -2172,10 +2199,9 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) if (++pos <= ctx->pos) continue; + info.start = vma->vm_start; + info.end = vma->vm_end; info.mode = vma->vm_file->f_mode; - info.len = snprintf(info.name, - sizeof(info.name), "%lx-%lx", - vma->vm_start, vma->vm_end); if (flex_array_put(fa, i++, &info, GFP_KERNEL)) BUG(); } @@ -2183,9 +2209,13 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) up_read(&mm->mmap_sem); for (i = 0; i < nr_files; i++) { + char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ + unsigned int len; + p = flex_array_get(fa, i); + len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end); if (!proc_fill_cache(file, ctx, - p->name, p->len, + buf, len, proc_map_files_instantiate, task, (void *)(unsigned long)p->mode)) @@ -3018,11 +3048,11 @@ static const struct inode_operations proc_tgid_base_inode_operations = { static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) { struct dentry *dentry, *leader, *dir; - char buf[PROC_NUMBUF]; + char buf[10 + 1]; struct qstr name; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", pid); + name.len = snprintf(buf, sizeof(buf), "%u", pid); /* no ->d_hash() rejects on procfs */ dentry = d_hash_and_lookup(mnt->mnt_root, &name); if (dentry) { @@ -3034,7 +3064,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) return; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", tgid); + name.len = snprintf(buf, sizeof(buf), "%u", tgid); leader = d_hash_and_lookup(mnt->mnt_root, &name); if (!leader) goto out; @@ -3046,7 +3076,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) goto out_put_leader; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", pid); + name.len = snprintf(buf, sizeof(buf), "%u", pid); dentry = d_hash_and_lookup(dir, &name); if (dentry) { d_invalidate(dentry); @@ -3225,14 +3255,14 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { - char name[PROC_NUMBUF]; + char name[10 + 1]; int len; cond_resched(); if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE)) continue; - len = snprintf(name, sizeof(name), "%d", iter.tgid); + len = snprintf(name, sizeof(name), "%u", iter.tgid); ctx->pos = iter.tgid + TGID_OFFSET; if (!proc_fill_cache(file, ctx, name, len, proc_pid_instantiate, iter.task, NULL)) { @@ -3560,10 +3590,10 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { - char name[PROC_NUMBUF]; + char name[10 + 1]; int len; tid = task_pid_nr_ns(task, ns); - len = snprintf(name, sizeof(name), "%d", tid); + len = snprintf(name, sizeof(name), "%u", tid); if (!proc_fill_cache(file, ctx, name, len, proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index 290ba85cb900..a8ac48aebd59 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -55,8 +55,7 @@ static int show_console_dev(struct seq_file *m, void *v) if (dev) seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); - seq_printf(m, "\n"); - + seq_putc(m, '\n'); return 0; } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 96fc70225e54..6b80cd1e419a 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -236,7 +236,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, for (fd = ctx->pos - 2; fd < files_fdtable(files)->max_fds; fd++, ctx->pos++) { - char name[PROC_NUMBUF]; + char name[10 + 1]; int len; if (!fcheck_files(files, fd)) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 793a67574668..5d709fa8f3a2 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -28,7 +28,7 @@ static DEFINE_RWLOCK(proc_subdir_lock); -static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) +static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len) { if (len < de->namelen) return -1; @@ -60,7 +60,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, struct proc_dir_entry *de = rb_entry(node, struct proc_dir_entry, subdir_node); - int result = proc_match(len, name, de); + int result = proc_match(name, de, len); if (result < 0) node = node->rb_left; @@ -84,7 +84,7 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, struct proc_dir_entry *this = rb_entry(*new, struct proc_dir_entry, subdir_node); - int result = proc_match(de->namelen, de->name, this); + int result = proc_match(de->name, this, de->namelen); parent = *new; if (result < 0) @@ -211,8 +211,8 @@ void proc_free_inum(unsigned int inum) * Don't create negative dentries here, return -ENOENT by hand * instead. */ -struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, - struct dentry *dentry) +struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, + struct proc_dir_entry *de) { struct inode *inode; @@ -235,7 +235,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - return proc_lookup_de(PDE(dir), dir, dentry); + return proc_lookup_de(dir, dentry, PDE(dir)); } /* @@ -247,8 +247,8 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, * value of the readdir() call, as long as it's non-negative * for success.. */ -int proc_readdir_de(struct proc_dir_entry *de, struct file *file, - struct dir_context *ctx) +int proc_readdir_de(struct file *file, struct dir_context *ctx, + struct proc_dir_entry *de) { int i; @@ -292,7 +292,7 @@ int proc_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); - return proc_readdir_de(PDE(inode), file, ctx); + return proc_readdir_de(file, ctx, PDE(inode)); } /* diff --git a/fs/proc/inode.c b/fs/proc/inode.c index dd0f82622427..6e8724958116 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -5,6 +5,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ +#include <linux/cache.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/kernel.h> @@ -52,7 +53,7 @@ static void proc_evict_inode(struct inode *inode) } } -static struct kmem_cache * proc_inode_cachep; +static struct kmem_cache *proc_inode_cachep __ro_after_init; static struct inode *proc_alloc_inode(struct super_block *sb) { @@ -128,12 +129,12 @@ enum {BIAS = -1U<<31}; static inline int use_pde(struct proc_dir_entry *pde) { - return atomic_inc_unless_negative(&pde->in_use); + return likely(atomic_inc_unless_negative(&pde->in_use)); } static void unuse_pde(struct proc_dir_entry *pde) { - if (atomic_dec_return(&pde->in_use) == BIAS) + if (unlikely(atomic_dec_return(&pde->in_use) == BIAS)) complete(pde->pde_unload_completion); } @@ -166,7 +167,7 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) spin_lock(&pde->pde_unload_lock); /* After ->release. */ list_del(&pdeo->lh); - if (pdeo->c) + if (unlikely(pdeo->c)) complete(pdeo->c); kfree(pdeo); } @@ -234,11 +235,11 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t return rv; } -static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts) +static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) { struct proc_dir_entry *pde = PDE(file_inode(file)); - unsigned int rv = DEFAULT_POLLMASK; - unsigned int (*poll)(struct file *, struct poll_table_struct *); + __poll_t rv = DEFAULT_POLLMASK; + __poll_t (*poll)(struct file *, struct poll_table_struct *); if (use_pde(pde)) { poll = pde->proc_fops->poll; if (poll) @@ -420,7 +421,7 @@ static const char *proc_get_link(struct dentry *dentry, struct delayed_call *done) { struct proc_dir_entry *pde = PDE(inode); - if (unlikely(!use_pde(pde))) + if (!use_pde(pde)) return ERR_PTR(-EINVAL); set_delayed_call(done, proc_put_link, pde); return pde->data; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 4a67188c8d74..d697c8ab0a14 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -31,24 +31,28 @@ struct mempolicy; * subdir_node is used to build the rb tree "subdir" of the parent. */ struct proc_dir_entry { + /* + * number of callers into module in progress; + * negative -> it's going away RSN + */ + atomic_t in_use; + atomic_t count; /* use count */ + struct list_head pde_openers; /* who did ->open, but not ->release */ + /* protects ->pde_openers and all struct pde_opener instances */ + spinlock_t pde_unload_lock; + struct completion *pde_unload_completion; + const struct inode_operations *proc_iops; + const struct file_operations *proc_fops; + void *data; unsigned int low_ino; - umode_t mode; nlink_t nlink; kuid_t uid; kgid_t gid; loff_t size; - const struct inode_operations *proc_iops; - const struct file_operations *proc_fops; struct proc_dir_entry *parent; struct rb_root_cached subdir; struct rb_node subdir_node; - void *data; - atomic_t count; /* use count */ - atomic_t in_use; /* number of callers into module in progress; */ - /* negative -> it's going away RSN */ - struct completion *pde_unload_completion; - struct list_head pde_openers; /* who did ->open, but not ->release */ - spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ + umode_t mode; u8 namelen; char name[]; } __randomize_layout; @@ -149,10 +153,9 @@ extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, i * generic.c */ extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); -extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, - struct dentry *); +struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *); extern int proc_readdir(struct file *, struct dir_context *); -extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *); +int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *); static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) { diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 4bc85cb8be6a..e8a93bc8285d 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -512,23 +512,15 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) return -EFAULT; } else { if (kern_addr_valid(start)) { - unsigned long n; - /* * Using bounce buffer to bypass the * hardened user copy kernel text checks. */ - memcpy(buf, (char *) start, tsz); - n = copy_to_user(buffer, buf, tsz); - /* - * We cannot distinguish between fault on source - * and fault on destination. When this happens - * we clear too and hope it will trigger the - * EFAULT again. - */ - if (n) { - if (clear_user(buffer + tsz - n, - n)) + if (probe_kernel_read(buf, (void *) start, tsz)) { + if (clear_user(buffer, tsz)) + return -EFAULT; + } else { + if (copy_to_user(buffer, buf, tsz)) return -EFAULT; } } else { diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index e0f8774acd65..f0bfb45c3f9f 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -40,7 +40,7 @@ static ssize_t kmsg_read(struct file *file, char __user *buf, return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC); } -static unsigned int kmsg_poll(struct file *file, poll_table *wait) +static __poll_t kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index a2bf369c923d..68c06ae7888c 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -135,7 +135,7 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir, de = ERR_PTR(-ENOENT); net = get_proc_task_net(dir); if (net != NULL) { - de = proc_lookup_de(net->proc_net, dir, dentry); + de = proc_lookup_de(dir, dentry, net->proc_net); put_net(net); } return de; @@ -172,7 +172,7 @@ static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) ret = -EINVAL; net = get_proc_task_net(file_inode(file)); if (net != NULL) { - ret = proc_readdir_de(net->proc_net, file, ctx); + ret = proc_readdir_de(file, ctx, net->proc_net); put_net(net); } return ret; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c5cbbdff3c3d..63325377621a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -630,12 +630,12 @@ static int proc_sys_open(struct inode *inode, struct file *filp) return 0; } -static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) +static __poll_t proc_sys_poll(struct file *filp, poll_table *wait) { struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; - unsigned int ret = DEFAULT_POLLMASK; + __poll_t ret = DEFAULT_POLLMASK; unsigned long event; /* sysctl was unregistered */ diff --git a/fs/proc/self.c b/fs/proc/self.c index 31326bb23b8b..4d7d061696b3 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/cache.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/pid_namespace.h> @@ -17,11 +18,11 @@ static const char *proc_self_get_link(struct dentry *dentry, if (!tgid) return ERR_PTR(-ENOENT); - /* 11 for max length of signed int in decimal + NULL term */ - name = kmalloc(12, dentry ? GFP_KERNEL : GFP_ATOMIC); + /* max length of unsigned int in decimal + NULL term */ + name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); if (unlikely(!name)) return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); - sprintf(name, "%d", tgid); + sprintf(name, "%u", tgid); set_delayed_call(done, kfree_link, name); return name; } @@ -30,7 +31,7 @@ static const struct inode_operations proc_self_inode_operations = { .get_link = proc_self_get_link, }; -static unsigned self_inum; +static unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 339e4c1c044d..ec6d2983a5cb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -47,8 +47,11 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) if (hiwater_rss < mm->hiwater_rss) hiwater_rss = mm->hiwater_rss; - text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; - lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; + /* split executable areas between text and lib */ + text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK); + text = min(text, mm->exec_vm << PAGE_SHIFT); + lib = (mm->exec_vm << PAGE_SHIFT) - text; + swap = get_mm_counter(mm, MM_SWAPENTS); seq_printf(m, "VmPeak:\t%8lu kB\n" @@ -76,7 +79,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) file << (PAGE_SHIFT-10), shmem << (PAGE_SHIFT-10), mm->data_vm << (PAGE_SHIFT-10), - mm->stack_vm << (PAGE_SHIFT-10), text, lib, + mm->stack_vm << (PAGE_SHIFT-10), + text >> 10, + lib >> 10, mm_pgtables_bytes(mm) >> 10, swap << (PAGE_SHIFT-10)); hugetlb_report_usage(m, mm); @@ -977,14 +982,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = *pmdp; + pmd_t old, pmd = *pmdp; if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */ - pmdp_invalidate(vma, addr, pmdp); - if (pmd_dirty(*pmdp)) + old = pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(old)) pmd = pmd_mkdirty(pmd); - if (pmd_young(*pmdp)) + if (pmd_young(old)) pmd = pmd_mkyoung(pmd); pmd = pmd_wrprotect(pmd); diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index b813e3b529f2..9d2efaca499f 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/cache.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/pid_namespace.h> @@ -18,11 +19,10 @@ static const char *proc_thread_self_get_link(struct dentry *dentry, if (!pid) return ERR_PTR(-ENOENT); - name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, - dentry ? GFP_KERNEL : GFP_ATOMIC); + name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); if (unlikely(!name)) return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); - sprintf(name, "%d/task/%d", tgid, pid); + sprintf(name, "%u/task/%u", tgid, pid); set_delayed_call(done, kfree_link, name); return name; } @@ -31,7 +31,7 @@ static const struct inode_operations proc_thread_self_inode_operations = { .get_link = proc_thread_self_get_link, }; -static unsigned thread_self_inum; +static unsigned thread_self_inum __ro_after_init; int proc_setup_thread_self(struct super_block *s) { diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 885d445afa0d..a45f0af22a60 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1178,18 +1178,16 @@ fs_initcall(vmcore_init); /* Cleanup function for vmcore module. */ void vmcore_cleanup(void) { - struct list_head *pos, *next; - if (proc_vmcore) { proc_remove(proc_vmcore); proc_vmcore = NULL; } /* clear the vmcore list. */ - list_for_each_safe(pos, next, &vmcore_list) { + while (!list_empty(&vmcore_list)) { struct vmcore *m; - m = list_entry(pos, struct vmcore, list); + m = list_first_entry(&vmcore_list, struct vmcore, list); list_del(&m->list); kfree(m); } diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index b786840facd9..c8528d587e09 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -18,12 +18,12 @@ #include "pnode.h" #include "internal.h" -static unsigned mounts_poll(struct file *file, poll_table *wait) +static __poll_t mounts_poll(struct file *file, poll_table *wait) { struct seq_file *m = file->private_data; struct proc_mounts *p = m->private; struct mnt_namespace *ns = p->ns; - unsigned res = POLLIN | POLLRDNORM; + __poll_t res = POLLIN | POLLRDNORM; int event; poll_wait(file, &p->ns->poll, wait); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 691032107f8c..c3129b131e4d 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -41,7 +41,6 @@ #include <linux/timer.h> #include <linux/slab.h> #include <linux/uaccess.h> -#include <linux/hardirq.h> #include <linux/jiffies.h> #include <linux/workqueue.h> diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig index 7cd46666ba2c..86e71c0caf48 100644 --- a/fs/reiserfs/Kconfig +++ b/fs/reiserfs/Kconfig @@ -57,8 +57,7 @@ config REISERFS_FS_XATTR depends on REISERFS_FS help Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). + the kernel or by users (see the attr(5) manual page for details). If unsure, say N. @@ -70,9 +69,6 @@ config REISERFS_FS_POSIX_ACL Posix Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. - To learn more about Access Control Lists, visit the Posix ACLs for - Linux website <http://acl.bestbits.at/>. - If you don't know what Access Control Lists are, say N config REISERFS_FS_SECURITY diff --git a/fs/select.c b/fs/select.c index 6de493bb42a4..ec14171dd78a 100644 --- a/fs/select.c +++ b/fs/select.c @@ -212,7 +212,7 @@ static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key struct poll_table_entry *entry; entry = container_of(wait, struct poll_table_entry, wait); - if (key && !((unsigned long)key & entry->key)) + if (key && !(key_to_poll(key) & entry->key)) return 0; return __pollwake(wait, mode, sync, key); } @@ -438,7 +438,7 @@ get_max: static inline void wait_key_set(poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, - unsigned int ll_flag) + __poll_t ll_flag) { wait->_key = POLLEX_SET | ll_flag; if (in & bit) @@ -454,7 +454,7 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) poll_table *wait; int retval, i, timed_out = 0; u64 slack = 0; - unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; + __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; rcu_read_lock(); @@ -484,8 +484,9 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; for (i = 0; i < n; ++rinp, ++routp, ++rexp) { - unsigned long in, out, ex, all_bits, bit = 1, mask, j; + unsigned long in, out, ex, all_bits, bit = 1, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; + __poll_t mask; in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; @@ -802,11 +803,11 @@ struct poll_list { * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ -static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, +static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, bool *can_busy_poll, - unsigned int busy_flag) + __poll_t busy_flag) { - unsigned int mask; + __poll_t mask; int fd; mask = 0; @@ -815,20 +816,24 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, struct fd f = fdget(fd); mask = POLLNVAL; if (f.file) { + /* userland u16 ->events contains POLL... bitmap */ + __poll_t filter = demangle_poll(pollfd->events) | + POLLERR | POLLHUP; mask = DEFAULT_POLLMASK; if (f.file->f_op->poll) { - pwait->_key = pollfd->events|POLLERR|POLLHUP; + pwait->_key = filter; pwait->_key |= busy_flag; mask = f.file->f_op->poll(f.file, pwait); if (mask & busy_flag) *can_busy_poll = true; } /* Mask out unneeded events. */ - mask &= pollfd->events | POLLERR | POLLHUP; + mask &= filter; fdput(f); } } - pollfd->revents = mask; + /* ... and so does ->revents */ + pollfd->revents = mangle_poll(mask); return mask; } @@ -840,7 +845,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, ktime_t expire, *to = NULL; int timed_out = 0, count = 0; u64 slack = 0; - unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; + __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; /* Optimise the no-wait case */ diff --git a/fs/signalfd.c b/fs/signalfd.c index 5f1ff8756595..31e923bec99a 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -58,10 +58,10 @@ static int signalfd_release(struct inode *inode, struct file *file) return 0; } -static unsigned int signalfd_poll(struct file *file, poll_table *wait) +static __poll_t signalfd_poll(struct file *file, poll_table *wait) { struct signalfd_ctx *ctx = file->private_data; - unsigned int events = 0; + __poll_t events = 0; poll_wait(file, ¤t->sighand->signalfd_wqh, wait); diff --git a/fs/super.c b/fs/super.c index 7ff1349609e4..672538ca9831 100644 --- a/fs/super.c +++ b/fs/super.c @@ -225,7 +225,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, if (s->s_user_ns != &init_user_ns) s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); - INIT_HLIST_BL_HEAD(&s->s_anon); + INIT_HLIST_BL_HEAD(&s->s_roots); mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); spin_lock_init(&s->s_inode_list_lock); @@ -517,7 +517,11 @@ retry: hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); - register_shrinker(&s->s_shrink); + err = register_shrinker(&s->s_shrink); + if (err) { + deactivate_locked_super(s); + s = ERR_PTR(err); + } return s; } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 2b67bda2021b..58eba92a0e41 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/dir.c - sysfs core and dir operation implementation * @@ -5,12 +6,10 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * - * This file is released under the GPLv2. - * * Please see Documentation/filesystems/sysfs.txt for more information. */ -#undef DEBUG +#define pr_fmt(fmt) "sysfs: " fmt #include <linux/fs.h> #include <linux/kobject.h> @@ -27,8 +26,8 @@ void sysfs_warn_dup(struct kernfs_node *parent, const char *name) if (buf) kernfs_path(parent, buf, PATH_MAX); - WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s/%s'\n", - buf, name); + pr_warn("cannot create duplicate filename '%s/%s'\n", buf, name); + dump_stack(); kfree(buf); } diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 39c75a86c67f..5c13f29bfcdb 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/file.c - sysfs regular (text) file implementation * @@ -5,14 +6,11 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * - * This file is released under the GPLv2. - * * Please see Documentation/filesystems/sysfs.txt for more information. */ #include <linux/module.h> #include <linux/kobject.h> -#include <linux/kallsyms.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/mutex.h> @@ -70,8 +68,8 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v) * indicate truncated result or overflow in normal use cases. */ if (count >= (ssize_t)PAGE_SIZE) { - print_symbol("fill_read_buffer: %s returned bad count\n", - (unsigned long)ops->show); + printk("fill_read_buffer: %pS returned bad count\n", + ops->show); /* Try to struggle along */ count = PAGE_SIZE - 1; } diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index ac2de0ed69ad..4802ec0e1e3a 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/group.c - Operations for adding/removing multiple files at once. * @@ -5,9 +6,6 @@ * Copyright (c) 2003 Open Source Development Lab * Copyright (c) 2013 Greg Kroah-Hartman * Copyright (c) 2013 The Linux Foundation - * - * This file is released undert the GPL v2. - * */ #include <linux/kobject.h> @@ -406,6 +404,6 @@ int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, kernfs_put(entry); kernfs_put(target); - return IS_ERR(link) ? PTR_ERR(link) : 0; + return PTR_ERR_OR_ZERO(link); } EXPORT_SYMBOL_GPL(__compat_only_sysfs_link_entry_to_kobj); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index fb49510c5dcf..b428d317ae92 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/symlink.c - operations for initializing and mounting sysfs * @@ -5,13 +6,9 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * - * This file is released under the GPLv2. - * * Please see Documentation/filesystems/sysfs.txt for more information. */ -#define DEBUG - #include <linux/fs.h> #include <linux/magic.h> #include <linux/mount.h> diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index aecb15f84557..8664db25a9a6 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/symlink.c - sysfs symlink implementation * @@ -5,8 +6,6 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * - * This file is released under the GPLv2. - * * Please see Documentation/filesystems/sysfs.txt for more information. */ diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 0e2f1cccb812..d098e015fcc9 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -1,11 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/sysfs/sysfs.h - sysfs internal header file * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> - * - * This file is released under the GPLv2. */ #ifndef __SYSFS_INTERNAL_H diff --git a/fs/timerfd.c b/fs/timerfd.c index 040612ec9598..0510717f3a53 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -227,10 +227,10 @@ static int timerfd_release(struct inode *inode, struct file *file) return 0; } -static unsigned int timerfd_poll(struct file *file, poll_table *wait) +static __poll_t timerfd_poll(struct file *file, poll_table *wait) { struct timerfd_ctx *ctx = file->private_data; - unsigned int events = 0; + __poll_t events = 0; unsigned long flags; poll_wait(file, &ctx->wqh, wait); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 417fe0b29f23..9d7fb88e172e 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -220,20 +220,9 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino); - if (ubifs_crypt_is_encrypted(dir)) { - err = fscrypt_get_encryption_info(dir); - - /* - * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is - * created while the directory was encrypted and we - * have access to the key. - */ - if (fscrypt_has_encryption_key(dir)) - fscrypt_set_encrypted_dentry(dentry); - fscrypt_set_d_op(dentry); - if (err && err != -ENOKEY) - return ERR_PTR(err); - } + err = fscrypt_prepare_lookup(dir, dentry, flags); + if (err) + return ERR_PTR(err); err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm); if (err) @@ -743,9 +732,9 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, ubifs_assert(inode_is_locked(dir)); ubifs_assert(inode_is_locked(inode)); - if (ubifs_crypt_is_encrypted(dir) && - !fscrypt_has_permitted_context(dir, inode)) - return -EPERM; + err = fscrypt_prepare_link(old_dentry, dir, dentry); + if (err) + return err; err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); if (err) @@ -1149,38 +1138,24 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, struct ubifs_info *c = dir->i_sb->s_fs_info; int err, len = strlen(symname); int sz_change = CALC_DENT_SIZE(len); - struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1); - struct fscrypt_symlink_data *sd = NULL; + struct fscrypt_str disk_link; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, .new_ino_d = ALIGN(len, 8), .dirtied_ino = 1 }; struct fscrypt_name nm; - if (ubifs_crypt_is_encrypted(dir)) { - err = fscrypt_get_encryption_info(dir); - if (err) - goto out_budg; - - if (!fscrypt_has_encryption_key(dir)) { - err = -EPERM; - goto out_budg; - } + dbg_gen("dent '%pd', target '%s' in dir ino %lu", dentry, + symname, dir->i_ino); - disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + - sizeof(struct fscrypt_symlink_data)); - } + err = fscrypt_prepare_symlink(dir, symname, len, UBIFS_MAX_INO_DATA, + &disk_link); + if (err) + return err; /* * Budget request settings: new inode, new direntry and changing parent * directory inode. */ - - dbg_gen("dent '%pd', target '%s' in dir ino %lu", dentry, - symname, dir->i_ino); - - if (disk_link.len > UBIFS_MAX_INO_DATA) - return -ENAMETOOLONG; - err = ubifs_budget_space(c, &req); if (err) return err; @@ -1202,38 +1177,20 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, goto out_inode; } - if (ubifs_crypt_is_encrypted(dir)) { - struct qstr istr = QSTR_INIT(symname, len); - struct fscrypt_str ostr; - - sd = kzalloc(disk_link.len, GFP_NOFS); - if (!sd) { - err = -ENOMEM; - goto out_inode; - } - - ostr.name = sd->encrypted_path; - ostr.len = disk_link.len; - - err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); - if (err) { - kfree(sd); + if (IS_ENCRYPTED(inode)) { + disk_link.name = ui->data; /* encrypt directly into ui->data */ + err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); + if (err) goto out_inode; - } - - sd->len = cpu_to_le16(ostr.len); - disk_link.name = (char *)sd; } else { + memcpy(ui->data, disk_link.name, disk_link.len); inode->i_link = ui->data; } - memcpy(ui->data, disk_link.name, disk_link.len); - ((char *)ui->data)[disk_link.len - 1] = '\0'; - /* * The terminating zero byte is not written to the flash media and it * is put just to make later in-memory string processing simpler. Thus, - * data length is @len, not @len + %1. + * data length is @disk_link.len - 1, not @disk_link.len. */ ui->data_len = disk_link.len - 1; inode->i_size = ubifs_inode(inode)->ui_size = disk_link.len - 1; @@ -1251,11 +1208,10 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); - ubifs_release_budget(c, &req); insert_inode_hash(inode); d_instantiate(dentry, inode); - fscrypt_free_filename(&nm); - return 0; + err = 0; + goto out_fname; out_cancel: dir->i_size -= sz_change; @@ -1353,12 +1309,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlink) ubifs_assert(inode_is_locked(new_inode)); - if (old_dir != new_dir) { - if (ubifs_crypt_is_encrypted(new_dir) && - !fscrypt_has_permitted_context(new_dir, old_inode)) - return -EPERM; - } - if (unlink && is_dir) { err = ubifs_check_dir_empty(new_inode); if (err) @@ -1573,13 +1523,6 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, ubifs_assert(fst_inode && snd_inode); - if ((ubifs_crypt_is_encrypted(old_dir) || - ubifs_crypt_is_encrypted(new_dir)) && - (old_dir != new_dir) && - (!fscrypt_has_permitted_context(new_dir, fst_inode) || - !fscrypt_has_permitted_context(old_dir, snd_inode))) - return -EPERM; - err = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &fst_nm); if (err) return err; @@ -1624,12 +1567,19 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { + int err; + if (flags & ~(RENAME_NOREPLACE | RENAME_WHITEOUT | RENAME_EXCHANGE)) return -EINVAL; ubifs_assert(inode_is_locked(old_dir)); ubifs_assert(inode_is_locked(new_dir)); + err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); + if (err) + return err; + if (flags & RENAME_EXCHANGE) return ubifs_xrename(old_dir, old_dentry, new_dir, new_dentry); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index dfe85069586e..cf348ba99238 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1284,13 +1284,9 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; - if (ubifs_crypt_is_encrypted(inode) && (attr->ia_valid & ATTR_SIZE)) { - err = fscrypt_get_encryption_info(inode); - if (err) - return err; - if (!fscrypt_has_encryption_key(inode)) - return -ENOKEY; - } + err = fscrypt_prepare_setattr(dentry, attr); + if (err) + return err; if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size < inode->i_size) /* Truncation to a smaller size */ @@ -1629,82 +1625,21 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int ubifs_file_open(struct inode *inode, struct file *filp) -{ - int ret; - struct dentry *dir; - struct ubifs_info *c = inode->i_sb->s_fs_info; - - if (ubifs_crypt_is_encrypted(inode)) { - ret = fscrypt_get_encryption_info(inode); - if (ret) - return -EACCES; - if (!fscrypt_has_encryption_key(inode)) - return -ENOKEY; - } - - dir = dget_parent(file_dentry(filp)); - if (ubifs_crypt_is_encrypted(d_inode(dir)) && - !fscrypt_has_permitted_context(d_inode(dir), inode)) { - ubifs_err(c, "Inconsistent encryption contexts: %lu/%lu", - (unsigned long) d_inode(dir)->i_ino, - (unsigned long) inode->i_ino); - dput(dir); - ubifs_ro_mode(c, -EPERM); - return -EPERM; - } - dput(dir); - - return 0; -} - static const char *ubifs_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - int err; - struct fscrypt_symlink_data *sd; struct ubifs_inode *ui = ubifs_inode(inode); - struct fscrypt_str cstr; - struct fscrypt_str pstr; - if (!ubifs_crypt_is_encrypted(inode)) + if (!IS_ENCRYPTED(inode)) return ui->data; if (!dentry) return ERR_PTR(-ECHILD); - err = fscrypt_get_encryption_info(inode); - if (err) - return ERR_PTR(err); - - sd = (struct fscrypt_symlink_data *)ui->data; - cstr.name = sd->encrypted_path; - cstr.len = le16_to_cpu(sd->len); - - if (cstr.len == 0) - return ERR_PTR(-ENOENT); - - if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > ui->data_len) - return ERR_PTR(-EIO); - - err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); - if (err) - return ERR_PTR(err); - - err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); - if (err) { - fscrypt_fname_free_buffer(&pstr); - return ERR_PTR(err); - } - - pstr.name[pstr.len] = '\0'; - - set_delayed_call(done, kfree_link, pstr.name); - return pstr.name; + return fscrypt_get_symlink(inode, ui->data, ui->data_len, done); } - const struct address_space_operations ubifs_file_address_operations = { .readpage = ubifs_readpage, .writepage = ubifs_writepage, @@ -1746,7 +1681,7 @@ const struct file_operations ubifs_file_operations = { .unlocked_ioctl = ubifs_ioctl, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, - .open = ubifs_file_open, + .open = fscrypt_file_open, #ifdef CONFIG_COMPAT .compat_ioctl = ubifs_compat_ioctl, #endif diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 0beb285b143d..b16ef162344a 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -379,9 +379,7 @@ out: } done: clear_inode(inode); -#ifdef CONFIG_UBIFS_FS_ENCRYPTION - fscrypt_put_encryption_info(inode, NULL); -#endif + fscrypt_put_encryption_info(inode); } static void ubifs_dirty_inode(struct inode *inode, int flags) diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 0a213dcba2a1..ba3d0e0f8615 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -1890,35 +1890,28 @@ static int search_dh_cookie(struct ubifs_info *c, const union ubifs_key *key, union ubifs_key *dkey; for (;;) { - if (!err) { - err = tnc_next(c, &znode, n); - if (err) - goto out; - } - zbr = &znode->zbranch[*n]; dkey = &zbr->key; if (key_inum(c, dkey) != key_inum(c, key) || key_type(c, dkey) != key_type(c, key)) { - err = -ENOENT; - goto out; + return -ENOENT; } err = tnc_read_hashed_node(c, zbr, dent); if (err) - goto out; + return err; if (key_hash(c, key) == key_hash(c, dkey) && le32_to_cpu(dent->cookie) == cookie) { *zn = znode; - goto out; + return 0; } - } - -out: - return err; + err = tnc_next(c, &znode, n); + if (err) + return err; + } } static int do_lookup_dh(struct ubifs_info *c, const union ubifs_key *key, diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 5ddc89d564fd..759f1a209dbb 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -381,8 +381,6 @@ ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf, if (buf) { /* If @buf is %NULL we are supposed to return the length */ if (ui->data_len > size) { - ubifs_err(c, "buffer size %zd, xattr len %d", - size, ui->data_len); err = -ERANGE; goto out_iput; } diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 2edc1755b7c5..50dfce000864 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -20,6 +20,7 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/swap.h> +#include <linux/iversion.h> #include "ufs_fs.h" #include "ufs.h" @@ -47,7 +48,7 @@ static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) struct inode *dir = mapping->host; int err = 0; - dir->i_version++; + inode_inc_iversion(dir); block_write_end(NULL, mapping, pos, len, len, page, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); @@ -428,7 +429,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx) unsigned long n = pos >> PAGE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); - int need_revalidate = file->f_version != inode->i_version; + bool need_revalidate = inode_cmp_iversion(inode, file->f_version); unsigned flags = UFS_SB(sb)->s_flags; UFSD("BEGIN\n"); @@ -455,8 +456,8 @@ ufs_readdir(struct file *file, struct dir_context *ctx) offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask); ctx->pos = (n<<PAGE_SHIFT) + offset; } - file->f_version = inode->i_version; - need_revalidate = 0; + file->f_version = inode_query_iversion(inode); + need_revalidate = false; } de = (struct ufs_dir_entry *)(kaddr+offset); limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index afb601c0dda0..c843ec858cf7 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -36,6 +36,7 @@ #include <linux/mm.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/iversion.h> #include "ufs_fs.h" #include "ufs.h" @@ -693,7 +694,7 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino) if (err) goto bad_inode; - inode->i_version++; + inode_inc_iversion(inode); ufsi->i_lastfrag = (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift; ufsi->i_dir_start_lookup = 0; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 4d497e9c6883..8254b8b3690f 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -88,6 +88,7 @@ #include <linux/log2.h> #include <linux/mount.h> #include <linux/seq_file.h> +#include <linux/iversion.h> #include "ufs_fs.h" #include "ufs.h" @@ -1440,7 +1441,7 @@ static struct inode *ufs_alloc_inode(struct super_block *sb) if (!ei) return NULL; - ei->vfs_inode.i_version = 1; + inode_set_iversion(&ei->vfs_inode, 1); seqlock_init(&ei->meta_lock); mutex_init(&ei->truncate_mutex); return &ei->vfs_inode; @@ -1466,11 +1467,14 @@ static void init_once(void *foo) static int __init init_inodecache(void) { - ufs_inode_cachep = kmem_cache_create("ufs_inode_cache", - sizeof(struct ufs_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), - init_once); + ufs_inode_cachep = kmem_cache_create_usercopy("ufs_inode_cache", + sizeof(struct ufs_inode_info), 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT), + offsetof(struct ufs_inode_info, i_u1.i_symlink), + sizeof_field(struct ufs_inode_info, + i_u1.i_symlink), + init_once); if (ufs_inode_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ac9a4e65ca49..87a13a7c8270 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -294,10 +294,13 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, * pmd_trans_unstable) of the pmd. */ _pmd = READ_ONCE(*pmd); - if (!pmd_present(_pmd)) + if (pmd_none(_pmd)) goto out; ret = false; + if (!pmd_present(_pmd)) + goto out; + if (pmd_trans_huge(_pmd)) goto out; @@ -570,11 +573,14 @@ out: static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, struct userfaultfd_wait_queue *ewq) { + struct userfaultfd_ctx *release_new_ctx; + if (WARN_ON_ONCE(current->flags & PF_EXITING)) goto out; ewq->ctx = ctx; init_waitqueue_entry(&ewq->wq, current); + release_new_ctx = NULL; spin_lock(&ctx->event_wqh.lock); /* @@ -601,8 +607,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, new = (struct userfaultfd_ctx *) (unsigned long) ewq->msg.arg.reserved.reserved1; - - userfaultfd_ctx_put(new); + release_new_ctx = new; } break; } @@ -617,6 +622,20 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, __set_current_state(TASK_RUNNING); spin_unlock(&ctx->event_wqh.lock); + if (release_new_ctx) { + struct vm_area_struct *vma; + struct mm_struct *mm = release_new_ctx->mm; + + /* the various vma->vm_userfaultfd_ctx still points to it */ + down_write(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) + if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + up_write(&mm->mmap_sem); + + userfaultfd_ctx_put(release_new_ctx); + } + /* * ctx may go away after this if the userfault pseudo fd is * already released. @@ -921,10 +940,10 @@ static inline struct userfaultfd_wait_queue *find_userfault_evt( return find_userfault_in(&ctx->event_wqh); } -static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) +static __poll_t userfaultfd_poll(struct file *file, poll_table *wait) { struct userfaultfd_ctx *ctx = file->private_data; - unsigned int ret; + __poll_t ret; poll_wait(file, &ctx->fd_wqh, wait); @@ -969,24 +988,14 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *ctx, struct uffd_msg *msg) { int fd; - struct file *file; - unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS; - fd = get_unused_fd_flags(flags); + fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new, + O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS)); if (fd < 0) return fd; - file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new, - O_RDWR | flags); - if (IS_ERR(file)) { - put_unused_fd(fd); - return PTR_ERR(file); - } - - fd_install(fd, file); msg->arg.reserved.reserved1 = 0; msg->arg.fork.ufd = fd; - return 0; } @@ -1868,24 +1877,10 @@ static void init_once_userfaultfd_ctx(void *mem) seqcount_init(&ctx->refile_seq); } -/** - * userfaultfd_file_create - Creates a userfaultfd file pointer. - * @flags: Flags for the userfaultfd file. - * - * This function creates a userfaultfd file pointer, w/out installing - * it into the fd table. This is useful when the userfaultfd file is - * used during the initialization of data structures that require - * extra setup after the userfaultfd creation. So the userfaultfd - * creation is split into the file pointer creation phase, and the - * file descriptor installation phase. In this way races with - * userspace closing the newly installed file descriptor can be - * avoided. Returns a userfaultfd file pointer, or a proper error - * pointer. - */ -static struct file *userfaultfd_file_create(int flags) +SYSCALL_DEFINE1(userfaultfd, int, flags) { - struct file *file; struct userfaultfd_ctx *ctx; + int fd; BUG_ON(!current->mm); @@ -1893,14 +1888,12 @@ static struct file *userfaultfd_file_create(int flags) BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); - file = ERR_PTR(-EINVAL); if (flags & ~UFFD_SHARED_FCNTL_FLAGS) - goto out; + return -EINVAL; - file = ERR_PTR(-ENOMEM); ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); if (!ctx) - goto out; + return -ENOMEM; atomic_set(&ctx->refcount, 1); ctx->flags = flags; @@ -1911,39 +1904,13 @@ static struct file *userfaultfd_file_create(int flags) /* prevent the mm struct to be freed */ mmgrab(ctx->mm); - file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx, - O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); - if (IS_ERR(file)) { + fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx, + O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); + if (fd < 0) { mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); } -out: - return file; -} - -SYSCALL_DEFINE1(userfaultfd, int, flags) -{ - int fd, error; - struct file *file; - - error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS); - if (error < 0) - return error; - fd = error; - - file = userfaultfd_file_create(flags); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_put_unused_fd; - } - fd_install(fd, file); - return fd; - -err_put_unused_fd: - put_unused_fd(fd); - - return error; } static int __init userfaultfd_init(void) diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index f42fcf1b5465..46bcf0e649f5 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -48,9 +48,6 @@ config XFS_POSIX_ACL POSIX Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. - To learn more about Access Control Lists, visit the POSIX ACLs for - Linux website <http://acl.bestbits.at/>. - If you don't know what Access Control Lists are, say N. config XFS_RT diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 0da80019a917..c02781a4c091 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -167,7 +167,7 @@ xfs_alloc_lookup_ge( * Lookup the first record less than or equal to [bno, len] * in the btree given by cur. */ -static int /* error */ +int /* error */ xfs_alloc_lookup_le( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ @@ -520,7 +520,7 @@ xfs_alloc_fixup_trees( return 0; } -static bool +static xfs_failaddr_t xfs_agfl_verify( struct xfs_buf *bp) { @@ -528,10 +528,19 @@ xfs_agfl_verify( struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); int i; + /* + * There is no verification of non-crc AGFLs because mkfs does not + * initialise the AGFL to zero or NULL. Hence the only valid part of the + * AGFL is what the AGF says is active. We can't get to the AGF, so we + * can't verify just those entries are valid. + */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return NULL; + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) - return false; + return __this_address; /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't @@ -539,16 +548,17 @@ xfs_agfl_verify( * so we can detect and avoid this problem. */ if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) - return false; + return __this_address; for (i = 0; i < XFS_AGFL_SIZE(mp); i++) { if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) - return false; + return __this_address; } - return xfs_log_check_lsn(mp, - be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)); + if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn))) + return __this_address; + return NULL; } static void @@ -556,6 +566,7 @@ xfs_agfl_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; /* * There is no verification of non-crc AGFLs because mkfs does not @@ -567,28 +578,29 @@ xfs_agfl_read_verify( return; if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_agfl_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_agfl_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } static void xfs_agfl_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_log_item; + xfs_failaddr_t fa; /* no verification of non-crc AGFLs */ if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - if (!xfs_agfl_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_agfl_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -602,6 +614,7 @@ const struct xfs_buf_ops xfs_agfl_buf_ops = { .name = "xfs_agfl", .verify_read = xfs_agfl_read_verify, .verify_write = xfs_agfl_write_verify, + .verify_struct = xfs_agfl_verify, }; /* @@ -702,7 +715,7 @@ xfs_alloc_ag_vextent( ASSERT(args->agbno % args->alignment == 0); /* if not file data, insert new block into the reverse map btree */ - if (args->oinfo.oi_owner != XFS_RMAP_OWN_UNKNOWN) { + if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { error = xfs_rmap_alloc(args->tp, args->agbp, args->agno, args->agbno, args->len, &args->oinfo); if (error) @@ -1682,7 +1695,7 @@ xfs_free_ag_extent( bno_cur = cnt_cur = NULL; mp = tp->t_mountp; - if (oinfo->oi_owner != XFS_RMAP_OWN_UNKNOWN) { + if (!xfs_rmap_should_skip_owner_update(oinfo)) { error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo); if (error) goto error0; @@ -2397,19 +2410,19 @@ xfs_alloc_put_freelist( return 0; } -static bool +static xfs_failaddr_t xfs_agf_verify( - struct xfs_mount *mp, - struct xfs_buf *bp) - { - struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn))) - return false; + return __this_address; } if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && @@ -2418,18 +2431,18 @@ xfs_agf_verify( be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) - return false; + return __this_address; if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) - return false; + return __this_address; if (xfs_sb_version_hasrmapbt(&mp->m_sb) && (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)) - return false; + return __this_address; /* * during growfs operations, the perag is not fully initialised, @@ -2438,18 +2451,18 @@ xfs_agf_verify( * so we can detect and avoid this problem. */ if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) - return false; + return __this_address; if (xfs_sb_version_haslazysbcount(&mp->m_sb) && be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) - return false; + return __this_address; if (xfs_sb_version_hasreflink(&mp->m_sb) && (be32_to_cpu(agf->agf_refcount_level) < 1 || be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)) - return false; + return __this_address; - return true;; + return NULL; } @@ -2458,28 +2471,29 @@ xfs_agf_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp, - XFS_ERRTAG_ALLOC_READ_AGF)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_agf_verify(bp); + if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF)) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } static void xfs_agf_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_log_item; + xfs_failaddr_t fa; - if (!xfs_agf_verify(mp, bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_agf_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -2496,6 +2510,7 @@ const struct xfs_buf_ops xfs_agf_buf_ops = { .name = "xfs_agf", .verify_read = xfs_agf_read_verify, .verify_write = xfs_agf_write_verify, + .verify_struct = xfs_agf_verify, }; /* @@ -2981,3 +2996,22 @@ xfs_verify_fsbno( return false; return xfs_verify_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno)); } + +/* Is there a record covering a given extent? */ +int +xfs_alloc_has_record( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool *exists) +{ + union xfs_btree_irec low; + union xfs_btree_irec high; + + memset(&low, 0, sizeof(low)); + low.a.ar_startblock = bno; + memset(&high, 0xFF, sizeof(high)); + high.a.ar_startblock = bno + len - 1; + + return xfs_btree_has_record(cur, &low, &high, exists); +} diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 7ba2d129d504..65a0cafe06e4 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -198,6 +198,13 @@ xfs_free_extent( enum xfs_ag_resv_type type); /* block reservation type */ int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +int /* error */ xfs_alloc_lookup_ge( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ @@ -237,4 +244,7 @@ bool xfs_verify_agbno(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno); bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno); +int xfs_alloc_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, bool *exist); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index cfde0a0f9706..6840b588187e 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -307,13 +307,14 @@ xfs_cntbt_diff_two_keys( be32_to_cpu(k2->alloc.ar_startblock); } -static bool +static xfs_failaddr_t xfs_allocbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; + xfs_failaddr_t fa; unsigned int level; /* @@ -331,29 +332,31 @@ xfs_allocbt_verify( level = be16_to_cpu(block->bb_level); switch (block->bb_magic) { case cpu_to_be32(XFS_ABTB_CRC_MAGIC): - if (!xfs_btree_sblock_v5hdr_verify(bp)) - return false; + fa = xfs_btree_sblock_v5hdr_verify(bp); + if (fa) + return fa; /* fall through */ case cpu_to_be32(XFS_ABTB_MAGIC): if (pag && pag->pagf_init) { if (level >= pag->pagf_levels[XFS_BTNUM_BNOi]) - return false; + return __this_address; } else if (level >= mp->m_ag_maxlevels) - return false; + return __this_address; break; case cpu_to_be32(XFS_ABTC_CRC_MAGIC): - if (!xfs_btree_sblock_v5hdr_verify(bp)) - return false; + fa = xfs_btree_sblock_v5hdr_verify(bp); + if (fa) + return fa; /* fall through */ case cpu_to_be32(XFS_ABTC_MAGIC): if (pag && pag->pagf_init) { if (level >= pag->pagf_levels[XFS_BTNUM_CNTi]) - return false; + return __this_address; } else if (level >= mp->m_ag_maxlevels) - return false; + return __this_address; break; default: - return false; + return __this_address; } return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); @@ -363,25 +366,30 @@ static void xfs_allocbt_read_verify( struct xfs_buf *bp) { + xfs_failaddr_t fa; + if (!xfs_btree_sblock_verify_crc(bp)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_allocbt_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_allocbt_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } - if (bp->b_error) { + if (bp->b_error) trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_verifier_error(bp); - } } static void xfs_allocbt_write_verify( struct xfs_buf *bp) { - if (!xfs_allocbt_verify(bp)) { + xfs_failaddr_t fa; + + fa = xfs_allocbt_verify(bp); + if (fa) { trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } xfs_btree_sblock_calc_crc(bp); @@ -392,6 +400,7 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = { .name = "xfs_allocbt", .verify_read = xfs_allocbt_read_verify, .verify_write = xfs_allocbt_write_verify, + .verify_struct = xfs_allocbt_verify, }; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 6249c92671de..ce4a34a2751d 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -212,6 +212,7 @@ xfs_attr_set( int flags) { struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *leaf_bp = NULL; struct xfs_da_args args; struct xfs_defer_ops dfops; struct xfs_trans_res tres; @@ -327,9 +328,16 @@ xfs_attr_set( * GROT: another possible req'mt for a double-split btree op. */ xfs_defer_init(args.dfops, args.firstblock); - error = xfs_attr_shortform_to_leaf(&args); + error = xfs_attr_shortform_to_leaf(&args, &leaf_bp); if (error) goto out_defer_cancel; + /* + * Prevent the leaf buffer from being unlocked so that a + * concurrent AIL push cannot grab the half-baked leaf + * buffer and run into problems with the write verifier. + */ + xfs_trans_bhold(args.trans, leaf_bp); + xfs_defer_bjoin(args.dfops, leaf_bp); xfs_defer_ijoin(args.dfops, dp); error = xfs_defer_finish(&args.trans, args.dfops); if (error) @@ -337,13 +345,14 @@ xfs_attr_set( /* * Commit the leaf transformation. We'll need another (linked) - * transaction to add the new attribute to the leaf. + * transaction to add the new attribute to the leaf, which + * means that we have to hold & join the leaf buffer here too. */ - error = xfs_trans_roll_inode(&args.trans, dp); if (error) goto out; - + xfs_trans_bjoin(args.trans, leaf_bp); + leaf_bp = NULL; } if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) @@ -374,8 +383,9 @@ xfs_attr_set( out_defer_cancel: xfs_defer_cancel(&dfops); - args.trans = NULL; out: + if (leaf_bp) + xfs_trans_brelse(args.trans, leaf_bp); if (args.trans) xfs_trans_cancel(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); @@ -707,7 +717,6 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) return error; out_defer_cancel: xfs_defer_cancel(args->dfops); - args->trans = NULL; return error; } @@ -760,7 +769,6 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) return 0; out_defer_cancel: xfs_defer_cancel(args->dfops); - args->trans = NULL; return error; } @@ -1035,7 +1043,6 @@ out: return retval; out_defer_cancel: xfs_defer_cancel(args->dfops); - args->trans = NULL; goto out; } @@ -1176,7 +1183,6 @@ out: return error; out_defer_cancel: xfs_defer_cancel(args->dfops); - args->trans = NULL; goto out; } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 53cc8b986eac..2135b8e67dcc 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -247,14 +247,15 @@ xfs_attr3_leaf_hdr_to_disk( } } -static bool +static xfs_failaddr_t xfs_attr3_leaf_verify( - struct xfs_buf *bp) + struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_attr_leafblock *leaf = bp->b_addr; - struct xfs_perag *pag = bp->b_pag; - struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_perag *pag = bp->b_pag; + struct xfs_attr_leaf_entry *entries; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); @@ -262,17 +263,17 @@ xfs_attr3_leaf_verify( struct xfs_da3_node_hdr *hdr3 = bp->b_addr; if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) - return false; + return __this_address; if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) - return false; + return __this_address; } else { if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) - return false; + return __this_address; } /* * In recovery there is a transient state where count == 0 is valid @@ -280,12 +281,27 @@ xfs_attr3_leaf_verify( * if the attr didn't fit in shortform. */ if (pag && pag->pagf_init && ichdr.count == 0) - return false; + return __this_address; + + /* + * firstused is the block offset of the first name info structure. + * Make sure it doesn't go off the block or crash into the header. + */ + if (ichdr.firstused > mp->m_attr_geo->blksize) + return __this_address; + if (ichdr.firstused < xfs_attr3_leaf_hdr_size(leaf)) + return __this_address; + + /* Make sure the entries array doesn't crash into the name info. */ + entries = xfs_attr3_leaf_entryp(bp->b_addr); + if ((char *)&entries[ichdr.count] > + (char *)bp->b_addr + ichdr.firstused) + return __this_address; /* XXX: need to range check rest of attr header values */ /* XXX: hash order check? */ - return true; + return NULL; } static void @@ -293,12 +309,13 @@ xfs_attr3_leaf_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; + xfs_failaddr_t fa; - if (!xfs_attr3_leaf_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_attr3_leaf_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -322,21 +339,23 @@ xfs_attr3_leaf_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_attr3_leaf_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_attr3_leaf_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { .name = "xfs_attr3_leaf", .verify_read = xfs_attr3_leaf_read_verify, .verify_write = xfs_attr3_leaf_write_verify, + .verify_struct = xfs_attr3_leaf_verify, }; int @@ -735,10 +754,13 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args) } /* - * Convert from using the shortform to the leaf. + * Convert from using the shortform to the leaf. On success, return the + * buffer so that we can keep it locked until we're totally done with it. */ int -xfs_attr_shortform_to_leaf(xfs_da_args_t *args) +xfs_attr_shortform_to_leaf( + struct xfs_da_args *args, + struct xfs_buf **leaf_bp) { xfs_inode_t *dp; xfs_attr_shortform_t *sf; @@ -818,7 +840,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) sfe = XFS_ATTR_SF_NEXTENTRY(sfe); } error = 0; - + *leaf_bp = bp; out: kmem_free(tmpbuffer); return error; @@ -867,6 +889,80 @@ xfs_attr_shortform_allfit( return xfs_attr_shortform_bytesfit(dp, bytes); } +/* Verify the consistency of an inline attribute fork. */ +xfs_failaddr_t +xfs_attr_shortform_verify( + struct xfs_inode *ip) +{ + struct xfs_attr_shortform *sfp; + struct xfs_attr_sf_entry *sfep; + struct xfs_attr_sf_entry *next_sfep; + char *endp; + struct xfs_ifork *ifp; + int i; + int size; + + ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL); + ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); + sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; + size = ifp->if_bytes; + + /* + * Give up if the attribute is way too short. + */ + if (size < sizeof(struct xfs_attr_sf_hdr)) + return __this_address; + + endp = (char *)sfp + size; + + /* Check all reported entries */ + sfep = &sfp->list[0]; + for (i = 0; i < sfp->hdr.count; i++) { + /* + * struct xfs_attr_sf_entry has a variable length. + * Check the fixed-offset parts of the structure are + * within the data buffer. + */ + if (((char *)sfep + sizeof(*sfep)) >= endp) + return __this_address; + + /* Don't allow names with known bad length. */ + if (sfep->namelen == 0) + return __this_address; + + /* + * Check that the variable-length part of the structure is + * within the data buffer. The next entry starts after the + * name component, so nextentry is an acceptable test. + */ + next_sfep = XFS_ATTR_SF_NEXTENTRY(sfep); + if ((char *)next_sfep > endp) + return __this_address; + + /* + * Check for unknown flags. Short form doesn't support + * the incomplete or local bits, so we can use the namespace + * mask here. + */ + if (sfep->flags & ~XFS_ATTR_NSP_ONDISK_MASK) + return __this_address; + + /* + * Check for invalid namespace combinations. We only allow + * one namespace flag per xattr, so we can just count the + * bits (i.e. hweight) here. + */ + if (hweight8(sfep->flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) + return __this_address; + + sfep = next_sfep; + } + if ((void *)sfep != (void *)endp) + return __this_address; + + return NULL; +} + /* * Convert a leaf attribute list to shortform attribute list */ @@ -2170,7 +2266,8 @@ xfs_attr3_leaf_lookup_int( leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); - ASSERT(ichdr.count < args->geo->blksize / 8); + if (ichdr.count >= args->geo->blksize / 8) + return -EFSCORRUPTED; /* * Binary search. (note: small blocks will skip this loop) @@ -2186,8 +2283,10 @@ xfs_attr3_leaf_lookup_int( else break; } - ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count)); - ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval); + if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) + return -EFSCORRUPTED; + if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) + return -EFSCORRUPTED; /* * Since we may have duplicate hashval's, find the first matching diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index f7dda0c237b0..4da08af5b134 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -48,10 +48,12 @@ void xfs_attr_shortform_create(struct xfs_da_args *args); void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff); int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); -int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); +int xfs_attr_shortform_to_leaf(struct xfs_da_args *args, + struct xfs_buf **leaf_bp); int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); +xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); /* diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index d56caf037ca0..21be186067a2 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -65,7 +65,7 @@ xfs_attr3_rmt_blocks( * does CRC, location and bounds checking, the unpacking function checks the * attribute parameters and owner. */ -static bool +static xfs_failaddr_t xfs_attr3_rmt_hdr_ok( void *ptr, xfs_ino_t ino, @@ -76,19 +76,19 @@ xfs_attr3_rmt_hdr_ok( struct xfs_attr3_rmt_hdr *rmt = ptr; if (bno != be64_to_cpu(rmt->rm_blkno)) - return false; + return __this_address; if (offset != be32_to_cpu(rmt->rm_offset)) - return false; + return __this_address; if (size != be32_to_cpu(rmt->rm_bytes)) - return false; + return __this_address; if (ino != be64_to_cpu(rmt->rm_owner)) - return false; + return __this_address; /* ok */ - return true; + return NULL; } -static bool +static xfs_failaddr_t xfs_attr3_rmt_verify( struct xfs_mount *mp, void *ptr, @@ -98,27 +98,29 @@ xfs_attr3_rmt_verify( struct xfs_attr3_rmt_hdr *rmt = ptr; if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; + return __this_address; if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) - return false; + return __this_address; if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (be64_to_cpu(rmt->rm_blkno) != bno) - return false; + return __this_address; if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) - return false; + return __this_address; if (be32_to_cpu(rmt->rm_offset) + be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX) - return false; + return __this_address; if (rmt->rm_owner == 0) - return false; + return __this_address; - return true; + return NULL; } -static void -xfs_attr3_rmt_read_verify( - struct xfs_buf *bp) +static int +__xfs_attr3_rmt_read_verify( + struct xfs_buf *bp, + bool check_crc, + xfs_failaddr_t *failaddr) { struct xfs_mount *mp = bp->b_target->bt_mount; char *ptr; @@ -128,7 +130,7 @@ xfs_attr3_rmt_read_verify( /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) - return; + return 0; ptr = bp->b_addr; bno = bp->b_bn; @@ -136,23 +138,48 @@ xfs_attr3_rmt_read_verify( ASSERT(len >= blksize); while (len > 0) { - if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) { - xfs_buf_ioerror(bp, -EFSBADCRC); - break; - } - if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - break; + if (check_crc && + !xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) { + *failaddr = __this_address; + return -EFSBADCRC; } + *failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); + if (*failaddr) + return -EFSCORRUPTED; len -= blksize; ptr += blksize; bno += BTOBB(blksize); } - if (bp->b_error) - xfs_verifier_error(bp); - else - ASSERT(len == 0); + if (len != 0) { + *failaddr = __this_address; + return -EFSCORRUPTED; + } + + return 0; +} + +static void +xfs_attr3_rmt_read_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + int error; + + error = __xfs_attr3_rmt_read_verify(bp, true, &fa); + if (error) + xfs_verifier_error(bp, error, fa); +} + +static xfs_failaddr_t +xfs_attr3_rmt_verify_struct( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + int error; + + error = __xfs_attr3_rmt_read_verify(bp, false, &fa); + return error ? fa : NULL; } static void @@ -160,6 +187,7 @@ xfs_attr3_rmt_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; int blksize = mp->m_attr_geo->blksize; char *ptr; int len; @@ -177,9 +205,9 @@ xfs_attr3_rmt_write_verify( while (len > 0) { struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr; - if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -188,8 +216,7 @@ xfs_attr3_rmt_write_verify( * xfs_attr3_rmt_hdr_set() for the explanation. */ if (rmt->rm_lsn != cpu_to_be64(NULLCOMMITLSN)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); return; } xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF); @@ -198,13 +225,16 @@ xfs_attr3_rmt_write_verify( ptr += blksize; bno += BTOBB(blksize); } - ASSERT(len == 0); + + if (len != 0) + xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); } const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { .name = "xfs_attr3_rmt", .verify_read = xfs_attr3_rmt_read_verify, .verify_write = xfs_attr3_rmt_write_verify, + .verify_struct = xfs_attr3_rmt_verify_struct, }; STATIC int @@ -269,7 +299,7 @@ xfs_attr_rmtval_copyout( byte_cnt = min(*valuelen, byte_cnt); if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset, + if (xfs_attr3_rmt_hdr_ok(src, ino, *offset, byte_cnt, bno)) { xfs_alert(mp, "remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 1210f684d3c2..daae00ed30c5 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -400,7 +400,7 @@ xfs_bmap_check_leaf_extents( pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); XFS_WANT_CORRUPTED_GOTO(mp, - XFS_FSB_SANITY_CHECK(mp, bno), error0); + xfs_verify_fsbno(mp, bno), error0); if (bp_release) { bp_release = 0; xfs_trans_brelse(NULL, bp); @@ -1220,7 +1220,7 @@ xfs_iread_extents( pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); XFS_WANT_CORRUPTED_GOTO(mp, - XFS_FSB_SANITY_CHECK(mp, bno), out_brelse); + xfs_verify_fsbno(mp, bno), out_brelse); xfs_trans_brelse(tp, bp); } @@ -3337,6 +3337,49 @@ xfs_bmap_btalloc_filestreams( return 0; } +/* Update all inode and quota accounting for the allocation we just did. */ +static void +xfs_bmap_btalloc_accounting( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args) +{ + if (ap->flags & XFS_BMAPI_COWFORK) { + /* + * COW fork blocks are in-core only and thus are treated as + * in-core quota reservation (like delalloc blocks) even when + * converted to real blocks. The quota reservation is not + * accounted to disk until blocks are remapped to the data + * fork. So if these blocks were previously delalloc, we + * already have quota reservation and there's nothing to do + * yet. + */ + if (ap->wasdel) + return; + + /* + * Otherwise, we've allocated blocks in a hole. The transaction + * has acquired in-core quota reservation for this extent. + * Rather than account these as real blocks, however, we reduce + * the transaction quota reservation based on the allocation. + * This essentially transfers the transaction quota reservation + * to that of a delalloc extent. + */ + ap->ip->i_delayed_blks += args->len; + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, XFS_TRANS_DQ_RES_BLKS, + -(long)args->len); + return; + } + + /* data/attr fork only */ + ap->ip->i_d.di_nblocks += args->len; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + if (ap->wasdel) + ap->ip->i_delayed_blks -= args->len; + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, + ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT, + args->len); +} + STATIC int xfs_bmap_btalloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ @@ -3347,6 +3390,8 @@ xfs_bmap_btalloc( xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ xfs_agnumber_t ag; xfs_alloc_arg_t args; + xfs_fileoff_t orig_offset; + xfs_extlen_t orig_length; xfs_extlen_t blen; xfs_extlen_t nextminlen = 0; int nullfb; /* true if ap->firstblock isn't set */ @@ -3356,6 +3401,8 @@ xfs_bmap_btalloc( int stripe_align; ASSERT(ap->length); + orig_offset = ap->offset; + orig_length = ap->length; mp = ap->ip->i_mount; @@ -3571,19 +3618,23 @@ xfs_bmap_btalloc( *ap->firstblock = args.fsbno; ASSERT(nullfb || fb_agno <= args.agno); ap->length = args.len; - if (!(ap->flags & XFS_BMAPI_COWFORK)) - ap->ip->i_d.di_nblocks += args.len; - xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); - if (ap->wasdel) - ap->ip->i_delayed_blks -= args.len; /* - * Adjust the disk quota also. This was reserved - * earlier. + * If the extent size hint is active, we tried to round the + * caller's allocation request offset down to extsz and the + * length up to another extsz boundary. If we found a free + * extent we mapped it in starting at this new offset. If the + * newly mapped space isn't long enough to cover any of the + * range of offsets that was originally requested, move the + * mapping up so that we can fill as much of the caller's + * original request as possible. Free space is apparently + * very fragmented so we're unlikely to be able to satisfy the + * hints anyway. */ - xfs_trans_mod_dquot_byino(ap->tp, ap->ip, - ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : - XFS_TRANS_DQ_BCOUNT, - (long) args.len); + if (ap->length <= orig_length) + ap->offset = orig_offset; + else if (ap->offset + ap->length < orig_offset + orig_length) + ap->offset = orig_offset + orig_length - ap->length; + xfs_bmap_btalloc_accounting(ap, &args); } else { ap->blkno = NULLFSBLOCK; ap->length = 0; @@ -3876,8 +3927,6 @@ xfs_bmapi_reserve_delalloc( struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_extlen_t alen; xfs_extlen_t indlen; - char rt = XFS_IS_REALTIME_INODE(ip); - xfs_extlen_t extsz; int error; xfs_fileoff_t aoff = off; @@ -3892,31 +3941,25 @@ xfs_bmapi_reserve_delalloc( prealloc = alen - len; /* Figure out the extent size, adjust alen */ - if (whichfork == XFS_COW_FORK) - extsz = xfs_get_cowextsz_hint(ip); - else - extsz = xfs_get_extsz_hint(ip); - if (extsz) { + if (whichfork == XFS_COW_FORK) { struct xfs_bmbt_irec prev; + xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) prev.br_startoff = NULLFILEOFF; - error = xfs_bmap_extsize_align(mp, got, &prev, extsz, rt, eof, + error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, 1, 0, &aoff, &alen); ASSERT(!error); } - if (rt) - extsz = alen / mp->m_sb.sb_rextsize; - /* * Make a transaction-less quota reservation for delayed allocation * blocks. This number gets adjusted later. We return if we haven't * allocated blocks already inside this loop. */ error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0, - rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + XFS_QMOPT_RES_REGBLKS); if (error) return error; @@ -3927,12 +3970,7 @@ xfs_bmapi_reserve_delalloc( indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); ASSERT(indlen > 0); - if (rt) { - error = xfs_mod_frextents(mp, -((int64_t)extsz)); - } else { - error = xfs_mod_fdblocks(mp, -((int64_t)alen), false); - } - + error = xfs_mod_fdblocks(mp, -((int64_t)alen), false); if (error) goto out_unreserve_quota; @@ -3963,14 +4001,11 @@ xfs_bmapi_reserve_delalloc( return 0; out_unreserve_blocks: - if (rt) - xfs_mod_frextents(mp, extsz); - else - xfs_mod_fdblocks(mp, alen, false); + xfs_mod_fdblocks(mp, alen, false); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) - xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ? - XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, + XFS_QMOPT_RES_REGBLKS); return error; } @@ -4304,8 +4339,16 @@ xfs_bmapi_write( while (bno < end && n < *nmap) { bool need_alloc = false, wasdelay = false; - /* in hole or beyoned EOF? */ + /* in hole or beyond EOF? */ if (eof || bma.got.br_startoff > bno) { + /* + * CoW fork conversions should /never/ hit EOF or + * holes. There should always be something for us + * to work on. + */ + ASSERT(!((flags & XFS_BMAPI_CONVERT) && + (flags & XFS_BMAPI_COWFORK))); + if (flags & XFS_BMAPI_DELALLOC) { /* * For the COW fork we can reasonably get a @@ -4824,6 +4867,7 @@ xfs_bmap_del_extent_cow( xfs_iext_insert(ip, icur, &new, state); break; } + ip->i_delayed_blks -= del->br_blockcount; } /* @@ -5136,7 +5180,7 @@ __xfs_bunmapi( * blowing out the transaction with a mix of EFIs and reflink * adjustments. */ - if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) + if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res)); else max_len = len; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index c10aecaaae44..9faf479aba49 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -425,33 +425,29 @@ xfs_bmbt_diff_two_keys( be64_to_cpu(k2->bmbt.br_startoff); } -static bool +static xfs_failaddr_t xfs_bmbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; unsigned int level; switch (block->bb_magic) { case cpu_to_be32(XFS_BMAP_CRC_MAGIC): - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) - return false; - if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn) - return false; /* * XXX: need a better way of verifying the owner here. Right now * just make sure there has been one set. */ - if (be64_to_cpu(block->bb_u.l.bb_owner) == 0) - return false; + fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; /* fall through */ case cpu_to_be32(XFS_BMAP_MAGIC): break; default: - return false; + return __this_address; } /* @@ -463,46 +459,39 @@ xfs_bmbt_verify( */ level = be16_to_cpu(block->bb_level); if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1])) - return false; - if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) - return false; - - /* sibling pointer verification */ - if (!block->bb_u.l.bb_leftsib || - (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && - !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))) - return false; - if (!block->bb_u.l.bb_rightsib || - (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && - !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))) - return false; - - return true; + return __this_address; + + return xfs_btree_lblock_verify(bp, mp->m_bmap_dmxr[level != 0]); } static void xfs_bmbt_read_verify( struct xfs_buf *bp) { + xfs_failaddr_t fa; + if (!xfs_btree_lblock_verify_crc(bp)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_bmbt_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_bmbt_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } - if (bp->b_error) { + if (bp->b_error) trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_verifier_error(bp); - } } static void xfs_bmbt_write_verify( struct xfs_buf *bp) { - if (!xfs_bmbt_verify(bp)) { + xfs_failaddr_t fa; + + fa = xfs_bmbt_verify(bp); + if (fa) { trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } xfs_btree_lblock_calc_crc(bp); @@ -512,6 +501,7 @@ const struct xfs_buf_ops xfs_bmbt_buf_ops = { .name = "xfs_bmbt", .verify_read = xfs_bmbt_read_verify, .verify_write = xfs_bmbt_write_verify, + .verify_struct = xfs_bmbt_verify, }; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 5f33adf8eecb..79ee4a1951d1 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -273,7 +273,7 @@ xfs_btree_lblock_calc_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) return; @@ -311,7 +311,7 @@ xfs_btree_sblock_calc_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) return; @@ -329,7 +329,7 @@ xfs_btree_sblock_verify_crc( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) - return false; + return __this_address; return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); } @@ -853,7 +853,7 @@ xfs_btree_read_bufl( xfs_daddr_t d; /* real disk block address */ int error; - if (!XFS_FSB_SANITY_CHECK(mp, fsbno)) + if (!xfs_verify_fsbno(mp, fsbno)) return -EFSCORRUPTED; d = XFS_FSB_TO_DADDR(mp, fsbno); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, @@ -4529,6 +4529,51 @@ xfs_btree_change_owner( &bbcoi); } +/* Verify the v5 fields of a long-format btree block. */ +xfs_failaddr_t +xfs_btree_lblock_v5hdr_verify( + struct xfs_buf *bp, + uint64_t owner) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return __this_address; + if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + if (block->bb_u.l.bb_blkno != cpu_to_be64(bp->b_bn)) + return __this_address; + if (owner != XFS_RMAP_OWN_UNKNOWN && + be64_to_cpu(block->bb_u.l.bb_owner) != owner) + return __this_address; + return NULL; +} + +/* Verify a long-format btree block. */ +xfs_failaddr_t +xfs_btree_lblock_verify( + struct xfs_buf *bp, + unsigned int max_recs) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + + /* numrecs verification */ + if (be16_to_cpu(block->bb_numrecs) > max_recs) + return __this_address; + + /* sibling pointer verification */ + if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && + !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))) + return __this_address; + if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && + !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))) + return __this_address; + + return NULL; +} + /** * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format * btree block @@ -4537,7 +4582,7 @@ xfs_btree_change_owner( * @max_recs: pointer to the m_*_mxr max records field in the xfs mount * @pag_max_level: pointer to the per-ag max level field */ -bool +xfs_failaddr_t xfs_btree_sblock_v5hdr_verify( struct xfs_buf *bp) { @@ -4546,14 +4591,14 @@ xfs_btree_sblock_v5hdr_verify( struct xfs_perag *pag = bp->b_pag; if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; + return __this_address; if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) - return false; + return __this_address; if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) - return false; - return true; + return __this_address; + return NULL; } /** @@ -4562,29 +4607,29 @@ xfs_btree_sblock_v5hdr_verify( * @bp: buffer containing the btree block * @max_recs: maximum records allowed in this btree node */ -bool +xfs_failaddr_t xfs_btree_sblock_verify( struct xfs_buf *bp, unsigned int max_recs) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_agblock_t agno; /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) - return false; + return __this_address; /* sibling pointer verification */ - if (!block->bb_u.s.bb_leftsib || - (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) - return false; - if (!block->bb_u.s.bb_rightsib || - (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) - return false; + agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp)); + if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && + !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib))) + return __this_address; + if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && + !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib))) + return __this_address; - return true; + return NULL; } /* @@ -4953,3 +4998,33 @@ xfs_btree_diff_two_ptrs( return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); } + +/* If there's an extent, we're done. */ +STATIC int +xfs_btree_has_record_helper( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + void *priv) +{ + return XFS_BTREE_QUERY_RANGE_ABORT; +} + +/* Is there a record covering a given range of keys? */ +int +xfs_btree_has_record( + struct xfs_btree_cur *cur, + union xfs_btree_irec *low, + union xfs_btree_irec *high, + bool *exists) +{ + int error; + + error = xfs_btree_query_range(cur, low, high, + &xfs_btree_has_record_helper, NULL); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) { + *exists = true; + return 0; + } + *exists = false; + return error; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index b57501c6f71d..50440b5618e8 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -473,10 +473,6 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) #define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b)) #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) -#define XFS_FSB_SANITY_CHECK(mp,fsb) \ - (fsb && XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ - XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks) - /* * Trace hooks. Currently not implemented as they need to be ported * over to the generic tracing functionality, which is some effort. @@ -496,8 +492,14 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) #define XFS_BTREE_TRACE_ARGR(c, r) #define XFS_BTREE_TRACE_CURSOR(c, t) -bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); -bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); +xfs_failaddr_t xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); +xfs_failaddr_t xfs_btree_sblock_verify(struct xfs_buf *bp, + unsigned int max_recs); +xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp, + uint64_t owner); +xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, + unsigned int max_recs); + uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits, unsigned long len); xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits, @@ -545,5 +547,7 @@ void xfs_btree_get_keys(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key); union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, union xfs_btree_key *key); +int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low, + union xfs_btree_irec *high, bool *exists); #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 651611530d2f..ea187b4a7991 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -128,7 +128,7 @@ xfs_da_state_free(xfs_da_state_t *state) kmem_zone_free(xfs_da_state_zone, state); } -static bool +static xfs_failaddr_t xfs_da3_node_verify( struct xfs_buf *bp) { @@ -145,24 +145,24 @@ xfs_da3_node_verify( struct xfs_da3_node_hdr *hdr3 = bp->b_addr; if (ichdr.magic != XFS_DA3_NODE_MAGIC) - return false; + return __this_address; if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) - return false; + return __this_address; } else { if (ichdr.magic != XFS_DA_NODE_MAGIC) - return false; + return __this_address; } if (ichdr.level == 0) - return false; + return __this_address; if (ichdr.level > XFS_DA_NODE_MAXDEPTH) - return false; + return __this_address; if (ichdr.count == 0) - return false; + return __this_address; /* * we don't know if the node is for and attribute or directory tree, @@ -170,11 +170,11 @@ xfs_da3_node_verify( */ if (ichdr.count > mp->m_dir_geo->node_ents && ichdr.count > mp->m_attr_geo->node_ents) - return false; + return __this_address; /* XXX: hash order check? */ - return true; + return NULL; } static void @@ -182,12 +182,13 @@ xfs_da3_node_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + xfs_failaddr_t fa; - if (!xfs_da3_node_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_da3_node_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -211,19 +212,20 @@ xfs_da3_node_read_verify( struct xfs_buf *bp) { struct xfs_da_blkinfo *info = bp->b_addr; + xfs_failaddr_t fa; switch (be16_to_cpu(info->magic)) { case XFS_DA3_NODE_MAGIC: if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) { - xfs_buf_ioerror(bp, -EFSBADCRC); + xfs_verifier_error(bp, -EFSBADCRC, + __this_address); break; } /* fall through */ case XFS_DA_NODE_MAGIC: - if (!xfs_da3_node_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - break; - } + fa = xfs_da3_node_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; case XFS_ATTR_LEAF_MAGIC: case XFS_ATTR3_LEAF_MAGIC: @@ -236,18 +238,40 @@ xfs_da3_node_read_verify( bp->b_ops->verify_read(bp); return; default: - xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); break; } +} + +/* Verify the structure of a da3 block. */ +static xfs_failaddr_t +xfs_da3_node_verify_struct( + struct xfs_buf *bp) +{ + struct xfs_da_blkinfo *info = bp->b_addr; - /* corrupt block */ - xfs_verifier_error(bp); + switch (be16_to_cpu(info->magic)) { + case XFS_DA3_NODE_MAGIC: + case XFS_DA_NODE_MAGIC: + return xfs_da3_node_verify(bp); + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + bp->b_ops = &xfs_attr3_leaf_buf_ops; + return bp->b_ops->verify_struct(bp); + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + bp->b_ops = &xfs_dir3_leafn_buf_ops; + return bp->b_ops->verify_struct(bp); + default: + return __this_address; + } } const struct xfs_buf_ops xfs_da3_node_buf_ops = { .name = "xfs_da3_node", .verify_read = xfs_da3_node_read_verify, .verify_write = xfs_da3_node_write_verify, + .verify_struct = xfs_da3_node_verify_struct, }; int diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 3771edcb301d..7e77299b7789 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -875,4 +875,10 @@ struct xfs_attr3_rmt_hdr { ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ sizeof(struct xfs_attr3_rmt_hdr) : 0)) +/* Number of bytes in a directory block. */ +static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp) +{ + return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog); +} + #endif /* __XFS_DA_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 072ebfe1d6ae..087fea02c389 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -249,6 +249,10 @@ xfs_defer_trans_roll( for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE); + /* Hold the (previously bjoin'd) buffer locked across the roll. */ + for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++) + xfs_trans_dirty_buf(*tp, dop->dop_bufs[i]); + trace_xfs_defer_trans_roll((*tp)->t_mountp, dop); /* Roll the transaction. */ @@ -264,6 +268,12 @@ xfs_defer_trans_roll( for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0); + /* Rejoin the buffers and dirty them so the log moves forward. */ + for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++) { + xfs_trans_bjoin(*tp, dop->dop_bufs[i]); + xfs_trans_bhold(*tp, dop->dop_bufs[i]); + } + return error; } @@ -295,6 +305,31 @@ xfs_defer_ijoin( } } + ASSERT(0); + return -EFSCORRUPTED; +} + +/* + * Add this buffer to the deferred op. Each joined buffer is relogged + * each time we roll the transaction. + */ +int +xfs_defer_bjoin( + struct xfs_defer_ops *dop, + struct xfs_buf *bp) +{ + int i; + + for (i = 0; i < XFS_DEFER_OPS_NR_BUFS; i++) { + if (dop->dop_bufs[i] == bp) + return 0; + else if (dop->dop_bufs[i] == NULL) { + dop->dop_bufs[i] = bp; + return 0; + } + } + + ASSERT(0); return -EFSCORRUPTED; } @@ -493,9 +528,7 @@ xfs_defer_init( struct xfs_defer_ops *dop, xfs_fsblock_t *fbp) { - dop->dop_committed = false; - dop->dop_low = false; - memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes)); + memset(dop, 0, sizeof(struct xfs_defer_ops)); *fbp = NULLFSBLOCK; INIT_LIST_HEAD(&dop->dop_intake); INIT_LIST_HEAD(&dop->dop_pending); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index d4f046dd44bd..045beacdd37d 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -59,6 +59,7 @@ enum xfs_defer_ops_type { }; #define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */ +#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */ struct xfs_defer_ops { bool dop_committed; /* did any trans commit? */ @@ -66,8 +67,9 @@ struct xfs_defer_ops { struct list_head dop_intake; /* unlogged pending work */ struct list_head dop_pending; /* logged pending work */ - /* relog these inodes with each roll */ + /* relog these with each roll */ struct xfs_inode *dop_inodes[XFS_DEFER_OPS_NR_INODES]; + struct xfs_buf *dop_bufs[XFS_DEFER_OPS_NR_BUFS]; }; void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type, @@ -77,6 +79,7 @@ void xfs_defer_cancel(struct xfs_defer_ops *dop); void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp); bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop); int xfs_defer_ijoin(struct xfs_defer_ops *dop, struct xfs_inode *ip); +int xfs_defer_bjoin(struct xfs_defer_ops *dop, struct xfs_buf *bp); /* Description of a deferred type. */ struct xfs_defer_op_type { diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index e10778c102ea..92f94e190f04 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -119,8 +119,7 @@ xfs_da_mount( ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT); - ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <= - XFS_MAX_BLOCKSIZE); + ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE); mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL); mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL); @@ -140,7 +139,7 @@ xfs_da_mount( dageo = mp->m_dir_geo; dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog; dageo->fsblog = mp->m_sb.sb_blocklog; - dageo->blksize = 1 << dageo->blklog; + dageo->blksize = xfs_dir2_dirblock_bytes(&mp->m_sb); dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog; /* diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 1a8f2cf977ca..388d67c5c903 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -340,5 +340,7 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) #define XFS_READDIR_BUFSIZE (32768) unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype); +void *xfs_dir3_data_endp(struct xfs_da_geometry *geo, + struct xfs_dir2_data_hdr *hdr); #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 43c902f7a68d..2da86a394bcf 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -58,7 +58,7 @@ xfs_dir_startup(void) xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } -static bool +static xfs_failaddr_t xfs_dir3_block_verify( struct xfs_buf *bp) { @@ -67,20 +67,18 @@ xfs_dir3_block_verify( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) - return false; + return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) - return false; + return __this_address; } else { if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) - return false; + return __this_address; } - if (__xfs_dir3_data_check(NULL, bp)) - return false; - return true; + return __xfs_dir3_data_check(NULL, bp); } static void @@ -88,15 +86,16 @@ xfs_dir3_block_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_dir3_block_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_dir3_block_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } static void @@ -104,12 +103,13 @@ xfs_dir3_block_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + xfs_failaddr_t fa; - if (!xfs_dir3_block_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_dir3_block_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -126,6 +126,7 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .name = "xfs_dir3_block", .verify_read = xfs_dir3_block_read_verify, .verify_write = xfs_dir3_block_write_verify, + .verify_struct = xfs_dir3_block_verify, }; int diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 8727a43115ef..920279485275 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -36,9 +36,9 @@ /* * Check the consistency of the data block. * The input can also be a block-format directory. - * Return 0 is the buffer is good, otherwise an error. + * Return NULL if the buffer is good, otherwise the address of the error. */ -int +xfs_failaddr_t __xfs_dir3_data_check( struct xfs_inode *dp, /* incore inode pointer */ struct xfs_buf *bp) /* data block's buffer */ @@ -73,6 +73,14 @@ __xfs_dir3_data_check( */ ops = xfs_dir_get_ops(mp, dp); + /* + * If this isn't a directory, or we don't get handed the dir ops, + * something is seriously wrong. Bail out. + */ + if ((dp && !S_ISDIR(VFS_I(dp)->i_mode)) || + ops != xfs_dir_get_ops(mp, NULL)) + return __this_address; + hdr = bp->b_addr; p = (char *)ops->data_entry_p(hdr); @@ -81,7 +89,6 @@ __xfs_dir3_data_check( case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): btp = xfs_dir2_block_tail_p(geo, hdr); lep = xfs_dir2_block_leaf_p(btp); - endp = (char *)lep; /* * The number of leaf entries is limited by the size of the @@ -90,17 +97,19 @@ __xfs_dir3_data_check( * so just ensure that the count falls somewhere inside the * block right now. */ - XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(btp->count) < - ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry)); + if (be32_to_cpu(btp->count) >= + ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry)) + return __this_address; break; case cpu_to_be32(XFS_DIR3_DATA_MAGIC): case cpu_to_be32(XFS_DIR2_DATA_MAGIC): - endp = (char *)hdr + geo->blksize; break; default: - XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp); - return -EFSCORRUPTED; + return __this_address; } + endp = xfs_dir3_data_endp(geo, hdr); + if (!endp) + return __this_address; /* * Account for zero bestfree entries. @@ -108,22 +117,25 @@ __xfs_dir3_data_check( bf = ops->data_bestfree_p(hdr); count = lastfree = freeseen = 0; if (!bf[0].length) { - XFS_WANT_CORRUPTED_RETURN(mp, !bf[0].offset); + if (bf[0].offset) + return __this_address; freeseen |= 1 << 0; } if (!bf[1].length) { - XFS_WANT_CORRUPTED_RETURN(mp, !bf[1].offset); + if (bf[1].offset) + return __this_address; freeseen |= 1 << 1; } if (!bf[2].length) { - XFS_WANT_CORRUPTED_RETURN(mp, !bf[2].offset); + if (bf[2].offset) + return __this_address; freeseen |= 1 << 2; } - XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[0].length) >= - be16_to_cpu(bf[1].length)); - XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[1].length) >= - be16_to_cpu(bf[2].length)); + if (be16_to_cpu(bf[0].length) < be16_to_cpu(bf[1].length)) + return __this_address; + if (be16_to_cpu(bf[1].length) < be16_to_cpu(bf[2].length)) + return __this_address; /* * Loop over the data/unused entries. */ @@ -135,22 +147,23 @@ __xfs_dir3_data_check( * doesn't need to be there. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - XFS_WANT_CORRUPTED_RETURN(mp, lastfree == 0); - XFS_WANT_CORRUPTED_RETURN(mp, endp >= - p + be16_to_cpu(dup->length)); - XFS_WANT_CORRUPTED_RETURN(mp, - be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == - (char *)dup - (char *)hdr); + if (lastfree != 0) + return __this_address; + if (endp < p + be16_to_cpu(dup->length)) + return __this_address; + if (be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) != + (char *)dup - (char *)hdr) + return __this_address; dfp = xfs_dir2_data_freefind(hdr, bf, dup); if (dfp) { i = (int)(dfp - bf); - XFS_WANT_CORRUPTED_RETURN(mp, - (freeseen & (1 << i)) == 0); + if ((freeseen & (1 << i)) != 0) + return __this_address; freeseen |= 1 << i; } else { - XFS_WANT_CORRUPTED_RETURN(mp, - be16_to_cpu(dup->length) <= - be16_to_cpu(bf[2].length)); + if (be16_to_cpu(dup->length) > + be16_to_cpu(bf[2].length)) + return __this_address; } p += be16_to_cpu(dup->length); lastfree = 1; @@ -163,16 +176,17 @@ __xfs_dir3_data_check( * The linear search is crude but this is DEBUG code. */ dep = (xfs_dir2_data_entry_t *)p; - XFS_WANT_CORRUPTED_RETURN(mp, dep->namelen != 0); - XFS_WANT_CORRUPTED_RETURN(mp, - !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); - XFS_WANT_CORRUPTED_RETURN(mp, endp >= - p + ops->data_entsize(dep->namelen)); - XFS_WANT_CORRUPTED_RETURN(mp, - be16_to_cpu(*ops->data_entry_tag_p(dep)) == - (char *)dep - (char *)hdr); - XFS_WANT_CORRUPTED_RETURN(mp, - ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX); + if (dep->namelen == 0) + return __this_address; + if (xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))) + return __this_address; + if (endp < p + ops->data_entsize(dep->namelen)) + return __this_address; + if (be16_to_cpu(*ops->data_entry_tag_p(dep)) != + (char *)dep - (char *)hdr) + return __this_address; + if (ops->data_get_ftype(dep) >= XFS_DIR3_FT_MAX) + return __this_address; count++; lastfree = 0; if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || @@ -188,34 +202,52 @@ __xfs_dir3_data_check( be32_to_cpu(lep[i].hashval) == hash) break; } - XFS_WANT_CORRUPTED_RETURN(mp, - i < be32_to_cpu(btp->count)); + if (i >= be32_to_cpu(btp->count)) + return __this_address; } p += ops->data_entsize(dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. */ - XFS_WANT_CORRUPTED_RETURN(mp, freeseen == 7); + if (freeseen != 7) + return __this_address; if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { if (lep[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; - if (i > 0) - XFS_WANT_CORRUPTED_RETURN(mp, - be32_to_cpu(lep[i].hashval) >= - be32_to_cpu(lep[i - 1].hashval)); + if (i > 0 && be32_to_cpu(lep[i].hashval) < + be32_to_cpu(lep[i - 1].hashval)) + return __this_address; } - XFS_WANT_CORRUPTED_RETURN(mp, count == - be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); - XFS_WANT_CORRUPTED_RETURN(mp, stale == be32_to_cpu(btp->stale)); + if (count != be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)) + return __this_address; + if (stale != be32_to_cpu(btp->stale)) + return __this_address; } - return 0; + return NULL; +} + +#ifdef DEBUG +void +xfs_dir3_data_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + fa = __xfs_dir3_data_check(dp, bp); + if (!fa) + return; + xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, dp->i_mount, + bp->b_addr, __FILE__, __LINE__, fa); + ASSERT(0); } +#endif -static bool +static xfs_failaddr_t xfs_dir3_data_verify( struct xfs_buf *bp) { @@ -224,20 +256,18 @@ xfs_dir3_data_verify( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) - return false; + return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) - return false; + return __this_address; } else { if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC)) - return false; + return __this_address; } - if (__xfs_dir3_data_check(NULL, bp)) - return false; - return true; + return __xfs_dir3_data_check(NULL, bp); } /* @@ -263,8 +293,7 @@ xfs_dir3_data_reada_verify( bp->b_ops->verify_read(bp); return; default: - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); break; } } @@ -274,15 +303,16 @@ xfs_dir3_data_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && - !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_dir3_data_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_dir3_data_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } static void @@ -290,12 +320,13 @@ xfs_dir3_data_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + xfs_failaddr_t fa; - if (!xfs_dir3_data_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_dir3_data_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -312,6 +343,7 @@ const struct xfs_buf_ops xfs_dir3_data_buf_ops = { .name = "xfs_dir3_data", .verify_read = xfs_dir3_data_read_verify, .verify_write = xfs_dir3_data_write_verify, + .verify_struct = xfs_dir3_data_verify, }; static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { @@ -515,7 +547,6 @@ xfs_dir2_data_freescan_int( struct xfs_dir2_data_hdr *hdr, int *loghead) { - xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* active data entry */ xfs_dir2_data_unused_t *dup; /* unused data entry */ struct xfs_dir2_data_free *bf; @@ -537,12 +568,7 @@ xfs_dir2_data_freescan_int( * Set up pointers. */ p = (char *)ops->data_entry_p(hdr); - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { - btp = xfs_dir2_block_tail_p(geo, hdr); - endp = (char *)xfs_dir2_block_leaf_p(btp); - } else - endp = (char *)hdr + geo->blksize; + endp = xfs_dir3_data_endp(geo, hdr); /* * Loop over the block's entries. */ @@ -755,17 +781,9 @@ xfs_dir2_data_make_free( /* * Figure out where the end of the data area is. */ - if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)) - endptr = (char *)hdr + args->geo->blksize; - else { - xfs_dir2_block_tail_t *btp; /* block tail */ + endptr = xfs_dir3_data_endp(args->geo, hdr); + ASSERT(endptr != NULL); - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); - btp = xfs_dir2_block_tail_p(args->geo, hdr); - endptr = (char *)xfs_dir2_block_leaf_p(btp); - } /* * If this isn't the start of the block, then back up to * the previous entry and see if it's free. @@ -1067,3 +1085,21 @@ xfs_dir2_data_use_free( } *needscanp = needscan; } + +/* Find the end of the entry data in a data/block format dir block. */ +void * +xfs_dir3_data_endp( + struct xfs_da_geometry *geo, + struct xfs_dir2_data_hdr *hdr) +{ + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + return xfs_dir2_block_leaf_p(xfs_dir2_block_tail_p(geo, hdr)); + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + return (char *)hdr + geo->blksize; + default: + return NULL; + } +} diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 27297a689d9c..d7e630f41f9c 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -50,13 +50,7 @@ static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args, * Pop an assert if something is wrong. */ #ifdef DEBUG -#define xfs_dir3_leaf_check(dp, bp) \ -do { \ - if (!xfs_dir3_leaf1_check((dp), (bp))) \ - ASSERT(0); \ -} while (0); - -STATIC bool +static xfs_failaddr_t xfs_dir3_leaf1_check( struct xfs_inode *dp, struct xfs_buf *bp) @@ -69,17 +63,32 @@ xfs_dir3_leaf1_check( if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) - return false; + return __this_address; } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC) - return false; + return __this_address; return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); } + +static inline void +xfs_dir3_leaf_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + fa = xfs_dir3_leaf1_check(dp, bp); + if (!fa) + return; + xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, dp->i_mount, + bp->b_addr, __FILE__, __LINE__, fa); + ASSERT(0); +} #else #define xfs_dir3_leaf_check(dp, bp) #endif -bool +xfs_failaddr_t xfs_dir3_leaf_check_int( struct xfs_mount *mp, struct xfs_inode *dp, @@ -114,27 +123,27 @@ xfs_dir3_leaf_check_int( * We can deduce a value for that from di_size. */ if (hdr->count > ops->leaf_max_ents(geo)) - return false; + return __this_address; /* Leaves and bests don't overlap in leaf format. */ if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || hdr->magic == XFS_DIR3_LEAF1_MAGIC) && (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) - return false; + return __this_address; /* Check hash value order, count stale entries. */ for (i = stale = 0; i < hdr->count; i++) { if (i + 1 < hdr->count) { if (be32_to_cpu(ents[i].hashval) > be32_to_cpu(ents[i + 1].hashval)) - return false; + return __this_address; } if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; } if (hdr->stale != stale) - return false; - return true; + return __this_address; + return NULL; } /* @@ -142,7 +151,7 @@ xfs_dir3_leaf_check_int( * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due * to incorrect magic numbers. */ -static bool +static xfs_failaddr_t xfs_dir3_leaf_verify( struct xfs_buf *bp, uint16_t magic) @@ -160,16 +169,16 @@ xfs_dir3_leaf_verify( : XFS_DIR3_LEAFN_MAGIC; if (leaf3->info.hdr.magic != cpu_to_be16(magic3)) - return false; + return __this_address; if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn))) - return false; + return __this_address; } else { if (leaf->hdr.info.magic != cpu_to_be16(magic)) - return false; + return __this_address; } return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf); @@ -181,15 +190,16 @@ __read_verify( uint16_t magic) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_dir3_leaf_verify(bp, magic)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_dir3_leaf_verify(bp, magic); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } static void @@ -198,12 +208,13 @@ __write_verify( uint16_t magic) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; + xfs_failaddr_t fa; - if (!xfs_dir3_leaf_verify(bp, magic)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_dir3_leaf_verify(bp, magic); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -216,6 +227,13 @@ __write_verify( xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF); } +static xfs_failaddr_t +xfs_dir3_leaf1_verify( + struct xfs_buf *bp) +{ + return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC); +} + static void xfs_dir3_leaf1_read_verify( struct xfs_buf *bp) @@ -230,6 +248,13 @@ xfs_dir3_leaf1_write_verify( __write_verify(bp, XFS_DIR2_LEAF1_MAGIC); } +static xfs_failaddr_t +xfs_dir3_leafn_verify( + struct xfs_buf *bp) +{ + return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC); +} + static void xfs_dir3_leafn_read_verify( struct xfs_buf *bp) @@ -248,12 +273,14 @@ const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { .name = "xfs_dir3_leaf1", .verify_read = xfs_dir3_leaf1_read_verify, .verify_write = xfs_dir3_leaf1_write_verify, + .verify_struct = xfs_dir3_leaf1_verify, }; const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { .name = "xfs_dir3_leafn", .verify_read = xfs_dir3_leafn_read_verify, .verify_write = xfs_dir3_leafn_write_verify, + .verify_struct = xfs_dir3_leafn_verify, }; int diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 682e2bf370c7..239d97a64296 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -53,13 +53,7 @@ static int xfs_dir2_node_addname_int(xfs_da_args_t *args, * Check internal consistency of a leafn block. */ #ifdef DEBUG -#define xfs_dir3_leaf_check(dp, bp) \ -do { \ - if (!xfs_dir3_leafn_check((dp), (bp))) \ - ASSERT(0); \ -} while (0); - -static bool +static xfs_failaddr_t xfs_dir3_leafn_check( struct xfs_inode *dp, struct xfs_buf *bp) @@ -72,17 +66,32 @@ xfs_dir3_leafn_check( if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) - return false; + return __this_address; } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC) - return false; + return __this_address; return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); } + +static inline void +xfs_dir3_leaf_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + fa = xfs_dir3_leafn_check(dp, bp); + if (!fa) + return; + xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, dp->i_mount, + bp->b_addr, __FILE__, __LINE__, fa); + ASSERT(0); +} #else #define xfs_dir3_leaf_check(dp, bp) #endif -static bool +static xfs_failaddr_t xfs_dir3_free_verify( struct xfs_buf *bp) { @@ -93,21 +102,21 @@ xfs_dir3_free_verify( struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) - return false; + return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) - return false; + return __this_address; } else { if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)) - return false; + return __this_address; } /* XXX: should bounds check the xfs_dir3_icfree_hdr here */ - return true; + return NULL; } static void @@ -115,15 +124,16 @@ xfs_dir3_free_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_dir3_free_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_dir3_free_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } static void @@ -131,12 +141,13 @@ xfs_dir3_free_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + xfs_failaddr_t fa; - if (!xfs_dir3_free_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_dir3_free_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -153,10 +164,11 @@ const struct xfs_buf_ops xfs_dir3_free_buf_ops = { .name = "xfs_dir3_free", .verify_read = xfs_dir3_free_read_verify, .verify_write = xfs_dir3_free_write_verify, + .verify_struct = xfs_dir3_free_verify, }; /* Everything ok in the free block header? */ -static bool +static xfs_failaddr_t xfs_dir3_free_header_check( struct xfs_inode *dp, xfs_dablk_t fbno, @@ -174,22 +186,22 @@ xfs_dir3_free_header_check( struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; if (be32_to_cpu(hdr3->firstdb) != firstdb) - return false; + return __this_address; if (be32_to_cpu(hdr3->nvalid) > maxbests) - return false; + return __this_address; if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused)) - return false; + return __this_address; } else { struct xfs_dir2_free_hdr *hdr = bp->b_addr; if (be32_to_cpu(hdr->firstdb) != firstdb) - return false; + return __this_address; if (be32_to_cpu(hdr->nvalid) > maxbests) - return false; + return __this_address; if (be32_to_cpu(hdr->nvalid) < be32_to_cpu(hdr->nused)) - return false; + return __this_address; } - return true; + return NULL; } static int @@ -200,6 +212,7 @@ __xfs_dir3_free_read( xfs_daddr_t mappedbno, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, @@ -208,9 +221,9 @@ __xfs_dir3_free_read( return err; /* Check things that we can't do in the verifier. */ - if (!xfs_dir3_free_header_check(dp, fbno, *bpp)) { - xfs_buf_ioerror(*bpp, -EFSCORRUPTED); - xfs_verifier_error(*bpp); + fa = xfs_dir3_free_header_check(dp, fbno, *bpp); + if (fa) { + xfs_verifier_error(*bpp, -EFSCORRUPTED, fa); xfs_trans_brelse(tp, *bpp); return -EFSCORRUPTED; } @@ -1906,7 +1919,7 @@ xfs_dir2_node_addname_int( (unsigned long long)ifbno, lastfbno); if (fblk) { xfs_alert(mp, - " fblk 0x%p blkno %llu index %d magic 0x%x", + " fblk "PTR_FMT" blkno %llu index %d magic 0x%x", fblk, (unsigned long long)fblk->blkno, fblk->index, diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 4badd26c47e6..753aeeeffc18 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -39,12 +39,13 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, /* xfs_dir2_data.c */ #ifdef DEBUG -#define xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp); +extern void xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); #else #define xfs_dir3_data_check(dp,bp) #endif -extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +extern xfs_failaddr_t __xfs_dir3_data_check(struct xfs_inode *dp, + struct xfs_buf *bp); extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno, @@ -89,8 +90,9 @@ xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr, int lowstale, int highstale, int *lfloglow, int *lfloghigh); extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); -extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp, - struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf); +extern xfs_failaddr_t xfs_dir3_leaf_check_int(struct xfs_mount *mp, + struct xfs_inode *dp, struct xfs_dir3_icleaf_hdr *hdr, + struct xfs_dir2_leaf *leaf); /* xfs_dir2_node.c */ extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, @@ -127,7 +129,7 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); -extern int xfs_dir2_sf_verify(struct xfs_inode *ip); +extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip); /* xfs_dir2_readdir.c */ extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp, diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index be8b9755f66a..0c75a7f00883 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -156,7 +156,6 @@ xfs_dir2_block_to_sf( xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */ { xfs_dir2_data_hdr_t *hdr; /* block header */ - xfs_dir2_block_tail_t *btp; /* block tail pointer */ xfs_dir2_data_entry_t *dep; /* data entry pointer */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* unused data pointer */ @@ -192,9 +191,8 @@ xfs_dir2_block_to_sf( /* * Set up to loop over the block's entries. */ - btp = xfs_dir2_block_tail_p(args->geo, hdr); ptr = (char *)dp->d_ops->data_entry_p(hdr); - endptr = (char *)xfs_dir2_block_leaf_p(btp); + endptr = xfs_dir3_data_endp(args->geo, hdr); sfep = xfs_dir2_sf_firstentry(sfp); /* * Loop over the active and unused entries. @@ -630,7 +628,7 @@ xfs_dir2_sf_check( #endif /* DEBUG */ /* Verify the consistency of an inline directory. */ -int +xfs_failaddr_t xfs_dir2_sf_verify( struct xfs_inode *ip) { @@ -665,7 +663,7 @@ xfs_dir2_sf_verify( */ if (size <= offsetof(struct xfs_dir2_sf_hdr, parent) || size < xfs_dir2_sf_hdr_size(sfp->i8count)) - return -EFSCORRUPTED; + return __this_address; endp = (char *)sfp + size; @@ -674,7 +672,7 @@ xfs_dir2_sf_verify( i8count = ino > XFS_DIR2_MAX_SHORT_INUM; error = xfs_dir_ino_validate(mp, ino); if (error) - return error; + return __this_address; offset = dops->data_first_offset; /* Check all reported entries */ @@ -686,11 +684,11 @@ xfs_dir2_sf_verify( * within the data buffer. */ if (((char *)sfep + sizeof(*sfep)) >= endp) - return -EFSCORRUPTED; + return __this_address; /* Don't allow names with known bad length. */ if (sfep->namelen == 0) - return -EFSCORRUPTED; + return __this_address; /* * Check that the variable-length part of the structure is @@ -699,23 +697,23 @@ xfs_dir2_sf_verify( */ next_sfep = dops->sf_nextentry(sfp, sfep); if (endp < (char *)next_sfep) - return -EFSCORRUPTED; + return __this_address; /* Check that the offsets always increase. */ if (xfs_dir2_sf_get_offset(sfep) < offset) - return -EFSCORRUPTED; + return __this_address; /* Check the inode number. */ ino = dops->sf_get_ino(sfp, sfep); i8count += ino > XFS_DIR2_MAX_SHORT_INUM; error = xfs_dir_ino_validate(mp, ino); if (error) - return error; + return __this_address; /* Check the file type. */ filetype = dops->sf_get_ftype(sfep); if (filetype >= XFS_DIR3_FT_MAX) - return -EFSCORRUPTED; + return __this_address; offset = xfs_dir2_sf_get_offset(sfep) + dops->data_entsize(sfep->namelen); @@ -723,16 +721,16 @@ xfs_dir2_sf_verify( sfep = next_sfep; } if (i8count != sfp->i8count) - return -EFSCORRUPTED; + return __this_address; if ((void *)sfep != (void *)endp) - return -EFSCORRUPTED; + return __this_address; /* Make sure this whole thing ought to be in local format. */ if (offset + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + (uint)sizeof(xfs_dir2_block_tail_t) > mp->m_dir_geo->blksize) - return -EFSCORRUPTED; + return __this_address; - return 0; + return NULL; } /* diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 747085b4ef44..8b7a6c3cb599 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -42,18 +42,14 @@ xfs_calc_dquots_per_chunk( /* * Do some primitive error checking on ondisk dquot data structures. */ -int -xfs_dqcheck( +xfs_failaddr_t +xfs_dquot_verify( struct xfs_mount *mp, xfs_disk_dquot_t *ddq, xfs_dqid_t id, uint type, /* used only when IO_dorepair is true */ - uint flags, - const char *str) + uint flags) { - xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; - int errs = 0; - /* * We can encounter an uninitialized dquot buffer for 2 reasons: * 1. If we crash while deleting the quotainode(s), and those blks got @@ -69,87 +65,57 @@ xfs_dqcheck( * This is all fine; things are still consistent, and we haven't lost * any quota information. Just don't complain about bad dquot blks. */ - if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", - str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); - errs++; - } - if (ddq->d_version != XFS_DQUOT_VERSION) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", - str, id, ddq->d_version, XFS_DQUOT_VERSION); - errs++; - } + if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) + return __this_address; + if (ddq->d_version != XFS_DQUOT_VERSION) + return __this_address; if (ddq->d_flags != XFS_DQ_USER && ddq->d_flags != XFS_DQ_PROJ && - ddq->d_flags != XFS_DQ_GROUP) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : XFS dquot ID 0x%x, unknown flags 0x%x", - str, id, ddq->d_flags); - errs++; - } + ddq->d_flags != XFS_DQ_GROUP) + return __this_address; - if (id != -1 && id != be32_to_cpu(ddq->d_id)) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : ondisk-dquot 0x%p, ID mismatch: " - "0x%x expected, found id 0x%x", - str, ddq, id, be32_to_cpu(ddq->d_id)); - errs++; - } + if (id != -1 && id != be32_to_cpu(ddq->d_id)) + return __this_address; - if (!errs && ddq->d_id) { - if (ddq->d_blk_softlimit && - be64_to_cpu(ddq->d_bcount) > - be64_to_cpu(ddq->d_blk_softlimit)) { - if (!ddq->d_btimer) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - if (ddq->d_ino_softlimit && - be64_to_cpu(ddq->d_icount) > - be64_to_cpu(ddq->d_ino_softlimit)) { - if (!ddq->d_itimer) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - if (ddq->d_rtb_softlimit && - be64_to_cpu(ddq->d_rtbcount) > - be64_to_cpu(ddq->d_rtb_softlimit)) { - if (!ddq->d_rtbtimer) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - } + if (!ddq->d_id) + return NULL; + + if (ddq->d_blk_softlimit && + be64_to_cpu(ddq->d_bcount) > be64_to_cpu(ddq->d_blk_softlimit) && + !ddq->d_btimer) + return __this_address; + + if (ddq->d_ino_softlimit && + be64_to_cpu(ddq->d_icount) > be64_to_cpu(ddq->d_ino_softlimit) && + !ddq->d_itimer) + return __this_address; - if (!errs || !(flags & XFS_QMOPT_DQREPAIR)) - return errs; + if (ddq->d_rtb_softlimit && + be64_to_cpu(ddq->d_rtbcount) > be64_to_cpu(ddq->d_rtb_softlimit) && + !ddq->d_rtbtimer) + return __this_address; + + return NULL; +} + +/* + * Do some primitive error checking on ondisk dquot data structures. + */ +int +xfs_dquot_repair( + struct xfs_mount *mp, + struct xfs_disk_dquot *ddq, + xfs_dqid_t id, + uint type) +{ + struct xfs_dqblk *d = (struct xfs_dqblk *)ddq; - if (flags & XFS_QMOPT_DOWARN) - xfs_notice(mp, "Re-initializing dquot ID 0x%x", id); /* * Typically, a repair is only requested by quotacheck. */ ASSERT(id != -1); - ASSERT(flags & XFS_QMOPT_DQREPAIR); memset(d, 0, sizeof(xfs_dqblk_t)); d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); @@ -163,7 +129,7 @@ xfs_dqcheck( XFS_DQUOT_CRC_OFF); } - return errs; + return 0; } STATIC bool @@ -198,13 +164,13 @@ xfs_dquot_buf_verify_crc( return true; } -STATIC bool +STATIC xfs_failaddr_t xfs_dquot_buf_verify( struct xfs_mount *mp, - struct xfs_buf *bp, - int warn) + struct xfs_buf *bp) { struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + xfs_failaddr_t fa; xfs_dqid_t id = 0; int ndquots; int i; @@ -228,33 +194,43 @@ xfs_dquot_buf_verify( */ for (i = 0; i < ndquots; i++) { struct xfs_disk_dquot *ddq; - int error; ddq = &d[i].dd_diskdq; if (i == 0) id = be32_to_cpu(ddq->d_id); - error = xfs_dqcheck(mp, ddq, id + i, 0, warn, __func__); - if (error) - return false; + fa = xfs_dquot_verify(mp, ddq, id + i, 0, 0); + if (fa) + return fa; } - return true; + + return NULL; +} + +static xfs_failaddr_t +xfs_dquot_buf_verify_struct( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + return xfs_dquot_buf_verify(mp, bp); } static void xfs_dquot_buf_read_verify( - struct xfs_buf *bp) + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; if (!xfs_dquot_buf_verify_crc(mp, bp)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_dquot_buf_verify(mp, bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); + } } /* @@ -270,7 +246,7 @@ xfs_dquot_buf_readahead_verify( struct xfs_mount *mp = bp->b_target->bt_mount; if (!xfs_dquot_buf_verify_crc(mp, bp) || - !xfs_dquot_buf_verify(mp, bp, 0)) { + xfs_dquot_buf_verify(mp, bp) != NULL) { xfs_buf_ioerror(bp, -EIO); bp->b_flags &= ~XBF_DONE; } @@ -283,21 +259,21 @@ xfs_dquot_buf_readahead_verify( */ static void xfs_dquot_buf_write_verify( - struct xfs_buf *bp) + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; - if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); - return; - } + fa = xfs_dquot_buf_verify(mp, bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); } const struct xfs_buf_ops xfs_dquot_buf_ops = { .name = "xfs_dquot", .verify_read = xfs_dquot_buf_read_verify, .verify_write = xfs_dquot_buf_write_verify, + .verify_struct = xfs_dquot_buf_verify_struct, }; const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index b90924104596..faf1a4edd618 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -233,6 +233,13 @@ typedef struct xfs_fsop_resblks { #define XFS_MAX_LOG_BLOCKS (1024 * 1024ULL) #define XFS_MIN_LOG_BYTES (10 * 1024 * 1024ULL) +/* + * Limits on sb_agblocks/sb_agblklog -- mkfs won't format AGs smaller than + * 16MB or larger than 1TB. + */ +#define XFS_MIN_AG_BYTES (1ULL << 24) /* 16 MB */ +#define XFS_MAX_AG_BYTES (1ULL << 40) /* 1 TB */ + /* keep the maximum size under 2^31 by a small amount */ #define XFS_MAX_LOG_BYTES \ ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 3b57ef0f2f76..0e2cf5f0be1f 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2491,7 +2491,7 @@ xfs_check_agi_unlinked( #define xfs_check_agi_unlinked(agi) #endif -static bool +static xfs_failaddr_t xfs_agi_verify( struct xfs_buf *bp) { @@ -2500,28 +2500,28 @@ xfs_agi_verify( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn))) - return false; + return __this_address; } /* * Validate the magic number of the agi block. */ if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC)) - return false; + return __this_address; if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) - return false; + return __this_address; if (be32_to_cpu(agi->agi_level) < 1 || be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) - return false; + return __this_address; if (xfs_sb_version_hasfinobt(&mp->m_sb) && (be32_to_cpu(agi->agi_free_level) < 1 || be32_to_cpu(agi->agi_free_level) > XFS_BTREE_MAXLEVELS)) - return false; + return __this_address; /* * during growfs operations, the perag is not fully initialised, @@ -2530,10 +2530,10 @@ xfs_agi_verify( * so we can detect and avoid this problem. */ if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno) - return false; + return __this_address; xfs_check_agi_unlinked(agi); - return true; + return NULL; } static void @@ -2541,28 +2541,29 @@ xfs_agi_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp, - XFS_ERRTAG_IALLOC_READ_AGI)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_agi_verify(bp); + if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI)) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } static void xfs_agi_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_log_item; + xfs_failaddr_t fa; - if (!xfs_agi_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_agi_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -2578,6 +2579,7 @@ const struct xfs_buf_ops xfs_agi_buf_ops = { .name = "xfs_agi", .verify_read = xfs_agi_read_verify, .verify_write = xfs_agi_write_verify, + .verify_struct = xfs_agi_verify, }; /* @@ -2751,3 +2753,102 @@ xfs_verify_dir_ino( return false; return xfs_verify_ino(mp, ino); } + +/* Is there an inode record covering a given range of inode numbers? */ +int +xfs_ialloc_has_inode_record( + struct xfs_btree_cur *cur, + xfs_agino_t low, + xfs_agino_t high, + bool *exists) +{ + struct xfs_inobt_rec_incore irec; + xfs_agino_t agino; + uint16_t holemask; + int has_record; + int i; + int error; + + *exists = false; + error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record); + while (error == 0 && has_record) { + error = xfs_inobt_get_rec(cur, &irec, &has_record); + if (error || irec.ir_startino > high) + break; + + agino = irec.ir_startino; + holemask = irec.ir_holemask; + for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; holemask >>= 1, + i++, agino += XFS_INODES_PER_HOLEMASK_BIT) { + if (holemask & 1) + continue; + if (agino + XFS_INODES_PER_HOLEMASK_BIT > low && + agino <= high) { + *exists = true; + return 0; + } + } + + error = xfs_btree_increment(cur, 0, &has_record); + } + return error; +} + +/* Is there an inode record covering a given extent? */ +int +xfs_ialloc_has_inodes_at_extent( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool *exists) +{ + xfs_agino_t low; + xfs_agino_t high; + + low = XFS_OFFBNO_TO_AGINO(cur->bc_mp, bno, 0); + high = XFS_OFFBNO_TO_AGINO(cur->bc_mp, bno + len, 0) - 1; + + return xfs_ialloc_has_inode_record(cur, low, high, exists); +} + +struct xfs_ialloc_count_inodes { + xfs_agino_t count; + xfs_agino_t freecount; +}; + +/* Record inode counts across all inobt records. */ +STATIC int +xfs_ialloc_count_inodes_rec( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_inobt_rec_incore irec; + struct xfs_ialloc_count_inodes *ci = priv; + + xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); + ci->count += irec.ir_count; + ci->freecount += irec.ir_freecount; + + return 0; +} + +/* Count allocated and free inodes under an inobt. */ +int +xfs_ialloc_count_inodes( + struct xfs_btree_cur *cur, + xfs_agino_t *count, + xfs_agino_t *freecount) +{ + struct xfs_ialloc_count_inodes ci = {0}; + int error; + + ASSERT(cur->bc_btnum == XFS_BTNUM_INO); + error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci); + if (error) + return error; + + *count = ci.count; + *freecount = ci.freecount; + return 0; +} diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 66a8de0b1caa..c5402bb4ce0c 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -170,6 +170,12 @@ int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, union xfs_btree_rec; void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec); +int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur, + xfs_agblock_t bno, xfs_extlen_t len, bool *exists); +int xfs_ialloc_has_inode_record(struct xfs_btree_cur *cur, xfs_agino_t low, + xfs_agino_t high, bool *exists); +int xfs_ialloc_count_inodes(struct xfs_btree_cur *cur, xfs_agino_t *count, + xfs_agino_t *freecount); int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); void xfs_ialloc_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 317caba9faa6..af197a5f3a82 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -141,21 +141,42 @@ xfs_finobt_alloc_block( union xfs_btree_ptr *new, int *stat) { + if (cur->bc_mp->m_inotbt_nores) + return xfs_inobt_alloc_block(cur, start, new, stat); return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_METADATA); } STATIC int -xfs_inobt_free_block( +__xfs_inobt_free_block( struct xfs_btree_cur *cur, - struct xfs_buf *bp) + struct xfs_buf *bp, + enum xfs_ag_resv_type resv) { struct xfs_owner_info oinfo; xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); return xfs_free_extent(cur->bc_tp, XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, - &oinfo, XFS_AG_RESV_NONE); + &oinfo, resv); +} + +STATIC int +xfs_inobt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_NONE); +} + +STATIC int +xfs_finobt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + if (cur->bc_mp->m_inotbt_nores) + return xfs_inobt_free_block(cur, bp); + return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA); } STATIC int @@ -250,12 +271,13 @@ xfs_inobt_diff_two_keys( be32_to_cpu(k2->inobt.ir_startino); } -static int +static xfs_failaddr_t xfs_inobt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; unsigned int level; /* @@ -271,20 +293,21 @@ xfs_inobt_verify( switch (block->bb_magic) { case cpu_to_be32(XFS_IBT_CRC_MAGIC): case cpu_to_be32(XFS_FIBT_CRC_MAGIC): - if (!xfs_btree_sblock_v5hdr_verify(bp)) - return false; + fa = xfs_btree_sblock_v5hdr_verify(bp); + if (fa) + return fa; /* fall through */ case cpu_to_be32(XFS_IBT_MAGIC): case cpu_to_be32(XFS_FIBT_MAGIC): break; default: - return 0; + return NULL; } /* level verification */ level = be16_to_cpu(block->bb_level); if (level >= mp->m_in_maxlevels) - return false; + return __this_address; return xfs_btree_sblock_verify(bp, mp->m_inobt_mxr[level != 0]); } @@ -293,25 +316,30 @@ static void xfs_inobt_read_verify( struct xfs_buf *bp) { + xfs_failaddr_t fa; + if (!xfs_btree_sblock_verify_crc(bp)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_inobt_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_inobt_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } - if (bp->b_error) { + if (bp->b_error) trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_verifier_error(bp); - } } static void xfs_inobt_write_verify( struct xfs_buf *bp) { - if (!xfs_inobt_verify(bp)) { + xfs_failaddr_t fa; + + fa = xfs_inobt_verify(bp); + if (fa) { trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } xfs_btree_sblock_calc_crc(bp); @@ -322,6 +350,7 @@ const struct xfs_buf_ops xfs_inobt_buf_ops = { .name = "xfs_inobt", .verify_read = xfs_inobt_read_verify, .verify_write = xfs_inobt_write_verify, + .verify_struct = xfs_inobt_verify, }; STATIC int @@ -372,7 +401,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_finobt_set_root, .alloc_block = xfs_finobt_alloc_block, - .free_block = xfs_inobt_free_block, + .free_block = xfs_finobt_free_block, .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, .init_key_from_rec = xfs_inobt_init_key_from_rec, diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 89bf16b4d937..b0f31791c7e6 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -632,8 +632,6 @@ xfs_iext_insert( struct xfs_iext_leaf *new = NULL; int nr_entries, i; - trace_xfs_iext_insert(ip, cur, state, _RET_IP_); - if (ifp->if_height == 0) xfs_iext_alloc_root(ifp, cur); else if (ifp->if_height == 1) @@ -661,6 +659,8 @@ xfs_iext_insert( xfs_iext_set(cur_rec(cur), irec); ifp->if_bytes += sizeof(struct xfs_iext_rec); + trace_xfs_iext_insert(ip, cur, state, _RET_IP_); + if (new) xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2); } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 6b7989038d75..4fe17b368316 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -32,6 +32,8 @@ #include "xfs_ialloc.h" #include "xfs_dir2.h" +#include <linux/iversion.h> + /* * Check that none of the inode's in the buffer have a next * unlinked field of 0. @@ -113,8 +115,7 @@ xfs_inode_buf_verify( return; } - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); #ifdef DEBUG xfs_alert(mp, "bad inode magic/vsn daddr %lld #%d (magic=%x)", @@ -264,7 +265,8 @@ xfs_inode_from_disk( to->di_flags = be16_to_cpu(from->di_flags); if (to->di_version == 3) { - inode->i_version = be64_to_cpu(from->di_changecount); + inode_set_iversion_queried(inode, + be64_to_cpu(from->di_changecount)); to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); to->di_flags2 = be64_to_cpu(from->di_flags2); @@ -314,7 +316,7 @@ xfs_inode_to_disk( to->di_flags = cpu_to_be16(from->di_flags); if (from->di_version == 3) { - to->di_changecount = cpu_to_be64(inode->i_version); + to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); to->di_flags2 = cpu_to_be64(from->di_flags2); @@ -381,7 +383,7 @@ xfs_log_dinode_to_disk( } } -bool +xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, xfs_ino_t ino, @@ -390,53 +392,122 @@ xfs_dinode_verify( uint16_t mode; uint16_t flags; uint64_t flags2; + uint64_t di_size; if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) - return false; + return __this_address; + + /* Verify v3 integrity information first */ + if (dip->di_version >= 3) { + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return __this_address; + if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, + XFS_DINODE_CRC_OFF)) + return __this_address; + if (be64_to_cpu(dip->di_ino) != ino) + return __this_address; + if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + } /* don't allow invalid i_size */ - if (be64_to_cpu(dip->di_size) & (1ULL << 63)) - return false; + di_size = be64_to_cpu(dip->di_size); + if (di_size & (1ULL << 63)) + return __this_address; mode = be16_to_cpu(dip->di_mode); if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN) - return false; + return __this_address; /* No zero-length symlinks/dirs. */ - if ((S_ISLNK(mode) || S_ISDIR(mode)) && dip->di_size == 0) - return false; + if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) + return __this_address; + + /* Fork checks carried over from xfs_iformat_fork */ + if (mode && + be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) > + be64_to_cpu(dip->di_nblocks)) + return __this_address; + + if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize) + return __this_address; + + flags = be16_to_cpu(dip->di_flags); + + if (mode && (flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp) + return __this_address; + + /* Do we have appropriate data fork formats for the mode? */ + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + if (dip->di_format != XFS_DINODE_FMT_DEV) + return __this_address; + break; + case S_IFREG: + case S_IFLNK: + case S_IFDIR: + switch (dip->di_format) { + case XFS_DINODE_FMT_LOCAL: + /* + * no local regular files yet + */ + if (S_ISREG(mode)) + return __this_address; + if (di_size > XFS_DFORK_DSIZE(dip, mp)) + return __this_address; + /* fall through */ + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + default: + return __this_address; + } + break; + case 0: + /* Uninitialized inode ok. */ + break; + default: + return __this_address; + } + + if (XFS_DFORK_Q(dip)) { + switch (dip->di_aformat) { + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + default: + return __this_address; + } + } /* only version 3 or greater inodes are extensively verified here */ if (dip->di_version < 3) - return true; - - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, - XFS_DINODE_CRC_OFF)) - return false; - if (be64_to_cpu(dip->di_ino) != ino) - return false; - if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return NULL; - flags = be16_to_cpu(dip->di_flags); flags2 = be64_to_cpu(dip->di_flags2); /* don't allow reflink/cowextsize if we don't have reflink */ if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) && !xfs_sb_version_hasreflink(&mp->m_sb)) - return false; + return __this_address; + + /* only regular files get reflink */ + if ((flags2 & XFS_DIFLAG2_REFLINK) && (mode & S_IFMT) != S_IFREG) + return __this_address; /* don't let reflink and realtime mix */ if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME)) - return false; + return __this_address; /* don't let reflink and dax mix */ if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags2 & XFS_DIFLAG2_DAX)) - return false; + return __this_address; - return true; + return NULL; } void @@ -476,6 +547,7 @@ xfs_iread( { xfs_buf_t *bp; xfs_dinode_t *dip; + xfs_failaddr_t fa; int error; /* @@ -507,11 +579,10 @@ xfs_iread( return error; /* even unallocated inodes are verified */ - if (!xfs_dinode_verify(mp, ip->i_ino, dip)) { - xfs_alert(mp, "%s: validation failed for inode %lld", - __func__, ip->i_ino); - - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); + fa = xfs_dinode_verify(mp, ip->i_ino, dip); + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", dip, + sizeof(*dip), fa); error = -EFSCORRUPTED; goto out_brelse; } diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index a9c97a356c30..8a5e1da52d74 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -82,7 +82,7 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #define xfs_inobp_check(mp, bp) #endif /* DEBUG */ -bool xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, - struct xfs_dinode *dip); +xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, + struct xfs_dinode *dip); #endif /* __XFS_INODE_BUF_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index c79a1616b79d..866d2861c625 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -35,6 +35,8 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_dir2_priv.h" +#include "xfs_attr_leaf.h" +#include "xfs_shared.h" kmem_zone_t *xfs_ifork_zone; @@ -62,69 +64,11 @@ xfs_iformat_fork( int error = 0; xfs_fsize_t di_size; - if (unlikely(be32_to_cpu(dip->di_nextents) + - be16_to_cpu(dip->di_anextents) > - be64_to_cpu(dip->di_nblocks))) { - xfs_warn(ip->i_mount, - "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", - (unsigned long long)ip->i_ino, - (int)(be32_to_cpu(dip->di_nextents) + - be16_to_cpu(dip->di_anextents)), - (unsigned long long) - be64_to_cpu(dip->di_nblocks)); - XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return -EFSCORRUPTED; - } - - if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { - xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.", - (unsigned long long)ip->i_ino, - dip->di_forkoff); - XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return -EFSCORRUPTED; - } - - if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && - !ip->i_mount->m_rtdev_targp)) { - xfs_warn(ip->i_mount, - "corrupt dinode %Lu, has realtime flag set.", - ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", - XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return -EFSCORRUPTED; - } - - if (unlikely(xfs_is_reflink_inode(ip) && !S_ISREG(inode->i_mode))) { - xfs_warn(ip->i_mount, - "corrupt dinode %llu, wrong file type for reflink.", - ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat(reflink)", - XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return -EFSCORRUPTED; - } - - if (unlikely(xfs_is_reflink_inode(ip) && - (ip->i_d.di_flags & XFS_DIFLAG_REALTIME))) { - xfs_warn(ip->i_mount, - "corrupt dinode %llu, has reflink+realtime flag set.", - ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat(reflink)", - XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return -EFSCORRUPTED; - } - switch (inode->i_mode & S_IFMT) { case S_IFIFO: case S_IFCHR: case S_IFBLK: case S_IFSOCK: - if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { - XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return -EFSCORRUPTED; - } ip->i_d.di_size = 0; inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip)); break; @@ -134,32 +78,7 @@ xfs_iformat_fork( case S_IFDIR: switch (dip->di_format) { case XFS_DINODE_FMT_LOCAL: - /* - * no local regular files yet - */ - if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (local format for regular file).", - (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat(4)", - XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return -EFSCORRUPTED; - } - di_size = be64_to_cpu(dip->di_size); - if (unlikely(di_size < 0 || - di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %Ld for local inode).", - (unsigned long long) ip->i_ino, - (long long) di_size); - XFS_CORRUPTION_ERROR("xfs_iformat(5)", - XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return -EFSCORRUPTED; - } - size = (int)di_size; error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); break; @@ -170,28 +89,16 @@ xfs_iformat_fork( error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); break; default: - XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, - ip->i_mount); return -EFSCORRUPTED; } break; default: - XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); return -EFSCORRUPTED; } if (error) return error; - /* Check inline dir contents. */ - if (S_ISDIR(inode->i_mode) && dip->di_format == XFS_DINODE_FMT_LOCAL) { - error = xfs_dir2_sf_verify(ip); - if (error) { - xfs_idestroy_fork(ip, XFS_DATA_FORK); - return error; - } - } - if (xfs_is_reflink_inode(ip)) { ASSERT(ip->i_cowfp == NULL); xfs_ifork_init_cow(ip); @@ -208,18 +115,6 @@ xfs_iformat_fork( atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); size = be16_to_cpu(atp->hdr.totsize); - if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad attr fork size %Ld).", - (unsigned long long) ip->i_ino, - (long long) size); - XFS_CORRUPTION_ERROR("xfs_iformat(8)", - XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - error = -EFSCORRUPTED; - break; - } - error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); break; case XFS_DINODE_FMT_EXTENTS: @@ -403,6 +298,7 @@ xfs_iformat_btree( */ if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= XFS_IFORK_MAXEXT(ip, whichfork) || + nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > XFS_DFORK_SIZE(dip, mp, whichfork) || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) || @@ -827,3 +723,45 @@ xfs_ifork_init_cow( ip->i_cformat = XFS_DINODE_FMT_EXTENTS; ip->i_cnextents = 0; } + +/* Default fork content verifiers. */ +struct xfs_ifork_ops xfs_default_ifork_ops = { + .verify_attr = xfs_attr_shortform_verify, + .verify_dir = xfs_dir2_sf_verify, + .verify_symlink = xfs_symlink_shortform_verify, +}; + +/* Verify the inline contents of the data fork of an inode. */ +xfs_failaddr_t +xfs_ifork_verify_data( + struct xfs_inode *ip, + struct xfs_ifork_ops *ops) +{ + /* Non-local data fork, we're done. */ + if (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + return NULL; + + /* Check the inline data fork if there is one. */ + switch (VFS_I(ip)->i_mode & S_IFMT) { + case S_IFDIR: + return ops->verify_dir(ip); + case S_IFLNK: + return ops->verify_symlink(ip); + default: + return NULL; + } +} + +/* Verify the inline contents of the attr fork of an inode. */ +xfs_failaddr_t +xfs_ifork_verify_attr( + struct xfs_inode *ip, + struct xfs_ifork_ops *ops) +{ + /* There has to be an attr fork allocated if aformat is local. */ + if (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) + return NULL; + if (!XFS_IFORK_PTR(ip, XFS_ATTR_FORK)) + return __this_address; + return ops->verify_attr(ip); +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index b9f0098e33b8..dd8aba0dd119 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -186,4 +186,18 @@ extern struct kmem_zone *xfs_ifork_zone; extern void xfs_ifork_init_cow(struct xfs_inode *ip); +typedef xfs_failaddr_t (*xfs_ifork_verifier_t)(struct xfs_inode *); + +struct xfs_ifork_ops { + xfs_ifork_verifier_t verify_symlink; + xfs_ifork_verifier_t verify_dir; + xfs_ifork_verifier_t verify_attr; +}; +extern struct xfs_ifork_ops xfs_default_ifork_ops; + +xfs_failaddr_t xfs_ifork_verify_data(struct xfs_inode *ip, + struct xfs_ifork_ops *ops); +xfs_failaddr_t xfs_ifork_verify_attr(struct xfs_inode *ip, + struct xfs_ifork_ops *ops); + #endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index c10597973333..cc4cbe290939 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -55,7 +55,7 @@ xfs_log_calc_max_attrsetm_res( * the maximum one in terms of the pre-calculated values which were done * at mount time. */ -STATIC void +void xfs_log_get_max_trans_res( struct xfs_mount *mp, struct xfs_trans_res *max_resp) diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index d69c772271cb..bb1b13a9b5f4 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -112,8 +112,6 @@ typedef uint16_t xfs_qwarncnt_t; #define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ #define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ #define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ -#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ -#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ #define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ #define XFS_QMOPT_DQNEXT 0x0008000 /* return next dquot >= this ID */ @@ -153,8 +151,11 @@ typedef uint16_t xfs_qwarncnt_t; (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) -extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq, - xfs_dqid_t id, uint type, uint flags, const char *str); +extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp, + struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type, + uint flags); extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); +extern int xfs_dquot_repair(struct xfs_mount *mp, struct xfs_disk_dquot *ddq, + xfs_dqid_t id, uint type); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 585b35d34142..bee68c23d612 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1488,27 +1488,12 @@ __xfs_refcount_cow_alloc( xfs_extlen_t aglen, struct xfs_defer_ops *dfops) { - int error; - trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno, agbno, aglen); /* Add refcount btree reservation */ - error = xfs_refcount_adjust_cow(rcur, agbno, aglen, + return xfs_refcount_adjust_cow(rcur, agbno, aglen, XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops); - if (error) - return error; - - /* Add rmap entry */ - if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { - error = xfs_rmap_alloc_extent(rcur->bc_mp, dfops, - rcur->bc_private.a.agno, - agbno, aglen, XFS_RMAP_OWN_COW); - if (error) - return error; - } - - return error; } /* @@ -1521,27 +1506,12 @@ __xfs_refcount_cow_free( xfs_extlen_t aglen, struct xfs_defer_ops *dfops) { - int error; - trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno, agbno, aglen); /* Remove refcount btree reservation */ - error = xfs_refcount_adjust_cow(rcur, agbno, aglen, + return xfs_refcount_adjust_cow(rcur, agbno, aglen, XFS_REFCOUNT_ADJUST_COW_FREE, dfops); - if (error) - return error; - - /* Remove rmap entry */ - if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { - error = xfs_rmap_free_extent(rcur->bc_mp, dfops, - rcur->bc_private.a.agno, - agbno, aglen, XFS_RMAP_OWN_COW); - if (error) - return error; - } - - return error; } /* Record a CoW staging extent in the refcount btree. */ @@ -1552,11 +1522,19 @@ xfs_refcount_alloc_cow_extent( xfs_fsblock_t fsb, xfs_extlen_t len) { + int error; + if (!xfs_sb_version_hasreflink(&mp->m_sb)) return 0; - return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW, + error = __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW, fsb, len); + if (error) + return error; + + /* Add rmap entry */ + return xfs_rmap_alloc_extent(mp, dfops, XFS_FSB_TO_AGNO(mp, fsb), + XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); } /* Forget a CoW staging event in the refcount btree. */ @@ -1567,9 +1545,17 @@ xfs_refcount_free_cow_extent( xfs_fsblock_t fsb, xfs_extlen_t len) { + int error; + if (!xfs_sb_version_hasreflink(&mp->m_sb)) return 0; + /* Remove rmap entry */ + error = xfs_rmap_free_extent(mp, dfops, XFS_FSB_TO_AGNO(mp, fsb), + XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); + if (error) + return error; + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_FREE_COW, fsb, len); } @@ -1710,3 +1696,22 @@ out_cursor: xfs_trans_brelse(tp, agbp); goto out_trans; } + +/* Is there a record covering a given extent? */ +int +xfs_refcount_has_record( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool *exists) +{ + union xfs_btree_irec low; + union xfs_btree_irec high; + + memset(&low, 0, sizeof(low)); + low.rc.rc_startblock = bno; + memset(&high, 0xFF, sizeof(high)); + high.rc.rc_startblock = bno + len - 1; + + return xfs_btree_has_record(cur, &low, &high, exists); +} diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index eafb9d1f3b37..2a731ac68fe4 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -83,4 +83,7 @@ static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD; } +extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, + xfs_agblock_t bno, xfs_extlen_t len, bool *exists); + #endif /* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 3c59dd3d58d7..8479769e470d 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -223,29 +223,31 @@ xfs_refcountbt_diff_two_keys( be32_to_cpu(k2->refc.rc_startblock); } -STATIC bool +STATIC xfs_failaddr_t xfs_refcountbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; + xfs_failaddr_t fa; unsigned int level; if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC)) - return false; + return __this_address; if (!xfs_sb_version_hasreflink(&mp->m_sb)) - return false; - if (!xfs_btree_sblock_v5hdr_verify(bp)) - return false; + return __this_address; + fa = xfs_btree_sblock_v5hdr_verify(bp); + if (fa) + return fa; level = be16_to_cpu(block->bb_level); if (pag && pag->pagf_init) { if (level >= pag->pagf_refcount_level) - return false; + return __this_address; } else if (level >= mp->m_refc_maxlevels) - return false; + return __this_address; return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]); } @@ -254,25 +256,30 @@ STATIC void xfs_refcountbt_read_verify( struct xfs_buf *bp) { + xfs_failaddr_t fa; + if (!xfs_btree_sblock_verify_crc(bp)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_refcountbt_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_refcountbt_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } - if (bp->b_error) { + if (bp->b_error) trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_verifier_error(bp); - } } STATIC void xfs_refcountbt_write_verify( struct xfs_buf *bp) { - if (!xfs_refcountbt_verify(bp)) { + xfs_failaddr_t fa; + + fa = xfs_refcountbt_verify(bp); + if (fa) { trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } xfs_btree_sblock_calc_crc(bp); @@ -283,6 +290,7 @@ const struct xfs_buf_ops xfs_refcountbt_buf_ops = { .name = "xfs_refcountbt", .verify_read = xfs_refcountbt_read_verify, .verify_write = xfs_refcountbt_write_verify, + .verify_struct = xfs_refcountbt_verify, }; STATIC int diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index dd019cee1b3b..79822cf6ebe3 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -368,6 +368,51 @@ xfs_rmap_lookup_le_range( } /* + * Perform all the relevant owner checks for a removal op. If we're doing an + * unknown-owner removal then we have no owner information to check. + */ +static int +xfs_rmap_free_check_owner( + struct xfs_mount *mp, + uint64_t ltoff, + struct xfs_rmap_irec *rec, + xfs_fsblock_t bno, + xfs_filblks_t len, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + int error = 0; + + if (owner == XFS_RMAP_OWN_UNKNOWN) + return 0; + + /* Make sure the unwritten flag matches. */ + XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == + (rec->rm_flags & XFS_RMAP_UNWRITTEN), out); + + /* Make sure the owner matches what we expect to find in the tree. */ + XFS_WANT_CORRUPTED_GOTO(mp, owner == rec->rm_owner, out); + + /* Check the offset, if necessary. */ + if (XFS_RMAP_NON_INODE_OWNER(owner)) + goto out; + + if (flags & XFS_RMAP_BMBT_BLOCK) { + XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_flags & XFS_RMAP_BMBT_BLOCK, + out); + } else { + XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_offset <= offset, out); + XFS_WANT_CORRUPTED_GOTO(mp, + ltoff + rec->rm_blockcount >= offset + len, + out); + } + +out: + return error; +} + +/* * Find the extent in the rmap btree and remove it. * * The record we find should always be an exact match for the extent that we're @@ -444,33 +489,40 @@ xfs_rmap_unmap( goto out_done; } - /* Make sure the unwritten flag matches. */ - XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == - (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error); + /* + * If we're doing an unknown-owner removal for EFI recovery, we expect + * to find the full range in the rmapbt or nothing at all. If we + * don't find any rmaps overlapping either end of the range, we're + * done. Hopefully this means that the EFI creator already queued + * (and finished) a RUI to remove the rmap. + */ + if (owner == XFS_RMAP_OWN_UNKNOWN && + ltrec.rm_startblock + ltrec.rm_blockcount <= bno) { + struct xfs_rmap_irec rtrec; + + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto out_error; + if (i == 0) + goto out_done; + error = xfs_rmap_get_rec(cur, &rtrec, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (rtrec.rm_startblock >= bno + len) + goto out_done; + } /* Make sure the extent we found covers the entire freeing range. */ XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno && - ltrec.rm_startblock + ltrec.rm_blockcount >= - bno + len, out_error); - - /* Make sure the owner matches what we expect to find in the tree. */ - XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner || - XFS_RMAP_NON_INODE_OWNER(owner), out_error); + ltrec.rm_startblock + ltrec.rm_blockcount >= + bno + len, out_error); - /* Check the offset, if necessary. */ - if (!XFS_RMAP_NON_INODE_OWNER(owner)) { - if (flags & XFS_RMAP_BMBT_BLOCK) { - XFS_WANT_CORRUPTED_GOTO(mp, - ltrec.rm_flags & XFS_RMAP_BMBT_BLOCK, - out_error); - } else { - XFS_WANT_CORRUPTED_GOTO(mp, - ltrec.rm_offset <= offset, out_error); - XFS_WANT_CORRUPTED_GOTO(mp, - ltoff + ltrec.rm_blockcount >= offset + len, - out_error); - } - } + /* Check owner information. */ + error = xfs_rmap_free_check_owner(mp, ltoff, <rec, bno, len, owner, + offset, flags); + if (error) + goto out_error; if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { /* exact match, simply remove the record from rmap tree */ @@ -664,6 +716,7 @@ xfs_rmap_map( flags |= XFS_RMAP_UNWRITTEN; trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, unwritten, oinfo); + ASSERT(!xfs_rmap_should_skip_owner_update(oinfo)); /* * For the initial lookup, look for an exact match or the left-adjacent @@ -2334,3 +2387,70 @@ xfs_rmap_compare( else return 0; } + +/* Is there a record covering a given extent? */ +int +xfs_rmap_has_record( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool *exists) +{ + union xfs_btree_irec low; + union xfs_btree_irec high; + + memset(&low, 0, sizeof(low)); + low.r.rm_startblock = bno; + memset(&high, 0xFF, sizeof(high)); + high.r.rm_startblock = bno + len - 1; + + return xfs_btree_has_record(cur, &low, &high, exists); +} + +/* + * Is there a record for this owner completely covering a given physical + * extent? If so, *has_rmap will be set to true. If there is no record + * or the record only covers part of the range, we set *has_rmap to false. + * This function doesn't perform range lookups or offset checks, so it is + * not suitable for checking data fork blocks. + */ +int +xfs_rmap_record_exists( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo, + bool *has_rmap) +{ + uint64_t owner; + uint64_t offset; + unsigned int flags; + int has_record; + struct xfs_rmap_irec irec; + int error; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) || + (flags & XFS_RMAP_BMBT_BLOCK)); + + error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, + &has_record); + if (error) + return error; + if (!has_record) { + *has_rmap = false; + return 0; + } + + error = xfs_rmap_get_rec(cur, &irec, &has_record); + if (error) + return error; + if (!has_record) { + *has_rmap = false; + return 0; + } + + *has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno && + irec.rm_startblock + irec.rm_blockcount >= bno + len); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 466ede637080..380e53be98d5 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -61,7 +61,21 @@ static inline void xfs_rmap_skip_owner_update( struct xfs_owner_info *oi) { - oi->oi_owner = XFS_RMAP_OWN_UNKNOWN; + xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_NULL); +} + +static inline bool +xfs_rmap_should_skip_owner_update( + struct xfs_owner_info *oi) +{ + return oi->oi_owner == XFS_RMAP_OWN_NULL; +} + +static inline void +xfs_rmap_any_owner_update( + struct xfs_owner_info *oi) +{ + xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_UNKNOWN); } /* Reverse mapping functions. */ @@ -219,5 +233,10 @@ int xfs_rmap_compare(const struct xfs_rmap_irec *a, union xfs_btree_rec; int xfs_rmap_btrec_to_irec(union xfs_btree_rec *rec, struct xfs_rmap_irec *irec); +int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, bool *exists); +int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, struct xfs_owner_info *oinfo, + bool *has_rmap); #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 9d9c9192584c..e829c3e489ea 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -303,13 +303,14 @@ xfs_rmapbt_diff_two_keys( return 0; } -static bool +static xfs_failaddr_t xfs_rmapbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; + xfs_failaddr_t fa; unsigned int level; /* @@ -325,19 +326,20 @@ xfs_rmapbt_verify( * in this case. */ if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC)) - return false; + return __this_address; if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) - return false; - if (!xfs_btree_sblock_v5hdr_verify(bp)) - return false; + return __this_address; + fa = xfs_btree_sblock_v5hdr_verify(bp); + if (fa) + return fa; level = be16_to_cpu(block->bb_level); if (pag && pag->pagf_init) { if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi]) - return false; + return __this_address; } else if (level >= mp->m_rmap_maxlevels) - return false; + return __this_address; return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]); } @@ -346,25 +348,30 @@ static void xfs_rmapbt_read_verify( struct xfs_buf *bp) { + xfs_failaddr_t fa; + if (!xfs_btree_sblock_verify_crc(bp)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_rmapbt_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_rmapbt_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } - if (bp->b_error) { + if (bp->b_error) trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_verifier_error(bp); - } } static void xfs_rmapbt_write_verify( struct xfs_buf *bp) { - if (!xfs_rmapbt_verify(bp)) { + xfs_failaddr_t fa; + + fa = xfs_rmapbt_verify(bp); + if (fa) { trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } xfs_btree_sblock_calc_crc(bp); @@ -375,6 +382,7 @@ const struct xfs_buf_ops xfs_rmapbt_buf_ops = { .name = "xfs_rmapbt", .verify_read = xfs_rmapbt_read_verify, .verify_write = xfs_rmapbt_write_verify, + .verify_struct = xfs_rmapbt_verify, }; STATIC int diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 3fb29a5ea915..106be2d0bb88 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1097,3 +1097,24 @@ xfs_verify_rtbno( { return rtbno < mp->m_sb.sb_rblocks; } + +/* Is the given extent all free? */ +int +xfs_rtalloc_extent_is_free( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_rtblock_t start, + xfs_extlen_t len, + bool *is_free) +{ + xfs_rtblock_t end; + int matches; + int error; + + error = xfs_rtcheck_range(mp, tp, start, len, 1, &end, &matches); + if (error) + return error; + + *is_free = matches; + return 0; +} diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 9b5aae2bcc0b..a55f7a45fa78 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -40,6 +40,8 @@ #include "xfs_rmap_btree.h" #include "xfs_bmap.h" #include "xfs_refcount_btree.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -116,6 +118,9 @@ xfs_mount_validate_sb( bool check_inprogress, bool check_version) { + uint32_t agcount = 0; + uint32_t rem; + if (sbp->sb_magicnum != XFS_SB_MAGIC) { xfs_warn(mp, "bad magic number"); return -EWRONGFS; @@ -226,6 +231,13 @@ xfs_mount_validate_sb( return -EINVAL; } + /* Compute agcount for this number of dblocks and agblocks */ + if (sbp->sb_agblocks) { + agcount = div_u64_rem(sbp->sb_dblocks, sbp->sb_agblocks, &rem); + if (rem) + agcount++; + } + /* * More sanity checking. Most of these were stolen directly from * xfs_repair. @@ -250,6 +262,10 @@ xfs_mount_validate_sb( sbp->sb_inodesize != (1 << sbp->sb_inodelog) || sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE || sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) || + XFS_FSB_TO_B(mp, sbp->sb_agblocks) < XFS_MIN_AG_BYTES || + XFS_FSB_TO_B(mp, sbp->sb_agblocks) > XFS_MAX_AG_BYTES || + sbp->sb_agblklog != xfs_highbit32(sbp->sb_agblocks - 1) + 1 || + agcount == 0 || agcount != sbp->sb_agcount || (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || @@ -640,11 +656,10 @@ xfs_sb_read_verify( error = xfs_sb_verify(bp, true); out_error: - if (error) { + if (error == -EFSCORRUPTED || error == -EFSBADCRC) + xfs_verifier_error(bp, error, __this_address); + else if (error) xfs_buf_ioerror(bp, error); - if (error == -EFSCORRUPTED || error == -EFSBADCRC) - xfs_verifier_error(bp); - } } /* @@ -673,13 +688,12 @@ xfs_sb_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; int error; error = xfs_sb_verify(bp, false); if (error) { - xfs_buf_ioerror(bp, error); - xfs_verifier_error(bp); + xfs_verifier_error(bp, error, __this_address); return; } @@ -876,3 +890,88 @@ xfs_sync_sb( xfs_trans_set_sync(tp); return xfs_trans_commit(tp); } + +int +xfs_fs_geometry( + struct xfs_sb *sbp, + struct xfs_fsop_geom *geo, + int struct_version) +{ + memset(geo, 0, sizeof(struct xfs_fsop_geom)); + + geo->blocksize = sbp->sb_blocksize; + geo->rtextsize = sbp->sb_rextsize; + geo->agblocks = sbp->sb_agblocks; + geo->agcount = sbp->sb_agcount; + geo->logblocks = sbp->sb_logblocks; + geo->sectsize = sbp->sb_sectsize; + geo->inodesize = sbp->sb_inodesize; + geo->imaxpct = sbp->sb_imax_pct; + geo->datablocks = sbp->sb_dblocks; + geo->rtblocks = sbp->sb_rblocks; + geo->rtextents = sbp->sb_rextents; + geo->logstart = sbp->sb_logstart; + BUILD_BUG_ON(sizeof(geo->uuid) != sizeof(sbp->sb_uuid)); + memcpy(geo->uuid, &sbp->sb_uuid, sizeof(sbp->sb_uuid)); + + if (struct_version < 2) + return 0; + + geo->sunit = sbp->sb_unit; + geo->swidth = sbp->sb_width; + + if (struct_version < 3) + return 0; + + geo->version = XFS_FSOP_GEOM_VERSION; + geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | + XFS_FSOP_GEOM_FLAGS_DIRV2; + if (xfs_sb_version_hasattr(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR; + if (xfs_sb_version_hasquota(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_QUOTA; + if (xfs_sb_version_hasalign(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN; + if (xfs_sb_version_hasdalign(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN; + if (xfs_sb_version_hasextflgbit(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_EXTFLG; + if (xfs_sb_version_hassector(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR; + if (xfs_sb_version_hasasciici(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI; + if (xfs_sb_version_haslazysbcount(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB; + if (xfs_sb_version_hasattr2(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2; + if (xfs_sb_version_hasprojid32bit(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32; + if (xfs_sb_version_hascrc(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_V5SB; + if (xfs_sb_version_hasftype(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_FTYPE; + if (xfs_sb_version_hasfinobt(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_FINOBT; + if (xfs_sb_version_hassparseinodes(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_SPINODES; + if (xfs_sb_version_hasrmapbt(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT; + if (xfs_sb_version_hasreflink(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK; + if (xfs_sb_version_hassector(sbp)) + geo->logsectsize = sbp->sb_logsectsize; + else + geo->logsectsize = BBSIZE; + geo->rtsectsize = sbp->sb_blocksize; + geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); + + if (struct_version < 4) + return 0; + + if (xfs_sb_version_haslogv2(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_LOGV2; + + geo->logsunit = sbp->sb_logsunit; + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 961e6475a309..63dcd2a1a657 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -34,4 +34,8 @@ extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from); extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from); extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); +#define XFS_FS_GEOM_MAX_STRUCT_VER (4) +extern int xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo, + int struct_version); + #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index c6f4eb46fe26..d0b84da0cb1e 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -76,6 +76,9 @@ struct xfs_log_item_desc { int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); int xfs_log_calc_minimum_size(struct xfs_mount *); +struct xfs_trans_res; +void xfs_log_get_max_trans_res(struct xfs_mount *mp, + struct xfs_trans_res *max_resp); /* * Values for t_flags. @@ -143,5 +146,6 @@ bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, uint32_t size, struct xfs_buf *bp); void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp); +xfs_failaddr_t xfs_symlink_shortform_verify(struct xfs_inode *ip); #endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index c484877129a0..5ef5f354587e 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -98,7 +98,7 @@ xfs_symlink_hdr_ok( return true; } -static bool +static xfs_failaddr_t xfs_symlink_verify( struct xfs_buf *bp) { @@ -106,22 +106,22 @@ xfs_symlink_verify( struct xfs_dsymlink_hdr *dsl = bp->b_addr; if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; + return __this_address; if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) - return false; + return __this_address; if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) - return false; + return __this_address; if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) - return false; + return __this_address; if (be32_to_cpu(dsl->sl_offset) + be32_to_cpu(dsl->sl_bytes) >= XFS_SYMLINK_MAXLEN) - return false; + return __this_address; if (dsl->sl_owner == 0) - return false; + return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(dsl->sl_lsn))) - return false; + return __this_address; - return true; + return NULL; } static void @@ -129,18 +129,19 @@ xfs_symlink_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) return; if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF)) - xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_symlink_verify(bp)) - xfs_buf_ioerror(bp, -EFSCORRUPTED); - - if (bp->b_error) - xfs_verifier_error(bp); + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_symlink_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } } static void @@ -148,15 +149,16 @@ xfs_symlink_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; + xfs_failaddr_t fa; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - if (!xfs_symlink_verify(bp)) { - xfs_buf_ioerror(bp, -EFSCORRUPTED); - xfs_verifier_error(bp); + fa = xfs_symlink_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } @@ -171,6 +173,7 @@ const struct xfs_buf_ops xfs_symlink_buf_ops = { .name = "xfs_symlink", .verify_read = xfs_symlink_read_verify, .verify_write = xfs_symlink_write_verify, + .verify_struct = xfs_symlink_verify, }; void @@ -207,3 +210,37 @@ xfs_symlink_local_to_remote( xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) + ifp->if_bytes - 1); } + +/* Verify the consistency of an inline symlink. */ +xfs_failaddr_t +xfs_symlink_shortform_verify( + struct xfs_inode *ip) +{ + char *sfp; + char *endp; + struct xfs_ifork *ifp; + int size; + + ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + sfp = (char *)ifp->if_u1.if_data; + size = ifp->if_bytes; + endp = sfp + size; + + /* Zero length symlinks can exist while we're deleting a remote one. */ + if (size == 0) + return NULL; + + /* No negative sizes or overly long symlink targets. */ + if (size < 0 || size > XFS_SYMLINK_MAXLEN) + return __this_address; + + /* No NULLs in the target either. */ + if (memchr(sfp, 0, size - 1)) + return __this_address; + + /* We /did/ null-terminate the buffer, right? */ + if (*endp != 0) + return __this_address; + return NULL; +} diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 6bd916bd35e2..5f17641f040f 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -34,6 +34,9 @@ #include "xfs_trans_space.h" #include "xfs_trace.h" +#define _ALLOC true +#define _FREE false + /* * A buffer has a format structure overhead in the log in addition * to the data, so we need to take this into account when reserving @@ -132,43 +135,77 @@ xfs_calc_inode_res( } /* - * The free inode btree is a conditional feature and the log reservation - * requirements differ slightly from that of the traditional inode allocation - * btree. The finobt tracks records for inode chunks with at least one free - * inode. A record can be removed from the tree for an inode allocation - * or free and thus the finobt reservation is unconditional across: + * Inode btree record insertion/removal modifies the inode btree and free space + * btrees (since the inobt does not use the agfl). This requires the following + * reservation: * - * - inode allocation - * - inode free - * - inode chunk allocation + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size * - * The 'modify' param indicates to include the record modification scenario. The - * 'alloc' param indicates to include the reservation for free space btree - * modifications on behalf of finobt modifications. This is required only for - * transactions that do not already account for free space btree modifications. + * The caller must account for SB and AG header modifications, etc. + */ +STATIC uint +xfs_calc_inobt_res( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * The free inode btree is a conditional feature. The behavior differs slightly + * from that of the traditional inode btree in that the finobt tracks records + * for inode chunks with at least one free inode. A record can be removed from + * the tree during individual inode allocation. Therefore the finobt + * reservation is unconditional for both the inode chunk allocation and + * individual inode allocation (modify) cases. * - * the free inode btree: max depth * block size - * the allocation btrees: 2 trees * (max depth - 1) * block size - * the free inode btree entry: block size + * Behavior aside, the reservation for finobt modification is equivalent to the + * traditional inobt: cover a full finobt shape change plus block allocation. */ STATIC uint xfs_calc_finobt_res( - struct xfs_mount *mp, - int alloc, - int modify) + struct xfs_mount *mp) { - uint res; - if (!xfs_sb_version_hasfinobt(&mp->m_sb)) return 0; - res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)); - if (alloc) - res += xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), - XFS_FSB_TO_B(mp, 1)); - if (modify) - res += (uint)XFS_FSB_TO_B(mp, 1); + return xfs_calc_inobt_res(mp); +} +/* + * Calculate the reservation required to allocate or free an inode chunk. This + * includes: + * + * the allocation btrees: 2 trees * (max depth - 1) * block size + * the inode chunk: m_ialloc_blks * N + * + * The size N of the inode chunk reservation depends on whether it is for + * allocation or free and which type of create transaction is in use. An inode + * chunk free always invalidates the buffers and only requires reservation for + * headers (N == 0). An inode chunk allocation requires a chunk sized + * reservation on v4 and older superblocks to initialize the chunk. No chunk + * reservation is required for allocation on v5 supers, which use ordered + * buffers to initialize. + */ +STATIC uint +xfs_calc_inode_chunk_res( + struct xfs_mount *mp, + bool alloc) +{ + uint res, size = 0; + + res = xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + XFS_FSB_TO_B(mp, 1)); + if (alloc) { + /* icreate tx uses ordered buffers */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return res; + size = XFS_FSB_TO_B(mp, 1); + } + + res += xfs_calc_buf_res(mp->m_ialloc_blks, size); return res; } @@ -232,8 +269,6 @@ xfs_calc_write_reservation( * the super block to reflect the freed blocks: sector size * worst case split in allocation btrees per extent assuming 4 extents: * 4 exts * 2 trees * (2 * max depth - 1) * block size - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size */ STATIC uint xfs_calc_itruncate_reservation( @@ -245,12 +280,7 @@ xfs_calc_itruncate_reservation( XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(5, 0) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(2 + mp->m_ialloc_blks + - mp->m_in_maxlevels, 0))); + XFS_FSB_TO_B(mp, 1)))); } /* @@ -282,13 +312,14 @@ xfs_calc_rename_reservation( * For removing an inode from unlinked list at first, we can modify: * the agi hash list and counters: sector size * the on disk inode before ours in the agi hash list: inode cluster size + * the on disk inode in the agi hash list: inode cluster size */ STATIC uint xfs_calc_iunlink_remove_reservation( struct xfs_mount *mp) { return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + - max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size); + 2 * max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size); } /* @@ -320,13 +351,13 @@ xfs_calc_link_reservation( /* * For adding an inode to unlinked list we can modify: * the agi hash list: sector size - * the unlinked inode: inode size + * the on disk inode: inode cluster size */ STATIC uint xfs_calc_iunlink_add_reservation(xfs_mount_t *mp) { return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + - xfs_calc_inode_res(mp, 1); + max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size); } /* @@ -379,45 +410,16 @@ xfs_calc_create_resv_modify( xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + (uint)XFS_FSB_TO_B(mp, 1) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) + - xfs_calc_finobt_res(mp, 1, 1); -} - -/* - * For create we can allocate some inodes giving: - * the agi and agf of the ag getting the new inodes: 2 * sectorsize - * the superblock for the nlink flag: sector size - * the inode blocks allocated: mp->m_ialloc_blks * blocksize - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -STATIC uint -xfs_calc_create_resv_alloc( - struct xfs_mount *mp) -{ - return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + - mp->m_sb.sb_sectsize + - xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), - XFS_FSB_TO_B(mp, 1)); -} - -STATIC uint -__xfs_calc_create_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX(xfs_calc_create_resv_alloc(mp), - xfs_calc_create_resv_modify(mp)); + xfs_calc_finobt_res(mp); } /* * For icreate we can allocate some inodes giving: * the agi and agf of the ag getting the new inodes: 2 * sectorsize * the superblock for the nlink flag: sector size - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - * the finobt (record insertion) + * the inode chunk (allocation, optional init) + * the inobt (record insertion) + * the finobt (optional, record insertion) */ STATIC uint xfs_calc_icreate_resv_alloc( @@ -425,10 +427,9 @@ xfs_calc_icreate_resv_alloc( { return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + mp->m_sb.sb_sectsize + - xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_finobt_res(mp, 0, 0); + xfs_calc_inode_chunk_res(mp, _ALLOC) + + xfs_calc_inobt_res(mp) + + xfs_calc_finobt_res(mp); } STATIC uint @@ -440,26 +441,12 @@ xfs_calc_icreate_reservation(xfs_mount_t *mp) } STATIC uint -xfs_calc_create_reservation( - struct xfs_mount *mp) -{ - if (xfs_sb_version_hascrc(&mp->m_sb)) - return xfs_calc_icreate_reservation(mp); - return __xfs_calc_create_reservation(mp); - -} - -STATIC uint xfs_calc_create_tmpfile_reservation( struct xfs_mount *mp) { uint res = XFS_DQUOT_LOGRES(mp); - if (xfs_sb_version_hascrc(&mp->m_sb)) - res += xfs_calc_icreate_resv_alloc(mp); - else - res += xfs_calc_create_resv_alloc(mp); - + res += xfs_calc_icreate_resv_alloc(mp); return res + xfs_calc_iunlink_add_reservation(mp); } @@ -470,7 +457,7 @@ STATIC uint xfs_calc_mkdir_reservation( struct xfs_mount *mp) { - return xfs_calc_create_reservation(mp); + return xfs_calc_icreate_reservation(mp); } @@ -483,20 +470,24 @@ STATIC uint xfs_calc_symlink_reservation( struct xfs_mount *mp) { - return xfs_calc_create_reservation(mp) + + return xfs_calc_icreate_reservation(mp) + xfs_calc_buf_res(1, XFS_SYMLINK_MAXLEN); } /* * In freeing an inode we can modify: * the inode being freed: inode size - * the super block free inode counter: sector size - * the agi hash list and counters: sector size - * the inode btree entry: block size - * the on disk inode before ours in the agi hash list: inode cluster size - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size + * the super block free inode counter, AGF and AGFL: sector size + * the on disk inode (agi unlinked list removal) + * the inode chunk (invalidated, headers only) + * the inode btree * the finobt (record insertion, removal or modification) + * + * Note that the inode chunk res. includes an allocfree res. for freeing of the + * inode chunk. This is technically extraneous because the inode chunk free is + * deferred (it occurs after a transaction roll). Include the extra reservation + * anyways since we've had reports of ifree transaction overruns due to too many + * agfl fixups during inode chunk frees. */ STATIC uint xfs_calc_ifree_reservation( @@ -504,15 +495,11 @@ xfs_calc_ifree_reservation( { return XFS_DQUOT_LOGRES(mp) + xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + xfs_calc_iunlink_remove_reservation(mp) + - xfs_calc_buf_res(1, 0) + - xfs_calc_buf_res(2 + mp->m_ialloc_blks + - mp->m_in_maxlevels, 0) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_finobt_res(mp, 0, 1); + xfs_calc_inode_chunk_res(mp, _FREE) + + xfs_calc_inobt_res(mp) + + xfs_calc_finobt_res(mp); } /* @@ -842,7 +829,7 @@ xfs_trans_resv_calc( resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT; resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_create.tr_logres = xfs_calc_create_reservation(mp); + resp->tr_create.tr_logres = xfs_calc_icreate_reservation(mp); resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT; resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES; diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 2a9b4f9e93c6..fd975524f460 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -32,30 +32,17 @@ #include "xfs_inode.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" +#include "xfs_rmap.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" /* - * Set up scrub to check all the static metadata in each AG. - * This means the SB, AGF, AGI, and AGFL headers. + * Walk all the blocks in the AGFL. The fn function can return any negative + * error code or XFS_BTREE_QUERY_RANGE_ABORT. */ int -xfs_scrub_setup_ag_header( - struct xfs_scrub_context *sc, - struct xfs_inode *ip) -{ - struct xfs_mount *mp = sc->mp; - - if (sc->sm->sm_agno >= mp->m_sb.sb_agcount || - sc->sm->sm_ino || sc->sm->sm_gen) - return -EINVAL; - return xfs_scrub_setup_fs(sc, ip); -} - -/* Walk all the blocks in the AGFL. */ -int xfs_scrub_walk_agfl( struct xfs_scrub_context *sc, int (*fn)(struct xfs_scrub_context *, @@ -115,6 +102,36 @@ xfs_scrub_walk_agfl( /* Superblock */ +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_superblock_xref( + struct xfs_scrub_context *sc, + struct xfs_buf *bp) +{ + struct xfs_owner_info oinfo; + struct xfs_mount *mp = sc->mp; + xfs_agnumber_t agno = sc->sm->sm_agno; + xfs_agblock_t agbno; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + agbno = XFS_SB_BLOCK(mp); + + error = xfs_scrub_ag_init(sc, agno, &sc->sa); + if (!xfs_scrub_xref_process_error(sc, agno, agbno, &error)) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, 1); + xfs_scrub_xref_is_not_inode_chunk(sc, agbno, 1); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS); + xfs_scrub_xref_is_owned_by(sc, agbno, 1, &oinfo); + xfs_scrub_xref_is_not_shared(sc, agbno, 1); + + /* scrub teardown will take care of sc->sa for us */ +} + /* * Scrub the filesystem superblock. * @@ -143,6 +160,22 @@ xfs_scrub_superblock( error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops); + /* + * The superblock verifier can return several different error codes + * if it thinks the superblock doesn't look right. For a mount these + * would all get bounced back to userspace, but if we're here then the + * fs mounted successfully, which means that this secondary superblock + * is simply incorrect. Treat all these codes the same way we treat + * any corruption. + */ + switch (error) { + case -EINVAL: /* also -EWRONGFS */ + case -ENOSYS: + case -EFBIG: + error = -EFSCORRUPTED; + default: + break; + } if (!xfs_scrub_process_error(sc, agno, XFS_SB_BLOCK(mp), &error)) return error; @@ -387,11 +420,175 @@ xfs_scrub_superblock( BBTOB(bp->b_length) - sizeof(struct xfs_dsb))) xfs_scrub_block_set_corrupt(sc, bp); + xfs_scrub_superblock_xref(sc, bp); + return error; } /* AGF */ +/* Tally freespace record lengths. */ +STATIC int +xfs_scrub_agf_record_bno_lengths( + struct xfs_btree_cur *cur, + struct xfs_alloc_rec_incore *rec, + void *priv) +{ + xfs_extlen_t *blocks = priv; + + (*blocks) += rec->ar_blockcount; + return 0; +} + +/* Check agf_freeblks */ +static inline void +xfs_scrub_agf_xref_freeblks( + struct xfs_scrub_context *sc) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + xfs_extlen_t blocks = 0; + int error; + + if (!sc->sa.bno_cur) + return; + + error = xfs_alloc_query_all(sc->sa.bno_cur, + xfs_scrub_agf_record_bno_lengths, &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.bno_cur)) + return; + if (blocks != be32_to_cpu(agf->agf_freeblks)) + xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp); +} + +/* Cross reference the AGF with the cntbt (freespace by length btree) */ +static inline void +xfs_scrub_agf_xref_cntbt( + struct xfs_scrub_context *sc) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + xfs_agblock_t agbno; + xfs_extlen_t blocks; + int have; + int error; + + if (!sc->sa.cnt_cur) + return; + + /* Any freespace at all? */ + error = xfs_alloc_lookup_le(sc->sa.cnt_cur, 0, -1U, &have); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.cnt_cur)) + return; + if (!have) { + if (agf->agf_freeblks != be32_to_cpu(0)) + xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp); + return; + } + + /* Check agf_longest */ + error = xfs_alloc_get_rec(sc->sa.cnt_cur, &agbno, &blocks, &have); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.cnt_cur)) + return; + if (!have || blocks != be32_to_cpu(agf->agf_longest)) + xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp); +} + +/* Check the btree block counts in the AGF against the btrees. */ +STATIC void +xfs_scrub_agf_xref_btreeblks( + struct xfs_scrub_context *sc) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_mount *mp = sc->mp; + xfs_agblock_t blocks; + xfs_agblock_t btreeblks; + int error; + + /* Check agf_rmap_blocks; set up for agf_btreeblks check */ + if (sc->sa.rmap_cur) { + error = xfs_btree_count_blocks(sc->sa.rmap_cur, &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + btreeblks = blocks - 1; + if (blocks != be32_to_cpu(agf->agf_rmap_blocks)) + xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp); + } else { + btreeblks = 0; + } + + /* + * No rmap cursor; we can't xref if we have the rmapbt feature. + * We also can't do it if we're missing the free space btree cursors. + */ + if ((xfs_sb_version_hasrmapbt(&mp->m_sb) && !sc->sa.rmap_cur) || + !sc->sa.bno_cur || !sc->sa.cnt_cur) + return; + + /* Check agf_btreeblks */ + error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.bno_cur)) + return; + btreeblks += blocks - 1; + + error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.cnt_cur)) + return; + btreeblks += blocks - 1; + + if (btreeblks != be32_to_cpu(agf->agf_btreeblks)) + xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp); +} + +/* Check agf_refcount_blocks against tree size */ +static inline void +xfs_scrub_agf_xref_refcblks( + struct xfs_scrub_context *sc) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + xfs_agblock_t blocks; + int error; + + if (!sc->sa.refc_cur) + return; + + error = xfs_btree_count_blocks(sc->sa.refc_cur, &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (blocks != be32_to_cpu(agf->agf_refcount_blocks)) + xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp); +} + +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_agf_xref( + struct xfs_scrub_context *sc) +{ + struct xfs_owner_info oinfo; + struct xfs_mount *mp = sc->mp; + xfs_agblock_t agbno; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + agbno = XFS_AGF_BLOCK(mp); + + error = xfs_scrub_ag_btcur_init(sc, &sc->sa); + if (error) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, 1); + xfs_scrub_agf_xref_freeblks(sc); + xfs_scrub_agf_xref_cntbt(sc); + xfs_scrub_xref_is_not_inode_chunk(sc, agbno, 1); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS); + xfs_scrub_xref_is_owned_by(sc, agbno, 1, &oinfo); + xfs_scrub_agf_xref_btreeblks(sc); + xfs_scrub_xref_is_not_shared(sc, agbno, 1); + xfs_scrub_agf_xref_refcblks(sc); + + /* scrub teardown will take care of sc->sa for us */ +} + /* Scrub the AGF. */ int xfs_scrub_agf( @@ -414,6 +611,7 @@ xfs_scrub_agf( &sc->sa.agf_bp, &sc->sa.agfl_bp); if (!xfs_scrub_process_error(sc, agno, XFS_AGF_BLOCK(sc->mp), &error)) goto out; + xfs_scrub_buffer_recheck(sc, sc->sa.agf_bp); agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); @@ -470,6 +668,7 @@ xfs_scrub_agf( if (agfl_count != 0 && fl_count != agfl_count) xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + xfs_scrub_agf_xref(sc); out: return error; } @@ -477,11 +676,28 @@ out: /* AGFL */ struct xfs_scrub_agfl_info { + struct xfs_owner_info oinfo; unsigned int sz_entries; unsigned int nr_entries; xfs_agblock_t *entries; }; +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_agfl_block_xref( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + struct xfs_owner_info *oinfo) +{ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, 1); + xfs_scrub_xref_is_not_inode_chunk(sc, agbno, 1); + xfs_scrub_xref_is_owned_by(sc, agbno, 1, oinfo); + xfs_scrub_xref_is_not_shared(sc, agbno, 1); +} + /* Scrub an AGFL block. */ STATIC int xfs_scrub_agfl_block( @@ -499,6 +715,8 @@ xfs_scrub_agfl_block( else xfs_scrub_block_set_corrupt(sc, sc->sa.agfl_bp); + xfs_scrub_agfl_block_xref(sc, agbno, priv); + return 0; } @@ -513,6 +731,37 @@ xfs_scrub_agblock_cmp( return (int)*a - (int)*b; } +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_agfl_xref( + struct xfs_scrub_context *sc) +{ + struct xfs_owner_info oinfo; + struct xfs_mount *mp = sc->mp; + xfs_agblock_t agbno; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + agbno = XFS_AGFL_BLOCK(mp); + + error = xfs_scrub_ag_btcur_init(sc, &sc->sa); + if (error) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, 1); + xfs_scrub_xref_is_not_inode_chunk(sc, agbno, 1); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS); + xfs_scrub_xref_is_owned_by(sc, agbno, 1, &oinfo); + xfs_scrub_xref_is_not_shared(sc, agbno, 1); + + /* + * Scrub teardown will take care of sc->sa for us. Leave sc->sa + * active so that the agfl block xref can use it too. + */ +} + /* Scrub the AGFL. */ int xfs_scrub_agfl( @@ -532,6 +781,12 @@ xfs_scrub_agfl( goto out; if (!sc->sa.agf_bp) return -EFSCORRUPTED; + xfs_scrub_buffer_recheck(sc, sc->sa.agfl_bp); + + xfs_scrub_agfl_xref(sc); + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; /* Allocate buffer to ensure uniqueness of AGFL entries. */ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); @@ -548,6 +803,7 @@ xfs_scrub_agfl( } /* Check the blocks in the AGFL. */ + xfs_rmap_ag_owner(&sai.oinfo, XFS_RMAP_OWN_AG); error = xfs_scrub_walk_agfl(sc, xfs_scrub_agfl_block, &sai); if (error) goto out_free; @@ -575,6 +831,56 @@ out: /* AGI */ +/* Check agi_count/agi_freecount */ +static inline void +xfs_scrub_agi_xref_icounts( + struct xfs_scrub_context *sc) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(sc->sa.agi_bp); + xfs_agino_t icount; + xfs_agino_t freecount; + int error; + + if (!sc->sa.ino_cur) + return; + + error = xfs_ialloc_count_inodes(sc->sa.ino_cur, &icount, &freecount); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.ino_cur)) + return; + if (be32_to_cpu(agi->agi_count) != icount || + be32_to_cpu(agi->agi_freecount) != freecount) + xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agi_bp); +} + +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_agi_xref( + struct xfs_scrub_context *sc) +{ + struct xfs_owner_info oinfo; + struct xfs_mount *mp = sc->mp; + xfs_agblock_t agbno; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + agbno = XFS_AGI_BLOCK(mp); + + error = xfs_scrub_ag_btcur_init(sc, &sc->sa); + if (error) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, 1); + xfs_scrub_xref_is_not_inode_chunk(sc, agbno, 1); + xfs_scrub_agi_xref_icounts(sc); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS); + xfs_scrub_xref_is_owned_by(sc, agbno, 1, &oinfo); + xfs_scrub_xref_is_not_shared(sc, agbno, 1); + + /* scrub teardown will take care of sc->sa for us */ +} + /* Scrub the AGI. */ int xfs_scrub_agi( @@ -598,6 +904,7 @@ xfs_scrub_agi( &sc->sa.agf_bp, &sc->sa.agfl_bp); if (!xfs_scrub_process_error(sc, agno, XFS_AGI_BLOCK(sc->mp), &error)) goto out; + xfs_scrub_buffer_recheck(sc, sc->sa.agi_bp); agi = XFS_BUF_TO_AGI(sc->sa.agi_bp); @@ -653,6 +960,7 @@ xfs_scrub_agi( if (agi->agi_pad32 != cpu_to_be32(0)) xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + xfs_scrub_agi_xref(sc); out: return error; } diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 059663e13414..517c079d3f68 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -31,6 +31,7 @@ #include "xfs_sb.h" #include "xfs_alloc.h" #include "xfs_rmap.h" +#include "xfs_alloc.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -49,6 +50,64 @@ xfs_scrub_setup_ag_allocbt( } /* Free space btree scrubber. */ +/* + * Ensure there's a corresponding cntbt/bnobt record matching this + * bnobt/cntbt record, respectively. + */ +STATIC void +xfs_scrub_allocbt_xref_other( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_btree_cur **pcur; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int has_otherrec; + int error; + + if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT) + pcur = &sc->sa.cnt_cur; + else + pcur = &sc->sa.bno_cur; + if (!*pcur) + return; + + error = xfs_alloc_lookup_le(*pcur, agbno, len, &has_otherrec); + if (!xfs_scrub_should_check_xref(sc, &error, pcur)) + return; + if (!has_otherrec) { + xfs_scrub_btree_xref_set_corrupt(sc, *pcur, 0); + return; + } + + error = xfs_alloc_get_rec(*pcur, &fbno, &flen, &has_otherrec); + if (!xfs_scrub_should_check_xref(sc, &error, pcur)) + return; + if (!has_otherrec) { + xfs_scrub_btree_xref_set_corrupt(sc, *pcur, 0); + return; + } + + if (fbno != agbno || flen != len) + xfs_scrub_btree_xref_set_corrupt(sc, *pcur, 0); +} + +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_allocbt_xref( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xfs_scrub_allocbt_xref_other(sc, agbno, len); + xfs_scrub_xref_is_not_inode_chunk(sc, agbno, len); + xfs_scrub_xref_has_no_owner(sc, agbno, len); + xfs_scrub_xref_is_not_shared(sc, agbno, len); +} /* Scrub a bnobt/cntbt record. */ STATIC int @@ -70,6 +129,8 @@ xfs_scrub_allocbt_rec( !xfs_verify_agbno(mp, agno, bno + len - 1)) xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + xfs_scrub_allocbt_xref(bs->sc, bno, len); + return error; } @@ -100,3 +161,23 @@ xfs_scrub_cntbt( { return xfs_scrub_allocbt(sc, XFS_BTNUM_CNT); } + +/* xref check that the extent is not free */ +void +xfs_scrub_xref_is_used_space( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + bool is_freesp; + int error; + + if (!sc->sa.bno_cur) + return; + + error = xfs_alloc_has_record(sc->sa.bno_cur, agbno, len, &is_freesp); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.bno_cur)) + return; + if (is_freesp) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.bno_cur, 0); +} diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 42fec0bcd9e1..d00282130492 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -37,6 +37,7 @@ #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" #include "xfs_rmap.h" +#include "xfs_refcount.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -99,6 +100,201 @@ struct xfs_scrub_bmap_info { int whichfork; }; +/* Look for a corresponding rmap for this irec. */ +static inline bool +xfs_scrub_bmap_get_rmap( + struct xfs_scrub_bmap_info *info, + struct xfs_bmbt_irec *irec, + xfs_agblock_t agbno, + uint64_t owner, + struct xfs_rmap_irec *rmap) +{ + xfs_fileoff_t offset; + unsigned int rflags = 0; + int has_rmap; + int error; + + if (info->whichfork == XFS_ATTR_FORK) + rflags |= XFS_RMAP_ATTR_FORK; + + /* + * CoW staging extents are owned (on disk) by the refcountbt, so + * their rmaps do not have offsets. + */ + if (info->whichfork == XFS_COW_FORK) + offset = 0; + else + offset = irec->br_startoff; + + /* + * If the caller thinks this could be a shared bmbt extent (IOWs, + * any data fork extent of a reflink inode) then we have to use the + * range rmap lookup to make sure we get the correct owner/offset. + */ + if (info->is_shared) { + error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno, + owner, offset, rflags, rmap, &has_rmap); + if (!xfs_scrub_should_check_xref(info->sc, &error, + &info->sc->sa.rmap_cur)) + return false; + goto out; + } + + /* + * Otherwise, use the (faster) regular lookup. + */ + error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, 0, owner, + offset, rflags, &has_rmap); + if (!xfs_scrub_should_check_xref(info->sc, &error, + &info->sc->sa.rmap_cur)) + return false; + if (!has_rmap) + goto out; + + error = xfs_rmap_get_rec(info->sc->sa.rmap_cur, rmap, &has_rmap); + if (!xfs_scrub_should_check_xref(info->sc, &error, + &info->sc->sa.rmap_cur)) + return false; + +out: + if (!has_rmap) + xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + return has_rmap; +} + +/* Make sure that we have rmapbt records for this extent. */ +STATIC void +xfs_scrub_bmap_xref_rmap( + struct xfs_scrub_bmap_info *info, + struct xfs_bmbt_irec *irec, + xfs_agblock_t agbno) +{ + struct xfs_rmap_irec rmap; + unsigned long long rmap_end; + uint64_t owner; + + if (!info->sc->sa.rmap_cur) + return; + + if (info->whichfork == XFS_COW_FORK) + owner = XFS_RMAP_OWN_COW; + else + owner = info->sc->ip->i_ino; + + /* Find the rmap record for this irec. */ + if (!xfs_scrub_bmap_get_rmap(info, irec, agbno, owner, &rmap)) + return; + + /* Check the rmap. */ + rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; + if (rmap.rm_startblock > agbno || + agbno + irec->br_blockcount > rmap_end) + xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* + * Check the logical offsets if applicable. CoW staging extents + * don't track logical offsets since the mappings only exist in + * memory. + */ + if (info->whichfork != XFS_COW_FORK) { + rmap_end = (unsigned long long)rmap.rm_offset + + rmap.rm_blockcount; + if (rmap.rm_offset > irec->br_startoff || + irec->br_startoff + irec->br_blockcount > rmap_end) + xfs_scrub_fblock_xref_set_corrupt(info->sc, + info->whichfork, irec->br_startoff); + } + + if (rmap.rm_owner != owner) + xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* + * Check for discrepancies between the unwritten flag in the irec and + * the rmap. Note that the (in-memory) CoW fork distinguishes between + * unwritten and written extents, but we don't track that in the rmap + * records because the blocks are owned (on-disk) by the refcountbt, + * which doesn't track unwritten state. + */ + if (owner != XFS_RMAP_OWN_COW && + irec->br_state == XFS_EXT_UNWRITTEN && + !(rmap.rm_flags & XFS_RMAP_UNWRITTEN)) + xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + if (info->whichfork == XFS_ATTR_FORK && + !(rmap.rm_flags & XFS_RMAP_ATTR_FORK)) + xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK) + xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); +} + +/* Cross-reference a single rtdev extent record. */ +STATIC void +xfs_scrub_bmap_rt_extent_xref( + struct xfs_scrub_bmap_info *info, + struct xfs_inode *ip, + struct xfs_btree_cur *cur, + struct xfs_bmbt_irec *irec) +{ + if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xfs_scrub_xref_is_used_rt_space(info->sc, irec->br_startblock, + irec->br_blockcount); +} + +/* Cross-reference a single datadev extent record. */ +STATIC void +xfs_scrub_bmap_extent_xref( + struct xfs_scrub_bmap_info *info, + struct xfs_inode *ip, + struct xfs_btree_cur *cur, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = info->sc->mp; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_extlen_t len; + int error; + + if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); + len = irec->br_blockcount; + + error = xfs_scrub_ag_init(info->sc, agno, &info->sc->sa); + if (!xfs_scrub_fblock_process_error(info->sc, info->whichfork, + irec->br_startoff, &error)) + return; + + xfs_scrub_xref_is_used_space(info->sc, agbno, len); + xfs_scrub_xref_is_not_inode_chunk(info->sc, agbno, len); + xfs_scrub_bmap_xref_rmap(info, irec, agbno); + switch (info->whichfork) { + case XFS_DATA_FORK: + if (xfs_is_reflink_inode(info->sc->ip)) + break; + /* fall through */ + case XFS_ATTR_FORK: + xfs_scrub_xref_is_not_shared(info->sc, agbno, + irec->br_blockcount); + break; + case XFS_COW_FORK: + xfs_scrub_xref_is_cow_staging(info->sc, agbno, + irec->br_blockcount); + break; + } + + xfs_scrub_ag_free(info->sc, &info->sc->sa); +} + /* Scrub a single extent record. */ STATIC int xfs_scrub_bmap_extent( @@ -109,6 +305,7 @@ xfs_scrub_bmap_extent( { struct xfs_mount *mp = info->sc->mp; struct xfs_buf *bp = NULL; + xfs_filblks_t end; int error = 0; if (cur) @@ -136,19 +333,23 @@ xfs_scrub_bmap_extent( irec->br_startoff); /* Make sure the extent points to a valid place. */ + if (irec->br_blockcount > MAXEXTLEN) + xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); if (irec->br_startblock + irec->br_blockcount <= irec->br_startblock) xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); + end = irec->br_startblock + irec->br_blockcount - 1; if (info->is_rt && (!xfs_verify_rtbno(mp, irec->br_startblock) || - !xfs_verify_rtbno(mp, irec->br_startblock + - irec->br_blockcount - 1))) + !xfs_verify_rtbno(mp, end))) xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); if (!info->is_rt && (!xfs_verify_fsbno(mp, irec->br_startblock) || - !xfs_verify_fsbno(mp, irec->br_startblock + - irec->br_blockcount - 1))) + !xfs_verify_fsbno(mp, end) || + XFS_FSB_TO_AGNO(mp, irec->br_startblock) != + XFS_FSB_TO_AGNO(mp, end))) xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -158,6 +359,11 @@ xfs_scrub_bmap_extent( xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); + if (info->is_rt) + xfs_scrub_bmap_rt_extent_xref(info, ip, cur, irec); + else + xfs_scrub_bmap_extent_xref(info, ip, cur, irec); + info->lastoff = irec->br_startoff + irec->br_blockcount; return error; } @@ -235,7 +441,6 @@ xfs_scrub_bmap( struct xfs_ifork *ifp; xfs_fileoff_t endoff; struct xfs_iext_cursor icur; - bool found; int error = 0; ifp = XFS_IFORK_PTR(ip, whichfork); @@ -314,9 +519,7 @@ xfs_scrub_bmap( /* Scrub extent records. */ info.lastoff = 0; ifp = XFS_IFORK_PTR(ip, whichfork); - for (found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &irec); - found != 0; - found = xfs_iext_next_extent(ifp, &icur, &irec)) { + for_each_xfs_iext(ifp, &icur, &irec) { if (xfs_scrub_should_terminate(sc, &error)) break; if (isnullstartblock(irec.br_startblock)) diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index df0766132ace..54218168c8f9 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -42,12 +42,14 @@ * Check for btree operation errors. See the section about handling * operational errors in common.c. */ -bool -xfs_scrub_btree_process_error( +static bool +__xfs_scrub_btree_process_error( struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, int level, - int *error) + int *error, + __u32 errflag, + void *ret_ip) { if (*error == 0) return true; @@ -60,36 +62,80 @@ xfs_scrub_btree_process_error( case -EFSBADCRC: case -EFSCORRUPTED: /* Note the badness but don't abort. */ - sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + sc->sm->sm_flags |= errflag; *error = 0; /* fall through */ default: if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) trace_xfs_scrub_ifork_btree_op_error(sc, cur, level, - *error, __return_address); + *error, ret_ip); else trace_xfs_scrub_btree_op_error(sc, cur, level, - *error, __return_address); + *error, ret_ip); break; } return false; } +bool +xfs_scrub_btree_process_error( + struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + int level, + int *error) +{ + return __xfs_scrub_btree_process_error(sc, cur, level, error, + XFS_SCRUB_OFLAG_CORRUPT, __return_address); +} + +bool +xfs_scrub_btree_xref_process_error( + struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + int level, + int *error) +{ + return __xfs_scrub_btree_process_error(sc, cur, level, error, + XFS_SCRUB_OFLAG_XFAIL, __return_address); +} + /* Record btree block corruption. */ -void -xfs_scrub_btree_set_corrupt( +static void +__xfs_scrub_btree_set_corrupt( struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, - int level) + int level, + __u32 errflag, + void *ret_ip) { - sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + sc->sm->sm_flags |= errflag; if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) trace_xfs_scrub_ifork_btree_error(sc, cur, level, - __return_address); + ret_ip); else trace_xfs_scrub_btree_error(sc, cur, level, - __return_address); + ret_ip); +} + +void +xfs_scrub_btree_set_corrupt( + struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + int level) +{ + __xfs_scrub_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_CORRUPT, + __return_address); +} + +void +xfs_scrub_btree_xref_set_corrupt( + struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + int level) +{ + __xfs_scrub_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_XCORRUPT, + __return_address); } /* @@ -268,6 +314,8 @@ xfs_scrub_btree_block_check_sibling( pp = xfs_btree_ptr_addr(ncur, ncur->bc_ptrs[level + 1], pblock); if (!xfs_scrub_btree_ptr_ok(bs, level + 1, pp)) goto out; + if (pbp) + xfs_scrub_buffer_recheck(bs->sc, pbp); if (xfs_btree_diff_two_ptrs(cur, pp, sibling)) xfs_scrub_btree_set_corrupt(bs->sc, cur, level); @@ -315,6 +363,97 @@ out: return error; } +struct check_owner { + struct list_head list; + xfs_daddr_t daddr; + int level; +}; + +/* + * Make sure this btree block isn't in the free list and that there's + * an rmap record for it. + */ +STATIC int +xfs_scrub_btree_check_block_owner( + struct xfs_scrub_btree *bs, + int level, + xfs_daddr_t daddr) +{ + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_btnum_t btnum; + bool init_sa; + int error = 0; + + if (!bs->cur) + return 0; + + btnum = bs->cur->bc_btnum; + agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr); + agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr); + + init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS; + if (init_sa) { + error = xfs_scrub_ag_init(bs->sc, agno, &bs->sc->sa); + if (!xfs_scrub_btree_xref_process_error(bs->sc, bs->cur, + level, &error)) + return error; + } + + xfs_scrub_xref_is_used_space(bs->sc, agbno, 1); + /* + * The bnobt scrubber aliases bs->cur to bs->sc->sa.bno_cur, so we + * have to nullify it (to shut down further block owner checks) if + * self-xref encounters problems. + */ + if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO) + bs->cur = NULL; + + xfs_scrub_xref_is_owned_by(bs->sc, agbno, 1, bs->oinfo); + if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP) + bs->cur = NULL; + + if (init_sa) + xfs_scrub_ag_free(bs->sc, &bs->sc->sa); + + return error; +} + +/* Check the owner of a btree block. */ +STATIC int +xfs_scrub_btree_check_owner( + struct xfs_scrub_btree *bs, + int level, + struct xfs_buf *bp) +{ + struct xfs_btree_cur *cur = bs->cur; + struct check_owner *co; + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && bp == NULL) + return 0; + + /* + * We want to cross-reference each btree block with the bnobt + * and the rmapbt. We cannot cross-reference the bnobt or + * rmapbt while scanning the bnobt or rmapbt, respectively, + * because we cannot alter the cursor and we'd prefer not to + * duplicate cursors. Therefore, save the buffer daddr for + * later scanning. + */ + if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) { + co = kmem_alloc(sizeof(struct check_owner), + KM_MAYFAIL | KM_NOFS); + if (!co) + return -ENOMEM; + co->level = level; + co->daddr = XFS_BUF_ADDR(bp); + list_add_tail(&co->list, &bs->to_check); + return 0; + } + + return xfs_scrub_btree_check_block_owner(bs, level, XFS_BUF_ADDR(bp)); +} + /* * Grab and scrub a btree block given a btree pointer. Returns block * and buffer pointers (if applicable) if they're ok to use. @@ -349,6 +488,16 @@ xfs_scrub_btree_get_block( xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level); return 0; } + if (*pbp) + xfs_scrub_buffer_recheck(bs->sc, *pbp); + + /* + * Check the block's owner; this function absorbs error codes + * for us. + */ + error = xfs_scrub_btree_check_owner(bs, level, *pbp); + if (error) + return error; /* * Check the block's siblings; this function absorbs error codes @@ -421,6 +570,8 @@ xfs_scrub_btree( struct xfs_btree_block *block; int level; struct xfs_buf *bp; + struct check_owner *co; + struct check_owner *n; int i; int error = 0; @@ -512,5 +663,14 @@ xfs_scrub_btree( } out: + /* Process deferred owner checks on btree blocks. */ + list_for_each_entry_safe(co, n, &bs.to_check, list) { + if (!error && bs.cur) + error = xfs_scrub_btree_check_block_owner(&bs, + co->level, co->daddr); + list_del(&co->list); + kmem_free(co); + } + return error; } diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h index 4de825a626d1..e2b868ede70b 100644 --- a/fs/xfs/scrub/btree.h +++ b/fs/xfs/scrub/btree.h @@ -26,10 +26,19 @@ bool xfs_scrub_btree_process_error(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, int level, int *error); +/* Check for btree xref operation errors. */ +bool xfs_scrub_btree_xref_process_error(struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, int level, + int *error); + /* Check for btree corruption. */ void xfs_scrub_btree_set_corrupt(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, int level); +/* Check for btree xref discrepancies. */ +void xfs_scrub_btree_xref_set_corrupt(struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, int level); + struct xfs_scrub_btree; typedef int (*xfs_scrub_btree_rec_fn)( struct xfs_scrub_btree *bs, diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index ac95fe911d96..8033ab9d8f47 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -78,12 +78,14 @@ */ /* Check for operational errors. */ -bool -xfs_scrub_process_error( +static bool +__xfs_scrub_process_error( struct xfs_scrub_context *sc, xfs_agnumber_t agno, xfs_agblock_t bno, - int *error) + int *error, + __u32 errflag, + void *ret_ip) { switch (*error) { case 0: @@ -95,24 +97,48 @@ xfs_scrub_process_error( case -EFSBADCRC: case -EFSCORRUPTED: /* Note the badness but don't abort. */ - sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + sc->sm->sm_flags |= errflag; *error = 0; /* fall through */ default: trace_xfs_scrub_op_error(sc, agno, bno, *error, - __return_address); + ret_ip); break; } return false; } -/* Check for operational errors for a file offset. */ bool -xfs_scrub_fblock_process_error( +xfs_scrub_process_error( + struct xfs_scrub_context *sc, + xfs_agnumber_t agno, + xfs_agblock_t bno, + int *error) +{ + return __xfs_scrub_process_error(sc, agno, bno, error, + XFS_SCRUB_OFLAG_CORRUPT, __return_address); +} + +bool +xfs_scrub_xref_process_error( + struct xfs_scrub_context *sc, + xfs_agnumber_t agno, + xfs_agblock_t bno, + int *error) +{ + return __xfs_scrub_process_error(sc, agno, bno, error, + XFS_SCRUB_OFLAG_XFAIL, __return_address); +} + +/* Check for operational errors for a file offset. */ +static bool +__xfs_scrub_fblock_process_error( struct xfs_scrub_context *sc, int whichfork, xfs_fileoff_t offset, - int *error) + int *error, + __u32 errflag, + void *ret_ip) { switch (*error) { case 0: @@ -124,17 +150,39 @@ xfs_scrub_fblock_process_error( case -EFSBADCRC: case -EFSCORRUPTED: /* Note the badness but don't abort. */ - sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + sc->sm->sm_flags |= errflag; *error = 0; /* fall through */ default: trace_xfs_scrub_file_op_error(sc, whichfork, offset, *error, - __return_address); + ret_ip); break; } return false; } +bool +xfs_scrub_fblock_process_error( + struct xfs_scrub_context *sc, + int whichfork, + xfs_fileoff_t offset, + int *error) +{ + return __xfs_scrub_fblock_process_error(sc, whichfork, offset, error, + XFS_SCRUB_OFLAG_CORRUPT, __return_address); +} + +bool +xfs_scrub_fblock_xref_process_error( + struct xfs_scrub_context *sc, + int whichfork, + xfs_fileoff_t offset, + int *error) +{ + return __xfs_scrub_fblock_process_error(sc, whichfork, offset, error, + XFS_SCRUB_OFLAG_XFAIL, __return_address); +} + /* * Handling scrub corruption/optimization/warning checks. * @@ -183,6 +231,16 @@ xfs_scrub_block_set_corrupt( trace_xfs_scrub_block_error(sc, bp->b_bn, __return_address); } +/* Record a corruption while cross-referencing. */ +void +xfs_scrub_block_xref_set_corrupt( + struct xfs_scrub_context *sc, + struct xfs_buf *bp) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; + trace_xfs_scrub_block_error(sc, bp->b_bn, __return_address); +} + /* * Record a corrupt inode. The trace data will include the block given * by bp if bp is given; otherwise it will use the block location of the @@ -198,6 +256,17 @@ xfs_scrub_ino_set_corrupt( trace_xfs_scrub_ino_error(sc, ino, bp ? bp->b_bn : 0, __return_address); } +/* Record a corruption while cross-referencing with an inode. */ +void +xfs_scrub_ino_xref_set_corrupt( + struct xfs_scrub_context *sc, + xfs_ino_t ino, + struct xfs_buf *bp) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; + trace_xfs_scrub_ino_error(sc, ino, bp ? bp->b_bn : 0, __return_address); +} + /* Record corruption in a block indexed by a file fork. */ void xfs_scrub_fblock_set_corrupt( @@ -209,6 +278,17 @@ xfs_scrub_fblock_set_corrupt( trace_xfs_scrub_fblock_error(sc, whichfork, offset, __return_address); } +/* Record a corruption while cross-referencing a fork block. */ +void +xfs_scrub_fblock_xref_set_corrupt( + struct xfs_scrub_context *sc, + int whichfork, + xfs_fileoff_t offset) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; + trace_xfs_scrub_fblock_error(sc, whichfork, offset, __return_address); +} + /* * Warn about inodes that need administrative review but is not * incorrect. @@ -245,6 +325,59 @@ xfs_scrub_set_incomplete( } /* + * rmap scrubbing -- compute the number of blocks with a given owner, + * at least according to the reverse mapping data. + */ + +struct xfs_scrub_rmap_ownedby_info { + struct xfs_owner_info *oinfo; + xfs_filblks_t *blocks; +}; + +STATIC int +xfs_scrub_count_rmap_ownedby_irec( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_scrub_rmap_ownedby_info *sroi = priv; + bool irec_attr; + bool oinfo_attr; + + irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK; + oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK; + + if (rec->rm_owner != sroi->oinfo->oi_owner) + return 0; + + if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr) + (*sroi->blocks) += rec->rm_blockcount; + + return 0; +} + +/* + * Calculate the number of blocks the rmap thinks are owned by something. + * The caller should pass us an rmapbt cursor. + */ +int +xfs_scrub_count_rmap_ownedby_ag( + struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + struct xfs_owner_info *oinfo, + xfs_filblks_t *blocks) +{ + struct xfs_scrub_rmap_ownedby_info sroi; + + sroi.oinfo = oinfo; + *blocks = 0; + sroi.blocks = blocks; + + return xfs_rmap_query_all(cur, xfs_scrub_count_rmap_ownedby_irec, + &sroi); +} + +/* * AG scrubbing * * These helpers facilitate locking an allocation group's header @@ -302,7 +435,7 @@ xfs_scrub_ag_read_headers( error = xfs_alloc_read_agfl(mp, sc->tp, agno, agfl); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL)) goto out; - + error = 0; out: return error; } @@ -472,7 +605,7 @@ xfs_scrub_setup_ag_btree( return error; } - error = xfs_scrub_setup_ag_header(sc, ip); + error = xfs_scrub_setup_fs(sc, ip); if (error) return error; @@ -503,18 +636,11 @@ xfs_scrub_get_inode( struct xfs_scrub_context *sc, struct xfs_inode *ip_in) { + struct xfs_imap imap; struct xfs_mount *mp = sc->mp; struct xfs_inode *ip = NULL; int error; - /* - * If userspace passed us an AG number or a generation number - * without an inode number, they haven't got a clue so bail out - * immediately. - */ - if (sc->sm->sm_agno || (sc->sm->sm_gen && !sc->sm->sm_ino)) - return -EINVAL; - /* We want to scan the inode we already had opened. */ if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { sc->ip = ip_in; @@ -526,10 +652,33 @@ xfs_scrub_get_inode( return -ENOENT; error = xfs_iget(mp, NULL, sc->sm->sm_ino, XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, &ip); - if (error == -ENOENT || error == -EINVAL) { - /* inode doesn't exist... */ - return -ENOENT; - } else if (error) { + switch (error) { + case -ENOENT: + /* Inode doesn't exist, just bail out. */ + return error; + case 0: + /* Got an inode, continue. */ + break; + case -EINVAL: + /* + * -EINVAL with IGET_UNTRUSTED could mean one of several + * things: userspace gave us an inode number that doesn't + * correspond to fs space, or doesn't have an inobt entry; + * or it could simply mean that the inode buffer failed the + * read verifiers. + * + * Try just the inode mapping lookup -- if it succeeds, then + * the inode buffer verifier failed and something needs fixing. + * Otherwise, we really couldn't find it so tell userspace + * that it no longer exists. + */ + error = xfs_imap(sc->mp, sc->tp, sc->sm->sm_ino, &imap, + XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE); + if (error) + return -ENOENT; + error = -EFSCORRUPTED; + /* fall through */ + default: trace_xfs_scrub_op_error(sc, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino), XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), @@ -572,3 +721,61 @@ out: /* scrub teardown will unlock and release the inode for us */ return error; } + +/* + * Predicate that decides if we need to evaluate the cross-reference check. + * If there was an error accessing the cross-reference btree, just delete + * the cursor and skip the check. + */ +bool +xfs_scrub_should_check_xref( + struct xfs_scrub_context *sc, + int *error, + struct xfs_btree_cur **curpp) +{ + if (*error == 0) + return true; + + if (curpp) { + /* If we've already given up on xref, just bail out. */ + if (!*curpp) + return false; + + /* xref error, delete cursor and bail out. */ + xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR); + *curpp = NULL; + } + + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; + trace_xfs_scrub_xref_error(sc, *error, __return_address); + + /* + * Errors encountered during cross-referencing with another + * data structure should not cause this scrubber to abort. + */ + *error = 0; + return false; +} + +/* Run the structure verifiers on in-memory buffers to detect bad memory. */ +void +xfs_scrub_buffer_recheck( + struct xfs_scrub_context *sc, + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + if (bp->b_ops == NULL) { + xfs_scrub_block_set_corrupt(sc, bp); + return; + } + if (bp->b_ops->verify_struct == NULL) { + xfs_scrub_set_incomplete(sc); + return; + } + fa = bp->b_ops->verify_struct(bp); + if (!fa) + return; + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xfs_scrub_block_error(sc, bp->b_bn, fa); +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 5c043855570e..ddb65d22c76a 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -56,6 +56,11 @@ bool xfs_scrub_process_error(struct xfs_scrub_context *sc, xfs_agnumber_t agno, bool xfs_scrub_fblock_process_error(struct xfs_scrub_context *sc, int whichfork, xfs_fileoff_t offset, int *error); +bool xfs_scrub_xref_process_error(struct xfs_scrub_context *sc, + xfs_agnumber_t agno, xfs_agblock_t bno, int *error); +bool xfs_scrub_fblock_xref_process_error(struct xfs_scrub_context *sc, + int whichfork, xfs_fileoff_t offset, int *error); + void xfs_scrub_block_set_preen(struct xfs_scrub_context *sc, struct xfs_buf *bp); void xfs_scrub_ino_set_preen(struct xfs_scrub_context *sc, xfs_ino_t ino, @@ -68,6 +73,13 @@ void xfs_scrub_ino_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino, void xfs_scrub_fblock_set_corrupt(struct xfs_scrub_context *sc, int whichfork, xfs_fileoff_t offset); +void xfs_scrub_block_xref_set_corrupt(struct xfs_scrub_context *sc, + struct xfs_buf *bp); +void xfs_scrub_ino_xref_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino, + struct xfs_buf *bp); +void xfs_scrub_fblock_xref_set_corrupt(struct xfs_scrub_context *sc, + int whichfork, xfs_fileoff_t offset); + void xfs_scrub_ino_set_warning(struct xfs_scrub_context *sc, xfs_ino_t ino, struct xfs_buf *bp); void xfs_scrub_fblock_set_warning(struct xfs_scrub_context *sc, int whichfork, @@ -76,10 +88,12 @@ void xfs_scrub_fblock_set_warning(struct xfs_scrub_context *sc, int whichfork, void xfs_scrub_set_incomplete(struct xfs_scrub_context *sc); int xfs_scrub_checkpoint_log(struct xfs_mount *mp); +/* Are we set up for a cross-referencing check? */ +bool xfs_scrub_should_check_xref(struct xfs_scrub_context *sc, int *error, + struct xfs_btree_cur **curpp); + /* Setup functions */ int xfs_scrub_setup_fs(struct xfs_scrub_context *sc, struct xfs_inode *ip); -int xfs_scrub_setup_ag_header(struct xfs_scrub_context *sc, - struct xfs_inode *ip); int xfs_scrub_setup_ag_allocbt(struct xfs_scrub_context *sc, struct xfs_inode *ip); int xfs_scrub_setup_ag_iallocbt(struct xfs_scrub_context *sc, @@ -134,11 +148,16 @@ int xfs_scrub_walk_agfl(struct xfs_scrub_context *sc, int (*fn)(struct xfs_scrub_context *, xfs_agblock_t bno, void *), void *priv); +int xfs_scrub_count_rmap_ownedby_ag(struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + struct xfs_owner_info *oinfo, + xfs_filblks_t *blocks); int xfs_scrub_setup_ag_btree(struct xfs_scrub_context *sc, struct xfs_inode *ip, bool force_log); int xfs_scrub_get_inode(struct xfs_scrub_context *sc, struct xfs_inode *ip_in); int xfs_scrub_setup_inode_contents(struct xfs_scrub_context *sc, struct xfs_inode *ip, unsigned int resblks); +void xfs_scrub_buffer_recheck(struct xfs_scrub_context *sc, struct xfs_buf *bp); #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index d94edd93cba8..bffdb7dc09bf 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -233,11 +233,28 @@ xfs_scrub_da_btree_write_verify( return; } } +static void * +xfs_scrub_da_btree_verify( + struct xfs_buf *bp) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DIR2_LEAF1_MAGIC: + case XFS_DIR3_LEAF1_MAGIC: + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + return bp->b_ops->verify_struct(bp); + default: + bp->b_ops = &xfs_da3_node_buf_ops; + return bp->b_ops->verify_struct(bp); + } +} static const struct xfs_buf_ops xfs_scrub_da_btree_buf_ops = { .name = "xfs_scrub_da_btree", .verify_read = xfs_scrub_da_btree_read_verify, .verify_write = xfs_scrub_da_btree_write_verify, + .verify_struct = xfs_scrub_da_btree_verify, }; /* Check a block's sibling. */ @@ -276,6 +293,9 @@ xfs_scrub_da_btree_block_check_sibling( xfs_scrub_da_set_corrupt(ds, level); return error; } + if (ds->state->altpath.blk[level].bp) + xfs_scrub_buffer_recheck(ds->sc, + ds->state->altpath.blk[level].bp); /* Compare upper level pointer to sibling pointer. */ if (ds->state->altpath.blk[level].blkno != sibling) @@ -358,6 +378,8 @@ xfs_scrub_da_btree_block( &xfs_scrub_da_btree_buf_ops); if (!xfs_scrub_da_process_error(ds, level, &error)) goto out_nobuf; + if (blk->bp) + xfs_scrub_buffer_recheck(ds->sc, blk->bp); /* * We didn't find a dir btree root block, which means that diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 69e1efdd4019..50b6a26b0299 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -92,7 +92,7 @@ xfs_scrub_dir_check_ftype( * inodes can trigger immediate inactive cleanup of the inode. */ error = xfs_iget(mp, sdc->sc->tp, inum, 0, 0, &ip); - if (!xfs_scrub_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset, + if (!xfs_scrub_fblock_xref_process_error(sdc->sc, XFS_DATA_FORK, offset, &error)) goto out; @@ -200,6 +200,7 @@ xfs_scrub_dir_rec( struct xfs_inode *dp = ds->dargs.dp; struct xfs_dir2_data_entry *dent; struct xfs_buf *bp; + char *p, *endp; xfs_ino_t ino; xfs_dablk_t rec_bno; xfs_dir2_db_t db; @@ -237,9 +238,37 @@ xfs_scrub_dir_rec( xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); goto out; } + xfs_scrub_buffer_recheck(ds->sc, bp); - /* Retrieve the entry, sanity check it, and compare hashes. */ dent = (struct xfs_dir2_data_entry *)(((char *)bp->b_addr) + off); + + /* Make sure we got a real directory entry. */ + p = (char *)mp->m_dir_inode_ops->data_entry_p(bp->b_addr); + endp = xfs_dir3_data_endp(mp->m_dir_geo, bp->b_addr); + if (!endp) { + xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + goto out_relse; + } + while (p < endp) { + struct xfs_dir2_data_entry *dep; + struct xfs_dir2_data_unused *dup; + + dup = (struct xfs_dir2_data_unused *)p; + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + p += be16_to_cpu(dup->length); + continue; + } + dep = (struct xfs_dir2_data_entry *)p; + if (dep == dent) + break; + p += mp->m_dir_inode_ops->data_entsize(dep->namelen); + } + if (p >= endp) { + xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + goto out_relse; + } + + /* Retrieve the entry, sanity check it, and compare hashes. */ ino = be64_to_cpu(dent->inumber); hash = be32_to_cpu(ent->hashval); tag = be16_to_cpup(dp->d_ops->data_entry_tag_p(dent)); @@ -324,6 +353,7 @@ xfs_scrub_directory_data_bestfree( } if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; + xfs_scrub_buffer_recheck(sc, bp); /* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */ @@ -361,13 +391,7 @@ xfs_scrub_directory_data_bestfree( /* Make sure the bestfrees are actually the best free spaces. */ ptr = (char *)d_ops->data_entry_p(bp->b_addr); - if (is_block) { - struct xfs_dir2_block_tail *btp; - - btp = xfs_dir2_block_tail_p(mp->m_dir_geo, bp->b_addr); - endptr = (char *)xfs_dir2_block_leaf_p(btp); - } else - endptr = (char *)bp->b_addr + BBTOB(bp->b_length); + endptr = xfs_dir3_data_endp(mp->m_dir_geo, bp->b_addr); /* Iterate the entries, stopping when we hit or go past the end. */ while (ptr < endptr) { @@ -474,6 +498,7 @@ xfs_scrub_directory_leaf1_bestfree( error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, -1, &bp); if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; + xfs_scrub_buffer_recheck(sc, bp); leaf = bp->b_addr; d_ops->leaf_hdr_from_disk(&leafhdr, leaf); @@ -559,6 +584,7 @@ xfs_scrub_directory_free_bestfree( error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp); if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; + xfs_scrub_buffer_recheck(sc, bp); if (xfs_sb_version_hascrc(&sc->mp->m_sb)) { struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 496d6f2fbb9e..63ab3f98430d 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -58,6 +58,56 @@ xfs_scrub_setup_ag_iallocbt( /* Inode btree scrubber. */ +/* + * If we're checking the finobt, cross-reference with the inobt. + * Otherwise we're checking the inobt; if there is an finobt, make sure + * we have a record or not depending on freecount. + */ +static inline void +xfs_scrub_iallocbt_chunk_xref_other( + struct xfs_scrub_context *sc, + struct xfs_inobt_rec_incore *irec, + xfs_agino_t agino) +{ + struct xfs_btree_cur **pcur; + bool has_irec; + int error; + + if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT) + pcur = &sc->sa.ino_cur; + else + pcur = &sc->sa.fino_cur; + if (!(*pcur)) + return; + error = xfs_ialloc_has_inode_record(*pcur, agino, agino, &has_irec); + if (!xfs_scrub_should_check_xref(sc, &error, pcur)) + return; + if (((irec->ir_freecount > 0 && !has_irec) || + (irec->ir_freecount == 0 && has_irec))) + xfs_scrub_btree_xref_set_corrupt(sc, *pcur, 0); +} + +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_iallocbt_chunk_xref( + struct xfs_scrub_context *sc, + struct xfs_inobt_rec_incore *irec, + xfs_agino_t agino, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_owner_info oinfo; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, len); + xfs_scrub_iallocbt_chunk_xref_other(sc, irec, agino); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); + xfs_scrub_xref_is_owned_by(sc, agbno, len, &oinfo); + xfs_scrub_xref_is_not_shared(sc, agbno, len); +} + /* Is this chunk worth checking? */ STATIC bool xfs_scrub_iallocbt_chunk( @@ -76,6 +126,8 @@ xfs_scrub_iallocbt_chunk( !xfs_verify_agbno(mp, agno, bno + len - 1)) xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + xfs_scrub_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len); + return true; } @@ -190,8 +242,14 @@ xfs_scrub_iallocbt_check_freemask( } /* If any part of this is a hole, skip it. */ - if (ir_holemask) + if (ir_holemask) { + xfs_scrub_xref_is_not_owned_by(bs->sc, agbno, + blks_per_cluster, &oinfo); continue; + } + + xfs_scrub_xref_is_owned_by(bs->sc, agbno, blks_per_cluster, + &oinfo); /* Grab the inode cluster buffer. */ imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno, @@ -227,6 +285,7 @@ xfs_scrub_iallocbt_rec( union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; + xfs_filblks_t *inode_blocks = bs->private; struct xfs_inobt_rec_incore irec; uint64_t holes; xfs_agnumber_t agno = bs->cur->bc_private.a.agno; @@ -264,6 +323,9 @@ xfs_scrub_iallocbt_rec( (agbno & (xfs_icluster_size_fsb(mp) - 1))) xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + *inode_blocks += XFS_B_TO_FSB(mp, + irec.ir_count * mp->m_sb.sb_inodesize); + /* Handle non-sparse inodes */ if (!xfs_inobt_issparse(irec.ir_holemask)) { len = XFS_B_TO_FSB(mp, @@ -308,6 +370,72 @@ out: return error; } +/* + * Make sure the inode btrees are as large as the rmap thinks they are. + * Don't bother if we're missing btree cursors, as we're already corrupt. + */ +STATIC void +xfs_scrub_iallocbt_xref_rmap_btreeblks( + struct xfs_scrub_context *sc, + int which) +{ + struct xfs_owner_info oinfo; + xfs_filblks_t blocks; + xfs_extlen_t inobt_blocks = 0; + xfs_extlen_t finobt_blocks = 0; + int error; + + if (!sc->sa.ino_cur || !sc->sa.rmap_cur || + (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur)) + return; + + /* Check that we saw as many inobt blocks as the rmap says. */ + error = xfs_btree_count_blocks(sc->sa.ino_cur, &inobt_blocks); + if (!xfs_scrub_process_error(sc, 0, 0, &error)) + return; + + if (sc->sa.fino_cur) { + error = xfs_btree_count_blocks(sc->sa.fino_cur, &finobt_blocks); + if (!xfs_scrub_process_error(sc, 0, 0, &error)) + return; + } + + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); + error = xfs_scrub_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, &oinfo, + &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (blocks != inobt_blocks + finobt_blocks) + xfs_scrub_btree_set_corrupt(sc, sc->sa.ino_cur, 0); +} + +/* + * Make sure that the inobt records point to the same number of blocks as + * the rmap says are owned by inodes. + */ +STATIC void +xfs_scrub_iallocbt_xref_rmap_inodes( + struct xfs_scrub_context *sc, + int which, + xfs_filblks_t inode_blocks) +{ + struct xfs_owner_info oinfo; + xfs_filblks_t blocks; + int error; + + if (!sc->sa.rmap_cur) + return; + + /* Check that we saw as many inode blocks as the rmap knows about. */ + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); + error = xfs_scrub_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, &oinfo, + &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (blocks != inode_blocks) + xfs_scrub_btree_set_corrupt(sc, sc->sa.ino_cur, 0); +} + /* Scrub the inode btrees for some AG. */ STATIC int xfs_scrub_iallocbt( @@ -316,10 +444,29 @@ xfs_scrub_iallocbt( { struct xfs_btree_cur *cur; struct xfs_owner_info oinfo; + xfs_filblks_t inode_blocks = 0; + int error; xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur; - return xfs_scrub_btree(sc, cur, xfs_scrub_iallocbt_rec, &oinfo, NULL); + error = xfs_scrub_btree(sc, cur, xfs_scrub_iallocbt_rec, &oinfo, + &inode_blocks); + if (error) + return error; + + xfs_scrub_iallocbt_xref_rmap_btreeblks(sc, which); + + /* + * If we're scrubbing the inode btree, inode_blocks is the number of + * blocks pointed to by all the inode chunk records. Therefore, we + * should compare to the number of inode chunk blocks that the rmap + * knows about. We can't do this for the finobt since it only points + * to inode chunks with free inodes. + */ + if (which == XFS_BTNUM_INO) + xfs_scrub_iallocbt_xref_rmap_inodes(sc, which, inode_blocks); + + return error; } int @@ -335,3 +482,46 @@ xfs_scrub_finobt( { return xfs_scrub_iallocbt(sc, XFS_BTNUM_FINO); } + +/* See if an inode btree has (or doesn't have) an inode chunk record. */ +static inline void +xfs_scrub_xref_inode_check( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len, + struct xfs_btree_cur **icur, + bool should_have_inodes) +{ + bool has_inodes; + int error; + + if (!(*icur)) + return; + + error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &has_inodes); + if (!xfs_scrub_should_check_xref(sc, &error, icur)) + return; + if (has_inodes != should_have_inodes) + xfs_scrub_btree_xref_set_corrupt(sc, *icur, 0); +} + +/* xref check that the extent is not covered by inodes */ +void +xfs_scrub_xref_is_not_inode_chunk( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + xfs_scrub_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, false); + xfs_scrub_xref_inode_check(sc, agbno, len, &sc->sa.fino_cur, false); +} + +/* xref check that the extent is covered by inodes */ +void +xfs_scrub_xref_is_inode_chunk( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + xfs_scrub_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, true); +} diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index f120fb20452f..21297bef8df1 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -36,9 +36,13 @@ #include "xfs_ialloc.h" #include "xfs_da_format.h" #include "xfs_reflink.h" +#include "xfs_rmap.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/btree.h" #include "scrub/trace.h" /* @@ -64,7 +68,7 @@ xfs_scrub_setup_inode( break; case -EFSCORRUPTED: case -EFSBADCRC: - return 0; + return xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); default: return error; } @@ -392,6 +396,14 @@ xfs_scrub_dinode( break; } + /* di_[amc]time.nsec */ + if (be32_to_cpu(dip->di_atime.t_nsec) >= NSEC_PER_SEC) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + if (be32_to_cpu(dip->di_mtime.t_nsec) >= NSEC_PER_SEC) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + if (be32_to_cpu(dip->di_ctime.t_nsec) >= NSEC_PER_SEC) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + /* * di_size. xfs_dinode_verify checks for things that screw up * the VFS such as the upper bit being set and zero-length @@ -495,6 +507,8 @@ xfs_scrub_dinode( } if (dip->di_version >= 3) { + if (be32_to_cpu(dip->di_crtime.t_nsec) >= NSEC_PER_SEC) + xfs_scrub_ino_set_corrupt(sc, ino, bp); xfs_scrub_inode_flags2(sc, bp, dip, ino, mode, flags, flags2); xfs_scrub_inode_cowextsize(sc, bp, dip, ino, mode, flags, flags2); @@ -546,7 +560,7 @@ xfs_scrub_inode_map_raw( */ bp->b_ops = &xfs_inode_buf_ops; dip = xfs_buf_offset(bp, imap.im_boffset); - if (!xfs_dinode_verify(mp, ino, dip) || + if (xfs_dinode_verify(mp, ino, dip) != NULL || !xfs_dinode_good_version(mp, dip->di_version)) { xfs_scrub_ino_set_corrupt(sc, ino, bp); goto out_buf; @@ -567,18 +581,155 @@ out_buf: return error; } +/* + * Make sure the finobt doesn't think this inode is free. + * We don't have to check the inobt ourselves because we got the inode via + * IGET_UNTRUSTED, which checks the inobt for us. + */ +static void +xfs_scrub_inode_xref_finobt( + struct xfs_scrub_context *sc, + xfs_ino_t ino) +{ + struct xfs_inobt_rec_incore rec; + xfs_agino_t agino; + int has_record; + int error; + + if (!sc->sa.fino_cur) + return; + + agino = XFS_INO_TO_AGINO(sc->mp, ino); + + /* + * Try to get the finobt record. If we can't get it, then we're + * in good shape. + */ + error = xfs_inobt_lookup(sc->sa.fino_cur, agino, XFS_LOOKUP_LE, + &has_record); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.fino_cur) || + !has_record) + return; + + error = xfs_inobt_get_rec(sc->sa.fino_cur, &rec, &has_record); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.fino_cur) || + !has_record) + return; + + /* + * Otherwise, make sure this record either doesn't cover this inode, + * or that it does but it's marked present. + */ + if (rec.ir_startino > agino || + rec.ir_startino + XFS_INODES_PER_CHUNK <= agino) + return; + + if (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.fino_cur, 0); +} + +/* Cross reference the inode fields with the forks. */ +STATIC void +xfs_scrub_inode_xref_bmap( + struct xfs_scrub_context *sc, + struct xfs_dinode *dip) +{ + xfs_extnum_t nextents; + xfs_filblks_t count; + xfs_filblks_t acount; + int error; + + /* Walk all the extents to check nextents/naextents/nblocks. */ + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, + &nextents, &count); + if (!xfs_scrub_should_check_xref(sc, &error, NULL)) + return; + if (nextents < be32_to_cpu(dip->di_nextents)) + xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino, NULL); + + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, + &nextents, &acount); + if (!xfs_scrub_should_check_xref(sc, &error, NULL)) + return; + if (nextents != be16_to_cpu(dip->di_anextents)) + xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino, NULL); + + /* Check nblocks against the inode. */ + if (count + acount != be64_to_cpu(dip->di_nblocks)) + xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino, NULL); +} + +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_inode_xref( + struct xfs_scrub_context *sc, + xfs_ino_t ino, + struct xfs_dinode *dip) +{ + struct xfs_owner_info oinfo; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + agno = XFS_INO_TO_AGNO(sc->mp, ino); + agbno = XFS_INO_TO_AGBNO(sc->mp, ino); + + error = xfs_scrub_ag_init(sc, agno, &sc->sa); + if (!xfs_scrub_xref_process_error(sc, agno, agbno, &error)) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, 1); + xfs_scrub_inode_xref_finobt(sc, ino); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); + xfs_scrub_xref_is_owned_by(sc, agbno, 1, &oinfo); + xfs_scrub_xref_is_not_shared(sc, agbno, 1); + xfs_scrub_inode_xref_bmap(sc, dip); + + xfs_scrub_ag_free(sc, &sc->sa); +} + +/* + * If the reflink iflag disagrees with a scan for shared data fork extents, + * either flag an error (shared extents w/ no flag) or a preen (flag set w/o + * any shared extents). We already checked for reflink iflag set on a non + * reflink filesystem. + */ +static void +xfs_scrub_inode_check_reflink_iflag( + struct xfs_scrub_context *sc, + xfs_ino_t ino, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = sc->mp; + bool has_shared; + int error; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return; + + error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, + &has_shared); + if (!xfs_scrub_xref_process_error(sc, XFS_INO_TO_AGNO(mp, ino), + XFS_INO_TO_AGBNO(mp, ino), &error)) + return; + if (xfs_is_reflink_inode(sc->ip) && !has_shared) + xfs_scrub_ino_set_preen(sc, ino, bp); + else if (!xfs_is_reflink_inode(sc->ip) && has_shared) + xfs_scrub_ino_set_corrupt(sc, ino, bp); +} + /* Scrub an inode. */ int xfs_scrub_inode( struct xfs_scrub_context *sc) { struct xfs_dinode di; - struct xfs_mount *mp = sc->mp; struct xfs_buf *bp = NULL; struct xfs_dinode *dip; xfs_ino_t ino; - - bool has_shared; int error = 0; /* Did we get the in-core inode, or are we doing this manually? */ @@ -603,19 +754,14 @@ xfs_scrub_inode( goto out; /* - * Does this inode have the reflink flag set but no shared extents? - * Set the preening flag if this is the case. + * Look for discrepancies between file's data blocks and the reflink + * iflag. We already checked the iflag against the file mode when + * we scrubbed the dinode. */ - if (xfs_is_reflink_inode(sc->ip)) { - error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, - &has_shared); - if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino), - XFS_INO_TO_AGBNO(mp, ino), &error)) - goto out; - if (!has_shared) - xfs_scrub_ino_set_preen(sc, ino, bp); - } + if (S_ISREG(VFS_I(sc->ip)->i_mode)) + xfs_scrub_inode_check_reflink_iflag(sc, ino, bp); + xfs_scrub_inode_xref(sc, ino, dip); out: if (bp) xfs_trans_brelse(sc->tp, bp); diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 63a25334fc83..0d3851410c74 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -169,9 +169,9 @@ xfs_scrub_parent_validate( * immediate inactive cleanup of the inode. */ error = xfs_iget(mp, sc->tp, dnum, 0, 0, &dp); - if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + if (!xfs_scrub_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out; - if (dp == sc->ip) { + if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) { xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); goto out_rele; } @@ -185,7 +185,7 @@ xfs_scrub_parent_validate( */ if (xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED)) { error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink); - if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, + if (!xfs_scrub_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out_unlock; if (nlink != expected_nlink) @@ -205,7 +205,7 @@ xfs_scrub_parent_validate( /* Go looking for our dentry. */ error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink); - if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + if (!xfs_scrub_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out_unlock; /* Drop the parent lock, relock this inode. */ diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 3d9037eceaf1..51daa4ae2627 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -67,13 +67,6 @@ xfs_scrub_setup_quota( { uint dqtype; - /* - * If userspace gave us an AG number or inode data, they don't - * know what they're doing. Get out. - */ - if (sc->sm->sm_agno || sc->sm->sm_ino || sc->sm->sm_gen) - return -EINVAL; - dqtype = xfs_scrub_quota_to_dqtype(sc); if (dqtype == 0) return -EINVAL; diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 2f88a8d44bd0..400f1561cd3d 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -31,6 +31,7 @@ #include "xfs_sb.h" #include "xfs_alloc.h" #include "xfs_rmap.h" +#include "xfs_refcount.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -50,6 +51,307 @@ xfs_scrub_setup_ag_refcountbt( /* Reference count btree scrubber. */ +/* + * Confirming Reference Counts via Reverse Mappings + * + * We want to count the reverse mappings overlapping a refcount record + * (bno, len, refcount), allowing for the possibility that some of the + * overlap may come from smaller adjoining reverse mappings, while some + * comes from single extents which overlap the range entirely. The + * outer loop is as follows: + * + * 1. For all reverse mappings overlapping the refcount extent, + * a. If a given rmap completely overlaps, mark it as seen. + * b. Otherwise, record the fragment (in agbno order) for later + * processing. + * + * Once we've seen all the rmaps, we know that for all blocks in the + * refcount record we want to find $refcount owners and we've already + * visited $seen extents that overlap all the blocks. Therefore, we + * need to find ($refcount - $seen) owners for every block in the + * extent; call that quantity $target_nr. Proceed as follows: + * + * 2. Pull the first $target_nr fragments from the list; all of them + * should start at or before the start of the extent. + * Call this subset of fragments the working set. + * 3. Until there are no more unprocessed fragments, + * a. Find the shortest fragments in the set and remove them. + * b. Note the block number of the end of these fragments. + * c. Pull the same number of fragments from the list. All of these + * fragments should start at the block number recorded in the + * previous step. + * d. Put those fragments in the set. + * 4. Check that there are $target_nr fragments remaining in the list, + * and that they all end at or beyond the end of the refcount extent. + * + * If the refcount is correct, all the check conditions in the algorithm + * should always hold true. If not, the refcount is incorrect. + */ +struct xfs_scrub_refcnt_frag { + struct list_head list; + struct xfs_rmap_irec rm; +}; + +struct xfs_scrub_refcnt_check { + struct xfs_scrub_context *sc; + struct list_head fragments; + + /* refcount extent we're examining */ + xfs_agblock_t bno; + xfs_extlen_t len; + xfs_nlink_t refcount; + + /* number of owners seen */ + xfs_nlink_t seen; +}; + +/* + * Decide if the given rmap is large enough that we can redeem it + * towards refcount verification now, or if it's a fragment, in + * which case we'll hang onto it in the hopes that we'll later + * discover that we've collected exactly the correct number of + * fragments as the refcountbt says we should have. + */ +STATIC int +xfs_scrub_refcountbt_rmap_check( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_scrub_refcnt_check *refchk = priv; + struct xfs_scrub_refcnt_frag *frag; + xfs_agblock_t rm_last; + xfs_agblock_t rc_last; + int error = 0; + + if (xfs_scrub_should_terminate(refchk->sc, &error)) + return error; + + rm_last = rec->rm_startblock + rec->rm_blockcount - 1; + rc_last = refchk->bno + refchk->len - 1; + + /* Confirm that a single-owner refc extent is a CoW stage. */ + if (refchk->refcount == 1 && rec->rm_owner != XFS_RMAP_OWN_COW) { + xfs_scrub_btree_xref_set_corrupt(refchk->sc, cur, 0); + return 0; + } + + if (rec->rm_startblock <= refchk->bno && rm_last >= rc_last) { + /* + * The rmap overlaps the refcount record, so we can confirm + * one refcount owner seen. + */ + refchk->seen++; + } else { + /* + * This rmap covers only part of the refcount record, so + * save the fragment for later processing. If the rmapbt + * is healthy each rmap_irec we see will be in agbno order + * so we don't need insertion sort here. + */ + frag = kmem_alloc(sizeof(struct xfs_scrub_refcnt_frag), + KM_MAYFAIL | KM_NOFS); + if (!frag) + return -ENOMEM; + memcpy(&frag->rm, rec, sizeof(frag->rm)); + list_add_tail(&frag->list, &refchk->fragments); + } + + return 0; +} + +/* + * Given a bunch of rmap fragments, iterate through them, keeping + * a running tally of the refcount. If this ever deviates from + * what we expect (which is the refcountbt's refcount minus the + * number of extents that totally covered the refcountbt extent), + * we have a refcountbt error. + */ +STATIC void +xfs_scrub_refcountbt_process_rmap_fragments( + struct xfs_scrub_refcnt_check *refchk) +{ + struct list_head worklist; + struct xfs_scrub_refcnt_frag *frag; + struct xfs_scrub_refcnt_frag *n; + xfs_agblock_t bno; + xfs_agblock_t rbno; + xfs_agblock_t next_rbno; + xfs_nlink_t nr; + xfs_nlink_t target_nr; + + target_nr = refchk->refcount - refchk->seen; + if (target_nr == 0) + return; + + /* + * There are (refchk->rc.rc_refcount - refchk->nr refcount) + * references we haven't found yet. Pull that many off the + * fragment list and figure out where the smallest rmap ends + * (and therefore the next rmap should start). All the rmaps + * we pull off should start at or before the beginning of the + * refcount record's range. + */ + INIT_LIST_HEAD(&worklist); + rbno = NULLAGBLOCK; + nr = 1; + + /* Make sure the fragments actually /are/ in agbno order. */ + bno = 0; + list_for_each_entry(frag, &refchk->fragments, list) { + if (frag->rm.rm_startblock < bno) + goto done; + bno = frag->rm.rm_startblock; + } + + /* + * Find all the rmaps that start at or before the refc extent, + * and put them on the worklist. + */ + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + if (frag->rm.rm_startblock > refchk->bno) + goto done; + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (bno < rbno) + rbno = bno; + list_move_tail(&frag->list, &worklist); + if (nr == target_nr) + break; + nr++; + } + + /* + * We should have found exactly $target_nr rmap fragments starting + * at or before the refcount extent. + */ + if (nr != target_nr) + goto done; + + while (!list_empty(&refchk->fragments)) { + /* Discard any fragments ending at rbno from the worklist. */ + nr = 0; + next_rbno = NULLAGBLOCK; + list_for_each_entry_safe(frag, n, &worklist, list) { + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (bno != rbno) { + if (bno < next_rbno) + next_rbno = bno; + continue; + } + list_del(&frag->list); + kmem_free(frag); + nr++; + } + + /* Try to add nr rmaps starting at rbno to the worklist. */ + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (frag->rm.rm_startblock != rbno) + goto done; + list_move_tail(&frag->list, &worklist); + if (next_rbno > bno) + next_rbno = bno; + nr--; + if (nr == 0) + break; + } + + /* + * If we get here and nr > 0, this means that we added fewer + * items to the worklist than we discarded because the fragment + * list ran out of items. Therefore, we cannot maintain the + * required refcount. Something is wrong, so we're done. + */ + if (nr) + goto done; + + rbno = next_rbno; + } + + /* + * Make sure the last extent we processed ends at or beyond + * the end of the refcount extent. + */ + if (rbno < refchk->bno + refchk->len) + goto done; + + /* Actually record us having seen the remaining refcount. */ + refchk->seen = refchk->refcount; +done: + /* Delete fragments and work list. */ + list_for_each_entry_safe(frag, n, &worklist, list) { + list_del(&frag->list); + kmem_free(frag); + } + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + list_del(&frag->list); + kmem_free(frag); + } +} + +/* Use the rmap entries covering this extent to verify the refcount. */ +STATIC void +xfs_scrub_refcountbt_xref_rmap( + struct xfs_scrub_context *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + xfs_nlink_t refcount) +{ + struct xfs_scrub_refcnt_check refchk = { + .sc = sc, + .bno = bno, + .len = len, + .refcount = refcount, + .seen = 0, + }; + struct xfs_rmap_irec low; + struct xfs_rmap_irec high; + struct xfs_scrub_refcnt_frag *frag; + struct xfs_scrub_refcnt_frag *n; + int error; + + if (!sc->sa.rmap_cur) + return; + + /* Cross-reference with the rmapbt to confirm the refcount. */ + memset(&low, 0, sizeof(low)); + low.rm_startblock = bno; + memset(&high, 0xFF, sizeof(high)); + high.rm_startblock = bno + len - 1; + + INIT_LIST_HEAD(&refchk.fragments); + error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high, + &xfs_scrub_refcountbt_rmap_check, &refchk); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + goto out_free; + + xfs_scrub_refcountbt_process_rmap_fragments(&refchk); + if (refcount != refchk.seen) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + +out_free: + list_for_each_entry_safe(frag, n, &refchk.fragments, list) { + list_del(&frag->list); + kmem_free(frag); + } +} + +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_refcountbt_xref( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len, + xfs_nlink_t refcount) +{ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, len); + xfs_scrub_xref_is_not_inode_chunk(sc, agbno, len); + xfs_scrub_refcountbt_xref_rmap(sc, agbno, len, refcount); +} + /* Scrub a refcountbt record. */ STATIC int xfs_scrub_refcountbt_rec( @@ -57,6 +359,7 @@ xfs_scrub_refcountbt_rec( union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; + xfs_agblock_t *cow_blocks = bs->private; xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agblock_t bno; xfs_extlen_t len; @@ -72,6 +375,8 @@ xfs_scrub_refcountbt_rec( has_cowflag = (bno & XFS_REFC_COW_START); if ((refcount == 1 && !has_cowflag) || (refcount != 1 && has_cowflag)) xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + if (has_cowflag) + (*cow_blocks) += len; /* Check the extent. */ bno &= ~XFS_REFC_COW_START; @@ -83,17 +388,128 @@ xfs_scrub_refcountbt_rec( if (refcount == 0) xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + xfs_scrub_refcountbt_xref(bs->sc, bno, len, refcount); + return error; } +/* Make sure we have as many refc blocks as the rmap says. */ +STATIC void +xfs_scrub_refcount_xref_rmap( + struct xfs_scrub_context *sc, + struct xfs_owner_info *oinfo, + xfs_filblks_t cow_blocks) +{ + xfs_extlen_t refcbt_blocks = 0; + xfs_filblks_t blocks; + int error; + + if (!sc->sa.rmap_cur) + return; + + /* Check that we saw as many refcbt blocks as the rmap knows about. */ + error = xfs_btree_count_blocks(sc->sa.refc_cur, &refcbt_blocks); + if (!xfs_scrub_btree_process_error(sc, sc->sa.refc_cur, 0, &error)) + return; + error = xfs_scrub_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, oinfo, + &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (blocks != refcbt_blocks) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + + /* Check that we saw as many cow blocks as the rmap knows about. */ + xfs_rmap_ag_owner(oinfo, XFS_RMAP_OWN_COW); + error = xfs_scrub_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, oinfo, + &blocks); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (blocks != cow_blocks) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); +} + /* Scrub the refcount btree for some AG. */ int xfs_scrub_refcountbt( struct xfs_scrub_context *sc) { struct xfs_owner_info oinfo; + xfs_agblock_t cow_blocks = 0; + int error; xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC); - return xfs_scrub_btree(sc, sc->sa.refc_cur, xfs_scrub_refcountbt_rec, - &oinfo, NULL); + error = xfs_scrub_btree(sc, sc->sa.refc_cur, xfs_scrub_refcountbt_rec, + &oinfo, &cow_blocks); + if (error) + return error; + + xfs_scrub_refcount_xref_rmap(sc, &oinfo, cow_blocks); + + return 0; +} + +/* xref check that a cow staging extent is marked in the refcountbt. */ +void +xfs_scrub_xref_is_cow_staging( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_refcount_irec rc; + bool has_cowflag; + int has_refcount; + int error; + + if (!sc->sa.refc_cur) + return; + + /* Find the CoW staging extent. */ + error = xfs_refcount_lookup_le(sc->sa.refc_cur, + agbno + XFS_REFC_COW_START, &has_refcount); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (!has_refcount) { + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); + return; + } + + error = xfs_refcount_get_rec(sc->sa.refc_cur, &rc, &has_refcount); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (!has_refcount) { + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); + return; + } + + /* CoW flag must be set, refcount must be 1. */ + has_cowflag = (rc.rc_startblock & XFS_REFC_COW_START); + if (!has_cowflag || rc.rc_refcount != 1) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); + + /* Must be at least as long as what was passed in */ + if (rc.rc_blockcount < len) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); +} + +/* + * xref check that the extent is not shared. Only file data blocks + * can have multiple owners. + */ +void +xfs_scrub_xref_is_not_shared( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + bool shared; + int error; + + if (!sc->sa.refc_cur) + return; + + error = xfs_refcount_has_record(sc->sa.refc_cur, agbno, len, &shared); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (shared) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); } diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 97846c424690..8f2a7c3ff455 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -32,6 +32,7 @@ #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_rmap.h" +#include "xfs_refcount.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -51,6 +52,61 @@ xfs_scrub_setup_ag_rmapbt( /* Reverse-mapping scrubber. */ +/* Cross-reference a rmap against the refcount btree. */ +STATIC void +xfs_scrub_rmapbt_xref_refc( + struct xfs_scrub_context *sc, + struct xfs_rmap_irec *irec) +{ + xfs_agblock_t fbno; + xfs_extlen_t flen; + bool non_inode; + bool is_bmbt; + bool is_attr; + bool is_unwritten; + int error; + + if (!sc->sa.refc_cur) + return; + + non_inode = XFS_RMAP_NON_INODE_OWNER(irec->rm_owner); + is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK; + is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK; + is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN; + + /* If this is shared, must be a data fork extent. */ + error = xfs_refcount_find_shared(sc->sa.refc_cur, irec->rm_startblock, + irec->rm_blockcount, &fbno, &flen, false); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (flen != 0 && (non_inode || is_attr || is_bmbt || is_unwritten)) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); +} + +/* Cross-reference with the other btrees. */ +STATIC void +xfs_scrub_rmapbt_xref( + struct xfs_scrub_context *sc, + struct xfs_rmap_irec *irec) +{ + xfs_agblock_t agbno = irec->rm_startblock; + xfs_extlen_t len = irec->rm_blockcount; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xfs_scrub_xref_is_used_space(sc, agbno, len); + if (irec->rm_owner == XFS_RMAP_OWN_INODES) + xfs_scrub_xref_is_inode_chunk(sc, agbno, len); + else + xfs_scrub_xref_is_not_inode_chunk(sc, agbno, len); + if (irec->rm_owner == XFS_RMAP_OWN_COW) + xfs_scrub_xref_is_cow_staging(sc, irec->rm_startblock, + irec->rm_blockcount); + else + xfs_scrub_rmapbt_xref_refc(sc, irec); +} + /* Scrub an rmapbt record. */ STATIC int xfs_scrub_rmapbt_rec( @@ -121,6 +177,8 @@ xfs_scrub_rmapbt_rec( irec.rm_owner > XFS_RMAP_OWN_FS) xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); } + + xfs_scrub_rmapbt_xref(bs->sc, &irec); out: return error; } @@ -136,3 +194,68 @@ xfs_scrub_rmapbt( return xfs_scrub_btree(sc, sc->sa.rmap_cur, xfs_scrub_rmapbt_rec, &oinfo, NULL); } + +/* xref check that the extent is owned by a given owner */ +static inline void +xfs_scrub_xref_check_owner( + struct xfs_scrub_context *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo, + bool should_have_rmap) +{ + bool has_rmap; + int error; + + if (!sc->sa.rmap_cur) + return; + + error = xfs_rmap_record_exists(sc->sa.rmap_cur, bno, len, oinfo, + &has_rmap); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (has_rmap != should_have_rmap) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); +} + +/* xref check that the extent is owned by a given owner */ +void +xfs_scrub_xref_is_owned_by( + struct xfs_scrub_context *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo) +{ + xfs_scrub_xref_check_owner(sc, bno, len, oinfo, true); +} + +/* xref check that the extent is not owned by a given owner */ +void +xfs_scrub_xref_is_not_owned_by( + struct xfs_scrub_context *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo) +{ + xfs_scrub_xref_check_owner(sc, bno, len, oinfo, false); +} + +/* xref check that the extent has no reverse mapping at all */ +void +xfs_scrub_xref_has_no_owner( + struct xfs_scrub_context *sc, + xfs_agblock_t bno, + xfs_extlen_t len) +{ + bool has_rmap; + int error; + + if (!sc->sa.rmap_cur) + return; + + error = xfs_rmap_has_record(sc->sa.rmap_cur, bno, len, &has_rmap); + if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (has_rmap) + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); +} diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index c6fedb698008..26390991369a 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -43,22 +43,14 @@ xfs_scrub_setup_rt( struct xfs_scrub_context *sc, struct xfs_inode *ip) { - struct xfs_mount *mp = sc->mp; - int error = 0; - - /* - * If userspace gave us an AG number or inode data, they don't - * know what they're doing. Get out. - */ - if (sc->sm->sm_agno || sc->sm->sm_ino || sc->sm->sm_gen) - return -EINVAL; + int error; error = xfs_scrub_setup_fs(sc, ip); if (error) return error; sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP; - sc->ip = mp->m_rbmip; + sc->ip = sc->mp->m_rbmip; xfs_ilock(sc->ip, sc->ilock_flags); return 0; @@ -106,3 +98,26 @@ xfs_scrub_rtsummary( /* XXX: implement this some day */ return -ENOENT; } + + +/* xref check that the extent is not free in the rtbitmap */ +void +xfs_scrub_xref_is_used_rt_space( + struct xfs_scrub_context *sc, + xfs_rtblock_t fsbno, + xfs_extlen_t len) +{ + bool is_free; + int error; + + xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, fsbno, len, + &is_free); + if (!xfs_scrub_should_check_xref(sc, &error, NULL)) + goto out_unlock; + if (is_free) + xfs_scrub_ino_xref_set_corrupt(sc, sc->mp->m_rbmip->i_ino, + NULL); +out_unlock: + xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index ab3aef2ae823..26c75967a072 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -110,6 +110,16 @@ * structure itself is corrupt, the CORRUPT flag will be set. If * the metadata is correct but otherwise suboptimal, the PREEN flag * will be set. + * + * We perform secondary validation of filesystem metadata by + * cross-referencing every record with all other available metadata. + * For example, for block mapping extents, we verify that there are no + * records in the free space and inode btrees corresponding to that + * space extent and that there is a corresponding entry in the reverse + * mapping btree. Inconsistent metadata is noted by setting the + * XCORRUPT flag; btree query function errors are noted by setting the + * XFAIL flag and deleting the cursor to prevent further attempts to + * cross-reference with a defective btree. */ /* @@ -128,8 +138,6 @@ xfs_scrub_probe( { int error = 0; - if (sc->sm->sm_ino || sc->sm->sm_agno) - return -EINVAL; if (xfs_scrub_should_terminate(sc, &error)) return error; @@ -151,7 +159,8 @@ xfs_scrub_teardown( sc->tp = NULL; } if (sc->ip) { - xfs_iunlock(sc->ip, sc->ilock_flags); + if (sc->ilock_flags) + xfs_iunlock(sc->ip, sc->ilock_flags); if (sc->ip != ip_in && !xfs_internal_inum(sc->mp, sc->ip->i_ino)) iput(VFS_I(sc->ip)); @@ -167,106 +176,130 @@ xfs_scrub_teardown( /* Scrubbing dispatch. */ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = { - { /* ioctl presence test */ + [XFS_SCRUB_TYPE_PROBE] = { /* ioctl presence test */ + .type = ST_NONE, .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_probe, }, - { /* superblock */ - .setup = xfs_scrub_setup_ag_header, + [XFS_SCRUB_TYPE_SB] = { /* superblock */ + .type = ST_PERAG, + .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_superblock, }, - { /* agf */ - .setup = xfs_scrub_setup_ag_header, + [XFS_SCRUB_TYPE_AGF] = { /* agf */ + .type = ST_PERAG, + .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_agf, }, - { /* agfl */ - .setup = xfs_scrub_setup_ag_header, + [XFS_SCRUB_TYPE_AGFL]= { /* agfl */ + .type = ST_PERAG, + .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_agfl, }, - { /* agi */ - .setup = xfs_scrub_setup_ag_header, + [XFS_SCRUB_TYPE_AGI] = { /* agi */ + .type = ST_PERAG, + .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_agi, }, - { /* bnobt */ + [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */ + .type = ST_PERAG, .setup = xfs_scrub_setup_ag_allocbt, .scrub = xfs_scrub_bnobt, }, - { /* cntbt */ + [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ + .type = ST_PERAG, .setup = xfs_scrub_setup_ag_allocbt, .scrub = xfs_scrub_cntbt, }, - { /* inobt */ + [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ + .type = ST_PERAG, .setup = xfs_scrub_setup_ag_iallocbt, .scrub = xfs_scrub_inobt, }, - { /* finobt */ + [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ + .type = ST_PERAG, .setup = xfs_scrub_setup_ag_iallocbt, .scrub = xfs_scrub_finobt, .has = xfs_sb_version_hasfinobt, }, - { /* rmapbt */ + [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ + .type = ST_PERAG, .setup = xfs_scrub_setup_ag_rmapbt, .scrub = xfs_scrub_rmapbt, .has = xfs_sb_version_hasrmapbt, }, - { /* refcountbt */ + [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */ + .type = ST_PERAG, .setup = xfs_scrub_setup_ag_refcountbt, .scrub = xfs_scrub_refcountbt, .has = xfs_sb_version_hasreflink, }, - { /* inode record */ + [XFS_SCRUB_TYPE_INODE] = { /* inode record */ + .type = ST_INODE, .setup = xfs_scrub_setup_inode, .scrub = xfs_scrub_inode, }, - { /* inode data fork */ + [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */ + .type = ST_INODE, .setup = xfs_scrub_setup_inode_bmap, .scrub = xfs_scrub_bmap_data, }, - { /* inode attr fork */ + [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */ + .type = ST_INODE, .setup = xfs_scrub_setup_inode_bmap, .scrub = xfs_scrub_bmap_attr, }, - { /* inode CoW fork */ + [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */ + .type = ST_INODE, .setup = xfs_scrub_setup_inode_bmap, .scrub = xfs_scrub_bmap_cow, }, - { /* directory */ + [XFS_SCRUB_TYPE_DIR] = { /* directory */ + .type = ST_INODE, .setup = xfs_scrub_setup_directory, .scrub = xfs_scrub_directory, }, - { /* extended attributes */ + [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */ + .type = ST_INODE, .setup = xfs_scrub_setup_xattr, .scrub = xfs_scrub_xattr, }, - { /* symbolic link */ + [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */ + .type = ST_INODE, .setup = xfs_scrub_setup_symlink, .scrub = xfs_scrub_symlink, }, - { /* parent pointers */ + [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */ + .type = ST_INODE, .setup = xfs_scrub_setup_parent, .scrub = xfs_scrub_parent, }, - { /* realtime bitmap */ + [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ + .type = ST_FS, .setup = xfs_scrub_setup_rt, .scrub = xfs_scrub_rtbitmap, .has = xfs_sb_version_hasrealtime, }, - { /* realtime summary */ + [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ + .type = ST_FS, .setup = xfs_scrub_setup_rt, .scrub = xfs_scrub_rtsummary, .has = xfs_sb_version_hasrealtime, }, - { /* user quota */ - .setup = xfs_scrub_setup_quota, - .scrub = xfs_scrub_quota, + [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ + .type = ST_FS, + .setup = xfs_scrub_setup_quota, + .scrub = xfs_scrub_quota, }, - { /* group quota */ - .setup = xfs_scrub_setup_quota, - .scrub = xfs_scrub_quota, + [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */ + .type = ST_FS, + .setup = xfs_scrub_setup_quota, + .scrub = xfs_scrub_quota, }, - { /* project quota */ - .setup = xfs_scrub_setup_quota, - .scrub = xfs_scrub_quota, + [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */ + .type = ST_FS, + .setup = xfs_scrub_setup_quota, + .scrub = xfs_scrub_quota, }, }; @@ -284,44 +317,56 @@ xfs_scrub_experimental_warning( "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); } -/* Dispatch metadata scrubbing. */ -int -xfs_scrub_metadata( - struct xfs_inode *ip, +static int +xfs_scrub_validate_inputs( + struct xfs_mount *mp, struct xfs_scrub_metadata *sm) { - struct xfs_scrub_context sc; - struct xfs_mount *mp = ip->i_mount; + int error; const struct xfs_scrub_meta_ops *ops; - bool try_harder = false; - int error = 0; - - trace_xfs_scrub_start(ip, sm, error); - - /* Forbidden if we are shut down or mounted norecovery. */ - error = -ESHUTDOWN; - if (XFS_FORCED_SHUTDOWN(mp)) - goto out; - error = -ENOTRECOVERABLE; - if (mp->m_flags & XFS_MOUNT_NORECOVERY) - goto out; - /* Check our inputs. */ error = -EINVAL; + /* Check our inputs. */ sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN) goto out; + /* sm_reserved[] must be zero */ if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved))) goto out; - /* Do we know about this type of metadata? */ error = -ENOENT; + /* Do we know about this type of metadata? */ if (sm->sm_type >= XFS_SCRUB_TYPE_NR) goto out; ops = &meta_scrub_ops[sm->sm_type]; - if (ops->scrub == NULL) + if (ops->setup == NULL || ops->scrub == NULL) goto out; + /* Does this fs even support this type of metadata? */ + if (ops->has && !ops->has(&mp->m_sb)) + goto out; + + error = -EINVAL; + /* restricting fields must be appropriate for type */ + switch (ops->type) { + case ST_NONE: + case ST_FS: + if (sm->sm_ino || sm->sm_gen || sm->sm_agno) + goto out; + break; + case ST_PERAG: + if (sm->sm_ino || sm->sm_gen || + sm->sm_agno >= mp->m_sb.sb_agcount) + goto out; + break; + case ST_INODE: + if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino)) + goto out; + break; + default: + goto out; + } + error = -EOPNOTSUPP; /* * We won't scrub any filesystem that doesn't have the ability * to record unwritten extents. The option was made default in @@ -331,20 +376,46 @@ xfs_scrub_metadata( * We also don't support v1-v3 filesystems, which aren't * mountable. */ - error = -EOPNOTSUPP; if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) goto out; - /* Does this fs even support this type of metadata? */ - error = -ENOENT; - if (ops->has && !ops->has(&mp->m_sb)) - goto out; - /* We don't know how to repair anything yet. */ - error = -EOPNOTSUPP; if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) goto out; + error = 0; +out: + return error; +} + +/* Dispatch metadata scrubbing. */ +int +xfs_scrub_metadata( + struct xfs_inode *ip, + struct xfs_scrub_metadata *sm) +{ + struct xfs_scrub_context sc; + struct xfs_mount *mp = ip->i_mount; + bool try_harder = false; + int error = 0; + + BUILD_BUG_ON(sizeof(meta_scrub_ops) != + (sizeof(struct xfs_scrub_meta_ops) * XFS_SCRUB_TYPE_NR)); + + trace_xfs_scrub_start(ip, sm, error); + + /* Forbidden if we are shut down or mounted norecovery. */ + error = -ESHUTDOWN; + if (XFS_FORCED_SHUTDOWN(mp)) + goto out; + error = -ENOTRECOVERABLE; + if (mp->m_flags & XFS_MOUNT_NORECOVERY) + goto out; + + error = xfs_scrub_validate_inputs(mp, sm); + if (error) + goto out; + xfs_scrub_experimental_warning(mp); retry_op: @@ -352,7 +423,7 @@ retry_op: memset(&sc, 0, sizeof(sc)); sc.mp = ip->i_mount; sc.sm = sm; - sc.ops = ops; + sc.ops = &meta_scrub_ops[sm->sm_type]; sc.try_harder = try_harder; sc.sa.agno = NULLAGNUMBER; error = sc.ops->setup(&sc, ip); diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index e9ec041cf713..0d92af86f67a 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -22,6 +22,14 @@ struct xfs_scrub_context; +/* Type info and names for the scrub types. */ +enum xfs_scrub_type { + ST_NONE = 1, /* disabled */ + ST_PERAG, /* per-AG metadata */ + ST_FS, /* per-FS metadata */ + ST_INODE, /* per-inode metadata */ +}; + struct xfs_scrub_meta_ops { /* Acquire whatever resources are needed for the operation. */ int (*setup)(struct xfs_scrub_context *, @@ -32,6 +40,9 @@ struct xfs_scrub_meta_ops { /* Decide if we even have this piece of metadata. */ bool (*has)(struct xfs_sb *); + + /* type describing required/allowed inputs */ + enum xfs_scrub_type type; }; /* Buffer pointers and btree cursors for an entire AG. */ @@ -112,4 +123,30 @@ xfs_scrub_quota(struct xfs_scrub_context *sc) } #endif +/* cross-referencing helpers */ +void xfs_scrub_xref_is_used_space(struct xfs_scrub_context *sc, + xfs_agblock_t agbno, xfs_extlen_t len); +void xfs_scrub_xref_is_not_inode_chunk(struct xfs_scrub_context *sc, + xfs_agblock_t agbno, xfs_extlen_t len); +void xfs_scrub_xref_is_inode_chunk(struct xfs_scrub_context *sc, + xfs_agblock_t agbno, xfs_extlen_t len); +void xfs_scrub_xref_is_owned_by(struct xfs_scrub_context *sc, + xfs_agblock_t agbno, xfs_extlen_t len, + struct xfs_owner_info *oinfo); +void xfs_scrub_xref_is_not_owned_by(struct xfs_scrub_context *sc, + xfs_agblock_t agbno, xfs_extlen_t len, + struct xfs_owner_info *oinfo); +void xfs_scrub_xref_has_no_owner(struct xfs_scrub_context *sc, + xfs_agblock_t agbno, xfs_extlen_t len); +void xfs_scrub_xref_is_cow_staging(struct xfs_scrub_context *sc, + xfs_agblock_t bno, xfs_extlen_t len); +void xfs_scrub_xref_is_not_shared(struct xfs_scrub_context *sc, + xfs_agblock_t bno, xfs_extlen_t len); +#ifdef CONFIG_XFS_RT +void xfs_scrub_xref_is_used_rt_space(struct xfs_scrub_context *sc, + xfs_rtblock_t rtbno, xfs_extlen_t len); +#else +# define xfs_scrub_xref_is_used_rt_space(sc, rtbno, len) do { } while (0) +#endif + #endif /* __XFS_SCRUB_SCRUB_H__ */ diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index c4ebfb5c1ee8..4dc896852bf0 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -50,7 +50,7 @@ DECLARE_EVENT_CLASS(xfs_scrub_class, __entry->flags = sm->sm_flags; __entry->error = error; ), - TP_printk("dev %d:%d ino %llu type %u agno %u inum %llu gen %u flags 0x%x error %d", + TP_printk("dev %d:%d ino 0x%llx type %u agno %u inum %llu gen %u flags 0x%x error %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->type, @@ -90,7 +90,7 @@ TRACE_EVENT(xfs_scrub_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u agno %u agbno %u error %d ret_ip %pF", + TP_printk("dev %d:%d type %u agno %u agbno %u error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->agno, @@ -121,7 +121,7 @@ TRACE_EVENT(xfs_scrub_file_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino %llu fork %d type %u offset %llu error %d ret_ip %pF", + TP_printk("dev %d:%d ino 0x%llx fork %d type %u offset %llu error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->whichfork, @@ -156,7 +156,7 @@ DECLARE_EVENT_CLASS(xfs_scrub_block_error_class, __entry->bno = bno; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u agno %u agbno %u ret_ip %pF", + TP_printk("dev %d:%d type %u agno %u agbno %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->agno, @@ -207,7 +207,7 @@ DECLARE_EVENT_CLASS(xfs_scrub_ino_error_class, __entry->bno = bno; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino %llu type %u agno %u agbno %u ret_ip %pF", + TP_printk("dev %d:%d ino 0x%llx type %u agno %u agbno %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->type, @@ -246,7 +246,7 @@ DECLARE_EVENT_CLASS(xfs_scrub_fblock_error_class, __entry->offset = offset; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino %llu fork %d type %u offset %llu ret_ip %pF", + TP_printk("dev %d:%d ino 0x%llx fork %d type %u offset %llu ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->whichfork, @@ -277,7 +277,7 @@ TRACE_EVENT(xfs_scrub_incomplete, __entry->type = sc->sm->sm_type; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u ret_ip %pF", + TP_printk("dev %d:%d type %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->ret_ip) @@ -311,7 +311,7 @@ TRACE_EVENT(xfs_scrub_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pF", + TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->btnum, @@ -354,7 +354,7 @@ TRACE_EVENT(xfs_scrub_ifork_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino %llu fork %d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pF", + TP_printk("dev %d:%d ino 0x%llx fork %d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->whichfork, @@ -393,7 +393,7 @@ TRACE_EVENT(xfs_scrub_btree_error, __entry->ptr = cur->bc_ptrs[level]; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pF", + TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->btnum, @@ -433,7 +433,7 @@ TRACE_EVENT(xfs_scrub_ifork_btree_error, __entry->ptr = cur->bc_ptrs[level]; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino %llu fork %d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pF", + TP_printk("dev %d:%d ino 0x%llx fork %d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->whichfork, @@ -491,6 +491,28 @@ DEFINE_EVENT(xfs_scrub_sbtree_class, name, \ DEFINE_SCRUB_SBTREE_EVENT(xfs_scrub_btree_rec); DEFINE_SCRUB_SBTREE_EVENT(xfs_scrub_btree_key); +TRACE_EVENT(xfs_scrub_xref_error, + TP_PROTO(struct xfs_scrub_context *sc, int error, void *ret_ip), + TP_ARGS(sc, error, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(int, error) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->error = error; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %u xref error %d ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->error, + __entry->ret_ip) +); + #endif /* _TRACE_XFS_SCRUB_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 21e2d70884e1..9c6a830da0ee 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -390,6 +390,19 @@ xfs_map_blocks( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; + /* + * Truncate can race with writeback since writeback doesn't take the + * iolock and truncate decreases the file size before it starts + * truncating the pages between new_size and old_size. Therefore, we + * can end up in the situation where writeback gets a CoW fork mapping + * but the truncate makes the mapping invalid and we end up in here + * trying to get a new mapping. Bail out here so that we simply never + * get a valid mapping and so we drop the write altogether. The page + * truncation will kill the contents anyway. + */ + if (type == XFS_IO_COW && offset > i_size_read(inode)) + return 0; + ASSERT(type != XFS_IO_COW); if (type == XFS_IO_UNWRITTEN) bmapi_flags |= XFS_BMAPI_IGSTATE; @@ -399,7 +412,7 @@ xfs_map_blocks( (ip->i_df.if_flags & XFS_IFEXTENTS)); ASSERT(offset <= mp->m_super->s_maxbytes); - if ((xfs_ufsize_t)offset + count > mp->m_super->s_maxbytes) + if (offset > mp->m_super->s_maxbytes - count) count = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); offset_fsb = XFS_B_TO_FSBT(mp, offset); @@ -791,7 +804,7 @@ xfs_aops_discard_page( goto out_invalidate; xfs_alert(ip->i_mount, - "page discard on page %p, inode 0x%llx, offset %llu.", + "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.", page, ip->i_ino, offset); xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1312,7 +1325,7 @@ xfs_get_blocks( lockmode = xfs_ilock_data_map_shared(ip); ASSERT(offset <= mp->m_super->s_maxbytes); - if ((xfs_ufsize_t)offset + size > mp->m_super->s_maxbytes) + if (offset > mp->m_super->s_maxbytes - size) size = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 6d37ab43195f..c83f549dc17b 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1872,7 +1872,7 @@ xfs_swap_extents( */ lock_two_nondirectories(VFS_I(ip), VFS_I(tip)); lock_flags = XFS_MMAPLOCK_EXCL; - xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); + xfs_lock_two_inodes(ip, XFS_MMAPLOCK_EXCL, tip, XFS_MMAPLOCK_EXCL); /* Verify that both files have the same format */ if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { @@ -1919,7 +1919,7 @@ xfs_swap_extents( * Lock and join the inodes to the tansaction so that transaction commit * or cancel will unlock the inodes from this point onwards. */ - xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + xfs_lock_two_inodes(ip, XFS_ILOCK_EXCL, tip, XFS_ILOCK_EXCL); lock_flags |= XFS_ILOCK_EXCL; xfs_trans_ijoin(tp, ip, 0); xfs_trans_ijoin(tp, tip, 0); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4c6e86d861fd..d1da2ee9e6db 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -236,6 +236,7 @@ _xfs_buf_alloc( init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_lru); INIT_LIST_HEAD(&bp->b_list); + INIT_LIST_HEAD(&bp->b_li_list); sema_init(&bp->b_sema, 0); /* held, no waiters */ spin_lock_init(&bp->b_lock); XB_SET_OWNER(bp); @@ -585,7 +586,7 @@ _xfs_buf_find( * returning a specific error on buffer lookup failures. */ xfs_alert(btp->bt_mount, - "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", + "%s: daddr 0x%llx out of range, EOFS 0x%llx", __func__, cmap.bm_bn, eofs); WARN_ON(1); return NULL; @@ -1180,13 +1181,14 @@ xfs_buf_ioend_async( } void -xfs_buf_ioerror( +__xfs_buf_ioerror( xfs_buf_t *bp, - int error) + int error, + xfs_failaddr_t failaddr) { ASSERT(error <= 0 && error >= -1000); bp->b_error = error; - trace_xfs_buf_ioerror(bp, error, _RET_IP_); + trace_xfs_buf_ioerror(bp, error, failaddr); } void @@ -1195,8 +1197,9 @@ xfs_buf_ioerror_alert( const char *func) { xfs_alert(bp->b_target->bt_mount, -"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", - (uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length); +"metadata I/O error in \"%s\" at daddr 0x%llx len %d error %d", + func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length, + -bp->b_error); } int @@ -1378,9 +1381,10 @@ _xfs_buf_ioapply( */ if (xfs_sb_version_hascrc(&mp->m_sb)) { xfs_warn(mp, - "%s: no ops on block 0x%llx/0x%x", + "%s: no buf ops on daddr 0x%llx len %d", __func__, bp->b_bn, bp->b_length); - xfs_hex_dump(bp->b_addr, 64); + xfs_hex_dump(bp->b_addr, + XFS_CORRUPTION_DUMP_LEN); dump_stack(); } } @@ -1671,7 +1675,7 @@ xfs_wait_buftarg( list_del_init(&bp->b_lru); if (bp->b_flags & XBF_WRITE_FAIL) { xfs_alert(btp->bt_mount, -"Corruption Alert: Buffer at block 0x%llx had permanent write failures!", +"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", (long long)bp->b_bn); xfs_alert(btp->bt_mount, "Please run xfs_repair to determine the extent of the problem."); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index f873bb786824..2f4c91452861 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -140,6 +140,7 @@ struct xfs_buf_ops { char *name; void (*verify_read)(struct xfs_buf *); void (*verify_write)(struct xfs_buf *); + xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp); }; typedef struct xfs_buf { @@ -175,7 +176,8 @@ typedef struct xfs_buf { struct workqueue_struct *b_ioend_wq; /* I/O completion wq */ xfs_buf_iodone_t b_iodone; /* I/O completion function */ struct completion b_iowait; /* queue for I/O waiters */ - void *b_fspriv; + void *b_log_item; + struct list_head b_li_list; /* Log items list head */ struct xfs_trans *b_transp; struct page **b_pages; /* array of page pointers */ struct page *b_page_array[XB_PAGES]; /* inline pages */ @@ -315,7 +317,9 @@ extern void xfs_buf_unlock(xfs_buf_t *); /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); extern void xfs_buf_ioend(struct xfs_buf *bp); -extern void xfs_buf_ioerror(xfs_buf_t *, int); +extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, + xfs_failaddr_t failaddr); +#define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); extern void xfs_buf_submit(struct xfs_buf *bp); extern int xfs_buf_submit_wait(struct xfs_buf *bp); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index e0a0af0946f2..270ddb4d2313 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -61,14 +61,14 @@ xfs_buf_log_format_size( */ STATIC void xfs_buf_item_size_segment( - struct xfs_buf_log_item *bip, - struct xfs_buf_log_format *blfp, - int *nvecs, - int *nbytes) + struct xfs_buf_log_item *bip, + struct xfs_buf_log_format *blfp, + int *nvecs, + int *nbytes) { - struct xfs_buf *bp = bip->bli_buf; - int next_bit; - int last_bit; + struct xfs_buf *bp = bip->bli_buf; + int next_bit; + int last_bit; last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); if (last_bit == -1) @@ -218,12 +218,12 @@ xfs_buf_item_format_segment( uint offset, struct xfs_buf_log_format *blfp) { - struct xfs_buf *bp = bip->bli_buf; - uint base_size; - int first_bit; - int last_bit; - int next_bit; - uint nbits; + struct xfs_buf *bp = bip->bli_buf; + uint base_size; + int first_bit; + int last_bit; + int next_bit; + uint nbits; /* copy the flags across from the base format item */ blfp->blf_flags = bip->__bli_format.blf_flags; @@ -406,12 +406,12 @@ xfs_buf_item_unpin( int remove) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); - xfs_buf_t *bp = bip->bli_buf; - struct xfs_ail *ailp = lip->li_ailp; - int stale = bip->bli_flags & XFS_BLI_STALE; - int freed; + xfs_buf_t *bp = bip->bli_buf; + struct xfs_ail *ailp = lip->li_ailp; + int stale = bip->bli_flags & XFS_BLI_STALE; + int freed; - ASSERT(bp->b_fspriv == bip); + ASSERT(bp->b_log_item == bip); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_buf_item_unpin(bip); @@ -456,13 +456,14 @@ xfs_buf_item_unpin( */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { xfs_buf_do_callbacks(bp); - bp->b_fspriv = NULL; + bp->b_log_item = NULL; + list_del_init(&bp->b_li_list); bp->b_iodone = NULL; } else { spin_lock(&ailp->xa_lock); xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); - ASSERT(bp->b_fspriv == NULL); + ASSERT(bp->b_log_item == NULL); } xfs_buf_relse(bp); } else if (freed && remove) { @@ -722,18 +723,15 @@ xfs_buf_item_free_format( /* * Allocate a new buf log item to go with the given buffer. - * Set the buffer's b_fsprivate field to point to the new - * buf log item. If there are other item's attached to the - * buffer (see xfs_buf_attach_iodone() below), then put the - * buf log item at the front. + * Set the buffer's b_log_item field to point to the new + * buf log item. */ int xfs_buf_item_init( struct xfs_buf *bp, struct xfs_mount *mp) { - struct xfs_log_item *lip = bp->b_fspriv; - struct xfs_buf_log_item *bip; + struct xfs_buf_log_item *bip = bp->b_log_item; int chunks; int map_size; int error; @@ -741,13 +739,14 @@ xfs_buf_item_init( /* * Check to see if there is already a buf log item for - * this buffer. If there is, it is guaranteed to be - * the first. If we do already have one, there is + * this buffer. If we do already have one, there is * nothing to do here so return. */ ASSERT(bp->b_target->bt_mount == mp); - if (lip != NULL && lip->li_type == XFS_LI_BUF) + if (bip != NULL) { + ASSERT(bip->bli_item.li_type == XFS_LI_BUF); return 0; + } bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); @@ -781,13 +780,7 @@ xfs_buf_item_init( bip->bli_formats[i].blf_map_size = map_size; } - /* - * Put the buf item into the list of items attached to the - * buffer at the front. - */ - if (bp->b_fspriv) - bip->bli_item.li_bio_list = bp->b_fspriv; - bp->b_fspriv = bip; + bp->b_log_item = bip; xfs_buf_hold(bp); return 0; } @@ -880,7 +873,7 @@ xfs_buf_item_log_segment( */ void xfs_buf_item_log( - xfs_buf_log_item_t *bip, + struct xfs_buf_log_item *bip, uint first, uint last) { @@ -943,7 +936,7 @@ xfs_buf_item_dirty_format( STATIC void xfs_buf_item_free( - xfs_buf_log_item_t *bip) + struct xfs_buf_log_item *bip) { xfs_buf_item_free_format(bip); kmem_free(bip->bli_item.li_lv_shadow); @@ -961,13 +954,13 @@ void xfs_buf_item_relse( xfs_buf_t *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; trace_xfs_buf_item_relse(bp, _RET_IP_); ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); - bp->b_fspriv = bip->bli_item.li_bio_list; - if (bp->b_fspriv == NULL) + bp->b_log_item = NULL; + if (list_empty(&bp->b_li_list)) bp->b_iodone = NULL; xfs_buf_rele(bp); @@ -980,9 +973,7 @@ xfs_buf_item_relse( * to be called when the buffer's I/O completes. If it is not set * already, set the buffer's b_iodone() routine to be * xfs_buf_iodone_callbacks() and link the log item into the list of - * items rooted at b_fsprivate. Items are always added as the second - * entry in the list if there is a first, because the buf item code - * assumes that the buf log item is first. + * items rooted at b_li_list. */ void xfs_buf_attach_iodone( @@ -990,18 +981,10 @@ xfs_buf_attach_iodone( void (*cb)(xfs_buf_t *, xfs_log_item_t *), xfs_log_item_t *lip) { - xfs_log_item_t *head_lip; - ASSERT(xfs_buf_islocked(bp)); lip->li_cb = cb; - head_lip = bp->b_fspriv; - if (head_lip) { - lip->li_bio_list = head_lip->li_bio_list; - head_lip->li_bio_list = lip; - } else { - bp->b_fspriv = lip; - } + list_add_tail(&lip->li_bio_list, &bp->b_li_list); ASSERT(bp->b_iodone == NULL || bp->b_iodone == xfs_buf_iodone_callbacks); @@ -1011,12 +994,12 @@ xfs_buf_attach_iodone( /* * We can have many callbacks on a buffer. Running the callbacks individually * can cause a lot of contention on the AIL lock, so we allow for a single - * callback to be able to scan the remaining lip->li_bio_list for other items - * of the same type and callback to be processed in the first call. + * callback to be able to scan the remaining items in bp->b_li_list for other + * items of the same type and callback to be processed in the first call. * * As a result, the loop walking the callback list below will also modify the * list. it removes the first item from the list and then runs the callback. - * The loop then restarts from the new head of the list. This allows the + * The loop then restarts from the new first item int the list. This allows the * callback to scan and modify the list attached to the buffer and we don't * have to care about maintaining a next item pointer. */ @@ -1024,18 +1007,26 @@ STATIC void xfs_buf_do_callbacks( struct xfs_buf *bp) { + struct xfs_buf_log_item *blip = bp->b_log_item; struct xfs_log_item *lip; - while ((lip = bp->b_fspriv) != NULL) { - bp->b_fspriv = lip->li_bio_list; - ASSERT(lip->li_cb != NULL); + /* If there is a buf_log_item attached, run its callback */ + if (blip) { + lip = &blip->bli_item; + lip->li_cb(bp, lip); + } + + while (!list_empty(&bp->b_li_list)) { + lip = list_first_entry(&bp->b_li_list, struct xfs_log_item, + li_bio_list); + /* - * Clear the next pointer so we don't have any + * Remove the item from the list, so we don't have any * confusion if the item is added to another buf. * Don't touch the log item after calling its * callback, because it could have freed itself. */ - lip->li_bio_list = NULL; + list_del_init(&lip->li_bio_list); lip->li_cb(bp, lip); } } @@ -1052,13 +1043,22 @@ STATIC void xfs_buf_do_callbacks_fail( struct xfs_buf *bp) { - struct xfs_log_item *next; - struct xfs_log_item *lip = bp->b_fspriv; - struct xfs_ail *ailp = lip->li_ailp; + struct xfs_log_item *lip; + struct xfs_ail *ailp; + /* + * Buffer log item errors are handled directly by xfs_buf_item_push() + * and xfs_buf_iodone_callback_error, and they have no IO error + * callbacks. Check only for items in b_li_list. + */ + if (list_empty(&bp->b_li_list)) + return; + + lip = list_first_entry(&bp->b_li_list, struct xfs_log_item, + li_bio_list); + ailp = lip->li_ailp; spin_lock(&ailp->xa_lock); - for (; lip; lip = next) { - next = lip->li_bio_list; + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { if (lip->li_ops->iop_error) lip->li_ops->iop_error(lip, bp); } @@ -1069,13 +1069,23 @@ static bool xfs_buf_iodone_callback_error( struct xfs_buf *bp) { - struct xfs_log_item *lip = bp->b_fspriv; - struct xfs_mount *mp = lip->li_mountp; + struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_log_item *lip; + struct xfs_mount *mp; static ulong lasttime; static xfs_buftarg_t *lasttarg; struct xfs_error_cfg *cfg; /* + * The failed buffer might not have a buf_log_item attached or the + * log_item list might be empty. Get the mp from the available + * xfs_log_item + */ + lip = list_first_entry_or_null(&bp->b_li_list, struct xfs_log_item, + li_bio_list); + mp = lip ? lip->li_mountp : bip->bli_item.li_mountp; + + /* * If we've already decided to shutdown the filesystem because of * I/O errors, there's no point in giving this a retry. */ @@ -1183,7 +1193,8 @@ xfs_buf_iodone_callbacks( bp->b_first_retry_time = 0; xfs_buf_do_callbacks(bp); - bp->b_fspriv = NULL; + bp->b_log_item = NULL; + list_del_init(&bp->b_li_list); bp->b_iodone = NULL; xfs_buf_ioend(bp); } @@ -1228,10 +1239,9 @@ xfs_buf_iodone( bool xfs_buf_resubmit_failed_buffers( struct xfs_buf *bp, - struct xfs_log_item *lip, struct list_head *buffer_list) { - struct xfs_log_item *next; + struct xfs_log_item *lip; /* * Clear XFS_LI_FAILED flag from all items before resubmit @@ -1239,10 +1249,8 @@ xfs_buf_resubmit_failed_buffers( * XFS_LI_FAILED set/clear is protected by xa_lock, caller this * function already have it acquired */ - for (; lip; lip = next) { - next = lip->li_bio_list; + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) xfs_clear_li_failed(lip); - } /* Add this buffer back to the delayed write list */ return xfs_buf_delwri_queue(bp, buffer_list); diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 9690ce62c9a7..643f53dcfe51 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -50,7 +50,7 @@ struct xfs_buf_log_item; * needed to log buffers. It tracks how many times the lock has been * locked, and which 128 byte chunks of the buffer are dirty. */ -typedef struct xfs_buf_log_item { +struct xfs_buf_log_item { xfs_log_item_t bli_item; /* common item structure */ struct xfs_buf *bli_buf; /* real buffer pointer */ unsigned int bli_flags; /* misc flags */ @@ -59,11 +59,11 @@ typedef struct xfs_buf_log_item { int bli_format_count; /* count of headers */ struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */ struct xfs_buf_log_format __bli_format; /* embedded in-log header */ -} xfs_buf_log_item_t; +}; int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); -void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); +void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_attach_iodone(struct xfs_buf *, void(*)(struct xfs_buf *, xfs_log_item_t *), @@ -71,7 +71,6 @@ void xfs_buf_attach_iodone(struct xfs_buf *, void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, - struct xfs_log_item *, struct list_head *); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 0c58918bc0ad..b6ae3597bfb0 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -152,7 +152,6 @@ xfs_dir2_block_getdents( struct xfs_inode *dp = args->dp; /* incore directory inode */ xfs_dir2_data_hdr_t *hdr; /* block header */ struct xfs_buf *bp; /* buffer for block */ - xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_dir2_data_unused_t *dup; /* block unused entry */ char *endptr; /* end of the data entries */ @@ -185,9 +184,8 @@ xfs_dir2_block_getdents( /* * Set up values for the loop. */ - btp = xfs_dir2_block_tail_p(geo, hdr); ptr = (char *)dp->d_ops->data_entry_p(hdr); - endptr = (char *)xfs_dir2_block_leaf_p(btp); + endptr = xfs_dir3_data_endp(geo, hdr); /* * Loop over the data portion of the block. diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index f248708c10ff..43572f8a1b8e 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -399,52 +399,6 @@ error0: return error; } -STATIC int -xfs_qm_dqrepair( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_dquot *dqp, - xfs_dqid_t firstid, - struct xfs_buf **bpp) -{ - int error; - struct xfs_disk_dquot *ddq; - struct xfs_dqblk *d; - int i; - - /* - * Read the buffer without verification so we get the corrupted - * buffer returned to us. make sure we verify it on write, though. - */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, - 0, bpp, NULL); - - if (error) { - ASSERT(*bpp == NULL); - return error; - } - (*bpp)->b_ops = &xfs_dquot_buf_ops; - - ASSERT(xfs_buf_islocked(*bpp)); - d = (struct xfs_dqblk *)(*bpp)->b_addr; - - /* Do the actual repair of dquots in this buffer */ - for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { - ddq = &d[i].dd_diskdq; - error = xfs_dqcheck(mp, ddq, firstid + i, - dqp->dq_flags & XFS_DQ_ALLTYPES, - XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair"); - if (error) { - /* repair failed, we're screwed */ - xfs_trans_brelse(tp, *bpp); - return -EIO; - } - } - - return 0; -} - /* * Maps a dquot to the buffer containing its on-disk version. * This returns a ptr to the buffer containing the on-disk dquot @@ -526,14 +480,6 @@ xfs_qm_dqtobp( dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); - - if (error == -EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { - xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * - mp->m_quotainfo->qi_dqperchunk; - ASSERT(bp == NULL); - error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp); - } - if (error) { ASSERT(bp == NULL); return error; @@ -1010,6 +956,7 @@ xfs_qm_dqflush( struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; struct xfs_disk_dquot *ddqp; + xfs_failaddr_t fa; int error; ASSERT(XFS_DQ_IS_LOCKED(dqp)); @@ -1056,9 +1003,10 @@ xfs_qm_dqflush( /* * A simple sanity check in case we got a corrupted dquot.. */ - error = xfs_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, - XFS_QMOPT_DOWARN, "dqflush (incore copy)"); - if (error) { + fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, 0); + if (fa) { + xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", + be32_to_cpu(ddqp->d_id), fa); xfs_buf_relse(bp); xfs_dqfunlock(dqp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 664dea105e76..96eaa6933709 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -150,10 +150,7 @@ xfs_dquot_item_error( struct xfs_log_item *lip, struct xfs_buf *bp) { - struct xfs_dquot *dqp; - - dqp = DQUOT_ITEM(lip)->qli_dquot; - ASSERT(!completion_done(&dqp->q_flush)); + ASSERT(!completion_done(&DQUOT_ITEM(lip)->qli_dquot->q_flush)); xfs_set_li_failed(lip, bp); } @@ -179,7 +176,7 @@ xfs_qm_dquot_logitem_push( if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; - if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list)) + if (!xfs_buf_resubmit_failed_buffers(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_unlock(bp); @@ -212,7 +209,7 @@ xfs_qm_dquot_logitem_push( error = xfs_qm_dqflush(dqp, &bp); if (error) { - xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p", + xfs_warn(dqp->q_mount, "%s: push error %d on dqp "PTR_FMT, __func__, error, dqp); } else { if (!xfs_buf_delwri_queue(bp, buffer_list)) diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 4c9f35d983b2..ccf520f0b00d 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -24,6 +24,7 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_sysfs.h" +#include "xfs_inode.h" #ifdef DEBUG @@ -314,12 +315,12 @@ xfs_error_report( struct xfs_mount *mp, const char *filename, int linenum, - void *ra) + xfs_failaddr_t failaddr) { if (level <= xfs_error_level) { xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, "Internal error %s at line %d of file %s. Caller %pS", - tag, linenum, filename, ra); + tag, linenum, filename, failaddr); xfs_stack_trace(); } @@ -333,11 +334,11 @@ xfs_corruption_error( void *p, const char *filename, int linenum, - void *ra) + xfs_failaddr_t failaddr) { if (level <= xfs_error_level) - xfs_hex_dump(p, 64); - xfs_error_report(tag, level, mp, filename, linenum, ra); + xfs_hex_dump(p, XFS_CORRUPTION_DUMP_LEN); + xfs_error_report(tag, level, mp, filename, linenum, failaddr); xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); } @@ -347,19 +348,62 @@ xfs_corruption_error( */ void xfs_verifier_error( - struct xfs_buf *bp) + struct xfs_buf *bp, + int error, + xfs_failaddr_t failaddr) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_failaddr_t fa; + + fa = failaddr ? failaddr : __return_address; + __xfs_buf_ioerror(bp, error, fa); xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", - __return_address, bp->b_ops->name, bp->b_bn); + fa, bp->b_ops->name, bp->b_bn); xfs_alert(mp, "Unmount and run xfs_repair"); if (xfs_error_level >= XFS_ERRLEVEL_LOW) { - xfs_alert(mp, "First 64 bytes of corrupted metadata buffer:"); - xfs_hex_dump(xfs_buf_offset(bp, 0), 64); + xfs_alert(mp, "First %d bytes of corrupted metadata buffer:", + XFS_CORRUPTION_DUMP_LEN); + xfs_hex_dump(xfs_buf_offset(bp, 0), XFS_CORRUPTION_DUMP_LEN); + } + + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); +} + +/* + * Warnings for inode corruption problems. Don't bother with the stack + * trace unless the error level is turned up high. + */ +void +xfs_inode_verifier_error( + struct xfs_inode *ip, + int error, + const char *name, + void *buf, + size_t bufsz, + xfs_failaddr_t failaddr) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_failaddr_t fa; + int sz; + + fa = failaddr ? failaddr : __return_address; + + xfs_alert(mp, "Metadata %s detected at %pS, inode 0x%llx %s", + error == -EFSBADCRC ? "CRC error" : "corruption", + fa, ip->i_ino, name); + + xfs_alert(mp, "Unmount and run xfs_repair"); + + if (buf && xfs_error_level >= XFS_ERRLEVEL_LOW) { + sz = min_t(size_t, XFS_CORRUPTION_DUMP_LEN, bufsz); + xfs_alert(mp, "First %d bytes of corrupted metadata buffer:", + sz); + xfs_hex_dump(buf, sz); } if (xfs_error_level >= XFS_ERRLEVEL_HIGH) diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index ea816c1bf8db..7e728c5a46b8 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -21,11 +21,16 @@ struct xfs_mount; extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, - const char *filename, int linenum, void *ra); + const char *filename, int linenum, + xfs_failaddr_t failaddr); extern void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, void *p, const char *filename, - int linenum, void *ra); -extern void xfs_verifier_error(struct xfs_buf *bp); + int linenum, xfs_failaddr_t failaddr); +extern void xfs_verifier_error(struct xfs_buf *bp, int error, + xfs_failaddr_t failaddr); +extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, + const char *name, void *buf, size_t bufsz, + xfs_failaddr_t failaddr); #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) @@ -37,6 +42,9 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERRLEVEL_LOW 1 #define XFS_ERRLEVEL_HIGH 5 +/* Dump 128 bytes of any corrupt buffer */ +#define XFS_CORRUPTION_DUMP_LEN (128) + /* * Macros to set EFSCORRUPTED & return/branch. */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 44f8c5451210..64da90655e95 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -538,7 +538,7 @@ xfs_efi_recover( return error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); - xfs_rmap_skip_owner_update(&oinfo); + xfs_rmap_any_owner_update(&oinfo); for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &efip->efi_format.efi_extents[i]; error = xfs_trans_free_extent(tp, efdp, extp->ext_start, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 8601275cc5e6..9ea08326f876 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1048,7 +1048,7 @@ __xfs_filemap_fault( if (IS_DAX(inode)) { pfn_t pfn; - ret = dax_iomap_fault(vmf, pe_size, &pfn, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &xfs_iomap_ops); if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); } else { diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 8f22fc579dbb..8b4545623e25 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -49,83 +49,6 @@ * File system operations */ -int -xfs_fs_geometry( - xfs_mount_t *mp, - xfs_fsop_geom_t *geo, - int new_version) -{ - - memset(geo, 0, sizeof(*geo)); - - geo->blocksize = mp->m_sb.sb_blocksize; - geo->rtextsize = mp->m_sb.sb_rextsize; - geo->agblocks = mp->m_sb.sb_agblocks; - geo->agcount = mp->m_sb.sb_agcount; - geo->logblocks = mp->m_sb.sb_logblocks; - geo->sectsize = mp->m_sb.sb_sectsize; - geo->inodesize = mp->m_sb.sb_inodesize; - geo->imaxpct = mp->m_sb.sb_imax_pct; - geo->datablocks = mp->m_sb.sb_dblocks; - geo->rtblocks = mp->m_sb.sb_rblocks; - geo->rtextents = mp->m_sb.sb_rextents; - geo->logstart = mp->m_sb.sb_logstart; - ASSERT(sizeof(geo->uuid)==sizeof(mp->m_sb.sb_uuid)); - memcpy(geo->uuid, &mp->m_sb.sb_uuid, sizeof(mp->m_sb.sb_uuid)); - if (new_version >= 2) { - geo->sunit = mp->m_sb.sb_unit; - geo->swidth = mp->m_sb.sb_width; - } - if (new_version >= 3) { - geo->version = XFS_FSOP_GEOM_VERSION; - geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | - XFS_FSOP_GEOM_FLAGS_DIRV2 | - (xfs_sb_version_hasattr(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_ATTR : 0) | - (xfs_sb_version_hasquota(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_QUOTA : 0) | - (xfs_sb_version_hasalign(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_IALIGN : 0) | - (xfs_sb_version_hasdalign(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_DALIGN : 0) | - (xfs_sb_version_hasextflgbit(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) | - (xfs_sb_version_hassector(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_SECTOR : 0) | - (xfs_sb_version_hasasciici(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_DIRV2CI : 0) | - (xfs_sb_version_haslazysbcount(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | - (xfs_sb_version_hasattr2(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | - (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) | - (xfs_sb_version_hascrc(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_V5SB : 0) | - (xfs_sb_version_hasftype(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_FTYPE : 0) | - (xfs_sb_version_hasfinobt(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_FINOBT : 0) | - (xfs_sb_version_hassparseinodes(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_SPINODES : 0) | - (xfs_sb_version_hasrmapbt(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_RMAPBT : 0) | - (xfs_sb_version_hasreflink(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_REFLINK : 0); - geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? - mp->m_sb.sb_logsectsize : BBSIZE; - geo->rtsectsize = mp->m_sb.sb_blocksize; - geo->dirblocksize = mp->m_dir_geo->blksize; - } - if (new_version >= 4) { - geo->flags |= - (xfs_sb_version_haslogv2(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_LOGV2 : 0); - geo->logsunit = mp->m_sb.sb_logsunit; - } - return 0; -} - static struct xfs_buf * xfs_growfs_get_hdr_buf( struct xfs_mount *mp, @@ -571,6 +494,11 @@ xfs_growfs_data_private( * this doesn't actually exist in the rmap btree. */ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL); + error = xfs_rmap_free(tp, bp, agno, + be32_to_cpu(agf->agf_length) - new, + new, &oinfo); + if (error) + goto error0; error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno, be32_to_cpu(agf->agf_length) - new), @@ -950,7 +878,7 @@ xfs_do_force_shutdown( if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { xfs_notice(mp, - "%s(0x%x) called from line %d of file %s. Return address = 0x%p", + "%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT, __func__, flags, lnnum, fname, __return_address); } /* diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 2954c13a3acd..20484ed5e919 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -18,7 +18,6 @@ #ifndef __XFS_FSOPS_H__ #define __XFS_FSOPS_H__ -extern int xfs_fs_geometry(xfs_mount_t *mp, xfs_fsop_geom_t *geo, int nversion); extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in); extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in); extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 43005fbe8b1e..d53a316162d6 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -37,6 +37,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/iversion.h> /* * Allocate and initialise an xfs_inode. @@ -293,15 +294,17 @@ xfs_reinit_inode( int error; uint32_t nlink = inode->i_nlink; uint32_t generation = inode->i_generation; - uint64_t version = inode->i_version; + uint64_t version = inode_peek_iversion(inode); umode_t mode = inode->i_mode; + dev_t dev = inode->i_rdev; error = inode_init_always(mp->m_super, inode); set_nlink(inode, nlink); inode->i_generation = generation; - inode->i_version = version; + inode_set_iversion_queried(inode, version); inode->i_mode = mode; + inode->i_rdev = dev; return error; } @@ -473,6 +476,11 @@ xfs_iget_cache_miss( if (error) goto out_destroy; + if (!xfs_inode_verify_forks(ip)) { + error = -EFSCORRUPTED; + goto out_destroy; + } + trace_xfs_iget_miss(ip); if ((VFS_I(ip)->i_mode == 0) && !(flags & XFS_IGET_CREATE)) { @@ -870,7 +878,7 @@ xfs_eofblocks_worker( * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). * (We'll just piggyback on the post-EOF prealloc space workqueue.) */ -STATIC void +void xfs_queue_cowblocks( struct xfs_mount *mp) { @@ -1536,8 +1544,23 @@ xfs_inode_free_quota_eofblocks( return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); } +static inline unsigned long +xfs_iflag_for_tag( + int tag) +{ + switch (tag) { + case XFS_ICI_EOFBLOCKS_TAG: + return XFS_IEOFBLOCKS; + case XFS_ICI_COWBLOCKS_TAG: + return XFS_ICOWBLOCKS; + default: + ASSERT(0); + return 0; + } +} + static void -__xfs_inode_set_eofblocks_tag( +__xfs_inode_set_blocks_tag( xfs_inode_t *ip, void (*execute)(struct xfs_mount *mp), void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, @@ -1552,10 +1575,10 @@ __xfs_inode_set_eofblocks_tag( * Don't bother locking the AG and looking up in the radix trees * if we already know that we have the tag set. */ - if (ip->i_flags & XFS_IEOFBLOCKS) + if (ip->i_flags & xfs_iflag_for_tag(tag)) return; spin_lock(&ip->i_flags_lock); - ip->i_flags |= XFS_IEOFBLOCKS; + ip->i_flags |= xfs_iflag_for_tag(tag); spin_unlock(&ip->i_flags_lock); pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); @@ -1587,13 +1610,13 @@ xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_eofblocks_tag(ip); - return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks, + return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks, trace_xfs_perag_set_eofblocks, XFS_ICI_EOFBLOCKS_TAG); } static void -__xfs_inode_clear_eofblocks_tag( +__xfs_inode_clear_blocks_tag( xfs_inode_t *ip, void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, int error, unsigned long caller_ip), @@ -1603,7 +1626,7 @@ __xfs_inode_clear_eofblocks_tag( struct xfs_perag *pag; spin_lock(&ip->i_flags_lock); - ip->i_flags &= ~XFS_IEOFBLOCKS; + ip->i_flags &= ~xfs_iflag_for_tag(tag); spin_unlock(&ip->i_flags_lock); pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); @@ -1630,33 +1653,20 @@ xfs_inode_clear_eofblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_clear_eofblocks_tag(ip); - return __xfs_inode_clear_eofblocks_tag(ip, + return __xfs_inode_clear_blocks_tag(ip, trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); } /* - * Automatic CoW Reservation Freeing - * - * These functions automatically garbage collect leftover CoW reservations - * that were made on behalf of a cowextsize hint when we start to run out - * of quota or when the reservations sit around for too long. If the file - * has dirty pages or is undergoing writeback, its CoW reservations will - * be retained. - * - * The actual garbage collection piggybacks off the same code that runs - * the speculative EOF preallocation garbage collector. + * Set ourselves up to free CoW blocks from this file. If it's already clean + * then we can bail out quickly, but otherwise we must back off if the file + * is undergoing some kind of write. */ -STATIC int -xfs_inode_free_cowblocks( +static bool +xfs_prep_free_cowblocks( struct xfs_inode *ip, - int flags, - void *args) + struct xfs_ifork *ifp) { - int ret; - struct xfs_eofblocks *eofb = args; - int match; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - /* * Just clear the tag if we have an empty cow fork or none at all. It's * possible the inode was fully unshared since it was originally tagged. @@ -1664,7 +1674,7 @@ xfs_inode_free_cowblocks( if (!xfs_is_reflink_inode(ip) || !ifp->if_bytes) { trace_xfs_inode_free_cowblocks_invalid(ip); xfs_inode_clear_cowblocks_tag(ip); - return 0; + return false; } /* @@ -1675,6 +1685,35 @@ xfs_inode_free_cowblocks( mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || atomic_read(&VFS_I(ip)->i_dio_count)) + return false; + + return true; +} + +/* + * Automatic CoW Reservation Freeing + * + * These functions automatically garbage collect leftover CoW reservations + * that were made on behalf of a cowextsize hint when we start to run out + * of quota or when the reservations sit around for too long. If the file + * has dirty pages or is undergoing writeback, its CoW reservations will + * be retained. + * + * The actual garbage collection piggybacks off the same code that runs + * the speculative EOF preallocation garbage collector. + */ +STATIC int +xfs_inode_free_cowblocks( + struct xfs_inode *ip, + int flags, + void *args) +{ + struct xfs_eofblocks *eofb = args; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + int match; + int ret = 0; + + if (!xfs_prep_free_cowblocks(ip, ifp)) return 0; if (eofb) { @@ -1695,7 +1734,12 @@ xfs_inode_free_cowblocks( xfs_ilock(ip, XFS_IOLOCK_EXCL); xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); + /* + * Check again, nobody else should be able to dirty blocks or change + * the reflink iflag now that we have the first two locks held. + */ + if (xfs_prep_free_cowblocks(ip, ifp)) + ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); xfs_iunlock(ip, XFS_IOLOCK_EXCL); @@ -1724,7 +1768,7 @@ xfs_inode_set_cowblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_cowblocks_tag(ip); - return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks, + return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks, trace_xfs_perag_set_cowblocks, XFS_ICI_COWBLOCKS_TAG); } @@ -1734,6 +1778,6 @@ xfs_inode_clear_cowblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_clear_cowblocks_tag(ip); - return __xfs_inode_clear_eofblocks_tag(ip, + return __xfs_inode_clear_blocks_tag(ip, trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index bff4d85e5498..d4a77588eca1 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -81,6 +81,7 @@ void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *); int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip); void xfs_cowblocks_worker(struct work_struct *); +void xfs_queue_cowblocks(struct xfs_mount *); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b41952a4ddd8..604ee384a00a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -16,6 +16,7 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <linux/log2.h> +#include <linux/iversion.h> #include "xfs.h" #include "xfs_fs.h" @@ -546,23 +547,36 @@ again: /* * xfs_lock_two_inodes() can only be used to lock one type of lock at a time - - * the iolock, the mmaplock or the ilock, but not more than one at a time. If we - * lock more than one at a time, lockdep will report false positives saying we - * have violated locking orders. + * the mmaplock or the ilock, but not more than one type at a time. If we lock + * more than one at a time, lockdep will report false positives saying we have + * violated locking orders. The iolock must be double-locked separately since + * we use i_rwsem for that. We now support taking one lock EXCL and the other + * SHARED. */ void xfs_lock_two_inodes( - xfs_inode_t *ip0, - xfs_inode_t *ip1, - uint lock_mode) + struct xfs_inode *ip0, + uint ip0_mode, + struct xfs_inode *ip1, + uint ip1_mode) { - xfs_inode_t *temp; + struct xfs_inode *temp; + uint mode_temp; int attempts = 0; xfs_log_item_t *lp; - ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); - if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) - ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + ASSERT(hweight32(ip0_mode) == 1); + ASSERT(hweight32(ip1_mode) == 1); + ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); + ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); + ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || + !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || + !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || + !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || + !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); ASSERT(ip0->i_ino != ip1->i_ino); @@ -570,10 +584,13 @@ xfs_lock_two_inodes( temp = ip0; ip0 = ip1; ip1 = temp; + mode_temp = ip0_mode; + ip0_mode = ip1_mode; + ip1_mode = mode_temp; } again: - xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0)); + xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0)); /* * If the first lock we have locked is in the AIL, we must TRY to get @@ -582,18 +599,17 @@ xfs_lock_two_inodes( */ lp = (xfs_log_item_t *)ip0->i_itemp; if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { - if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) { - xfs_iunlock(ip0, lock_mode); + if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) { + xfs_iunlock(ip0, ip0_mode); if ((++attempts % 5) == 0) delay(1); /* Don't just spin the CPU */ goto again; } } else { - xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1)); + xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1)); } } - void __xfs_iflock( struct xfs_inode *ip) @@ -832,7 +848,7 @@ xfs_ialloc( ip->i_d.di_flags = 0; if (ip->i_d.di_version == 3) { - inode->i_version = 1; + inode_set_iversion(inode, 1); ip->i_d.di_flags2 = 0; ip->i_d.di_cowextsize = 0; ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec; @@ -1421,7 +1437,7 @@ xfs_link( if (error) goto std_return; - xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); + xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); @@ -1487,6 +1503,24 @@ xfs_link( return error; } +/* Clear the reflink flag and the cowblocks tag if possible. */ +static void +xfs_itruncate_clear_reflink_flags( + struct xfs_inode *ip) +{ + struct xfs_ifork *dfork; + struct xfs_ifork *cfork; + + if (!xfs_is_reflink_inode(ip)) + return; + dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK); + if (dfork->if_bytes == 0 && cfork->if_bytes == 0) + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + if (cfork->if_bytes == 0) + xfs_inode_clear_cowblocks_tag(ip); +} + /* * Free up the underlying blocks past new_size. The new size must be smaller * than the current size. This routine can be used both for the attribute and @@ -1583,15 +1617,7 @@ xfs_itruncate_extents( if (error) goto out; - /* - * Clear the reflink flag if there are no data fork blocks and - * there are no extents staged in the cow fork. - */ - if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) { - if (ip->i_d.di_nblocks == 0) - ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; - xfs_inode_clear_cowblocks_tag(ip); - } + xfs_itruncate_clear_reflink_flags(ip); /* * Always re-log the inode so that our permanent transaction can keep @@ -2204,7 +2230,7 @@ xfs_ifree_cluster( xfs_buf_t *bp; xfs_inode_t *ip; xfs_inode_log_item_t *iip; - xfs_log_item_t *lip; + struct xfs_log_item *lip; struct xfs_perag *pag; xfs_ino_t inum; @@ -2262,8 +2288,7 @@ xfs_ifree_cluster( * stale first, we will not attempt to lock them in the loop * below as the XFS_ISTALE flag will be set. */ - lip = bp->b_fspriv; - while (lip) { + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { if (lip->li_type == XFS_LI_INODE) { iip = (xfs_inode_log_item_t *)lip; ASSERT(iip->ili_logged == 1); @@ -2273,7 +2298,6 @@ xfs_ifree_cluster( &iip->ili_item.li_lsn); xfs_iflags_set(iip->ili_inode, XFS_ISTALE); } - lip = lip->li_bio_list; } @@ -2441,6 +2465,7 @@ xfs_ifree( VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; + ip->i_d.di_flags2 = 0; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; @@ -2576,7 +2601,7 @@ xfs_remove( goto std_return; } - xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); + xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); @@ -3469,6 +3494,36 @@ abort_out: return error; } +/* + * If there are inline format data / attr forks attached to this inode, + * make sure they're not corrupt. + */ +bool +xfs_inode_verify_forks( + struct xfs_inode *ip) +{ + struct xfs_ifork *ifp; + xfs_failaddr_t fa; + + fa = xfs_ifork_verify_data(ip, &xfs_default_ifork_ops); + if (fa) { + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork", + ifp->if_u1.if_data, ifp->if_bytes, fa); + return false; + } + + fa = xfs_ifork_verify_attr(ip, &xfs_default_ifork_ops); + if (fa) { + ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", + ifp ? ifp->if_u1.if_data : NULL, + ifp ? ifp->if_bytes : 0, fa); + return false; + } + return true; +} + STATIC int xfs_iflush_int( struct xfs_inode *ip, @@ -3491,7 +3546,7 @@ xfs_iflush_int( if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p", + "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); goto corrupt_out; } @@ -3501,7 +3556,7 @@ xfs_iflush_int( (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad regular inode %Lu, ptr 0x%p", + "%s: Bad regular inode %Lu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto corrupt_out; } @@ -3512,7 +3567,7 @@ xfs_iflush_int( (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad directory inode %Lu, ptr 0x%p", + "%s: Bad directory inode %Lu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto corrupt_out; } @@ -3521,7 +3576,7 @@ xfs_iflush_int( ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %Lu, " - "total extents = %d, nblocks = %Ld, ptr 0x%p", + "total extents = %d, nblocks = %Ld, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_d.di_nextents + ip->i_d.di_anextents, ip->i_d.di_nblocks, ip); @@ -3530,7 +3585,7 @@ xfs_iflush_int( if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p", + "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_d.di_forkoff, ip); goto corrupt_out; } @@ -3547,10 +3602,8 @@ xfs_iflush_int( if (ip->i_d.di_version < 3) ip->i_d.di_flushiter++; - /* Check the inline directory data. */ - if (S_ISDIR(VFS_I(ip)->i_mode) && - ip->i_d.di_format == XFS_DINODE_FMT_LOCAL && - xfs_dir2_sf_verify(ip)) + /* Check the inline fork data before we write out. */ + if (!xfs_inode_verify_forks(ip)) goto corrupt_out; /* @@ -3613,7 +3666,7 @@ xfs_iflush_int( /* generate the checksum. */ xfs_dinode_calc_crc(mp, dip); - ASSERT(bp->b_fspriv != NULL); + ASSERT(!list_empty(&bp->b_li_list)); ASSERT(bp->b_iodone != NULL); return 0; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index b2136af9289f..3e8dc990d41c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -232,6 +232,7 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) * log recovery to replay a bmap operation on the inode. */ #define XFS_IRECOVERY (1 << 11) +#define XFS_ICOWBLOCKS (1 << 12)/* has the cowblocks tag set */ /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during @@ -422,7 +423,8 @@ void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) int xfs_iflush(struct xfs_inode *, struct xfs_buf **); -void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); +void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, + struct xfs_inode *ip1, uint ip1_mode); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); @@ -490,4 +492,6 @@ extern struct kmem_zone *xfs_inode_zone; /* The default CoW extent size hint. */ #define XFS_DEFAULT_COWEXTSZ_HINT 32 +bool xfs_inode_verify_forks(struct xfs_inode *ip); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6ee5c3bf19ad..d5037f060d6f 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -30,6 +30,7 @@ #include "xfs_buf_item.h" #include "xfs_log.h" +#include <linux/iversion.h> kmem_zone_t *xfs_ili_zone; /* inode log item zone */ @@ -354,7 +355,7 @@ xfs_inode_to_log_dinode( to->di_next_unlinked = NULLAGINO; if (from->di_version == 3) { - to->di_changecount = inode->i_version; + to->di_changecount = inode_peek_iversion(inode); to->di_crtime.t_sec = from->di_crtime.t_sec; to->di_crtime.t_nsec = from->di_crtime.t_nsec; to->di_flags2 = from->di_flags2; @@ -521,7 +522,7 @@ xfs_inode_item_push( if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; - if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list)) + if (!xfs_buf_resubmit_failed_buffers(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_unlock(bp); @@ -712,37 +713,23 @@ xfs_iflush_done( struct xfs_log_item *lip) { struct xfs_inode_log_item *iip; - struct xfs_log_item *blip; - struct xfs_log_item *next; - struct xfs_log_item *prev; + struct xfs_log_item *blip, *n; struct xfs_ail *ailp = lip->li_ailp; int need_ail = 0; + LIST_HEAD(tmp); /* * Scan the buffer IO completions for other inodes being completed and * attach them to the current inode log item. */ - blip = bp->b_fspriv; - prev = NULL; - while (blip != NULL) { - if (blip->li_cb != xfs_iflush_done) { - prev = blip; - blip = blip->li_bio_list; - continue; - } - /* remove from list */ - next = blip->li_bio_list; - if (!prev) { - bp->b_fspriv = next; - } else { - prev->li_bio_list = next; - } + list_add_tail(&lip->li_bio_list, &tmp); - /* add to current list */ - blip->li_bio_list = lip->li_bio_list; - lip->li_bio_list = blip; + list_for_each_entry_safe(blip, n, &bp->b_li_list, li_bio_list) { + if (lip->li_cb != xfs_iflush_done) + continue; + list_move_tail(&blip->li_bio_list, &tmp); /* * while we have the item, do the unlocked check for needing * the AIL lock. @@ -751,8 +738,6 @@ xfs_iflush_done( if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || (blip->li_flags & XFS_LI_FAILED)) need_ail++; - - blip = next; } /* make sure we capture the state of the initial inode. */ @@ -775,7 +760,7 @@ xfs_iflush_done( /* this is an opencoded batch version of xfs_trans_ail_delete */ spin_lock(&ailp->xa_lock); - for (blip = lip; blip; blip = blip->li_bio_list) { + list_for_each_entry(blip, &tmp, li_bio_list) { if (INODE_ITEM(blip)->ili_logged && blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) mlip_changed |= xfs_ail_delete_one(ailp, blip); @@ -801,15 +786,14 @@ xfs_iflush_done( * ili_last_fields bits now that we know that the data corresponding to * them is safely on disk. */ - for (blip = lip; blip; blip = next) { - next = blip->li_bio_list; - blip->li_bio_list = NULL; - + list_for_each_entry_safe(blip, n, &tmp, li_bio_list) { + list_del_init(&blip->li_bio_list); iip = INODE_ITEM(blip); iip->ili_logged = 0; iip->ili_last_fields = 0; xfs_ifunlock(iip->ili_inode); } + list_del(&tmp); } /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 20dc65fef6a4..89fb1eb80aae 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -45,6 +45,7 @@ #include <linux/fsmap.h> #include "xfs_fsmap.h" #include "scrub/xfs_scrub.h" +#include "xfs_sb.h" #include <linux/capability.h> #include <linux/cred.h> @@ -809,7 +810,7 @@ xfs_ioc_fsgeometry_v1( xfs_fsop_geom_t fsgeo; int error; - error = xfs_fs_geometry(mp, &fsgeo, 3); + error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 3); if (error) return error; @@ -831,7 +832,7 @@ xfs_ioc_fsgeometry( xfs_fsop_geom_t fsgeo; int error; - error = xfs_fs_geometry(mp, &fsgeo, 4); + error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 4); if (error) return error; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 35c79e246fde..10fbde359649 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -37,6 +37,7 @@ #include "xfs_ioctl.h" #include "xfs_ioctl32.h" #include "xfs_trace.h" +#include "xfs_sb.h" #define _NATIVE_IOC(cmd, type) \ _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type)) @@ -66,7 +67,7 @@ xfs_compat_ioc_fsgeometry_v1( xfs_fsop_geom_t fsgeo; int error; - error = xfs_fs_geometry(mp, &fsgeo, 3); + error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 3); if (error) return error; /* The 32-bit variant simply has some padding at the end */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 7ab52a8bc0a9..66e1edbfb2b2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1006,7 +1006,7 @@ xfs_file_iomap_begin( } ASSERT(offset <= mp->m_super->s_maxbytes); - if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) + if (offset > mp->m_super->s_maxbytes - length) length = mp->m_super->s_maxbytes - offset; offset_fsb = XFS_B_TO_FSBT(mp, offset); end_fsb = XFS_B_TO_FSB(mp, offset + length); diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 99562ec0de56..bee51a14a906 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -285,8 +285,22 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) #define XFS_IS_REALTIME_INODE(ip) \ (((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) && \ (ip)->i_mount->m_rtdev_targp) +#define XFS_IS_REALTIME_MOUNT(mp) ((mp)->m_rtdev_targp ? 1 : 0) #else #define XFS_IS_REALTIME_INODE(ip) (0) +#define XFS_IS_REALTIME_MOUNT(mp) (0) +#endif + +/* + * Starting in Linux 4.15, the %p (raw pointer value) printk modifier + * prints a hashed version of the pointer to avoid leaking kernel + * pointers into dmesg. If we're trying to debug the kernel we want the + * raw values, so override this behavior as best we can. + */ +#ifdef DEBUG +# define PTR_FMT "%px" +#else +# define PTR_FMT "%p" #endif #endif /* __XFS_LINUX__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index a503af96d780..3e5ba1ecc080 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1047,6 +1047,7 @@ xfs_log_item_init( INIT_LIST_HEAD(&item->li_ail); INIT_LIST_HEAD(&item->li_cil); + INIT_LIST_HEAD(&item->li_bio_list); } /* @@ -1242,7 +1243,7 @@ xlog_space_left( static void xlog_iodone(xfs_buf_t *bp) { - struct xlog_in_core *iclog = bp->b_fspriv; + struct xlog_in_core *iclog = bp->b_log_item; struct xlog *l = iclog->ic_log; int aborted = 0; @@ -1773,7 +1774,7 @@ STATIC int xlog_bdstrat( struct xfs_buf *bp) { - struct xlog_in_core *iclog = bp->b_fspriv; + struct xlog_in_core *iclog = bp->b_log_item; xfs_buf_lock(bp); if (iclog->ic_state & XLOG_STATE_IOERROR) { @@ -1919,7 +1920,7 @@ xlog_sync( } bp->b_io_length = BTOBB(count); - bp->b_fspriv = iclog; + bp->b_log_item = iclog; bp->b_flags &= ~XBF_FLUSH; bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA); @@ -1958,7 +1959,7 @@ xlog_sync( XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */ xfs_buf_associate_memory(bp, (char *)&iclog->ic_header + count, split); - bp->b_fspriv = iclog; + bp->b_log_item = iclog; bp->b_flags &= ~XBF_FLUSH; bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA); @@ -2117,7 +2118,9 @@ xlog_print_trans( /* dump core transaction and ticket info */ xfs_warn(mp, "transaction summary:"); - xfs_warn(mp, " flags = 0x%x", tp->t_flags); + xfs_warn(mp, " log res = %d", tp->t_log_res); + xfs_warn(mp, " log count = %d", tp->t_log_count); + xfs_warn(mp, " flags = 0x%x", tp->t_flags); xlog_print_tic_res(mp, tp->t_ticket); @@ -2242,7 +2245,7 @@ xlog_write_setup_ophdr( break; default: xfs_warn(log->l_mp, - "Bad XFS transaction clientid 0x%x in ticket 0x%p", + "Bad XFS transaction clientid 0x%x in ticket "PTR_FMT, ophdr->oh_clientid, ticket); return NULL; } @@ -3924,7 +3927,7 @@ xlog_verify_iclog( } if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) xfs_warn(log->l_mp, - "%s: invalid clientid %d op 0x%p offset 0x%lx", + "%s: invalid clientid %d op "PTR_FMT" offset 0x%lx", __func__, clientid, ophead, (unsigned long)field_offset); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 28d1abfe835e..00240c9ee72e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -400,9 +400,9 @@ xlog_recover_iodone( * On v5 supers, a bli could be attached to update the metadata LSN. * Clean it up. */ - if (bp->b_fspriv) + if (bp->b_log_item) xfs_buf_item_relse(bp); - ASSERT(bp->b_fspriv == NULL); + ASSERT(bp->b_log_item == NULL); bp->b_iodone = NULL; xfs_buf_ioend(bp); @@ -2218,7 +2218,7 @@ xlog_recover_do_inode_buffer( next_unlinked_offset - reg_buf_offset; if (unlikely(*logged_nextp == 0)) { xfs_alert(mp, - "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). " + "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). " "Trying to replay bad (0) inode di_next_unlinked field.", item, bp); XFS_ERROR_REPORT("xlog_recover_do_inode_buf", @@ -2630,7 +2630,7 @@ xlog_recover_validate_buf_type( ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone); bp->b_iodone = xlog_recover_iodone; xfs_buf_item_init(bp, mp); - bip = bp->b_fspriv; + bip = bp->b_log_item; bip->bli_item.li_lsn = current_lsn; } } @@ -2652,7 +2652,7 @@ xlog_recover_do_reg_buffer( int i; int bit; int nbits; - int error; + xfs_failaddr_t fa; trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); @@ -2687,7 +2687,7 @@ xlog_recover_do_reg_buffer( * the first dquot in the buffer should do. XXXThis is * probably a good thing to do for other buf types also. */ - error = 0; + fa = NULL; if (buf_f->blf_flags & (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { if (item->ri_buf[i].i_addr == NULL) { @@ -2701,11 +2701,14 @@ xlog_recover_do_reg_buffer( item->ri_buf[i].i_len, __func__); goto next; } - error = xfs_dqcheck(mp, item->ri_buf[i].i_addr, - -1, 0, XFS_QMOPT_DOWARN, - "dquot_buf_recover"); - if (error) + fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, + -1, 0, 0); + if (fa) { + xfs_alert(mp, + "dquot corrupt at %pS trying to replay into block 0x%llx", + fa, bp->b_bn); goto next; + } } memcpy(xfs_buf_offset(bp, @@ -2957,6 +2960,10 @@ xfs_recover_inode_owner_change( if (error) goto out_free_ip; + if (!xfs_inode_verify_forks(ip)) { + error = -EFSCORRUPTED; + goto out_free_ip; + } if (in_f->ilf_fields & XFS_ILOG_DOWNER) { ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); @@ -3042,7 +3049,7 @@ xlog_recover_inode_pass2( */ if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { xfs_alert(mp, - "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", + "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", __func__, dip, bp, in_f->ilf_ino); XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", XFS_ERRLEVEL_LOW, mp); @@ -3052,7 +3059,7 @@ xlog_recover_inode_pass2( ldip = item->ri_buf[1].i_addr; if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) { xfs_alert(mp, - "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", + "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld", __func__, item, in_f->ilf_ino); XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", XFS_ERRLEVEL_LOW, mp); @@ -3110,8 +3117,8 @@ xlog_recover_inode_pass2( XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", XFS_ERRLEVEL_LOW, mp, ldip); xfs_alert(mp, - "%s: Bad regular inode log record, rec ptr 0x%p, " - "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + "%s: Bad regular inode log record, rec ptr "PTR_FMT", " + "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; @@ -3123,8 +3130,8 @@ xlog_recover_inode_pass2( XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", XFS_ERRLEVEL_LOW, mp, ldip); xfs_alert(mp, - "%s: Bad dir inode log record, rec ptr 0x%p, " - "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + "%s: Bad dir inode log record, rec ptr "PTR_FMT", " + "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; @@ -3134,8 +3141,8 @@ xlog_recover_inode_pass2( XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", XFS_ERRLEVEL_LOW, mp, ldip); xfs_alert(mp, - "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " - "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", + "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " + "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld", __func__, item, dip, bp, in_f->ilf_ino, ldip->di_nextents + ldip->di_anextents, ldip->di_nblocks); @@ -3146,8 +3153,8 @@ xlog_recover_inode_pass2( XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", XFS_ERRLEVEL_LOW, mp, ldip); xfs_alert(mp, - "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " - "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, + "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " + "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__, item, dip, bp, in_f->ilf_ino, ldip->di_forkoff); error = -EFSCORRUPTED; goto out_release; @@ -3157,7 +3164,7 @@ xlog_recover_inode_pass2( XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, ldip); xfs_alert(mp, - "%s: Bad inode log record length %d, rec ptr 0x%p", + "%s: Bad inode log record length %d, rec ptr "PTR_FMT, __func__, item->ri_buf[1].i_len, item); error = -EFSCORRUPTED; goto out_release; @@ -3303,6 +3310,7 @@ xlog_recover_dquot_pass2( xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; struct xfs_disk_dquot *ddq, *recddq; + xfs_failaddr_t fa; int error; xfs_dq_logformat_t *dq_f; uint type; @@ -3345,10 +3353,12 @@ xlog_recover_dquot_pass2( */ dq_f = item->ri_buf[0].i_addr; ASSERT(dq_f); - error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, - "xlog_recover_dquot_pass2 (log copy)"); - if (error) + fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0, 0); + if (fa) { + xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS", + dq_f->qlf_id, fa); return -EIO; + } ASSERT(dq_f->qlf_len == 1); /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c879b517cc94..98fd41cbb9e1 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -162,6 +162,7 @@ xfs_free_perag( ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); xfs_buf_hash_destroy(pag); + mutex_destroy(&pag->pag_ici_reclaim_lock); call_rcu(&pag->rcu_head, __xfs_free_perag); } } @@ -248,6 +249,7 @@ xfs_initialize_perag( out_hash_destroy: xfs_buf_hash_destroy(pag); out_free_pag: + mutex_destroy(&pag->pag_ici_reclaim_lock); kmem_free(pag); out_unwind_new_pags: /* unwind any prior newly initialized pags */ @@ -256,6 +258,7 @@ out_unwind_new_pags: if (!pag) break; xfs_buf_hash_destroy(pag); + mutex_destroy(&pag->pag_ici_reclaim_lock); kmem_free(pag); } return error; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index ec952dfad359..5b848f4b637f 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -48,7 +48,7 @@ STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); - +STATIC void xfs_qm_destroy_quotainos(xfs_quotainfo_t *qi); STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); /* * We use the batch lookup interface to iterate over the dquots as it @@ -162,7 +162,7 @@ xfs_qm_dqpurge( */ error = xfs_qm_dqflush(dqp, &bp); if (error) { - xfs_warn(mp, "%s: dquot %p flush failed", + xfs_warn(mp, "%s: dquot "PTR_FMT" flush failed", __func__, dqp); } else { error = xfs_bwrite(bp); @@ -291,8 +291,7 @@ xfs_qm_dqattach_one( * exist on disk and we didn't ask it to allocate; ESRCH if quotas got * turned off suddenly. */ - error = xfs_qm_dqget(ip->i_mount, ip, id, type, - doalloc | XFS_QMOPT_DOWARN, &dqp); + error = xfs_qm_dqget(ip->i_mount, ip, id, type, doalloc, &dqp); if (error) return error; @@ -481,7 +480,7 @@ xfs_qm_dquot_isolate( error = xfs_qm_dqflush(dqp, &bp); if (error) { - xfs_warn(dqp->q_mount, "%s: dquot %p flush failed", + xfs_warn(dqp->q_mount, "%s: dquot "PTR_FMT" flush failed", __func__, dqp); goto out_unlock_dirty; } @@ -574,7 +573,7 @@ xfs_qm_set_defquota( struct xfs_def_quota *defq; int error; - error = xfs_qm_dqread(mp, 0, type, XFS_QMOPT_DOWARN, &dqp); + error = xfs_qm_dqread(mp, 0, type, 0, &dqp); if (!error) { xfs_disk_dquot_t *ddqp = &dqp->q_core; @@ -652,7 +651,7 @@ xfs_qm_init_quotainfo( XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : XFS_DQ_PROJ), - XFS_QMOPT_DOWARN, &dqp); + 0, &dqp); if (!error) { xfs_disk_dquot_t *ddqp = &dqp->q_core; @@ -695,9 +694,17 @@ xfs_qm_init_quotainfo( qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; qinf->qi_shrinker.seeks = DEFAULT_SEEKS; qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; - register_shrinker(&qinf->qi_shrinker); + + error = register_shrinker(&qinf->qi_shrinker); + if (error) + goto out_free_inos; + return 0; +out_free_inos: + mutex_destroy(&qinf->qi_quotaofflock); + mutex_destroy(&qinf->qi_tree_lock); + xfs_qm_destroy_quotainos(qinf); out_free_lru: list_lru_destroy(&qinf->qi_lru); out_free_qinf: @@ -706,7 +713,6 @@ out_free_qinf: return error; } - /* * Gets called when unmounting a filesystem or when all quotas get * turned off. @@ -723,19 +729,8 @@ xfs_qm_destroy_quotainfo( unregister_shrinker(&qi->qi_shrinker); list_lru_destroy(&qi->qi_lru); - - if (qi->qi_uquotaip) { - IRELE(qi->qi_uquotaip); - qi->qi_uquotaip = NULL; /* paranoia */ - } - if (qi->qi_gquotaip) { - IRELE(qi->qi_gquotaip); - qi->qi_gquotaip = NULL; - } - if (qi->qi_pquotaip) { - IRELE(qi->qi_pquotaip); - qi->qi_pquotaip = NULL; - } + xfs_qm_destroy_quotainos(qi); + mutex_destroy(&qi->qi_tree_lock); mutex_destroy(&qi->qi_quotaofflock); kmem_free(qi); mp->m_quotainfo = NULL; @@ -847,6 +842,7 @@ xfs_qm_reset_dqcounts( { struct xfs_dqblk *dqb; int j; + xfs_failaddr_t fa; trace_xfs_reset_dqcounts(bp, _RET_IP_); @@ -868,10 +864,13 @@ xfs_qm_reset_dqcounts( /* * Do a sanity check, and if needed, repair the dqblk. Don't * output any warnings because it's perfectly possible to - * find uninitialised dquot blks. See comment in xfs_dqcheck. + * find uninitialised dquot blks. See comment in + * xfs_dquot_verify. */ - xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR, - "xfs_quotacheck"); + fa = xfs_dquot_verify(mp, ddq, id + j, type, 0); + if (fa) + xfs_dquot_repair(mp, ddq, id + j, type); + /* * Reset type in case we are reusing group quota file for * project quotas or vice versa @@ -1078,8 +1077,7 @@ xfs_qm_quotacheck_dqadjust( struct xfs_dquot *dqp; int error; - error = xfs_qm_dqget(mp, ip, id, type, - XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp); + error = xfs_qm_dqget(mp, ip, id, type, XFS_QMOPT_DQALLOC, &dqp); if (error) { /* * Shouldn't be able to turn off quotas here. @@ -1600,6 +1598,24 @@ error_rele: } STATIC void +xfs_qm_destroy_quotainos( + xfs_quotainfo_t *qi) +{ + if (qi->qi_uquotaip) { + IRELE(qi->qi_uquotaip); + qi->qi_uquotaip = NULL; /* paranoia */ + } + if (qi->qi_gquotaip) { + IRELE(qi->qi_gquotaip); + qi->qi_gquotaip = NULL; + } + if (qi->qi_pquotaip) { + IRELE(qi->qi_pquotaip); + qi->qi_pquotaip = NULL; + } +} + +STATIC void xfs_qm_dqfree_one( struct xfs_dquot *dqp) { @@ -1682,8 +1698,7 @@ xfs_qm_vop_dqalloc( xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, NULL, uid, XFS_DQ_USER, - XFS_QMOPT_DQALLOC | - XFS_QMOPT_DOWARN, + XFS_QMOPT_DQALLOC, &uq); if (error) { ASSERT(error != -ENOENT); @@ -1709,8 +1724,7 @@ xfs_qm_vop_dqalloc( xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, NULL, gid, XFS_DQ_GROUP, - XFS_QMOPT_DQALLOC | - XFS_QMOPT_DOWARN, + XFS_QMOPT_DQALLOC, &gq); if (error) { ASSERT(error != -ENOENT); @@ -1729,8 +1743,7 @@ xfs_qm_vop_dqalloc( xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, XFS_DQ_PROJ, - XFS_QMOPT_DQALLOC | - XFS_QMOPT_DOWARN, + XFS_QMOPT_DQALLOC, &pq); if (error) { ASSERT(error != -ENOENT); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cf7c8f81bebb..270246943a06 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -454,6 +454,8 @@ retry: if (error) goto out_bmap_cancel; + xfs_inode_set_cowblocks_tag(ip); + /* Finish up. */ error = xfs_defer_finish(&tp, &dfops); if (error) @@ -462,6 +464,13 @@ retry: error = xfs_trans_commit(tp); if (error) return error; + + /* + * Allocation succeeded but the requested range was not even partially + * satisfied? Bail out! + */ + if (nimaps == 0) + return -ENOSPC; convert: return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb, &dfops); @@ -490,8 +499,9 @@ xfs_reflink_find_cow_mapping( struct xfs_iext_cursor icur; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); - ASSERT(xfs_is_reflink_inode(ip)); + if (!xfs_is_reflink_inode(ip)) + return false; offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got)) return false; @@ -596,10 +606,6 @@ xfs_reflink_cancel_cow_blocks( del.br_startblock, del.br_blockcount, NULL); - /* Update quota accounting */ - xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT, - -(long)del.br_blockcount); - /* Roll the transaction */ xfs_defer_ijoin(&dfops, ip); error = xfs_defer_finish(tpp, &dfops); @@ -610,6 +616,16 @@ xfs_reflink_cancel_cow_blocks( /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &icur, &got, &del); + + /* Remove the quota reservation */ + error = xfs_trans_reserve_quota_nblks(NULL, ip, + -(long)del.br_blockcount, 0, + XFS_QMOPT_RES_REGBLKS); + if (error) + break; + } else { + /* Didn't do anything, push cursor back. */ + xfs_iext_prev(ifp, &icur); } next_extent: if (!xfs_iext_get_extent(ifp, &icur, &got)) @@ -725,7 +741,7 @@ xfs_reflink_end_cow( (unsigned int)(end_fsb - offset_fsb), XFS_DATA_FORK); error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, - resblks, 0, 0, &tp); + resblks, 0, XFS_TRANS_RESERVE, &tp); if (error) goto out; @@ -789,6 +805,10 @@ xfs_reflink_end_cow( if (error) goto out_defer; + /* Charge this new data fork mapping to the on-disk quota. */ + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, + (long)del.br_blockcount); + /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &icur, &got, &del); @@ -938,7 +958,7 @@ xfs_reflink_set_inode_flag( if (src->i_ino == dest->i_ino) xfs_ilock(src, XFS_ILOCK_EXCL); else - xfs_lock_two_inodes(src, dest, XFS_ILOCK_EXCL); + xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL); if (!xfs_is_reflink_inode(src)) { trace_xfs_reflink_set_inode_flag(src); @@ -1196,13 +1216,16 @@ xfs_reflink_remap_blocks( /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ while (len) { + uint lock_mode; + trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, dest, destoff); + /* Read extent from the source file */ nimaps = 1; - xfs_ilock(src, XFS_ILOCK_EXCL); + lock_mode = xfs_ilock_data_map_shared(src); error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); - xfs_iunlock(src, XFS_ILOCK_EXCL); + xfs_iunlock(src, lock_mode); if (error) goto err; ASSERT(nimaps == 1); @@ -1239,6 +1262,50 @@ err: } /* + * Grab the exclusive iolock for a data copy from src to dest, making + * sure to abide vfs locking order (lowest pointer value goes first) and + * breaking the pnfs layout leases on dest before proceeding. The loop + * is needed because we cannot call the blocking break_layout() with the + * src iolock held, and therefore have to back out both locks. + */ +static int +xfs_iolock_two_inodes_and_break_layout( + struct inode *src, + struct inode *dest) +{ + int error; + +retry: + if (src < dest) { + inode_lock_shared(src); + inode_lock_nested(dest, I_MUTEX_NONDIR2); + } else { + /* src >= dest */ + inode_lock(dest); + } + + error = break_layout(dest, false); + if (error == -EWOULDBLOCK) { + inode_unlock(dest); + if (src < dest) + inode_unlock_shared(src); + error = break_layout(dest, true); + if (error) + return error; + goto retry; + } + if (error) { + inode_unlock(dest); + if (src < dest) + inode_unlock_shared(src); + return error; + } + if (src > dest) + inode_lock_shared_nested(src, I_MUTEX_NONDIR2); + return 0; +} + +/* * Link a range of blocks from one file to another. */ int @@ -1268,11 +1335,14 @@ xfs_reflink_remap_range( return -EIO; /* Lock both files against IO */ - lock_two_nondirectories(inode_in, inode_out); + ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out); + if (ret) + return ret; if (same_inode) xfs_ilock(src, XFS_MMAPLOCK_EXCL); else - xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); + xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest, + XFS_MMAPLOCK_EXCL); /* Check file eligibility and prepare for block sharing. */ ret = -EINVAL; @@ -1289,8 +1359,24 @@ xfs_reflink_remap_range( if (ret <= 0) goto out_unlock; + /* Attach dquots to dest inode before changing block map */ + ret = xfs_qm_dqattach(dest, 0); + if (ret) + goto out_unlock; + trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); + /* + * Clear out post-eof preallocations because we don't have page cache + * backing the delayed allocations and they'll never get freed on + * their own. + */ + if (xfs_can_free_eofblocks(dest, true)) { + ret = xfs_free_eofblocks(dest); + if (ret) + goto out_unlock; + } + /* Set flags and remap blocks. */ ret = xfs_reflink_set_inode_flag(src, dest); if (ret) @@ -1324,10 +1410,12 @@ xfs_reflink_remap_range( is_dedupe); out_unlock: - xfs_iunlock(src, XFS_MMAPLOCK_EXCL); + xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); + if (!same_inode) + xfs_iunlock(src, XFS_MMAPLOCK_SHARED); + inode_unlock(inode_out); if (!same_inode) - xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); - unlock_two_nondirectories(inode_in, inode_out); + inode_unlock_shared(inode_in); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return ret; diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 3f30f846d7f2..dfee3c991155 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -139,6 +139,9 @@ int xfs_rtalloc_query_all(struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); +int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_extlen_t len, + bool *is_free); #else # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) @@ -148,6 +151,7 @@ bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); # define xfs_rtalloc_query_all(t,f,p) (ENOSYS) # define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS) # define xfs_verify_rtbno(m, r) (false) +# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 5122d3021117..7aba628dc527 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1153,6 +1153,14 @@ xfs_fs_statfs( ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD)) xfs_qm_statvfs(ip, statp); + + if (XFS_IS_REALTIME_MOUNT(mp) && + (ip->i_d.di_flags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) { + statp->f_blocks = sbp->sb_rblocks; + statp->f_bavail = statp->f_bfree = + sbp->sb_frextents * sbp->sb_rextsize; + } + return 0; } @@ -1360,6 +1368,7 @@ xfs_fs_remount( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } + xfs_queue_cowblocks(mp); /* Create the per-AG metadata reservation pool .*/ error = xfs_fs_reserve_ag_blocks(mp); @@ -1369,6 +1378,14 @@ xfs_fs_remount( /* rw -> ro */ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) { + /* Get rid of any leftover CoW reservations... */ + cancel_delayed_work_sync(&mp->m_cowblocks_work); + error = xfs_icache_free_cowblocks(mp, NULL); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + /* Free the per-AG metadata reservation pool. */ error = xfs_fs_unreserve_ag_blocks(mp); if (error) { @@ -1649,9 +1666,12 @@ xfs_fs_fill_super( "DAX unsupported by block device. Turning off DAX."); mp->m_flags &= ~XFS_MOUNT_DAX; } - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_sb_version_hasreflink(&mp->m_sb)) { xfs_alert(mp, - "DAX and reflink have not been tested together!"); + "DAX and reflink cannot be used together!"); + error = -EINVAL; + goto out_filestream_unmount; + } } if (mp->m_flags & XFS_MOUNT_DISCARD) { @@ -1664,20 +1684,19 @@ xfs_fs_fill_super( } } - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { - if (mp->m_sb.sb_rblocks) { - xfs_alert(mp, - "EXPERIMENTAL reverse mapping btree not compatible with realtime device!"); - error = -EINVAL; - goto out_filestream_unmount; - } + if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) { xfs_alert(mp, - "EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!"); + "reflink not compatible with realtime device!"); + error = -EINVAL; + goto out_filestream_unmount; } - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) { xfs_alert(mp, - "EXPERIMENTAL reflink feature enabled. Use at your own risk!"); + "reverse mapping btree not compatible with realtime device!"); + error = -EINVAL; + goto out_filestream_unmount; + } error = xfs_mountfs(mp); if (error) diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index fcc5dfc70aa0..8cee8e8050e3 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -44,6 +44,12 @@ extern void xfs_qm_exit(void); # define XFS_REALTIME_STRING #endif +#ifdef CONFIG_XFS_ONLINE_SCRUB +# define XFS_SCRUB_STRING "scrub, " +#else +# define XFS_SCRUB_STRING +#endif + #ifdef DEBUG # define XFS_DBG_STRING "debug" #else @@ -54,6 +60,7 @@ extern void xfs_qm_exit(void); #define XFS_BUILD_OPTIONS XFS_ACL_STRING \ XFS_SECURITY_STRING \ XFS_REALTIME_STRING \ + XFS_SCRUB_STRING \ XFS_DBG_STRING /* DBG must be last */ struct xfs_inode; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index d718a10c2271..945de08af7ba 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -72,7 +72,7 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class, __entry->flags = ctx->flags; ), TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " - "alist 0x%p size %u count %u firstu %u flags %d %s", + "alist %p size %u count %u firstu %u flags %d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->hashval, @@ -119,7 +119,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class, __entry->refcount = refcount; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno %u refcount %d caller %ps", + TP_printk("dev %d:%d agno %u refcount %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->refcount, @@ -200,7 +200,7 @@ TRACE_EVENT(xfs_attr_list_node_descend, __entry->bt_before = be32_to_cpu(btree->before); ), TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " - "alist 0x%p size %u count %u firstu %u flags %d %s " + "alist %p size %u count %u firstu %u flags %d %s " "node hashval %u, node before %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, @@ -251,8 +251,8 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __entry->bmap_state = state; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx state %s cur 0x%p/%d " - "offset %lld block %lld count %lld flag %d caller %ps", + TP_printk("dev %d:%d ino 0x%llx state %s cur %p/%d " + "offset %lld block %lld count %lld flag %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), @@ -301,7 +301,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d " - "lock %d flags %s caller %ps", + "lock %d flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->nblks, @@ -370,7 +370,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " - "lock %d flags %s caller %ps", + "lock %d flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->buffer_length, @@ -390,7 +390,7 @@ DEFINE_BUF_FLAGS_EVENT(xfs_buf_get); DEFINE_BUF_FLAGS_EVENT(xfs_buf_read); TRACE_EVENT(xfs_buf_ioerror, - TP_PROTO(struct xfs_buf *bp, int error, unsigned long caller_ip), + TP_PROTO(struct xfs_buf *bp, int error, xfs_failaddr_t caller_ip), TP_ARGS(bp, error, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) @@ -401,7 +401,7 @@ TRACE_EVENT(xfs_buf_ioerror, __field(int, pincount) __field(unsigned, lockval) __field(int, error) - __field(unsigned long, caller_ip) + __field(xfs_failaddr_t, caller_ip) ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; @@ -415,7 +415,7 @@ TRACE_EVENT(xfs_buf_ioerror, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " - "lock %d error %d flags %s caller %ps", + "lock %d error %d flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->buffer_length, @@ -460,7 +460,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " "lock %d flags %s recur %d refcount %d bliflags %s " - "lidesc 0x%p liflags %s", + "lidesc %p liflags %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->buf_bno, __entry->buf_len, @@ -579,7 +579,7 @@ DECLARE_EVENT_CLASS(xfs_lock_class, __entry->lock_flags = lock_flags; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx flags %s caller %ps", + TP_printk("dev %d:%d ino 0x%llx flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), @@ -697,7 +697,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, __entry->pincount = atomic_read(&ip->i_pincount); __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %ps", + TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->count, @@ -1028,7 +1028,7 @@ DECLARE_EVENT_CLASS(xfs_log_item_class, __entry->flags = lip->li_flags; __entry->lsn = lip->li_lsn; ), - TP_printk("dev %d:%d lip 0x%p lsn %d/%d type %s flags %s", + TP_printk("dev %d:%d lip %p lsn %d/%d type %s flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->lip, CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn), @@ -1049,7 +1049,7 @@ TRACE_EVENT(xfs_log_force, __entry->lsn = lsn; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d lsn 0x%llx caller %ps", + TP_printk("dev %d:%d lsn 0x%llx caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->lsn, (void *)__entry->caller_ip) ) @@ -1082,7 +1082,7 @@ DECLARE_EVENT_CLASS(xfs_ail_class, __entry->old_lsn = old_lsn; __entry->new_lsn = new_lsn; ), - TP_printk("dev %d:%d lip 0x%p old lsn %d/%d new lsn %d/%d type %s flags %s", + TP_printk("dev %d:%d lip %p old lsn %d/%d new lsn %d/%d type %s flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->lip, CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn), @@ -1403,7 +1403,7 @@ TRACE_EVENT(xfs_bunmap, __entry->flags = flags; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx" - "flags %s caller %ps", + "flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, @@ -1517,7 +1517,7 @@ TRACE_EVENT(xfs_agf, ), TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u " "levels b %u c %u flfirst %u fllast %u flcount %u " - "freeblks %u longest %u caller %ps", + "freeblks %u longest %u caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __print_flags(__entry->flags, "|", XFS_AGF_FLAGS), @@ -2014,7 +2014,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class, __entry->count = item->ri_cnt; __entry->total = item->ri_total; ), - TP_printk("dev %d:%d tid 0x%x lsn 0x%llx, pass %d, item 0x%p, " + TP_printk("dev %d:%d tid 0x%x lsn 0x%llx, pass %d, item %p, " "item type %s item region count/total %d/%d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, @@ -2486,7 +2486,7 @@ DECLARE_EVENT_CLASS(xfs_ag_error_class, __entry->error = error; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno %u error %d caller %ps", + TP_printk("dev %d:%d agno %u error %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->error, @@ -2977,7 +2977,7 @@ DECLARE_EVENT_CLASS(xfs_inode_error_class, __entry->error = error; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino %llx error %d caller %ps", + TP_printk("dev %d:%d ino %llx error %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->error, @@ -3313,6 +3313,32 @@ DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key); DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key); DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping); +TRACE_EVENT(xfs_trans_resv_calc, + TP_PROTO(struct xfs_mount *mp, unsigned int type, + struct xfs_trans_res *res), + TP_ARGS(mp, type, res), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(uint, logres) + __field(int, logcount) + __field(int, logflags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->type = type; + __entry->logres = res->tr_logres; + __entry->logcount = res->tr_logcount; + __entry->logflags = res->tr_logflags; + ), + TP_printk("dev %d:%d type %d logres %u logcount %d flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->logres, + __entry->logcount, + __entry->logflags) +); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index a87f657f59c9..86f92df32c42 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -35,6 +35,27 @@ kmem_zone_t *xfs_trans_zone; kmem_zone_t *xfs_log_item_desc_zone; +#if defined(CONFIG_TRACEPOINTS) +static void +xfs_trans_trace_reservations( + struct xfs_mount *mp) +{ + struct xfs_trans_res resv; + struct xfs_trans_res *res; + struct xfs_trans_res *end_res; + int i; + + res = (struct xfs_trans_res *)M_RES(mp); + end_res = (struct xfs_trans_res *)(M_RES(mp) + 1); + for (i = 0; res < end_res; i++, res++) + trace_xfs_trans_resv_calc(mp, i, res); + xfs_log_get_max_trans_res(mp, &resv); + trace_xfs_trans_resv_calc(mp, -1, &resv); +} +#else +# define xfs_trans_trace_reservations(mp) +#endif + /* * Initialize the precomputed transaction reservation values * in the mount structure. @@ -44,6 +65,7 @@ xfs_trans_init( struct xfs_mount *mp) { xfs_trans_resv_calc(mp, M_RES(mp)); + xfs_trans_trace_reservations(mp); } /* diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 815b53d20e26..9d542dfe0052 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -50,7 +50,7 @@ typedef struct xfs_log_item { uint li_type; /* item type */ uint li_flags; /* misc flags */ struct xfs_buf *li_buf; /* real buffer pointer */ - struct xfs_log_item *li_bio_list; /* buffer item list */ + struct list_head li_bio_list; /* buffer item list */ void (*li_cb)(struct xfs_buf *, struct xfs_log_item *); /* buffer item iodone */ diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 3ba7a96a8abd..653ce379d36b 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -82,12 +82,12 @@ _xfs_trans_bjoin( ASSERT(bp->b_transp == NULL); /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If + * The xfs_buf_log_item pointer is stored in b_log_item. If * it doesn't have one yet, then allocate one and initialize it. * The checks to see if one is there are in xfs_buf_item_init(). */ xfs_buf_item_init(bp, tp->t_mountp); - bip = bp->b_fspriv; + bip = bp->b_log_item; ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); @@ -118,7 +118,7 @@ xfs_trans_bjoin( struct xfs_buf *bp) { _xfs_trans_bjoin(tp, bp, 0); - trace_xfs_trans_bjoin(bp->b_fspriv); + trace_xfs_trans_bjoin(bp->b_log_item); } /* @@ -139,7 +139,7 @@ xfs_trans_get_buf_map( xfs_buf_flags_t flags) { xfs_buf_t *bp; - xfs_buf_log_item_t *bip; + struct xfs_buf_log_item *bip; if (!tp) return xfs_buf_get_map(target, map, nmaps, flags); @@ -159,7 +159,7 @@ xfs_trans_get_buf_map( } ASSERT(bp->b_transp == tp); - bip = bp->b_fspriv; + bip = bp->b_log_item; ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; @@ -175,7 +175,7 @@ xfs_trans_get_buf_map( ASSERT(!bp->b_error); _xfs_trans_bjoin(tp, bp, 1); - trace_xfs_trans_get_buf(bp->b_fspriv); + trace_xfs_trans_get_buf(bp->b_log_item); return bp; } @@ -188,12 +188,13 @@ xfs_trans_get_buf_map( * mount structure. */ xfs_buf_t * -xfs_trans_getsb(xfs_trans_t *tp, - struct xfs_mount *mp, - int flags) +xfs_trans_getsb( + xfs_trans_t *tp, + struct xfs_mount *mp, + int flags) { xfs_buf_t *bp; - xfs_buf_log_item_t *bip; + struct xfs_buf_log_item *bip; /* * Default to just trying to lock the superblock buffer @@ -210,7 +211,7 @@ xfs_trans_getsb(xfs_trans_t *tp, */ bp = mp->m_sb_bp; if (bp->b_transp == tp) { - bip = bp->b_fspriv; + bip = bp->b_log_item; ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; @@ -223,7 +224,7 @@ xfs_trans_getsb(xfs_trans_t *tp, return NULL; _xfs_trans_bjoin(tp, bp, 1); - trace_xfs_trans_getsb(bp->b_fspriv); + trace_xfs_trans_getsb(bp->b_log_item); return bp; } @@ -266,7 +267,7 @@ xfs_trans_read_buf_map( if (bp) { ASSERT(xfs_buf_islocked(bp)); ASSERT(bp->b_transp == tp); - ASSERT(bp->b_fspriv != NULL); + ASSERT(bp->b_log_item != NULL); ASSERT(!bp->b_error); ASSERT(bp->b_flags & XBF_DONE); @@ -279,7 +280,7 @@ xfs_trans_read_buf_map( return -EIO; } - bip = bp->b_fspriv; + bip = bp->b_log_item; bip->bli_recur++; ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -329,7 +330,7 @@ xfs_trans_read_buf_map( if (tp) { _xfs_trans_bjoin(tp, bp, 1); - trace_xfs_trans_read_buf(bp->b_fspriv); + trace_xfs_trans_read_buf(bp->b_log_item); } *bpp = bp; return 0; @@ -352,10 +353,11 @@ xfs_trans_read_buf_map( * brelse() call. */ void -xfs_trans_brelse(xfs_trans_t *tp, - xfs_buf_t *bp) +xfs_trans_brelse( + xfs_trans_t *tp, + xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; + struct xfs_buf_log_item *bip; int freed; /* @@ -368,7 +370,7 @@ xfs_trans_brelse(xfs_trans_t *tp, } ASSERT(bp->b_transp == tp); - bip = bp->b_fspriv; + bip = bp->b_log_item; ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); @@ -456,10 +458,11 @@ xfs_trans_brelse(xfs_trans_t *tp, */ /* ARGSUSED */ void -xfs_trans_bhold(xfs_trans_t *tp, - xfs_buf_t *bp) +xfs_trans_bhold( + xfs_trans_t *tp, + xfs_buf_t *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); @@ -476,10 +479,11 @@ xfs_trans_bhold(xfs_trans_t *tp, * for this transaction. */ void -xfs_trans_bhold_release(xfs_trans_t *tp, - xfs_buf_t *bp) +xfs_trans_bhold_release( + xfs_trans_t *tp, + xfs_buf_t *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); @@ -500,7 +504,7 @@ xfs_trans_dirty_buf( struct xfs_trans *tp, struct xfs_buf *bp) { - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); @@ -557,7 +561,7 @@ xfs_trans_log_buf( uint first, uint last) { - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(first <= last && last < BBTOB(bp->b_length)); ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED)); @@ -600,10 +604,10 @@ xfs_trans_log_buf( */ void xfs_trans_binval( - xfs_trans_t *tp, - xfs_buf_t *bp) + xfs_trans_t *tp, + xfs_buf_t *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; int i; ASSERT(bp->b_transp == tp); @@ -655,10 +659,10 @@ xfs_trans_binval( */ void xfs_trans_inode_buf( - xfs_trans_t *tp, - xfs_buf_t *bp) + xfs_trans_t *tp, + xfs_buf_t *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); @@ -679,10 +683,10 @@ xfs_trans_inode_buf( */ void xfs_trans_stale_inode_buf( - xfs_trans_t *tp, - xfs_buf_t *bp) + xfs_trans_t *tp, + xfs_buf_t *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); @@ -704,10 +708,10 @@ xfs_trans_stale_inode_buf( /* ARGSUSED */ void xfs_trans_inode_alloc_buf( - xfs_trans_t *tp, - xfs_buf_t *bp) + xfs_trans_t *tp, + xfs_buf_t *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); @@ -729,7 +733,7 @@ xfs_trans_ordered_buf( struct xfs_trans *tp, struct xfs_buf *bp) { - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); @@ -759,7 +763,7 @@ xfs_trans_buf_set_type( struct xfs_buf *bp, enum xfs_blft type) { - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; if (!tp) return; @@ -776,8 +780,8 @@ xfs_trans_buf_copy_type( struct xfs_buf *dst_bp, struct xfs_buf *src_bp) { - struct xfs_buf_log_item *sbip = src_bp->b_fspriv; - struct xfs_buf_log_item *dbip = dst_bp->b_fspriv; + struct xfs_buf_log_item *sbip = src_bp->b_log_item; + struct xfs_buf_log_item *dbip = dst_bp->b_log_item; enum xfs_blft type; type = xfs_blft_from_flags(&sbip->__bli_format); @@ -797,11 +801,11 @@ xfs_trans_buf_copy_type( /* ARGSUSED */ void xfs_trans_dquot_buf( - xfs_trans_t *tp, - xfs_buf_t *bp, - uint type) + xfs_trans_t *tp, + xfs_buf_t *bp, + uint type) { - struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(type == XFS_BLF_UDQUOT_BUF || type == XFS_BLF_PDQUOT_BUF || diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index daa7615497f9..4a89da4b6fe7 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -28,6 +28,8 @@ #include "xfs_inode_item.h" #include "xfs_trace.h" +#include <linux/iversion.h> + /* * Add a locked inode to the transaction. * @@ -110,15 +112,17 @@ xfs_trans_log_inode( /* * First time we log the inode in a transaction, bump the inode change - * counter if it is configured for this to occur. We don't use - * inode_inc_version() because there is no need for extra locking around - * i_version as we already hold the inode locked exclusively for - * metadata modification. + * counter if it is configured for this to occur. While we have the + * inode locked exclusively for metadata modification, we can usually + * avoid setting XFS_ILOG_CORE if no one has queried the value since + * the last time it was incremented. If we have XFS_ILOG_CORE already + * set however, then go ahead and bump the i_version counter + * unconditionally. */ if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && IS_I_VERSION(VFS_I(ip))) { - VFS_I(ip)->i_version++; - flags |= XFS_ILOG_CORE; + if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE)) + flags |= XFS_ILOG_CORE; } tp->t_flags |= XFS_TRANS_DIRTY; |