diff options
Diffstat (limited to 'fs')
364 files changed, 16280 insertions, 7727 deletions
diff --git a/fs/Makefile b/fs/Makefile index 1148c555c4d3..98be354fdb61 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -37,7 +37,7 @@ obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_VERITY) += verity/ obj-$(CONFIG_FILE_LOCKING) += locks.o -obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o +obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index b7e844d2f321..699c4fa8b78b 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -26,14 +26,13 @@ static inline u16 adfs_filetype(u32 loadaddr) #define ADFS_NDA_PUBLIC_READ (1 << 5) #define ADFS_NDA_PUBLIC_WRITE (1 << 6) -#include "dir_f.h" - /* * adfs file system inode data in memory */ struct adfs_inode_info { loff_t mmu_private; __u32 parent_id; /* parent indirect disc address */ + __u32 indaddr; /* object indirect disc address */ __u32 loadaddr; /* RISC OS load address */ __u32 execaddr; /* RISC OS exec address */ unsigned int attr; /* RISC OS permissions */ @@ -93,15 +92,19 @@ struct adfs_dir { int nr_buffers; struct buffer_head *bh[4]; - - /* big directories need allocated buffers */ - struct buffer_head **bh_fplus; + struct buffer_head **bhs; unsigned int pos; __u32 parent_id; - struct adfs_dirheader dirhead; - union adfs_dirtail dirtail; + union { + struct adfs_dirheader *dirhead; + struct adfs_bigdirheader *bighead; + }; + union { + struct adfs_newdirtail *newtail; + struct adfs_bigdirtail *bigtail; + }; }; /* @@ -122,13 +125,13 @@ struct object_info { struct adfs_dir_ops { int (*read)(struct super_block *sb, unsigned int indaddr, unsigned int size, struct adfs_dir *dir); + int (*iterate)(struct adfs_dir *dir, struct dir_context *ctx); int (*setpos)(struct adfs_dir *dir, unsigned int fpos); int (*getnext)(struct adfs_dir *dir, struct object_info *obj); int (*update)(struct adfs_dir *dir, struct object_info *obj); int (*create)(struct adfs_dir *dir, struct object_info *obj); int (*remove)(struct adfs_dir *dir, struct object_info *obj); - int (*sync)(struct adfs_dir *dir); - void (*free)(struct adfs_dir *dir); + int (*commit)(struct adfs_dir *dir); }; struct adfs_discmap { @@ -145,7 +148,9 @@ int adfs_notify_change(struct dentry *dentry, struct iattr *attr); /* map.c */ int adfs_map_lookup(struct super_block *sb, u32 frag_id, unsigned int offset); -extern unsigned int adfs_map_free(struct super_block *sb); +void adfs_map_statfs(struct super_block *sb, struct kstatfs *buf); +struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr); +void adfs_free_map(struct super_block *sb); /* Misc */ __printf(3, 4) @@ -167,6 +172,13 @@ extern const struct dentry_operations adfs_dentry_operations; extern const struct adfs_dir_ops adfs_f_dir_ops; extern const struct adfs_dir_ops adfs_fplus_dir_ops; +int adfs_dir_copyfrom(void *dst, struct adfs_dir *dir, unsigned int offset, + size_t len); +int adfs_dir_copyto(struct adfs_dir *dir, unsigned int offset, const void *src, + size_t len); +void adfs_dir_relse(struct adfs_dir *dir); +int adfs_dir_read_buffers(struct super_block *sb, u32 indaddr, + unsigned int size, struct adfs_dir *dir); void adfs_object_fixup(struct adfs_dir *dir, struct object_info *obj); extern int adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait); diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index a54c53244992..77fbd196008f 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -6,12 +6,196 @@ * * Common directory handling for ADFS */ +#include <linux/slab.h> #include "adfs.h" /* * For future. This should probably be per-directory. */ -static DEFINE_RWLOCK(adfs_dir_lock); +static DECLARE_RWSEM(adfs_dir_rwsem); + +int adfs_dir_copyfrom(void *dst, struct adfs_dir *dir, unsigned int offset, + size_t len) +{ + struct super_block *sb = dir->sb; + unsigned int index, remain; + + index = offset >> sb->s_blocksize_bits; + offset &= sb->s_blocksize - 1; + remain = sb->s_blocksize - offset; + if (index + (remain < len) >= dir->nr_buffers) + return -EINVAL; + + if (remain < len) { + memcpy(dst, dir->bhs[index]->b_data + offset, remain); + dst += remain; + len -= remain; + index += 1; + offset = 0; + } + + memcpy(dst, dir->bhs[index]->b_data + offset, len); + + return 0; +} + +int adfs_dir_copyto(struct adfs_dir *dir, unsigned int offset, const void *src, + size_t len) +{ + struct super_block *sb = dir->sb; + unsigned int index, remain; + + index = offset >> sb->s_blocksize_bits; + offset &= sb->s_blocksize - 1; + remain = sb->s_blocksize - offset; + if (index + (remain < len) >= dir->nr_buffers) + return -EINVAL; + + if (remain < len) { + memcpy(dir->bhs[index]->b_data + offset, src, remain); + src += remain; + len -= remain; + index += 1; + offset = 0; + } + + memcpy(dir->bhs[index]->b_data + offset, src, len); + + return 0; +} + +static void __adfs_dir_cleanup(struct adfs_dir *dir) +{ + dir->nr_buffers = 0; + + if (dir->bhs != dir->bh) + kfree(dir->bhs); + dir->bhs = NULL; + dir->sb = NULL; +} + +void adfs_dir_relse(struct adfs_dir *dir) +{ + unsigned int i; + + for (i = 0; i < dir->nr_buffers; i++) + brelse(dir->bhs[i]); + + __adfs_dir_cleanup(dir); +} + +static void adfs_dir_forget(struct adfs_dir *dir) +{ + unsigned int i; + + for (i = 0; i < dir->nr_buffers; i++) + bforget(dir->bhs[i]); + + __adfs_dir_cleanup(dir); +} + +int adfs_dir_read_buffers(struct super_block *sb, u32 indaddr, + unsigned int size, struct adfs_dir *dir) +{ + struct buffer_head **bhs; + unsigned int i, num; + int block; + + num = ALIGN(size, sb->s_blocksize) >> sb->s_blocksize_bits; + if (num > ARRAY_SIZE(dir->bh)) { + /* We only allow one extension */ + if (dir->bhs != dir->bh) + return -EINVAL; + + bhs = kcalloc(num, sizeof(*bhs), GFP_KERNEL); + if (!bhs) + return -ENOMEM; + + if (dir->nr_buffers) + memcpy(bhs, dir->bhs, dir->nr_buffers * sizeof(*bhs)); + + dir->bhs = bhs; + } + + for (i = dir->nr_buffers; i < num; i++) { + block = __adfs_block_map(sb, indaddr, i); + if (!block) { + adfs_error(sb, "dir %06x has a hole at offset %u", + indaddr, i); + goto error; + } + + dir->bhs[i] = sb_bread(sb, block); + if (!dir->bhs[i]) { + adfs_error(sb, + "dir %06x failed read at offset %u, mapped block 0x%08x", + indaddr, i, block); + goto error; + } + + dir->nr_buffers++; + } + return 0; + +error: + adfs_dir_relse(dir); + + return -EIO; +} + +static int adfs_dir_read(struct super_block *sb, u32 indaddr, + unsigned int size, struct adfs_dir *dir) +{ + dir->sb = sb; + dir->bhs = dir->bh; + dir->nr_buffers = 0; + + return ADFS_SB(sb)->s_dir->read(sb, indaddr, size, dir); +} + +static int adfs_dir_read_inode(struct super_block *sb, struct inode *inode, + struct adfs_dir *dir) +{ + int ret; + + ret = adfs_dir_read(sb, ADFS_I(inode)->indaddr, inode->i_size, dir); + if (ret) + return ret; + + if (ADFS_I(inode)->parent_id != dir->parent_id) { + adfs_error(sb, + "parent directory id changed under me! (%06x but got %06x)\n", + ADFS_I(inode)->parent_id, dir->parent_id); + adfs_dir_relse(dir); + ret = -EIO; + } + + return ret; +} + +static void adfs_dir_mark_dirty(struct adfs_dir *dir) +{ + unsigned int i; + + /* Mark the buffers dirty */ + for (i = 0; i < dir->nr_buffers; i++) + mark_buffer_dirty(dir->bhs[i]); +} + +static int adfs_dir_sync(struct adfs_dir *dir) +{ + int err = 0; + int i; + + for (i = dir->nr_buffers - 1; i >= 0; i--) { + struct buffer_head *bh = dir->bhs[i]; + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + err = -EIO; + } + + return err; +} void adfs_object_fixup(struct adfs_dir *dir, struct object_info *obj) { @@ -51,87 +235,90 @@ void adfs_object_fixup(struct adfs_dir *dir, struct object_info *obj) } } -static int -adfs_readdir(struct file *file, struct dir_context *ctx) +static int adfs_iterate(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; - struct object_info obj; struct adfs_dir dir; - int ret = 0; - - if (ctx->pos >> 32) - return 0; + int ret; - ret = ops->read(sb, inode->i_ino, inode->i_size, &dir); + down_read(&adfs_dir_rwsem); + ret = adfs_dir_read_inode(sb, inode, &dir); if (ret) - return ret; + goto unlock; if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) - goto free_out; + goto unlock_relse; ctx->pos = 1; } if (ctx->pos == 1) { if (!dir_emit(ctx, "..", 2, dir.parent_id, DT_DIR)) - goto free_out; + goto unlock_relse; ctx->pos = 2; } - read_lock(&adfs_dir_lock); + ret = ops->iterate(&dir, ctx); - ret = ops->setpos(&dir, ctx->pos - 2); - if (ret) - goto unlock_out; - while (ops->getnext(&dir, &obj) == 0) { - if (!dir_emit(ctx, obj.name, obj.name_len, - obj.indaddr, DT_UNKNOWN)) - break; - ctx->pos++; - } - -unlock_out: - read_unlock(&adfs_dir_lock); +unlock_relse: + up_read(&adfs_dir_rwsem); + adfs_dir_relse(&dir); + return ret; -free_out: - ops->free(&dir); +unlock: + up_read(&adfs_dir_rwsem); return ret; } int adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait) { - int ret = -EINVAL; -#ifdef CONFIG_ADFS_FS_RW const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; struct adfs_dir dir; + int ret; - printk(KERN_INFO "adfs_dir_update: object %06x in dir %06x\n", - obj->indaddr, obj->parent_id); + if (!IS_ENABLED(CONFIG_ADFS_FS_RW)) + return -EINVAL; - if (!ops->update) { - ret = -EINVAL; - goto out; - } + if (!ops->update) + return -EINVAL; - ret = ops->read(sb, obj->parent_id, 0, &dir); + down_write(&adfs_dir_rwsem); + ret = adfs_dir_read(sb, obj->parent_id, 0, &dir); if (ret) - goto out; + goto unlock; - write_lock(&adfs_dir_lock); ret = ops->update(&dir, obj); - write_unlock(&adfs_dir_lock); + if (ret) + goto forget; - if (wait) { - int err = ops->sync(&dir); - if (!ret) - ret = err; - } + ret = ops->commit(&dir); + if (ret) + goto forget; + up_write(&adfs_dir_rwsem); + + adfs_dir_mark_dirty(&dir); + + if (wait) + ret = adfs_dir_sync(&dir); + + adfs_dir_relse(&dir); + return ret; + + /* + * If the updated failed because the entry wasn't found, we can + * just release the buffers. If it was any other error, forget + * the dirtied buffers so they aren't written back to the media. + */ +forget: + if (ret == -ENOENT) + adfs_dir_relse(&dir); + else + adfs_dir_forget(&dir); +unlock: + up_write(&adfs_dir_rwsem); - ops->free(&dir); -out: -#endif return ret; } @@ -167,25 +354,14 @@ static int adfs_dir_lookup_byname(struct inode *inode, const struct qstr *qstr, u32 name_len; int ret; - ret = ops->read(sb, inode->i_ino, inode->i_size, &dir); + down_read(&adfs_dir_rwsem); + ret = adfs_dir_read_inode(sb, inode, &dir); if (ret) - goto out; - - if (ADFS_I(inode)->parent_id != dir.parent_id) { - adfs_error(sb, - "parent directory changed under me! (%06x but got %06x)\n", - ADFS_I(inode)->parent_id, dir.parent_id); - ret = -EIO; - goto free_out; - } - - obj->parent_id = inode->i_ino; - - read_lock(&adfs_dir_lock); + goto unlock; ret = ops->setpos(&dir, 0); if (ret) - goto unlock_out; + goto unlock_relse; ret = -ENOENT; name = qstr->name; @@ -196,20 +372,22 @@ static int adfs_dir_lookup_byname(struct inode *inode, const struct qstr *qstr, break; } } + obj->parent_id = ADFS_I(inode)->indaddr; -unlock_out: - read_unlock(&adfs_dir_lock); +unlock_relse: + up_read(&adfs_dir_rwsem); + adfs_dir_relse(&dir); + return ret; -free_out: - ops->free(&dir); -out: +unlock: + up_read(&adfs_dir_rwsem); return ret; } const struct file_operations adfs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, - .iterate = adfs_readdir, + .iterate_shared = adfs_iterate, .fsync = generic_file_fsync, }; diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c index c1a950c7400a..30d526fecc3f 100644 --- a/fs/adfs/dir_f.c +++ b/fs/adfs/dir_f.c @@ -9,8 +9,6 @@ #include "adfs.h" #include "dir_f.h" -static void adfs_f_free(struct adfs_dir *dir); - /* * Read an (unaligned) value of length 1..4 bytes */ @@ -60,7 +58,7 @@ static inline void adfs_writeval(unsigned char *p, int len, unsigned int val) #define bufoff(_bh,_idx) \ ({ int _buf = _idx >> blocksize_bits; \ int _off = _idx - (_buf << blocksize_bits);\ - (u8 *)(_bh[_buf]->b_data + _off); \ + (void *)(_bh[_buf]->b_data + _off); \ }) /* @@ -123,65 +121,49 @@ adfs_dir_checkbyte(const struct adfs_dir *dir) return (dircheck ^ (dircheck >> 8) ^ (dircheck >> 16) ^ (dircheck >> 24)) & 0xff; } -/* Read and check that a directory is valid */ -static int adfs_dir_read(struct super_block *sb, u32 indaddr, - unsigned int size, struct adfs_dir *dir) +static int adfs_f_validate(struct adfs_dir *dir) { - const unsigned int blocksize_bits = sb->s_blocksize_bits; - int blk = 0; - - /* - * Directories which are not a multiple of 2048 bytes - * are considered bad v2 [3.6] - */ - if (size & 2047) - goto bad_dir; - - size >>= blocksize_bits; - - dir->nr_buffers = 0; - dir->sb = sb; - - for (blk = 0; blk < size; blk++) { - int phys; + struct adfs_dirheader *head = dir->dirhead; + struct adfs_newdirtail *tail = dir->newtail; + + if (head->startmasseq != tail->endmasseq || + tail->dirlastmask || tail->reserved[0] || tail->reserved[1] || + (memcmp(&head->startname, "Nick", 4) && + memcmp(&head->startname, "Hugo", 4)) || + memcmp(&head->startname, &tail->endname, 4) || + adfs_dir_checkbyte(dir) != tail->dircheckbyte) + return -EIO; - phys = __adfs_block_map(sb, indaddr, blk); - if (!phys) { - adfs_error(sb, "dir %06x has a hole at offset %d", - indaddr, blk); - goto release_buffers; - } + return 0; +} - dir->bh[blk] = sb_bread(sb, phys); - if (!dir->bh[blk]) - goto release_buffers; - } +/* Read and check that a directory is valid */ +static int adfs_f_read(struct super_block *sb, u32 indaddr, unsigned int size, + struct adfs_dir *dir) +{ + const unsigned int blocksize_bits = sb->s_blocksize_bits; + int ret; - memcpy(&dir->dirhead, bufoff(dir->bh, 0), sizeof(dir->dirhead)); - memcpy(&dir->dirtail, bufoff(dir->bh, 2007), sizeof(dir->dirtail)); + if (size && size != ADFS_NEWDIR_SIZE) + return -EIO; - if (dir->dirhead.startmasseq != dir->dirtail.new.endmasseq || - memcmp(&dir->dirhead.startname, &dir->dirtail.new.endname, 4)) - goto bad_dir; + ret = adfs_dir_read_buffers(sb, indaddr, ADFS_NEWDIR_SIZE, dir); + if (ret) + return ret; - if (memcmp(&dir->dirhead.startname, "Nick", 4) && - memcmp(&dir->dirhead.startname, "Hugo", 4)) - goto bad_dir; + dir->dirhead = bufoff(dir->bh, 0); + dir->newtail = bufoff(dir->bh, 2007); - if (adfs_dir_checkbyte(dir) != dir->dirtail.new.dircheckbyte) + if (adfs_f_validate(dir)) goto bad_dir; - dir->nr_buffers = blk; + dir->parent_id = adfs_readval(dir->newtail->dirparent, 3); return 0; bad_dir: adfs_error(sb, "dir %06x is corrupted", indaddr); -release_buffers: - for (blk -= 1; blk >= 0; blk -= 1) - brelse(dir->bh[blk]); - - dir->sb = NULL; + adfs_dir_relse(dir); return -EIO; } @@ -232,24 +214,12 @@ adfs_obj2dir(struct adfs_direntry *de, struct object_info *obj) static int __adfs_dir_get(struct adfs_dir *dir, int pos, struct object_info *obj) { - struct super_block *sb = dir->sb; struct adfs_direntry de; - int thissize, buffer, offset; - - buffer = pos >> sb->s_blocksize_bits; - - if (buffer > dir->nr_buffers) - return -EINVAL; - - offset = pos & (sb->s_blocksize - 1); - thissize = sb->s_blocksize - offset; - if (thissize > 26) - thissize = 26; + int ret; - memcpy(&de, dir->bh[buffer]->b_data + offset, thissize); - if (thissize != 26) - memcpy(((char *)&de) + thissize, dir->bh[buffer + 1]->b_data, - 26 - thissize); + ret = adfs_dir_copyfrom(&de, dir, pos, 26); + if (ret) + return ret; if (!de.dirobname[0]) return -ENOENT; @@ -260,89 +230,6 @@ __adfs_dir_get(struct adfs_dir *dir, int pos, struct object_info *obj) } static int -__adfs_dir_put(struct adfs_dir *dir, int pos, struct object_info *obj) -{ - struct super_block *sb = dir->sb; - struct adfs_direntry de; - int thissize, buffer, offset; - - buffer = pos >> sb->s_blocksize_bits; - - if (buffer > dir->nr_buffers) - return -EINVAL; - - offset = pos & (sb->s_blocksize - 1); - thissize = sb->s_blocksize - offset; - if (thissize > 26) - thissize = 26; - - /* - * Get the entry in total - */ - memcpy(&de, dir->bh[buffer]->b_data + offset, thissize); - if (thissize != 26) - memcpy(((char *)&de) + thissize, dir->bh[buffer + 1]->b_data, - 26 - thissize); - - /* - * update it - */ - adfs_obj2dir(&de, obj); - - /* - * Put the new entry back - */ - memcpy(dir->bh[buffer]->b_data + offset, &de, thissize); - if (thissize != 26) - memcpy(dir->bh[buffer + 1]->b_data, ((char *)&de) + thissize, - 26 - thissize); - - return 0; -} - -/* - * the caller is responsible for holding the necessary - * locks. - */ -static int adfs_dir_find_entry(struct adfs_dir *dir, u32 indaddr) -{ - int pos, ret; - - ret = -ENOENT; - - for (pos = 5; pos < ADFS_NUM_DIR_ENTRIES * 26 + 5; pos += 26) { - struct object_info obj; - - if (!__adfs_dir_get(dir, pos, &obj)) - break; - - if (obj.indaddr == indaddr) { - ret = pos; - break; - } - } - - return ret; -} - -static int adfs_f_read(struct super_block *sb, u32 indaddr, unsigned int size, - struct adfs_dir *dir) -{ - int ret; - - if (size != ADFS_NEWDIR_SIZE) - return -EIO; - - ret = adfs_dir_read(sb, indaddr, size, dir); - if (ret) - adfs_error(sb, "unable to read directory"); - else - dir->parent_id = adfs_readval(dir->dirtail.new.dirparent, 3); - - return ret; -} - -static int adfs_f_setpos(struct adfs_dir *dir, unsigned int fpos) { if (fpos >= ADFS_NUM_DIR_ENTRIES) @@ -364,99 +251,74 @@ adfs_f_getnext(struct adfs_dir *dir, struct object_info *obj) return ret; } -static int -adfs_f_update(struct adfs_dir *dir, struct object_info *obj) +static int adfs_f_iterate(struct adfs_dir *dir, struct dir_context *ctx) { - struct super_block *sb = dir->sb; - int ret, i; + struct object_info obj; + int pos = 5 + (ctx->pos - 2) * 26; - ret = adfs_dir_find_entry(dir, obj->indaddr); - if (ret < 0) { - adfs_error(dir->sb, "unable to locate entry to update"); - goto out; + while (ctx->pos < 2 + ADFS_NUM_DIR_ENTRIES) { + if (__adfs_dir_get(dir, pos, &obj)) + break; + if (!dir_emit(ctx, obj.name, obj.name_len, + obj.indaddr, DT_UNKNOWN)) + break; + pos += 26; + ctx->pos++; } + return 0; +} - __adfs_dir_put(dir, ret, obj); - - /* - * Increment directory sequence number - */ - dir->bh[0]->b_data[0] += 1; - dir->bh[dir->nr_buffers - 1]->b_data[sb->s_blocksize - 6] += 1; - - ret = adfs_dir_checkbyte(dir); - /* - * Update directory check byte - */ - dir->bh[dir->nr_buffers - 1]->b_data[sb->s_blocksize - 1] = ret; - -#if 1 - { - const unsigned int blocksize_bits = sb->s_blocksize_bits; - - memcpy(&dir->dirhead, bufoff(dir->bh, 0), sizeof(dir->dirhead)); - memcpy(&dir->dirtail, bufoff(dir->bh, 2007), sizeof(dir->dirtail)); +static int adfs_f_update(struct adfs_dir *dir, struct object_info *obj) +{ + struct adfs_direntry de; + int offset, ret; - if (dir->dirhead.startmasseq != dir->dirtail.new.endmasseq || - memcmp(&dir->dirhead.startname, &dir->dirtail.new.endname, 4)) - goto bad_dir; + offset = 5 - (int)sizeof(de); - if (memcmp(&dir->dirhead.startname, "Nick", 4) && - memcmp(&dir->dirhead.startname, "Hugo", 4)) - goto bad_dir; + do { + offset += sizeof(de); + ret = adfs_dir_copyfrom(&de, dir, offset, sizeof(de)); + if (ret) { + adfs_error(dir->sb, "error reading directory entry"); + return -ENOENT; + } + if (!de.dirobname[0]) { + adfs_error(dir->sb, "unable to locate entry to update"); + return -ENOENT; + } + } while (adfs_readval(de.dirinddiscadd, 3) != obj->indaddr); - if (adfs_dir_checkbyte(dir) != dir->dirtail.new.dircheckbyte) - goto bad_dir; - } -#endif - for (i = dir->nr_buffers - 1; i >= 0; i--) - mark_buffer_dirty(dir->bh[i]); + /* Update the directory entry with the new object state */ + adfs_obj2dir(&de, obj); - ret = 0; -out: - return ret; -#if 1 -bad_dir: - adfs_error(dir->sb, "whoops! I broke a directory!"); - return -EIO; -#endif + /* Write the directory entry back to the directory */ + return adfs_dir_copyto(dir, offset, &de, 26); } -static int -adfs_f_sync(struct adfs_dir *dir) +static int adfs_f_commit(struct adfs_dir *dir) { - int err = 0; - int i; - - for (i = dir->nr_buffers - 1; i >= 0; i--) { - struct buffer_head *bh = dir->bh[i]; - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) - err = -EIO; - } + int ret; - return err; -} + /* Increment directory sequence number */ + dir->dirhead->startmasseq += 1; + dir->newtail->endmasseq += 1; -static void -adfs_f_free(struct adfs_dir *dir) -{ - int i; + /* Update directory check byte */ + dir->newtail->dircheckbyte = adfs_dir_checkbyte(dir); - for (i = dir->nr_buffers - 1; i >= 0; i--) { - brelse(dir->bh[i]); - dir->bh[i] = NULL; - } + /* Make sure the directory still validates correctly */ + ret = adfs_f_validate(dir); + if (ret) + adfs_msg(dir->sb, KERN_ERR, "error: update broke directory"); - dir->nr_buffers = 0; - dir->sb = NULL; + return ret; } const struct adfs_dir_ops adfs_f_dir_ops = { .read = adfs_f_read, + .iterate = adfs_f_iterate, .setpos = adfs_f_setpos, .getnext = adfs_f_getnext, .update = adfs_f_update, - .sync = adfs_f_sync, - .free = adfs_f_free + .commit = adfs_f_commit, }; diff --git a/fs/adfs/dir_f.h b/fs/adfs/dir_f.h index 5aec332b90f5..a5393e6cf9f4 100644 --- a/fs/adfs/dir_f.h +++ b/fs/adfs/dir_f.h @@ -13,9 +13,9 @@ * Directory header */ struct adfs_dirheader { - unsigned char startmasseq; - unsigned char startname[4]; -}; + __u8 startmasseq; + __u8 startname[4]; +} __attribute__((packed)); #define ADFS_NEWDIR_SIZE 2048 #define ADFS_NUM_DIR_ENTRIES 77 @@ -31,32 +31,36 @@ struct adfs_direntry { __u8 dirlen[4]; __u8 dirinddiscadd[3]; __u8 newdiratts; -}; +} __attribute__((packed)); /* * Directory tail */ +struct adfs_olddirtail { + __u8 dirlastmask; + char dirname[10]; + __u8 dirparent[3]; + char dirtitle[19]; + __u8 reserved[14]; + __u8 endmasseq; + __u8 endname[4]; + __u8 dircheckbyte; +} __attribute__((packed)); + +struct adfs_newdirtail { + __u8 dirlastmask; + __u8 reserved[2]; + __u8 dirparent[3]; + char dirtitle[19]; + char dirname[10]; + __u8 endmasseq; + __u8 endname[4]; + __u8 dircheckbyte; +} __attribute__((packed)); + union adfs_dirtail { - struct { - unsigned char dirlastmask; - char dirname[10]; - unsigned char dirparent[3]; - char dirtitle[19]; - unsigned char reserved[14]; - unsigned char endmasseq; - unsigned char endname[4]; - unsigned char dircheckbyte; - } old; - struct { - unsigned char dirlastmask; - unsigned char reserved[2]; - unsigned char dirparent[3]; - char dirtitle[19]; - char dirname[10]; - unsigned char endmasseq; - unsigned char endname[4]; - unsigned char dircheckbyte; - } new; + struct adfs_olddirtail old; + struct adfs_newdirtail new; }; #endif diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c index d56924c11b17..4a15924014da 100644 --- a/fs/adfs/dir_fplus.c +++ b/fs/adfs/dir_fplus.c @@ -4,123 +4,163 @@ * * Copyright (C) 1997-1999 Russell King */ -#include <linux/slab.h> #include "adfs.h" #include "dir_fplus.h" -static int -adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir) +/* Return the byte offset to directory entry pos */ +static unsigned int adfs_fplus_offset(const struct adfs_bigdirheader *h, + unsigned int pos) { - struct adfs_bigdirheader *h; - struct adfs_bigdirtail *t; - unsigned long block; - unsigned int blk, size; - int i, ret = -EIO; + return offsetof(struct adfs_bigdirheader, bigdirname) + + ALIGN(le32_to_cpu(h->bigdirnamelen), 4) + + pos * sizeof(struct adfs_bigdirentry); +} - dir->nr_buffers = 0; +static int adfs_fplus_validate_header(const struct adfs_bigdirheader *h) +{ + unsigned int size = le32_to_cpu(h->bigdirsize); + unsigned int len; - /* start off using fixed bh set - only alloc for big dirs */ - dir->bh_fplus = &dir->bh[0]; + if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 || + h->bigdirversion[2] != 0 || + h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME) || + !size || size & 2047 || size > SZ_4M) + return -EIO; - block = __adfs_block_map(sb, id, 0); - if (!block) { - adfs_error(sb, "dir object %X has a hole at offset 0", id); - goto out; - } + size -= sizeof(struct adfs_bigdirtail) + + offsetof(struct adfs_bigdirheader, bigdirname); - dir->bh_fplus[0] = sb_bread(sb, block); - if (!dir->bh_fplus[0]) - goto out; - dir->nr_buffers += 1; + /* Check that bigdirnamelen fits within the directory */ + len = ALIGN(le32_to_cpu(h->bigdirnamelen), 4); + if (len > size) + return -EIO; - h = (struct adfs_bigdirheader *)dir->bh_fplus[0]->b_data; - size = le32_to_cpu(h->bigdirsize); - if (size != sz) { - adfs_msg(sb, KERN_WARNING, - "directory header size %X does not match directory size %X", - size, sz); + size -= len; + + /* Check that bigdirnamesize fits within the directory */ + len = le32_to_cpu(h->bigdirnamesize); + if (len > size) + return -EIO; + + size -= len; + + /* + * Avoid division, we know that absolute maximum number of entries + * can not be so large to cause overflow of the multiplication below. + */ + len = le32_to_cpu(h->bigdirentries); + if (len > SZ_4M / sizeof(struct adfs_bigdirentry) || + len * sizeof(struct adfs_bigdirentry) > size) + return -EIO; + + return 0; +} + +static int adfs_fplus_validate_tail(const struct adfs_bigdirheader *h, + const struct adfs_bigdirtail *t) +{ + if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) || + t->bigdirendmasseq != h->startmasseq || + t->reserved[0] != 0 || t->reserved[1] != 0) + return -EIO; + + return 0; +} + +static u8 adfs_fplus_checkbyte(struct adfs_dir *dir) +{ + struct adfs_bigdirheader *h = dir->bighead; + struct adfs_bigdirtail *t = dir->bigtail; + unsigned int end, bs, bi, i; + __le32 *bp; + u32 dircheck; + + end = adfs_fplus_offset(h, le32_to_cpu(h->bigdirentries)) + + le32_to_cpu(h->bigdirnamesize); + + /* Accumulate the contents of the header, entries and names */ + for (dircheck = 0, bi = 0; end; bi++) { + bp = (void *)dir->bhs[bi]->b_data; + bs = dir->bhs[bi]->b_size; + if (bs > end) + bs = end; + + for (i = 0; i < bs; i += sizeof(u32)) + dircheck = ror32(dircheck, 13) ^ le32_to_cpup(bp++); + + end -= bs; } - if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 || - h->bigdirversion[2] != 0 || size & 2047 || - h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME)) { - adfs_error(sb, "dir %06x has malformed header", id); + /* Accumulate the contents of the tail except for the check byte */ + dircheck = ror32(dircheck, 13) ^ le32_to_cpu(t->bigdirendname); + dircheck = ror32(dircheck, 13) ^ t->bigdirendmasseq; + dircheck = ror32(dircheck, 13) ^ t->reserved[0]; + dircheck = ror32(dircheck, 13) ^ t->reserved[1]; + + return dircheck ^ dircheck >> 8 ^ dircheck >> 16 ^ dircheck >> 24; +} + +static int adfs_fplus_read(struct super_block *sb, u32 indaddr, + unsigned int size, struct adfs_dir *dir) +{ + struct adfs_bigdirheader *h; + struct adfs_bigdirtail *t; + unsigned int dirsize; + int ret; + + /* Read first buffer */ + ret = adfs_dir_read_buffers(sb, indaddr, sb->s_blocksize, dir); + if (ret) + return ret; + + dir->bighead = h = (void *)dir->bhs[0]->b_data; + ret = adfs_fplus_validate_header(h); + if (ret) { + adfs_error(sb, "dir %06x has malformed header", indaddr); goto out; } - size >>= sb->s_blocksize_bits; - if (size > ARRAY_SIZE(dir->bh)) { - /* this directory is too big for fixed bh set, must allocate */ - struct buffer_head **bh_fplus = - kcalloc(size, sizeof(struct buffer_head *), - GFP_KERNEL); - if (!bh_fplus) { - adfs_msg(sb, KERN_ERR, - "not enough memory for dir object %X (%d blocks)", - id, size); - ret = -ENOMEM; - goto out; - } - dir->bh_fplus = bh_fplus; - /* copy over the pointer to the block that we've already read */ - dir->bh_fplus[0] = dir->bh[0]; + dirsize = le32_to_cpu(h->bigdirsize); + if (size && dirsize != size) { + adfs_msg(sb, KERN_WARNING, + "dir %06x header size %X does not match directory size %X", + indaddr, dirsize, size); } - for (blk = 1; blk < size; blk++) { - block = __adfs_block_map(sb, id, blk); - if (!block) { - adfs_error(sb, "dir object %X has a hole at offset %d", id, blk); - goto out; - } + /* Read remaining buffers */ + ret = adfs_dir_read_buffers(sb, indaddr, dirsize, dir); + if (ret) + return ret; - dir->bh_fplus[blk] = sb_bread(sb, block); - if (!dir->bh_fplus[blk]) { - adfs_error(sb, "dir object %x failed read for offset %d, mapped block %lX", - id, blk, block); - goto out; - } + dir->bigtail = t = (struct adfs_bigdirtail *) + (dir->bhs[dir->nr_buffers - 1]->b_data + (sb->s_blocksize - 8)); - dir->nr_buffers += 1; + ret = adfs_fplus_validate_tail(h, t); + if (ret) { + adfs_error(sb, "dir %06x has malformed tail", indaddr); + goto out; } - t = (struct adfs_bigdirtail *) - (dir->bh_fplus[size - 1]->b_data + (sb->s_blocksize - 8)); - - if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) || - t->bigdirendmasseq != h->startmasseq || - t->reserved[0] != 0 || t->reserved[1] != 0) { - adfs_error(sb, "dir %06x has malformed tail", id); + if (adfs_fplus_checkbyte(dir) != t->bigdircheckbyte) { + adfs_error(sb, "dir %06x checkbyte mismatch\n", indaddr); goto out; } dir->parent_id = le32_to_cpu(h->bigdirparent); - dir->sb = sb; return 0; out: - if (dir->bh_fplus) { - for (i = 0; i < dir->nr_buffers; i++) - brelse(dir->bh_fplus[i]); - - if (&dir->bh[0] != dir->bh_fplus) - kfree(dir->bh_fplus); + adfs_dir_relse(dir); - dir->bh_fplus = NULL; - } - - dir->nr_buffers = 0; - dir->sb = NULL; return ret; } static int adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos) { - struct adfs_bigdirheader *h = - (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data; int ret = -ENOENT; - if (fpos <= le32_to_cpu(h->bigdirentries)) { + if (fpos <= le32_to_cpu(dir->bighead->bigdirentries)) { dir->pos = fpos; ret = 0; } @@ -128,51 +168,23 @@ adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos) return ret; } -static void -dir_memcpy(struct adfs_dir *dir, unsigned int offset, void *to, int len) -{ - struct super_block *sb = dir->sb; - unsigned int buffer, partial, remainder; - - buffer = offset >> sb->s_blocksize_bits; - offset &= sb->s_blocksize - 1; - - partial = sb->s_blocksize - offset; - - if (partial >= len) - memcpy(to, dir->bh_fplus[buffer]->b_data + offset, len); - else { - char *c = (char *)to; - - remainder = len - partial; - - memcpy(c, - dir->bh_fplus[buffer]->b_data + offset, - partial); - - memcpy(c + partial, - dir->bh_fplus[buffer + 1]->b_data, - remainder); - } -} - static int adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj) { - struct adfs_bigdirheader *h = - (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data; + struct adfs_bigdirheader *h = dir->bighead; struct adfs_bigdirentry bde; unsigned int offset; - int ret = -ENOENT; + int ret; if (dir->pos >= le32_to_cpu(h->bigdirentries)) - goto out; + return -ENOENT; - offset = offsetof(struct adfs_bigdirheader, bigdirname); - offset += ((le32_to_cpu(h->bigdirnamelen) + 4) & ~3); - offset += dir->pos * sizeof(struct adfs_bigdirentry); + offset = adfs_fplus_offset(h, dir->pos); - dir_memcpy(dir, offset, &bde, sizeof(struct adfs_bigdirentry)); + ret = adfs_dir_copyfrom(&bde, dir, offset, + sizeof(struct adfs_bigdirentry)); + if (ret) + return ret; obj->loadaddr = le32_to_cpu(bde.bigdirload); obj->execaddr = le32_to_cpu(bde.bigdirexec); @@ -181,59 +193,95 @@ adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj) obj->attr = le32_to_cpu(bde.bigdirattr); obj->name_len = le32_to_cpu(bde.bigdirobnamelen); - offset = offsetof(struct adfs_bigdirheader, bigdirname); - offset += ((le32_to_cpu(h->bigdirnamelen) + 4) & ~3); - offset += le32_to_cpu(h->bigdirentries) * sizeof(struct adfs_bigdirentry); + offset = adfs_fplus_offset(h, le32_to_cpu(h->bigdirentries)); offset += le32_to_cpu(bde.bigdirobnameptr); - dir_memcpy(dir, offset, obj->name, obj->name_len); + ret = adfs_dir_copyfrom(obj->name, dir, offset, obj->name_len); + if (ret) + return ret; + adfs_object_fixup(dir, obj); dir->pos += 1; - ret = 0; -out: - return ret; + + return 0; } -static int -adfs_fplus_sync(struct adfs_dir *dir) +static int adfs_fplus_iterate(struct adfs_dir *dir, struct dir_context *ctx) { - int err = 0; - int i; - - for (i = dir->nr_buffers - 1; i >= 0; i--) { - struct buffer_head *bh = dir->bh_fplus[i]; - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) - err = -EIO; + struct object_info obj; + + if ((ctx->pos - 2) >> 32) + return 0; + + if (adfs_fplus_setpos(dir, ctx->pos - 2)) + return 0; + + while (!adfs_fplus_getnext(dir, &obj)) { + if (!dir_emit(ctx, obj.name, obj.name_len, + obj.indaddr, DT_UNKNOWN)) + break; + ctx->pos++; } - return err; + return 0; } -static void -adfs_fplus_free(struct adfs_dir *dir) +static int adfs_fplus_update(struct adfs_dir *dir, struct object_info *obj) { - int i; + struct adfs_bigdirheader *h = dir->bighead; + struct adfs_bigdirentry bde; + int offset, end, ret; - if (dir->bh_fplus) { - for (i = 0; i < dir->nr_buffers; i++) - brelse(dir->bh_fplus[i]); + offset = adfs_fplus_offset(h, 0) - sizeof(bde); + end = adfs_fplus_offset(h, le32_to_cpu(h->bigdirentries)); - if (&dir->bh[0] != dir->bh_fplus) - kfree(dir->bh_fplus); + do { + offset += sizeof(bde); + if (offset >= end) { + adfs_error(dir->sb, "unable to locate entry to update"); + return -ENOENT; + } + ret = adfs_dir_copyfrom(&bde, dir, offset, sizeof(bde)); + if (ret) { + adfs_error(dir->sb, "error reading directory entry"); + return -ENOENT; + } + } while (le32_to_cpu(bde.bigdirindaddr) != obj->indaddr); - dir->bh_fplus = NULL; - } + bde.bigdirload = cpu_to_le32(obj->loadaddr); + bde.bigdirexec = cpu_to_le32(obj->execaddr); + bde.bigdirlen = cpu_to_le32(obj->size); + bde.bigdirindaddr = cpu_to_le32(obj->indaddr); + bde.bigdirattr = cpu_to_le32(obj->attr); + + return adfs_dir_copyto(dir, offset, &bde, sizeof(bde)); +} + +static int adfs_fplus_commit(struct adfs_dir *dir) +{ + int ret; - dir->nr_buffers = 0; - dir->sb = NULL; + /* Increment directory sequence number */ + dir->bighead->startmasseq += 1; + dir->bigtail->bigdirendmasseq += 1; + + /* Update directory check byte */ + dir->bigtail->bigdircheckbyte = adfs_fplus_checkbyte(dir); + + /* Make sure the directory still validates correctly */ + ret = adfs_fplus_validate_header(dir->bighead); + if (ret == 0) + ret = adfs_fplus_validate_tail(dir->bighead, dir->bigtail); + + return ret; } const struct adfs_dir_ops adfs_fplus_dir_ops = { .read = adfs_fplus_read, + .iterate = adfs_fplus_iterate, .setpos = adfs_fplus_setpos, .getnext = adfs_fplus_getnext, - .sync = adfs_fplus_sync, - .free = adfs_fplus_free + .update = adfs_fplus_update, + .commit = adfs_fplus_commit, }; diff --git a/fs/adfs/dir_fplus.h b/fs/adfs/dir_fplus.h index 4ec0931e36ad..d729b1591e5e 100644 --- a/fs/adfs/dir_fplus.h +++ b/fs/adfs/dir_fplus.h @@ -22,7 +22,7 @@ struct adfs_bigdirheader { __le32 bigdirnamesize; __le32 bigdirparent; char bigdirname[1]; -}; +} __attribute__((packed, aligned(4))); struct adfs_bigdirentry { __le32 bigdirload; @@ -32,11 +32,11 @@ struct adfs_bigdirentry { __le32 bigdirattr; __le32 bigdirobnamelen; __le32 bigdirobnameptr; -}; +} __attribute__((packed, aligned(4))); struct adfs_bigdirtail { __le32 bigdirendname; __u8 bigdirendmasseq; __u8 reserved[2]; __u8 bigdircheckbyte; -}; +} __attribute__((packed, aligned(4))); diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 124de75413a5..32620f4a7623 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -20,7 +20,8 @@ adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh, if (block >= inode->i_blocks) goto abort_toobig; - block = __adfs_block_map(inode->i_sb, inode->i_ino, block); + block = __adfs_block_map(inode->i_sb, ADFS_I(inode)->indaddr, + block); if (block) map_bh(bh, inode->i_sb, block); return 0; @@ -126,29 +127,29 @@ adfs_atts2mode(struct super_block *sb, struct inode *inode) * Convert Linux permission to ADFS attribute. We try to do the reverse * of atts2mode, but there is not a 1:1 translation. */ -static int -adfs_mode2atts(struct super_block *sb, struct inode *inode) +static int adfs_mode2atts(struct super_block *sb, struct inode *inode, + umode_t ia_mode) { + struct adfs_sb_info *asb = ADFS_SB(sb); umode_t mode; int attr; - struct adfs_sb_info *asb = ADFS_SB(sb); /* FIXME: should we be able to alter a link? */ if (S_ISLNK(inode->i_mode)) return ADFS_I(inode)->attr; + /* Directories do not have read/write permissions on the media */ if (S_ISDIR(inode->i_mode)) - attr = ADFS_NDA_DIRECTORY; - else - attr = 0; + return ADFS_NDA_DIRECTORY; - mode = inode->i_mode & asb->s_owner_mask; + attr = 0; + mode = ia_mode & asb->s_owner_mask; if (mode & S_IRUGO) attr |= ADFS_NDA_OWNER_READ; if (mode & S_IWUGO) attr |= ADFS_NDA_OWNER_WRITE; - mode = inode->i_mode & asb->s_other_mask; + mode = ia_mode & asb->s_other_mask; mode &= ~asb->s_owner_mask; if (mode & S_IRUGO) attr |= ADFS_NDA_PUBLIC_READ; @@ -158,6 +159,8 @@ adfs_mode2atts(struct super_block *sb, struct inode *inode) return attr; } +static const s64 nsec_unix_epoch_diff_risc_os_epoch = 2208988800000000000LL; + /* * Convert an ADFS time to Unix time. ADFS has a 40-bit centi-second time * referenced to 1 Jan 1900 (til 2248) so we need to discard 2208988800 seconds @@ -170,8 +173,6 @@ adfs_adfs2unix_time(struct timespec64 *tv, struct inode *inode) /* 01 Jan 1970 00:00:00 (Unix epoch) as nanoseconds since * 01 Jan 1900 00:00:00 (RISC OS epoch) */ - static const s64 nsec_unix_epoch_diff_risc_os_epoch = - 2208988800000000000LL; s64 nsec; if (!adfs_inode_is_stamped(inode)) @@ -204,24 +205,23 @@ adfs_adfs2unix_time(struct timespec64 *tv, struct inode *inode) return; } -/* - * Convert an Unix time to ADFS time. We only do this if the entry has a - * time/date stamp already. - */ -static void -adfs_unix2adfs_time(struct inode *inode, unsigned int secs) +/* Convert an Unix time to ADFS time for an entry that is already stamped. */ +static void adfs_unix2adfs_time(struct inode *inode, + const struct timespec64 *ts) { - unsigned int high, low; + s64 cs, nsec = timespec64_to_ns(ts); - if (adfs_inode_is_stamped(inode)) { - /* convert 32-bit seconds to 40-bit centi-seconds */ - low = (secs & 255) * 100; - high = (secs / 256) * 100 + (low >> 8) + 0x336e996a; + /* convert from Unix to RISC OS epoch */ + nsec += nsec_unix_epoch_diff_risc_os_epoch; - ADFS_I(inode)->loadaddr = (high >> 24) | - (ADFS_I(inode)->loadaddr & ~0xff); - ADFS_I(inode)->execaddr = (low & 255) | (high << 8); - } + /* convert from nanoseconds to centiseconds */ + cs = div_s64(nsec, 10000000); + + cs = clamp_t(s64, cs, 0, 0xffffffffff); + + ADFS_I(inode)->loadaddr &= ~0xff; + ADFS_I(inode)->loadaddr |= (cs >> 32) & 0xff; + ADFS_I(inode)->execaddr = cs; } /* @@ -260,6 +260,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj) * for cross-directory renames. */ ADFS_I(inode)->parent_id = obj->parent_id; + ADFS_I(inode)->indaddr = obj->indaddr; ADFS_I(inode)->loadaddr = obj->loadaddr; ADFS_I(inode)->execaddr = obj->execaddr; ADFS_I(inode)->attr = obj->attr; @@ -315,10 +316,11 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr) if (ia_valid & ATTR_SIZE) truncate_setsize(inode, attr->ia_size); - if (ia_valid & ATTR_MTIME) { - inode->i_mtime = attr->ia_mtime; - adfs_unix2adfs_time(inode, attr->ia_mtime.tv_sec); + if (ia_valid & ATTR_MTIME && adfs_inode_is_stamped(inode)) { + adfs_unix2adfs_time(inode, &attr->ia_mtime); + adfs_adfs2unix_time(&inode->i_mtime, inode); } + /* * FIXME: should we make these == to i_mtime since we don't * have the ability to represent them in our filesystem? @@ -328,7 +330,7 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr) if (ia_valid & ATTR_CTIME) inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { - ADFS_I(inode)->attr = adfs_mode2atts(sb, inode); + ADFS_I(inode)->attr = adfs_mode2atts(sb, inode, attr->ia_mode); inode->i_mode = adfs_atts2mode(sb, inode); } @@ -353,7 +355,7 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) struct object_info obj; int ret; - obj.indaddr = inode->i_ino; + obj.indaddr = ADFS_I(inode)->indaddr; obj.name_len = 0; obj.parent_id = ADFS_I(inode)->parent_id; obj.loadaddr = ADFS_I(inode)->loadaddr; diff --git a/fs/adfs/map.c b/fs/adfs/map.c index f44d12cef5be..a81de80c45c1 100644 --- a/fs/adfs/map.c +++ b/fs/adfs/map.c @@ -4,6 +4,8 @@ * * Copyright (C) 1997-2002 Russell King */ +#include <linux/slab.h> +#include <linux/statfs.h> #include <asm/unaligned.h> #include "adfs.h" @@ -66,54 +68,41 @@ static DEFINE_RWLOCK(adfs_map_lock); static int lookup_zone(const struct adfs_discmap *dm, const unsigned int idlen, const u32 frag_id, unsigned int *offset) { - const unsigned int mapsize = dm->dm_endbit; + const unsigned int endbit = dm->dm_endbit; const u32 idmask = (1 << idlen) - 1; - unsigned char *map = dm->dm_bh->b_data + 4; + unsigned char *map = dm->dm_bh->b_data; unsigned int start = dm->dm_startbit; - unsigned int mapptr; + unsigned int freelink, fragend; u32 frag; + frag = GET_FRAG_ID(map, 8, idmask & 0x7fff); + freelink = frag ? 8 + frag : 0; + do { frag = GET_FRAG_ID(map, start, idmask); - mapptr = start + idlen; - - /* - * find end of fragment - */ - { - __le32 *_map = (__le32 *)map; - u32 v = le32_to_cpu(_map[mapptr >> 5]) >> (mapptr & 31); - while (v == 0) { - mapptr = (mapptr & ~31) + 32; - if (mapptr >= mapsize) - goto error; - v = le32_to_cpu(_map[mapptr >> 5]); - } - - mapptr += 1 + ffz(~v); + + fragend = find_next_bit_le(map, endbit, start + idlen); + if (fragend >= endbit) + goto error; + + if (start == freelink) { + freelink += frag & 0x7fff; + } else if (frag == frag_id) { + unsigned int length = fragend + 1 - start; + + if (*offset < length) + return start + *offset; + *offset -= length; } - if (frag == frag_id) - goto found; -again: - start = mapptr; - } while (mapptr < mapsize); + start = fragend + 1; + } while (start < endbit); return -1; error: printk(KERN_ERR "adfs: oversized fragment 0x%x at 0x%x-0x%x\n", - frag, start, mapptr); + frag, start, fragend); return -1; - -found: - { - int length = mapptr - start; - if (*offset >= length) { - *offset -= length; - goto again; - } - } - return start + *offset; } /* @@ -125,12 +114,12 @@ found: static unsigned int scan_free_map(struct adfs_sb_info *asb, struct adfs_discmap *dm) { - const unsigned int mapsize = dm->dm_endbit + 32; + const unsigned int endbit = dm->dm_endbit; const unsigned int idlen = asb->s_idlen; const unsigned int frag_idlen = idlen <= 15 ? idlen : 15; const u32 idmask = (1 << frag_idlen) - 1; unsigned char *map = dm->dm_bh->b_data; - unsigned int start = 8, mapptr; + unsigned int start = 8, fragend; u32 frag; unsigned long total = 0; @@ -149,29 +138,13 @@ scan_free_map(struct adfs_sb_info *asb, struct adfs_discmap *dm) do { start += frag; - /* - * get fragment id - */ frag = GET_FRAG_ID(map, start, idmask); - mapptr = start + idlen; - - /* - * find end of fragment - */ - { - __le32 *_map = (__le32 *)map; - u32 v = le32_to_cpu(_map[mapptr >> 5]) >> (mapptr & 31); - while (v == 0) { - mapptr = (mapptr & ~31) + 32; - if (mapptr >= mapsize) - goto error; - v = le32_to_cpu(_map[mapptr >> 5]); - } - - mapptr += 1 + ffz(~v); - } - total += mapptr - start; + fragend = find_next_bit_le(map, endbit, start + idlen); + if (fragend >= endbit) + goto error; + + total += fragend + 1 - start; } while (frag >= idlen + 1); if (frag != 0) @@ -220,10 +193,10 @@ found: * total_free = E(free_in_zone_n) * nzones */ -unsigned int -adfs_map_free(struct super_block *sb) +void adfs_map_statfs(struct super_block *sb, struct kstatfs *buf) { struct adfs_sb_info *asb = ADFS_SB(sb); + struct adfs_discrecord *dr = adfs_map_discrecord(asb->s_map); struct adfs_discmap *dm; unsigned int total = 0; unsigned int zone; @@ -235,7 +208,10 @@ adfs_map_free(struct super_block *sb) total += scan_free_map(asb, dm++); } while (--zone > 0); - return signed_asl(total, asb->s_map2blk); + buf->f_blocks = adfs_disc_size(dr) >> sb->s_blocksize_bits; + buf->f_files = asb->s_ids_per_zone * asb->s_map_size; + buf->f_bavail = + buf->f_bfree = signed_asl(total, asb->s_map2blk); } int adfs_map_lookup(struct super_block *sb, u32 frag_id, unsigned int offset) @@ -280,3 +256,152 @@ bad_fragment: frag_id, zone, asb->s_map_size); return 0; } + +static unsigned char adfs_calczonecheck(struct super_block *sb, unsigned char *map) +{ + unsigned int v0, v1, v2, v3; + int i; + + v0 = v1 = v2 = v3 = 0; + for (i = sb->s_blocksize - 4; i; i -= 4) { + v0 += map[i] + (v3 >> 8); + v3 &= 0xff; + v1 += map[i + 1] + (v0 >> 8); + v0 &= 0xff; + v2 += map[i + 2] + (v1 >> 8); + v1 &= 0xff; + v3 += map[i + 3] + (v2 >> 8); + v2 &= 0xff; + } + v0 += v3 >> 8; + v1 += map[1] + (v0 >> 8); + v2 += map[2] + (v1 >> 8); + v3 += map[3] + (v2 >> 8); + + return v0 ^ v1 ^ v2 ^ v3; +} + +static int adfs_checkmap(struct super_block *sb, struct adfs_discmap *dm) +{ + unsigned char crosscheck = 0, zonecheck = 1; + int i; + + for (i = 0; i < ADFS_SB(sb)->s_map_size; i++) { + unsigned char *map; + + map = dm[i].dm_bh->b_data; + + if (adfs_calczonecheck(sb, map) != map[0]) { + adfs_error(sb, "zone %d fails zonecheck", i); + zonecheck = 0; + } + crosscheck ^= map[3]; + } + if (crosscheck != 0xff) + adfs_error(sb, "crosscheck != 0xff"); + return crosscheck == 0xff && zonecheck; +} + +/* + * Layout the map - the first zone contains a copy of the disc record, + * and the last zone must be limited to the size of the filesystem. + */ +static void adfs_map_layout(struct adfs_discmap *dm, unsigned int nzones, + struct adfs_discrecord *dr) +{ + unsigned int zone, zone_size; + u64 size; + + zone_size = (8 << dr->log2secsize) - le16_to_cpu(dr->zone_spare); + + dm[0].dm_bh = NULL; + dm[0].dm_startblk = 0; + dm[0].dm_startbit = 32 + ADFS_DR_SIZE_BITS; + dm[0].dm_endbit = 32 + zone_size; + + for (zone = 1; zone < nzones; zone++) { + dm[zone].dm_bh = NULL; + dm[zone].dm_startblk = zone * zone_size - ADFS_DR_SIZE_BITS; + dm[zone].dm_startbit = 32; + dm[zone].dm_endbit = 32 + zone_size; + } + + size = adfs_disc_size(dr) >> dr->log2bpmb; + size -= (nzones - 1) * zone_size - ADFS_DR_SIZE_BITS; + dm[nzones - 1].dm_endbit = 32 + size; +} + +static int adfs_map_read(struct adfs_discmap *dm, struct super_block *sb, + unsigned int map_addr, unsigned int nzones) +{ + unsigned int zone; + + for (zone = 0; zone < nzones; zone++) { + dm[zone].dm_bh = sb_bread(sb, map_addr + zone); + if (!dm[zone].dm_bh) + return -EIO; + } + + return 0; +} + +static void adfs_map_relse(struct adfs_discmap *dm, unsigned int nzones) +{ + unsigned int zone; + + for (zone = 0; zone < nzones; zone++) + brelse(dm[zone].dm_bh); +} + +struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr) +{ + struct adfs_sb_info *asb = ADFS_SB(sb); + struct adfs_discmap *dm; + unsigned int map_addr, zone_size, nzones; + int ret; + + nzones = dr->nzones | dr->nzones_high << 8; + zone_size = (8 << dr->log2secsize) - le16_to_cpu(dr->zone_spare); + + asb->s_idlen = dr->idlen; + asb->s_map_size = nzones; + asb->s_map2blk = dr->log2bpmb - dr->log2secsize; + asb->s_log2sharesize = dr->log2sharesize; + asb->s_ids_per_zone = zone_size / (asb->s_idlen + 1); + + map_addr = (nzones >> 1) * zone_size - + ((nzones > 1) ? ADFS_DR_SIZE_BITS : 0); + map_addr = signed_asl(map_addr, asb->s_map2blk); + + dm = kmalloc_array(nzones, sizeof(*dm), GFP_KERNEL); + if (dm == NULL) { + adfs_error(sb, "not enough memory"); + return ERR_PTR(-ENOMEM); + } + + adfs_map_layout(dm, nzones, dr); + + ret = adfs_map_read(dm, sb, map_addr, nzones); + if (ret) { + adfs_error(sb, "unable to read map"); + goto error_free; + } + + if (adfs_checkmap(sb, dm)) + return dm; + + adfs_error(sb, "map corrupted"); + +error_free: + adfs_map_relse(dm, nzones); + kfree(dm); + return ERR_PTR(-EIO); +} + +void adfs_free_map(struct super_block *sb) +{ + struct adfs_sb_info *asb = ADFS_SB(sb); + + adfs_map_relse(asb->s_map, asb->s_map_size); + kfree(asb->s_map); +} diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 65b04ebb51c3..a3cc8ecb50da 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -88,59 +88,11 @@ static int adfs_checkdiscrecord(struct adfs_discrecord *dr) return 0; } -static unsigned char adfs_calczonecheck(struct super_block *sb, unsigned char *map) -{ - unsigned int v0, v1, v2, v3; - int i; - - v0 = v1 = v2 = v3 = 0; - for (i = sb->s_blocksize - 4; i; i -= 4) { - v0 += map[i] + (v3 >> 8); - v3 &= 0xff; - v1 += map[i + 1] + (v0 >> 8); - v0 &= 0xff; - v2 += map[i + 2] + (v1 >> 8); - v1 &= 0xff; - v3 += map[i + 3] + (v2 >> 8); - v2 &= 0xff; - } - v0 += v3 >> 8; - v1 += map[1] + (v0 >> 8); - v2 += map[2] + (v1 >> 8); - v3 += map[3] + (v2 >> 8); - - return v0 ^ v1 ^ v2 ^ v3; -} - -static int adfs_checkmap(struct super_block *sb, struct adfs_discmap *dm) -{ - unsigned char crosscheck = 0, zonecheck = 1; - int i; - - for (i = 0; i < ADFS_SB(sb)->s_map_size; i++) { - unsigned char *map; - - map = dm[i].dm_bh->b_data; - - if (adfs_calczonecheck(sb, map) != map[0]) { - adfs_error(sb, "zone %d fails zonecheck", i); - zonecheck = 0; - } - crosscheck ^= map[3]; - } - if (crosscheck != 0xff) - adfs_error(sb, "crosscheck != 0xff"); - return crosscheck == 0xff && zonecheck; -} - static void adfs_put_super(struct super_block *sb) { - int i; struct adfs_sb_info *asb = ADFS_SB(sb); - for (i = 0; i < asb->s_map_size; i++) - brelse(asb->s_map[i].dm_bh); - kfree(asb->s_map); + adfs_free_map(sb); kfree_rcu(asb, rcu); } @@ -249,16 +201,13 @@ static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct adfs_sb_info *sbi = ADFS_SB(sb); - struct adfs_discrecord *dr = adfs_map_discrecord(sbi->s_map); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + adfs_map_statfs(sb, buf); + buf->f_type = ADFS_SUPER_MAGIC; buf->f_namelen = sbi->s_namelen; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = adfs_disc_size(dr) >> sb->s_blocksize_bits; - buf->f_files = sbi->s_ids_per_zone * sbi->s_map_size; - buf->f_bavail = - buf->f_bfree = adfs_map_free(sb); buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); @@ -282,6 +231,12 @@ static void adfs_free_inode(struct inode *inode) kmem_cache_free(adfs_inode_cachep, ADFS_I(inode)); } +static int adfs_drop_inode(struct inode *inode) +{ + /* always drop inodes if we are read-only */ + return !IS_ENABLED(CONFIG_ADFS_FS_RW) || IS_RDONLY(inode); +} + static void init_once(void *foo) { struct adfs_inode_info *ei = (struct adfs_inode_info *) foo; @@ -314,7 +269,7 @@ static void destroy_inodecache(void) static const struct super_operations adfs_sops = { .alloc_inode = adfs_alloc_inode, .free_inode = adfs_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = adfs_drop_inode, .write_inode = adfs_write_inode, .put_super = adfs_put_super, .statfs = adfs_statfs, @@ -322,66 +277,94 @@ static const struct super_operations adfs_sops = { .show_options = adfs_show_options, }; -static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr) +static int adfs_probe(struct super_block *sb, unsigned int offset, int silent, + int (*validate)(struct super_block *sb, + struct buffer_head *bh, + struct adfs_discrecord **bhp)) { - struct adfs_discmap *dm; - unsigned int map_addr, zone_size, nzones; - int i, zone; struct adfs_sb_info *asb = ADFS_SB(sb); + struct adfs_discrecord *dr; + struct buffer_head *bh; + unsigned int blocksize = BLOCK_SIZE; + int ret, try; + + for (try = 0; try < 2; try++) { + /* try to set the requested block size */ + if (sb->s_blocksize != blocksize && + !sb_set_blocksize(sb, blocksize)) { + if (!silent) + adfs_msg(sb, KERN_ERR, + "error: unsupported blocksize"); + return -EINVAL; + } - nzones = asb->s_map_size; - zone_size = (8 << dr->log2secsize) - le16_to_cpu(dr->zone_spare); - map_addr = (nzones >> 1) * zone_size - - ((nzones > 1) ? ADFS_DR_SIZE_BITS : 0); - map_addr = signed_asl(map_addr, asb->s_map2blk); + /* read the buffer */ + bh = sb_bread(sb, offset >> sb->s_blocksize_bits); + if (!bh) { + adfs_msg(sb, KERN_ERR, + "error: unable to read block %u, try %d", + offset >> sb->s_blocksize_bits, try); + return -EIO; + } + + /* validate it */ + ret = validate(sb, bh, &dr); + if (ret) { + brelse(bh); + return ret; + } - asb->s_ids_per_zone = zone_size / (asb->s_idlen + 1); + /* does the block size match the filesystem block size? */ + blocksize = 1 << dr->log2secsize; + if (sb->s_blocksize == blocksize) { + asb->s_map = adfs_read_map(sb, dr); + brelse(bh); + return PTR_ERR_OR_ZERO(asb->s_map); + } - dm = kmalloc_array(nzones, sizeof(*dm), GFP_KERNEL); - if (dm == NULL) { - adfs_error(sb, "not enough memory"); - return ERR_PTR(-ENOMEM); + brelse(bh); } - for (zone = 0; zone < nzones; zone++, map_addr++) { - dm[zone].dm_startbit = 0; - dm[zone].dm_endbit = zone_size; - dm[zone].dm_startblk = zone * zone_size - ADFS_DR_SIZE_BITS; - dm[zone].dm_bh = sb_bread(sb, map_addr); + return -EIO; +} - if (!dm[zone].dm_bh) { - adfs_error(sb, "unable to read map"); - goto error_free; - } - } +static int adfs_validate_bblk(struct super_block *sb, struct buffer_head *bh, + struct adfs_discrecord **drp) +{ + struct adfs_discrecord *dr; + unsigned char *b_data; - /* adjust the limits for the first and last map zones */ - i = zone - 1; - dm[0].dm_startblk = 0; - dm[0].dm_startbit = ADFS_DR_SIZE_BITS; - dm[i].dm_endbit = (adfs_disc_size(dr) >> dr->log2bpmb) + - (ADFS_DR_SIZE_BITS - i * zone_size); + b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize); + if (adfs_checkbblk(b_data)) + return -EILSEQ; - if (adfs_checkmap(sb, dm)) - return dm; + /* Do some sanity checks on the ADFS disc record */ + dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); + if (adfs_checkdiscrecord(dr)) + return -EILSEQ; + + *drp = dr; + return 0; +} - adfs_error(sb, "map corrupted"); +static int adfs_validate_dr0(struct super_block *sb, struct buffer_head *bh, + struct adfs_discrecord **drp) +{ + struct adfs_discrecord *dr; -error_free: - while (--zone >= 0) - brelse(dm[zone].dm_bh); + /* Do some sanity checks on the ADFS disc record */ + dr = (struct adfs_discrecord *)(bh->b_data + 4); + if (adfs_checkdiscrecord(dr) || dr->nzones_high || dr->nzones != 1) + return -EILSEQ; - kfree(dm); - return ERR_PTR(-EIO); + *drp = dr; + return 0; } static int adfs_fill_super(struct super_block *sb, void *data, int silent) { struct adfs_discrecord *dr; - struct buffer_head *bh; struct object_info root_obj; - unsigned char *b_data; - unsigned int blocksize; struct adfs_sb_info *asb; struct inode *root; int ret = -EINVAL; @@ -391,7 +374,10 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) asb = kzalloc(sizeof(*asb), GFP_KERNEL); if (!asb) return -ENOMEM; + sb->s_fs_info = asb; + sb->s_magic = ADFS_SUPER_MAGIC; + sb->s_time_gran = 10000000; /* set default options */ asb->s_uid = GLOBAL_ROOT_UID; @@ -403,78 +389,21 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) if (parse_options(sb, asb, data)) goto error; - sb_set_blocksize(sb, BLOCK_SIZE); - if (!(bh = sb_bread(sb, ADFS_DISCRECORD / BLOCK_SIZE))) { - adfs_msg(sb, KERN_ERR, "error: unable to read superblock"); - ret = -EIO; - goto error; - } - - b_data = bh->b_data + (ADFS_DISCRECORD % BLOCK_SIZE); - - if (adfs_checkbblk(b_data)) { - ret = -EINVAL; - goto error_badfs; - } - - dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); - - /* - * Do some sanity checks on the ADFS disc record - */ - if (adfs_checkdiscrecord(dr)) { - ret = -EINVAL; - goto error_badfs; - } - - blocksize = 1 << dr->log2secsize; - brelse(bh); - - if (sb_set_blocksize(sb, blocksize)) { - bh = sb_bread(sb, ADFS_DISCRECORD / sb->s_blocksize); - if (!bh) { - adfs_msg(sb, KERN_ERR, - "error: couldn't read superblock on 2nd try."); - ret = -EIO; - goto error; - } - b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize); - if (adfs_checkbblk(b_data)) { - adfs_msg(sb, KERN_ERR, - "error: disc record mismatch, very weird!"); - ret = -EINVAL; - goto error_free_bh; - } - dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); - } else { + /* Try to probe the filesystem boot block */ + ret = adfs_probe(sb, ADFS_DISCRECORD, 1, adfs_validate_bblk); + if (ret == -EILSEQ) + ret = adfs_probe(sb, 0, silent, adfs_validate_dr0); + if (ret == -EILSEQ) { if (!silent) adfs_msg(sb, KERN_ERR, - "error: unsupported blocksize"); + "error: can't find an ADFS filesystem on dev %s.", + sb->s_id); ret = -EINVAL; - goto error; } + if (ret) + goto error; - /* - * blocksize on this device should now be set to the ADFS log2secsize - */ - - sb->s_magic = ADFS_SUPER_MAGIC; - asb->s_idlen = dr->idlen; - asb->s_map_size = dr->nzones | (dr->nzones_high << 8); - asb->s_map2blk = dr->log2bpmb - dr->log2secsize; - asb->s_log2sharesize = dr->log2sharesize; - - asb->s_map = adfs_read_map(sb, dr); - if (IS_ERR(asb->s_map)) { - ret = PTR_ERR(asb->s_map); - goto error_free_bh; - } - - brelse(bh); - - /* - * set up enough so that we can read an inode - */ + /* set up enough so that we can read an inode */ sb->s_op = &adfs_sops; dr = adfs_map_discrecord(asb->s_map); @@ -511,23 +440,13 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) root = adfs_iget(sb, &root_obj); sb->s_root = d_make_root(root); if (!sb->s_root) { - int i; - for (i = 0; i < asb->s_map_size; i++) - brelse(asb->s_map[i].dm_bh); - kfree(asb->s_map); + adfs_free_map(sb); adfs_error(sb, "get root inode failed\n"); ret = -EIO; goto error; } return 0; -error_badfs: - if (!silent) - adfs_msg(sb, KERN_ERR, - "error: can't find an ADFS filesystem on dev %s.", - sb->s_id); -error_free_bh: - brelse(bh); error: sb->s_fs_info = NULL; kfree(asb); diff --git a/fs/afs/cell.c b/fs/afs/cell.c index fd5133e26a38..78ba5f932287 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -134,8 +134,17 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, _leave(" = -ENAMETOOLONG"); return ERR_PTR(-ENAMETOOLONG); } - if (namelen == 5 && memcmp(name, "@cell", 5) == 0) + + /* Prohibit cell names that contain unprintable chars, '/' and '@' or + * that begin with a dot. This also precludes "@cell". + */ + if (name[0] == '.') return ERR_PTR(-EINVAL); + for (i = 0; i < namelen; i++) { + char ch = name[i]; + if (!isprint(ch) || ch == '/' || ch == '@') + return ERR_PTR(-EINVAL); + } _enter("%*.*s,%s", namelen, namelen, name, addresses); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 497f979018c2..5c794f4b051a 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -908,6 +908,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct afs_vnode *dvnode = AFS_FS_I(dir); + struct afs_fid fid = {}; struct inode *inode; struct dentry *d; struct key *key; @@ -951,21 +952,18 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, afs_stat_v(dvnode, n_lookup); inode = afs_do_lookup(dir, dentry, key); key_put(key); - if (inode == ERR_PTR(-ENOENT)) { + if (inode == ERR_PTR(-ENOENT)) inode = afs_try_auto_mntpt(dentry, dir); - } else { - dentry->d_fsdata = - (void *)(unsigned long)dvnode->status.data_version; - } + + if (!IS_ERR_OR_NULL(inode)) + fid = AFS_FS_I(inode)->fid; + d = d_splice_alias(inode, dentry); if (!IS_ERR_OR_NULL(d)) { d->d_fsdata = dentry->d_fsdata; - trace_afs_lookup(dvnode, &d->d_name, - inode ? AFS_FS_I(inode) : NULL); + trace_afs_lookup(dvnode, &d->d_name, &fid); } else { - trace_afs_lookup(dvnode, &dentry->d_name, - IS_ERR_OR_NULL(inode) ? NULL - : AFS_FS_I(inode)); + trace_afs_lookup(dvnode, &dentry->d_name, &fid); } return d; } diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 4150280509ff..7503899c0a1b 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -136,6 +136,9 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr ASSERTCMP(d_inode(dentry), ==, NULL); + if (flags & LOOKUP_CREATE) + return ERR_PTR(-EOPNOTSUPP); + if (dentry->d_name.len >= AFSNAMEMAX) { _leave(" = -ENAMETOOLONG"); return ERR_PTR(-ENAMETOOLONG); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index f532d6d3bd28..79bc5f1338ed 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -126,7 +126,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) if (src_as->cell) ctx->cell = afs_get_cell(src_as->cell); - if (size > PAGE_SIZE - 1) + if (size < 2 || size > PAGE_SIZE - 1) return -EINVAL; page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL); @@ -140,7 +140,9 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) } buf = kmap(page); - ret = vfs_parse_fs_string(fc, "source", buf, size); + ret = -EINVAL; + if (buf[size - 1] == '.') + ret = vfs_parse_fs_string(fc, "source", buf, size - 1); kunmap(page); put_page(page); if (ret < 0) diff --git a/fs/afs/proc.c b/fs/afs/proc.c index fba2ec3a3a9c..468e1713bce1 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -213,13 +213,14 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) /* Display header on line 1 */ if (v == &cell->proc_volumes) { - seq_puts(m, "USE VID TY\n"); + seq_puts(m, "USE VID TY NAME\n"); return 0; } - seq_printf(m, "%3d %08llx %s\n", + seq_printf(m, "%3d %08llx %s %s\n", atomic_read(&vol->usage), vol->vid, - afs_vol_types[vol->type]); + afs_vol_types[vol->type], + vol->name); return 0; } diff --git a/fs/afs/server.c b/fs/afs/server.c index 1686bf188ccd..b7f3cb2130ca 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -32,18 +32,11 @@ static void afs_dec_servers_outstanding(struct afs_net *net) struct afs_server *afs_find_server(struct afs_net *net, const struct sockaddr_rxrpc *srx) { - const struct sockaddr_in6 *a = &srx->transport.sin6, *b; const struct afs_addr_list *alist; struct afs_server *server = NULL; unsigned int i; - bool ipv6 = true; int seq = 0, diff; - if (srx->transport.sin6.sin6_addr.s6_addr32[0] == 0 || - srx->transport.sin6.sin6_addr.s6_addr32[1] == 0 || - srx->transport.sin6.sin6_addr.s6_addr32[2] == htonl(0xffff)) - ipv6 = false; - rcu_read_lock(); do { @@ -52,7 +45,8 @@ struct afs_server *afs_find_server(struct afs_net *net, server = NULL; read_seqbegin_or_lock(&net->fs_addr_lock, &seq); - if (ipv6) { + if (srx->transport.family == AF_INET6) { + const struct sockaddr_in6 *a = &srx->transport.sin6, *b; hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) { alist = rcu_dereference(server->addresses); for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { @@ -68,15 +62,16 @@ struct afs_server *afs_find_server(struct afs_net *net, } } } else { + const struct sockaddr_in *a = &srx->transport.sin, *b; hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) { alist = rcu_dereference(server->addresses); for (i = 0; i < alist->nr_ipv4; i++) { - b = &alist->addrs[i].transport.sin6; - diff = ((u16 __force)a->sin6_port - - (u16 __force)b->sin6_port); + b = &alist->addrs[i].transport.sin; + diff = ((u16 __force)a->sin_port - + (u16 __force)b->sin_port); if (diff == 0) - diff = ((u32 __force)a->sin6_addr.s6_addr32[3] - - (u32 __force)b->sin6_addr.s6_addr32[3]); + diff = ((u32 __force)a->sin_addr.s_addr - + (u32 __force)b->sin_addr.s_addr); if (diff == 0) goto found; } diff --git a/fs/afs/super.c b/fs/afs/super.c index 488641b1a418..7f8a9b3137bf 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -404,6 +404,7 @@ static int afs_test_super(struct super_block *sb, struct fs_context *fc) return (as->net_ns == fc->net_ns && as->volume && as->volume->vid == ctx->volume->vid && + as->cell == ctx->cell && !as->dyn_root); } @@ -448,7 +449,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) /* allocate the root inode and dentry */ if (as->dyn_root) { inode = afs_iget_pseudo_dir(sb, true); - sb->s_flags |= SB_RDONLY; } else { sprintf(sb->s_id, "%llu", as->volume->vid); afs_activate_volume(as->volume); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index ecd8d2698515..f4713ea76e82 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -97,7 +97,7 @@ static struct linux_binfmt elf_format = { .min_coredump = ELF_EXEC_PAGESIZE, }; -#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) +#define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) static int set_brk(unsigned long start, unsigned long end, int prot) { @@ -161,9 +161,11 @@ static int padzero(unsigned long elf_bss) #endif static int -create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, - unsigned long load_addr, unsigned long interp_load_addr) +create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, + unsigned long load_addr, unsigned long interp_load_addr, + unsigned long e_entry) { + struct mm_struct *mm = current->mm; unsigned long p = bprm->p; int argc = bprm->argc; int envc = bprm->envc; @@ -176,7 +178,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, unsigned char k_rand_bytes[16]; int items; elf_addr_t *elf_info; - int ei_index = 0; + int ei_index; const struct cred *cred = current_cred(); struct vm_area_struct *vma; @@ -226,12 +228,12 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, return -EFAULT; /* Create the ELF interpreter info */ - elf_info = (elf_addr_t *)current->mm->saved_auxv; + elf_info = (elf_addr_t *)mm->saved_auxv; /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ #define NEW_AUX_ENT(id, val) \ do { \ - elf_info[ei_index++] = id; \ - elf_info[ei_index++] = val; \ + *elf_info++ = id; \ + *elf_info++ = val; \ } while (0) #ifdef ARCH_DLINFO @@ -251,7 +253,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_PHNUM, exec->e_phnum); NEW_AUX_ENT(AT_BASE, interp_load_addr); NEW_AUX_ENT(AT_FLAGS, 0); - NEW_AUX_ENT(AT_ENTRY, exec->e_entry); + NEW_AUX_ENT(AT_ENTRY, e_entry); NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid)); NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid)); NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid)); @@ -275,12 +277,13 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, } #undef NEW_AUX_ENT /* AT_NULL is zero; clear the rest too */ - memset(&elf_info[ei_index], 0, - sizeof current->mm->saved_auxv - ei_index * sizeof elf_info[0]); + memset(elf_info, 0, (char *)mm->saved_auxv + + sizeof(mm->saved_auxv) - (char *)elf_info); /* And advance past the AT_NULL entry. */ - ei_index += 2; + elf_info += 2; + ei_index = elf_info - (elf_addr_t *)mm->saved_auxv; sp = STACK_ADD(p, ei_index); items = (argc + 1) + (envc + 1) + 1; @@ -299,7 +302,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, * Grow the stack manually; some architectures have a limit on how * far ahead a user-space access may be in order to grow the stack. */ - vma = find_extend_vma(current->mm, bprm->p); + vma = find_extend_vma(mm, bprm->p); if (!vma) return -EFAULT; @@ -308,7 +311,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, return -EFAULT; /* Populate list of argv pointers back to argv strings. */ - p = current->mm->arg_end = current->mm->arg_start; + p = mm->arg_end = mm->arg_start; while (argc-- > 0) { size_t len; if (__put_user((elf_addr_t)p, sp++)) @@ -320,10 +323,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, } if (__put_user(0, sp++)) return -EFAULT; - current->mm->arg_end = p; + mm->arg_end = p; /* Populate list of envp pointers back to envp strings. */ - current->mm->env_end = current->mm->env_start = p; + mm->env_end = mm->env_start = p; while (envc-- > 0) { size_t len; if (__put_user((elf_addr_t)p, sp++)) @@ -335,10 +338,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, } if (__put_user(0, sp++)) return -EFAULT; - current->mm->env_end = p; + mm->env_end = p; /* Put the elf_info on the stack in the right place. */ - if (copy_to_user(sp, elf_info, ei_index * sizeof(elf_addr_t))) + if (copy_to_user(sp, mm->saved_auxv, ei_index * sizeof(elf_addr_t))) return -EFAULT; return 0; } @@ -689,15 +692,17 @@ static int load_elf_binary(struct linux_binprm *bprm) int bss_prot = 0; int retval, i; unsigned long elf_entry; + unsigned long e_entry; unsigned long interp_load_addr = 0; unsigned long start_code, end_code, start_data, end_data; unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; + struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf; struct { - struct elfhdr elf_ex; struct elfhdr interp_elf_ex; } *loc; struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE; + struct mm_struct *mm; struct pt_regs *regs; loc = kmalloc(sizeof(*loc), GFP_KERNEL); @@ -705,30 +710,27 @@ static int load_elf_binary(struct linux_binprm *bprm) retval = -ENOMEM; goto out_ret; } - - /* Get the exec-header */ - loc->elf_ex = *((struct elfhdr *)bprm->buf); retval = -ENOEXEC; /* First of all, some simple consistency checks */ - if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0) + if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0) goto out; - if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN) + if (elf_ex->e_type != ET_EXEC && elf_ex->e_type != ET_DYN) goto out; - if (!elf_check_arch(&loc->elf_ex)) + if (!elf_check_arch(elf_ex)) goto out; - if (elf_check_fdpic(&loc->elf_ex)) + if (elf_check_fdpic(elf_ex)) goto out; if (!bprm->file->f_op->mmap) goto out; - elf_phdata = load_elf_phdrs(&loc->elf_ex, bprm->file); + elf_phdata = load_elf_phdrs(elf_ex, bprm->file); if (!elf_phdata) goto out; elf_ppnt = elf_phdata; - for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { + for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) { char *elf_interpreter; if (elf_ppnt->p_type != PT_INTERP) @@ -782,7 +784,7 @@ out_free_interp: } elf_ppnt = elf_phdata; - for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) + for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { case PT_GNU_STACK: if (elf_ppnt->p_flags & PF_X) @@ -792,7 +794,7 @@ out_free_interp: break; case PT_LOPROC ... PT_HIPROC: - retval = arch_elf_pt_proc(&loc->elf_ex, elf_ppnt, + retval = arch_elf_pt_proc(elf_ex, elf_ppnt, bprm->file, false, &arch_state); if (retval) @@ -836,7 +838,7 @@ out_free_interp: * still possible to return an error to the code that invoked * the exec syscall. */ - retval = arch_check_elf(&loc->elf_ex, + retval = arch_check_elf(elf_ex, !!interpreter, &loc->interp_elf_ex, &arch_state); if (retval) @@ -849,8 +851,8 @@ out_free_interp: /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ - SET_PERSONALITY2(loc->elf_ex, &arch_state); - if (elf_read_implies_exec(loc->elf_ex, executable_stack)) + SET_PERSONALITY2(*elf_ex, &arch_state); + if (elf_read_implies_exec(*elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC; if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) @@ -877,7 +879,7 @@ out_free_interp: /* Now we do a little grungy work by mmapping the ELF image into the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; - i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { + i < elf_ex->e_phnum; i++, elf_ppnt++) { int elf_prot, elf_flags; unsigned long k, vaddr; unsigned long total_size = 0; @@ -921,9 +923,9 @@ out_free_interp: * If we are loading ET_EXEC or we have already performed * the ET_DYN load_addr calculations, proceed normally. */ - if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { + if (elf_ex->e_type == ET_EXEC || load_addr_set) { elf_flags |= MAP_FIXED; - } else if (loc->elf_ex.e_type == ET_DYN) { + } else if (elf_ex->e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program * Header for ET_DYN binaries to calculate the @@ -972,7 +974,7 @@ out_free_interp: load_bias = ELF_PAGESTART(load_bias - vaddr); total_size = total_mapping_size(elf_phdata, - loc->elf_ex.e_phnum); + elf_ex->e_phnum); if (!total_size) { retval = -EINVAL; goto out_free_dentry; @@ -990,7 +992,7 @@ out_free_interp: if (!load_addr_set) { load_addr_set = 1; load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset); - if (loc->elf_ex.e_type == ET_DYN) { + if (elf_ex->e_type == ET_DYN) { load_bias += error - ELF_PAGESTART(load_bias + vaddr); load_addr += load_bias; @@ -998,7 +1000,7 @@ out_free_interp: } } k = elf_ppnt->p_vaddr; - if (k < start_code) + if ((elf_ppnt->p_flags & PF_X) && k < start_code) start_code = k; if (start_data < k) start_data = k; @@ -1031,7 +1033,7 @@ out_free_interp: } } - loc->elf_ex.e_entry += load_bias; + e_entry = elf_ex->e_entry + load_bias; elf_bss += load_bias; elf_brk += load_bias; start_code += load_bias; @@ -1074,7 +1076,7 @@ out_free_interp: allow_write_access(interpreter); fput(interpreter); } else { - elf_entry = loc->elf_ex.e_entry; + elf_entry = e_entry; if (BAD_ADDR(elf_entry)) { retval = -EINVAL; goto out_free_dentry; @@ -1092,15 +1094,17 @@ out_free_interp: goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ - retval = create_elf_tables(bprm, &loc->elf_ex, - load_addr, interp_load_addr); + retval = create_elf_tables(bprm, elf_ex, + load_addr, interp_load_addr, e_entry); if (retval < 0) goto out; - current->mm->end_code = end_code; - current->mm->start_code = start_code; - current->mm->start_data = start_data; - current->mm->end_data = end_data; - current->mm->start_stack = bprm->p; + + mm = current->mm; + mm->end_code = end_code; + mm->start_code = start_code; + mm->start_data = start_data; + mm->end_data = end_data; + mm->start_stack = bprm->p; if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) { /* @@ -1111,12 +1115,11 @@ out_free_interp: * growing down), and into the unused ELF_ET_DYN_BASE region. */ if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && - loc->elf_ex.e_type == ET_DYN && !interpreter) - current->mm->brk = current->mm->start_brk = - ELF_ET_DYN_BASE; + elf_ex->e_type == ET_DYN && !interpreter) { + mm->brk = mm->start_brk = ELF_ET_DYN_BASE; + } - current->mm->brk = current->mm->start_brk = - arch_randomize_brk(current->mm); + mm->brk = mm->start_brk = arch_randomize_brk(mm); #ifdef compat_brk_randomized current->brk_randomized = 1; #endif @@ -1574,6 +1577,7 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, */ static int fill_files_note(struct memelfnote *note) { + struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned count, size, names_ofs, remaining, n; user_long_t *data; @@ -1581,7 +1585,7 @@ static int fill_files_note(struct memelfnote *note) char *name_base, *name_curpos; /* *Estimated* file count and total data size needed */ - count = current->mm->map_count; + count = mm->map_count; if (count > UINT_MAX / 64) return -EINVAL; size = count * 64; @@ -1591,6 +1595,10 @@ static int fill_files_note(struct memelfnote *note) if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */ return -EINVAL; size = round_up(size, PAGE_SIZE); + /* + * "size" can be 0 here legitimately. + * Let it ENOMEM and omit NT_FILE section which will be empty anyway. + */ data = kvmalloc(size, GFP_KERNEL); if (ZERO_OR_NULL_PTR(data)) return -ENOMEM; @@ -1599,7 +1607,7 @@ static int fill_files_note(struct memelfnote *note) name_base = name_curpos = ((char *)data) + names_ofs; remaining = size - names_ofs; count = 0; - for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) { + for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { struct file *file; const char *filename; @@ -1633,10 +1641,10 @@ static int fill_files_note(struct memelfnote *note) data[0] = count; data[1] = PAGE_SIZE; /* - * Count usually is less than current->mm->map_count, + * Count usually is less than mm->map_count, * we need to move filenames down. */ - n = current->mm->map_count - count; + n = mm->map_count - count; if (n != 0) { unsigned shift_bytes = n * 3 * sizeof(data[0]); memmove(name_base - shift_bytes, name_base, @@ -2182,7 +2190,7 @@ static int elf_core_dump(struct coredump_params *cprm) int segs, i; size_t vma_data_size = 0; struct vm_area_struct *vma, *gate_vma; - struct elfhdr *elf = NULL; + struct elfhdr elf; loff_t offset = 0, dataoff; struct elf_note_info info = { }; struct elf_phdr *phdr4note = NULL; @@ -2203,10 +2211,6 @@ static int elf_core_dump(struct coredump_params *cprm) * exists while dumping the mm->vm_next areas to the core file. */ - /* alloc memory for large data structures: too large to be on stack */ - elf = kmalloc(sizeof(*elf), GFP_KERNEL); - if (!elf) - goto out; /* * The number of segs are recored into ELF header as 16bit value. * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. @@ -2230,7 +2234,7 @@ static int elf_core_dump(struct coredump_params *cprm) * Collect all the non-memory information about the process for the * notes. This also sets up the file header. */ - if (!fill_note_info(elf, e_phnum, &info, cprm->siginfo, cprm->regs)) + if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs)) goto cleanup; has_dumped = 1; @@ -2238,7 +2242,7 @@ static int elf_core_dump(struct coredump_params *cprm) fs = get_fs(); set_fs(KERNEL_DS); - offset += sizeof(*elf); /* Elf header */ + offset += sizeof(elf); /* Elf header */ offset += segs * sizeof(struct elf_phdr); /* Program headers */ /* Write notes phdr entry */ @@ -2257,11 +2261,13 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz)) - goto end_coredump; + /* + * Zero vma process will get ZERO_SIZE_PTR here. + * Let coredump continue for register state at least. + */ vma_filesz = kvmalloc(array_size(sizeof(*vma_filesz), (segs - 1)), GFP_KERNEL); - if (ZERO_OR_NULL_PTR(vma_filesz)) + if (!vma_filesz) goto end_coredump; for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; @@ -2281,12 +2287,12 @@ static int elf_core_dump(struct coredump_params *cprm) shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL); if (!shdr4extnum) goto end_coredump; - fill_extnum_info(elf, shdr4extnum, e_shoff, segs); + fill_extnum_info(&elf, shdr4extnum, e_shoff, segs); } offset = dataoff; - if (!dump_emit(cprm, elf, sizeof(*elf))) + if (!dump_emit(cprm, &elf, sizeof(elf))) goto end_coredump; if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note))) @@ -2370,8 +2376,6 @@ cleanup: kfree(shdr4extnum); kvfree(vma_filesz); kfree(phdr4note); - kfree(elf); -out: return has_dumped; } diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 75b6d10c9845..575636f6491e 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -7,6 +7,7 @@ config BTRFS_FS select LIBCRC32C select CRYPTO_XXHASH select CRYPTO_SHA256 + select CRYPTO_BLAKE2B select ZLIB_INFLATE select ZLIB_DEFLATE select LZO_COMPRESS diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 82200dbca5ac..9a0ff3384381 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ - block-rsv.o delalloc-space.o block-group.o + block-rsv.o delalloc-space.o block-group.o discard.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 6934a5b8708f..404e050ce8ee 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -14,6 +14,8 @@ #include "sysfs.h" #include "tree-log.h" #include "delalloc-space.h" +#include "discard.h" +#include "raid56.h" /* * Return target flags in extended format or 0 if restripe for this chunk_type @@ -95,7 +97,7 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) return extended_to_chunk(flags | allowed); } -static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) +u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) { unsigned seq; u64 flags; @@ -115,11 +117,6 @@ static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) return btrfs_reduce_alloc_profile(fs_info, flags); } -u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) -{ - return get_alloc_profile(fs_info, orig_flags); -} - void btrfs_get_block_group(struct btrfs_block_group *cache) { atomic_inc(&cache->count); @@ -132,6 +129,15 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) WARN_ON(cache->reserved > 0); /* + * A block_group shouldn't be on the discard_list anymore. + * Remove the block_group from the discard_list to prevent us + * from causing a panic due to NULL pointer dereference. + */ + if (WARN_ON(!list_empty(&cache->discard_list))) + btrfs_discard_cancel_work(&cache->fs_info->discard_ctl, + cache); + + /* * If not empty, someone is still holding mutex of * full_stripe_lock, which can only be released by caller. * And it will definitely cause use-after-free when caller @@ -466,8 +472,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end } else if (extent_start > start && extent_start < end) { size = extent_start - start; total_added += size; - ret = btrfs_add_free_space(block_group, start, - size); + ret = btrfs_add_free_space_async_trimmed(block_group, + start, size); BUG_ON(ret); /* -ENOMEM or logic error */ start = extent_end + 1; } else { @@ -478,7 +484,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end if (start < end) { size = end - start; total_added += size; - ret = btrfs_add_free_space(block_group, start, size); + ret = btrfs_add_free_space_async_trimmed(block_group, start, + size); BUG_ON(ret); /* -ENOMEM or logic error */ } @@ -1184,22 +1191,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; - u64 sinfo_used; - u64 min_allocable_bytes; int ret = -ENOSPC; - /* - * We need some metadata space and system metadata space for - * allocating chunks in some corner cases until we force to set - * it to be readonly. - */ - if ((sinfo->flags & - (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && - !force) - min_allocable_bytes = SZ_1M; - else - min_allocable_bytes = 0; - spin_lock(&sinfo->lock); spin_lock(&cache->lock); @@ -1211,20 +1204,38 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) num_bytes = cache->length - cache->reserved - cache->pinned - cache->bytes_super - cache->used; - sinfo_used = btrfs_space_info_used(sinfo, true); /* - * sinfo_used + num_bytes should always <= sinfo->total_bytes. - * - * Here we make sure if we mark this bg RO, we still have enough - * free space as buffer (if min_allocable_bytes is not 0). + * Data never overcommits, even in mixed mode, so do just the straight + * check of left over space in how much we have allocated. */ - if (sinfo_used + num_bytes + min_allocable_bytes <= - sinfo->total_bytes) { + if (force) { + ret = 0; + } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) { + u64 sinfo_used = btrfs_space_info_used(sinfo, true); + + /* + * Here we make sure if we mark this bg RO, we still have enough + * free space as buffer. + */ + if (sinfo_used + num_bytes <= sinfo->total_bytes) + ret = 0; + } else { + /* + * We overcommit metadata, so we need to do the + * btrfs_can_overcommit check here, and we need to pass in + * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of + * leeway to allow us to mark this block group as read only. + */ + if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes, + BTRFS_RESERVE_NO_FLUSH)) + ret = 0; + } + + if (!ret) { sinfo->bytes_readonly += num_bytes; cache->ro++; list_add_tail(&cache->ro_list, &sinfo->ro_bgs); - ret = 0; } out: spin_unlock(&cache->lock); @@ -1232,9 +1243,6 @@ out: if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { btrfs_info(cache->fs_info, "unable to make block group %llu ro", cache->start); - btrfs_info(cache->fs_info, - "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", - sinfo_used, num_bytes, min_allocable_bytes); btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); } return ret; @@ -1249,6 +1257,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; + const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC); int ret = 0; if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) @@ -1272,10 +1281,28 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->unused_bgs_lock); + btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); + mutex_lock(&fs_info->delete_unused_bgs_mutex); /* Don't want to race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); + + /* + * Async discard moves the final block group discard to be prior + * to the unused_bgs code path. Therefore, if it's not fully + * trimmed, punt it back to the async discard lists. + */ + if (btrfs_test_opt(fs_info, DISCARD_ASYNC) && + !btrfs_is_free_space_trimmed(block_group)) { + trace_btrfs_skip_unused_block_group(block_group); + up_write(&space_info->groups_sem); + /* Requeue if we failed because of async discard */ + btrfs_discard_queue_work(&fs_info->discard_ctl, + block_group); + goto next; + } + spin_lock(&block_group->lock); if (block_group->reserved || block_group->pinned || block_group->used || block_group->ro || @@ -1347,6 +1374,23 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) } mutex_unlock(&fs_info->unused_bg_unpin_mutex); + /* + * At this point, the block_group is read only and should fail + * new allocations. However, btrfs_finish_extent_commit() can + * cause this block_group to be placed back on the discard + * lists because now the block_group isn't fully discarded. + * Bail here and try again later after discarding everything. + */ + spin_lock(&fs_info->discard_ctl.lock); + if (!list_empty(&block_group->discard_list)) { + spin_unlock(&fs_info->discard_ctl.lock); + btrfs_dec_block_group_ro(block_group); + btrfs_discard_queue_work(&fs_info->discard_ctl, + block_group); + goto end_trans; + } + spin_unlock(&fs_info->discard_ctl.lock); + /* Reset pinned so btrfs_put_block_group doesn't complain */ spin_lock(&space_info->lock); spin_lock(&block_group->lock); @@ -1362,8 +1406,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); + /* + * The normal path here is an unused block group is passed here, + * then trimming is handled in the transaction commit path. + * Async discard interposes before this to do the trimming + * before coming down the unused block group path as trimming + * will no longer be done later in the transaction commit path. + */ + if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC)) + goto flip_async; + /* DISCARD can flip during remount */ - trimming = btrfs_test_opt(fs_info, DISCARD); + trimming = btrfs_test_opt(fs_info, DISCARD_SYNC); /* Implicit trim during transaction commit. */ if (trimming) @@ -1406,6 +1460,13 @@ next: spin_lock(&fs_info->unused_bgs_lock); } spin_unlock(&fs_info->unused_bgs_lock); + return; + +flip_async: + btrfs_end_transaction(trans); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + btrfs_put_block_group(block_group); + btrfs_discard_punt_unused_bgs_list(fs_info); } void btrfs_mark_bg_unused(struct btrfs_block_group *bg) @@ -1516,6 +1577,102 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) write_sequnlock(&fs_info->profiles_lock); } +/** + * btrfs_rmap_block - Map a physical disk address to a list of logical addresses + * @chunk_start: logical address of block group + * @physical: physical address to map to logical addresses + * @logical: return array of logical addresses which map to @physical + * @naddrs: length of @logical + * @stripe_len: size of IO stripe for the given block group + * + * Maps a particular @physical disk address to a list of @logical addresses. + * Used primarily to exclude those portions of a block group that contain super + * block copies. + */ +EXPORT_FOR_TESTS +int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + u64 physical, u64 **logical, int *naddrs, int *stripe_len) +{ + struct extent_map *em; + struct map_lookup *map; + u64 *buf; + u64 bytenr; + u64 data_stripe_length; + u64 io_stripe_size; + int i, nr = 0; + int ret = 0; + + em = btrfs_get_chunk_map(fs_info, chunk_start, 1); + if (IS_ERR(em)) + return -EIO; + + map = em->map_lookup; + data_stripe_length = em->len; + io_stripe_size = map->stripe_len; + + if (map->type & BTRFS_BLOCK_GROUP_RAID10) + data_stripe_length = div_u64(data_stripe_length, + map->num_stripes / map->sub_stripes); + else if (map->type & BTRFS_BLOCK_GROUP_RAID0) + data_stripe_length = div_u64(data_stripe_length, map->num_stripes); + else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + data_stripe_length = div_u64(data_stripe_length, + nr_data_stripes(map)); + io_stripe_size = map->stripe_len * nr_data_stripes(map); + } + + buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); + if (!buf) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < map->num_stripes; i++) { + bool already_inserted = false; + u64 stripe_nr; + int j; + + if (!in_range(physical, map->stripes[i].physical, + data_stripe_length)) + continue; + + stripe_nr = physical - map->stripes[i].physical; + stripe_nr = div64_u64(stripe_nr, map->stripe_len); + + if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + stripe_nr = stripe_nr * map->num_stripes + i; + stripe_nr = div_u64(stripe_nr, map->sub_stripes); + } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + stripe_nr = stripe_nr * map->num_stripes + i; + } + /* + * The remaining case would be for RAID56, multiply by + * nr_data_stripes(). Alternatively, just use rmap_len below + * instead of map->stripe_len + */ + + bytenr = chunk_start + stripe_nr * io_stripe_size; + + /* Ensure we don't add duplicate addresses */ + for (j = 0; j < nr; j++) { + if (buf[j] == bytenr) { + already_inserted = true; + break; + } + } + + if (!already_inserted) + buf[nr++] = bytenr; + } + + *logical = buf; + *naddrs = nr; + *stripe_len = io_stripe_size; +out: + free_extent_map(em); + return ret; +} + static int exclude_super_stripes(struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; @@ -1610,6 +1767,8 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); set_free_space_tree_thresholds(cache); + cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED; + atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); init_rwsem(&cache->data_rwsem); @@ -1617,6 +1776,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->bg_list); INIT_LIST_HEAD(&cache->ro_list); + INIT_LIST_HEAD(&cache->discard_list); INIT_LIST_HEAD(&cache->dirty_list); INIT_LIST_HEAD(&cache->io_list); btrfs_init_free_space_ctl(cache); @@ -1775,7 +1935,10 @@ static int read_one_block_group(struct btrfs_fs_info *info, inc_block_group_ro(cache, 1); } else if (cache->used == 0) { ASSERT(list_empty(&cache->bg_list)); - btrfs_mark_bg_unused(cache); + if (btrfs_test_opt(info, DISCARD_ASYNC)) + btrfs_discard_queue_work(&info->discard_ctl, cache); + else + btrfs_mark_bg_unused(cache); } return 0; error: @@ -2077,7 +2240,7 @@ again: } } - ret = inc_block_group_ro(cache, !do_chunk_alloc); + ret = inc_block_group_ro(cache, 0); if (!do_chunk_alloc) goto unlock_out; if (!ret) @@ -2738,8 +2901,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * dirty list to avoid races between cleaner kthread and space * cache writeout. */ - if (!alloc && old_val == 0) - btrfs_mark_bg_unused(cache); + if (!alloc && old_val == 0) { + if (!btrfs_test_opt(info, DISCARD_ASYNC)) + btrfs_mark_bg_unused(cache); + } btrfs_put_block_group(cache); total -= num_bytes; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 9b409676c4b2..107bb557ca8d 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -13,6 +13,19 @@ enum btrfs_disk_cache_state { }; /* + * This describes the state of the block_group for async discard. This is due + * to the two pass nature of it where extent discarding is prioritized over + * bitmap discarding. BTRFS_DISCARD_RESET_CURSOR is set when we are resetting + * between lists to prevent contention for discard state variables + * (eg. discard_cursor). + */ +enum btrfs_discard_state { + BTRFS_DISCARD_EXTENTS, + BTRFS_DISCARD_BITMAPS, + BTRFS_DISCARD_RESET_CURSOR, +}; + +/* * Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to * only allocate a chunk if we really need one. * @@ -116,7 +129,13 @@ struct btrfs_block_group { /* For read-only block groups */ struct list_head ro_list; + /* For discard operations */ atomic_t trimming; + struct list_head discard_list; + int discard_index; + u64 discard_eligible_time; + u64 discard_cursor; + enum btrfs_discard_state discard_state; /* For dirty block groups */ struct list_head dirty_list; @@ -158,6 +177,22 @@ struct btrfs_block_group { struct btrfs_full_stripe_locks_tree full_stripe_locks_root; }; +static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) +{ + return (block_group->start + block_group->length); +} + +static inline bool btrfs_is_block_group_data_only( + struct btrfs_block_group *block_group) +{ + /* + * In mixed mode the fragmentation is expected to be high, lowering the + * efficiency, so only proper data block groups are considered. + */ + return (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && + !(block_group->flags & BTRFS_BLOCK_GROUP_METADATA); +} + #ifdef CONFIG_BTRFS_DEBUG static inline int btrfs_should_fragment_free_space( struct btrfs_block_group *block_group) @@ -248,4 +283,9 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache) cache->cached == BTRFS_CACHE_ERROR; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + u64 physical, u64 **logical, int *naddrs, int *stripe_len); +#endif + #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 0b52ab4cb964..a0ce69f2d27c 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -629,7 +629,6 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev, static int btrfsic_process_superblock(struct btrfsic_state *state, struct btrfs_fs_devices *fs_devices) { - struct btrfs_fs_info *fs_info = state->fs_info; struct btrfs_super_block *selected_super; struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; @@ -637,7 +636,6 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, int ret = 0; int pass; - BUG_ON(NULL == state); selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); if (NULL == selected_super) { pr_info("btrfsic: error, kmalloc failed!\n"); @@ -700,7 +698,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, break; } - num_copies = btrfs_num_copies(fs_info, next_bytenr, + num_copies = btrfs_num_copies(state->fs_info, next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) pr_info("num_copies(log_bytenr=%llu) = %d\n", diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ee834ef7beb4..9ab610cc9114 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -447,7 +447,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, if (blkcg_css) { bio->bi_opf |= REQ_CGROUP_PUNT; - bio_associate_blkg_from_css(bio, blkcg_css); + kthread_associate_blkcg(blkcg_css); } refcount_set(&cb->pending_bios, 1); @@ -491,6 +491,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; + if (blkcg_css) + bio->bi_opf |= REQ_CGROUP_PUNT; bio_add_page(bio, page, PAGE_SIZE, 0); } if (bytes_left < PAGE_SIZE) { @@ -517,6 +519,9 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_endio(bio); } + if (blkcg_css) + kthread_associate_blkcg(NULL); + return 0; } @@ -758,7 +763,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(inode, comp_bio, - sums); + (u64)-1, sums); BUG_ON(ret); /* -ENOMEM */ } @@ -786,7 +791,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, BUG_ON(ret); /* -ENOMEM */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); + ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums); BUG_ON(ret); /* -ENOMEM */ } @@ -1285,7 +1290,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, /* copy bytes from the working buffer into the pages */ while (working_bytes > 0) { bytes = min_t(unsigned long, bvec.bv_len, - PAGE_SIZE - buf_offset); + PAGE_SIZE - (buf_offset % PAGE_SIZE)); bytes = min(bytes, working_bytes); kaddr = kmap_atomic(bvec.bv_page); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5b6e86aaf2e1..f2ec1a9bae28 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -326,12 +326,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) { write_lock(&fs_info->tree_mod_log_lock); - spin_lock(&fs_info->tree_mod_seq_lock); if (!elem->seq) { elem->seq = btrfs_inc_tree_mod_seq(fs_info); list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); } - spin_unlock(&fs_info->tree_mod_seq_lock); write_unlock(&fs_info->tree_mod_log_lock); return elem->seq; @@ -351,7 +349,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, if (!seq_putting) return; - spin_lock(&fs_info->tree_mod_seq_lock); + write_lock(&fs_info->tree_mod_log_lock); list_del(&elem->list); elem->seq = 0; @@ -362,24 +360,22 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * blocker with lower sequence number exists, we * cannot remove anything from the log */ - spin_unlock(&fs_info->tree_mod_seq_lock); + write_unlock(&fs_info->tree_mod_log_lock); return; } min_seq = cur_elem->seq; } } - spin_unlock(&fs_info->tree_mod_seq_lock); /* * anything that's lower than the lowest existing (read: blocked) * sequence number can be removed from the tree. */ - write_lock(&fs_info->tree_mod_log_lock); tm_root = &fs_info->tree_mod_log; for (node = rb_first(tm_root); node; node = next) { next = rb_next(node); tm = rb_entry(node, struct tree_mod_elem, node); - if (tm->seq > min_seq) + if (tm->seq >= min_seq) continue; rb_erase(node, tm_root); kfree(tm); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b2e8fd8a8e59..36df977b64d9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -101,6 +101,14 @@ struct btrfs_ref; #define BTRFS_MAX_EXTENT_SIZE SZ_128M +/* + * Deltas are an effective way to populate global statistics. Give macro names + * to make it clear what we're doing. An example is discard_extents in + * btrfs_free_space_ctl. + */ +#define BTRFS_STAT_NR_ENTRIES 2 +#define BTRFS_STAT_CURR 0 +#define BTRFS_STAT_PREV 1 /* * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size @@ -440,6 +448,36 @@ struct btrfs_full_stripe_locks_tree { struct mutex lock; }; +/* Discard control. */ +/* + * Async discard uses multiple lists to differentiate the discard filter + * parameters. Index 0 is for completely free block groups where we need to + * ensure the entire block group is trimmed without being lossy. Indices + * afterwards represent monotonically decreasing discard filter sizes to + * prioritize what should be discarded next. + */ +#define BTRFS_NR_DISCARD_LISTS 3 +#define BTRFS_DISCARD_INDEX_UNUSED 0 +#define BTRFS_DISCARD_INDEX_START 1 + +struct btrfs_discard_ctl { + struct workqueue_struct *discard_workers; + struct delayed_work work; + spinlock_t lock; + struct btrfs_block_group *block_group; + struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; + u64 prev_discard; + atomic_t discardable_extents; + atomic64_t discardable_bytes; + u64 max_discard_size; + unsigned long delay; + u32 iops_limit; + u32 kbps_limit; + u64 discard_extent_bytes; + u64 discard_bitmap_bytes; + atomic64_t discard_bytes_saved; +}; + /* delayed seq elem */ struct seq_list { struct list_head list; @@ -526,6 +564,9 @@ enum { * so we don't need to offload checksums to workqueues. */ BTRFS_FS_CSUM_IMPL_FAST, + + /* Indicate that the discard workqueue can service discards. */ + BTRFS_FS_DISCARD_RUNNING, }; struct btrfs_fs_info { @@ -673,14 +714,12 @@ struct btrfs_fs_info { atomic_t nr_delayed_iputs; wait_queue_head_t delayed_iputs_wait; - /* this protects tree_mod_seq_list */ - spinlock_t tree_mod_seq_lock; atomic64_t tree_mod_seq; - struct list_head tree_mod_seq_list; - /* this protects tree_mod_log */ + /* this protects tree_mod_log and tree_mod_seq_list */ rwlock_t tree_mod_log_lock; struct rb_root tree_mod_log; + struct list_head tree_mod_seq_list; atomic_t async_delalloc_pages; @@ -816,6 +855,8 @@ struct btrfs_fs_info { struct btrfs_workqueue *scrub_wr_completion_workers; struct btrfs_workqueue *scrub_parity_workers; + struct btrfs_discard_ctl discard_ctl; + #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; #endif @@ -902,6 +943,11 @@ struct btrfs_fs_info { spinlock_t ref_verify_lock; struct rb_root block_tree; #endif + +#ifdef CONFIG_BTRFS_DEBUG + struct kobject *debug_kobj; + struct kobject *discard_debug_kobj; +#endif }; static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) @@ -1170,7 +1216,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) #define BTRFS_MOUNT_NOSSD (1 << 9) -#define BTRFS_MOUNT_DISCARD (1 << 10) +#define BTRFS_MOUNT_DISCARD_SYNC (1 << 10) #define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) #define BTRFS_MOUNT_SPACE_CACHE (1 << 12) #define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) @@ -1189,6 +1235,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) #define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) #define BTRFS_MOUNT_REF_VERIFY (1 << 28) +#define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) @@ -2449,8 +2496,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc); -int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len); +int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, + u64 len); void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, @@ -2787,11 +2834,9 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( /* file-item.c */ struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); + struct btrfs_root *root, u64 bytenr, u64 len); blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - u8 *dst); -blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, - u64 logical_offset); + u64 offset, u8 *dst); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, @@ -2877,7 +2922,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, - u64 start, u64 end, int create); + u64 start, u64 end); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); @@ -3110,17 +3155,21 @@ do { \ rcu_read_unlock(); \ } while (0) -__cold -static inline void assfail(const char *expr, const char *file, int line) +#ifdef CONFIG_BTRFS_ASSERT +__cold __noreturn +static inline void assertfail(const char *expr, const char *file, int line) { - if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { - pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); - BUG(); - } + pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); + BUG(); } -#define ASSERT(expr) \ - (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) +#define ASSERT(expr) \ + (likely(expr) ? (void)0 : assertfail(#expr, __FILE__, __LINE__)) + +#else +static inline void assertfail(const char *expr, const char* file, int line) { } +#define ASSERT(expr) (void)(expr) +#endif /* * Use that for functions that are conditionally exported for sanity tests but diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index df3bd880061d..dfdb7d4f8406 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -492,7 +492,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, if (head->is_data) return; - spin_lock(&fs_info->tree_mod_seq_lock); + read_lock(&fs_info->tree_mod_log_lock); if (!list_empty(&fs_info->tree_mod_seq_list)) { struct seq_list *elem; @@ -500,7 +500,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct seq_list, list); seq = elem->seq; } - spin_unlock(&fs_info->tree_mod_seq_lock); + read_unlock(&fs_info->tree_mod_log_lock); again: for (node = rb_first_cached(&head->ref_tree); node; @@ -518,7 +518,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) struct seq_list *elem; int ret = 0; - spin_lock(&fs_info->tree_mod_seq_lock); + read_lock(&fs_info->tree_mod_log_lock); if (!list_empty(&fs_info->tree_mod_seq_list)) { elem = list_first_entry(&fs_info->tree_mod_seq_list, struct seq_list, list); @@ -531,7 +531,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) } } - spin_unlock(&fs_info->tree_mod_seq_lock); + read_unlock(&fs_info->tree_mod_log_lock); return ret; } diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f639dde2a679..2ca2a09d0e23 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -500,11 +500,8 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); - if (ret == -EINPROGRESS) { + if (ret == -EINPROGRESS) ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; - } else if (ret != -ECANCELED) { - WARN_ON(ret); - } return ret; @@ -707,6 +704,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* replace the sysfs entry */ btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device); + btrfs_sysfs_update_devid(tgt_device); btrfs_rm_dev_replace_free_srcdev(src_device); /* write back the superblocks */ diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c new file mode 100644 index 000000000000..5615320fa659 --- /dev/null +++ b/fs/btrfs/discard.c @@ -0,0 +1,702 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/ktime.h> +#include <linux/list.h> +#include <linux/math64.h> +#include <linux/sizes.h> +#include <linux/workqueue.h> +#include "ctree.h" +#include "block-group.h" +#include "discard.h" +#include "free-space-cache.h" + +/* + * This contains the logic to handle async discard. + * + * Async discard manages trimming of free space outside of transaction commit. + * Discarding is done by managing the block_groups on a LRU list based on free + * space recency. Two passes are used to first prioritize discarding extents + * and then allow for trimming in the bitmap the best opportunity to coalesce. + * The block_groups are maintained on multiple lists to allow for multiple + * passes with different discard filter requirements. A delayed work item is + * used to manage discarding with timeout determined by a max of the delay + * incurred by the iops rate limit, the byte rate limit, and the max delay of + * BTRFS_DISCARD_MAX_DELAY. + * + * Note, this only keeps track of block_groups that are explicitly for data. + * Mixed block_groups are not supported. + * + * The first list is special to manage discarding of fully free block groups. + * This is necessary because we issue a final trim for a full free block group + * after forgetting it. When a block group becomes unused, instead of directly + * being added to the unused_bgs list, we add it to this first list. Then + * from there, if it becomes fully discarded, we place it onto the unused_bgs + * list. + * + * The in-memory free space cache serves as the backing state for discard. + * Consequently this means there is no persistence. We opt to load all the + * block groups in as not discarded, so the mount case degenerates to the + * crashing case. + * + * As the free space cache uses bitmaps, there exists a tradeoff between + * ease/efficiency for find_free_extent() and the accuracy of discard state. + * Here we opt to let untrimmed regions merge with everything while only letting + * trimmed regions merge with other trimmed regions. This can cause + * overtrimming, but the coalescing benefit seems to be worth it. Additionally, + * bitmap state is tracked as a whole. If we're able to fully trim a bitmap, + * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in, + * this resets the state and we will retry trimming the whole bitmap. This is a + * tradeoff between discard state accuracy and the cost of accounting. + */ + +/* This is an initial delay to give some chance for block reuse */ +#define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC) +#define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC) + +/* Target completion latency of discarding all discardable extents */ +#define BTRFS_DISCARD_TARGET_MSEC (6 * 60 * 60UL * MSEC_PER_SEC) +#define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL) +#define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) +#define BTRFS_DISCARD_MAX_IOPS (10U) + +/* Montonically decreasing minimum length filters after index 0 */ +static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { + 0, + BTRFS_ASYNC_DISCARD_MAX_FILTER, + BTRFS_ASYNC_DISCARD_MIN_FILTER +}; + +static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + return &discard_ctl->discard_list[block_group->discard_index]; +} + +static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + if (!btrfs_run_discard_work(discard_ctl)) + return; + + if (list_empty(&block_group->discard_list) || + block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { + if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) + block_group->discard_index = BTRFS_DISCARD_INDEX_START; + block_group->discard_eligible_time = (ktime_get_ns() + + BTRFS_DISCARD_DELAY); + block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; + } + + list_move_tail(&block_group->discard_list, + get_discard_list(discard_ctl, block_group)); +} + +static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + if (!btrfs_is_block_group_data_only(block_group)) + return; + + spin_lock(&discard_ctl->lock); + __add_to_discard_list(discard_ctl, block_group); + spin_unlock(&discard_ctl->lock); +} + +static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + spin_lock(&discard_ctl->lock); + + if (!btrfs_run_discard_work(discard_ctl)) { + spin_unlock(&discard_ctl->lock); + return; + } + + list_del_init(&block_group->discard_list); + + block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED; + block_group->discard_eligible_time = (ktime_get_ns() + + BTRFS_DISCARD_UNUSED_DELAY); + block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; + list_add_tail(&block_group->discard_list, + &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); + + spin_unlock(&discard_ctl->lock); +} + +static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + bool running = false; + + spin_lock(&discard_ctl->lock); + + if (block_group == discard_ctl->block_group) { + running = true; + discard_ctl->block_group = NULL; + } + + block_group->discard_eligible_time = 0; + list_del_init(&block_group->discard_list); + + spin_unlock(&discard_ctl->lock); + + return running; +} + +/** + * find_next_block_group - find block_group that's up next for discarding + * @discard_ctl: discard control + * @now: current time + * + * Iterate over the discard lists to find the next block_group up for + * discarding checking the discard_eligible_time of block_group. + */ +static struct btrfs_block_group *find_next_block_group( + struct btrfs_discard_ctl *discard_ctl, + u64 now) +{ + struct btrfs_block_group *ret_block_group = NULL, *block_group; + int i; + + for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { + struct list_head *discard_list = &discard_ctl->discard_list[i]; + + if (!list_empty(discard_list)) { + block_group = list_first_entry(discard_list, + struct btrfs_block_group, + discard_list); + + if (!ret_block_group) + ret_block_group = block_group; + + if (ret_block_group->discard_eligible_time < now) + break; + + if (ret_block_group->discard_eligible_time > + block_group->discard_eligible_time) + ret_block_group = block_group; + } + } + + return ret_block_group; +} + +/** + * peek_discard_list - wrap find_next_block_group() + * @discard_ctl: discard control + * @discard_state: the discard_state of the block_group after state management + * @discard_index: the discard_index of the block_group after state management + * + * This wraps find_next_block_group() and sets the block_group to be in use. + * discard_state's control flow is managed here. Variables related to + * discard_state are reset here as needed (eg discard_cursor). @discard_state + * and @discard_index are remembered as it may change while we're discarding, + * but we want the discard to execute in the context determined here. + */ +static struct btrfs_block_group *peek_discard_list( + struct btrfs_discard_ctl *discard_ctl, + enum btrfs_discard_state *discard_state, + int *discard_index) +{ + struct btrfs_block_group *block_group; + const u64 now = ktime_get_ns(); + + spin_lock(&discard_ctl->lock); +again: + block_group = find_next_block_group(discard_ctl, now); + + if (block_group && now > block_group->discard_eligible_time) { + if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && + block_group->used != 0) { + if (btrfs_is_block_group_data_only(block_group)) + __add_to_discard_list(discard_ctl, block_group); + else + list_del_init(&block_group->discard_list); + goto again; + } + if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { + block_group->discard_cursor = block_group->start; + block_group->discard_state = BTRFS_DISCARD_EXTENTS; + } + discard_ctl->block_group = block_group; + *discard_state = block_group->discard_state; + *discard_index = block_group->discard_index; + } else { + block_group = NULL; + } + + spin_unlock(&discard_ctl->lock); + + return block_group; +} + +/** + * btrfs_discard_check_filter - updates a block groups filters + * @block_group: block group of interest + * @bytes: recently freed region size after coalescing + * + * Async discard maintains multiple lists with progressively smaller filters + * to prioritize discarding based on size. Should a free space that matches + * a larger filter be returned to the free_space_cache, prioritize that discard + * by moving @block_group to the proper filter. + */ +void btrfs_discard_check_filter(struct btrfs_block_group *block_group, + u64 bytes) +{ + struct btrfs_discard_ctl *discard_ctl; + + if (!block_group || + !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) + return; + + discard_ctl = &block_group->fs_info->discard_ctl; + + if (block_group->discard_index > BTRFS_DISCARD_INDEX_START && + bytes >= discard_minlen[block_group->discard_index - 1]) { + int i; + + remove_from_discard_list(discard_ctl, block_group); + + for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS; + i++) { + if (bytes >= discard_minlen[i]) { + block_group->discard_index = i; + add_to_discard_list(discard_ctl, block_group); + break; + } + } + } +} + +/** + * btrfs_update_discard_index - moves a block group along the discard lists + * @discard_ctl: discard control + * @block_group: block_group of interest + * + * Increment @block_group's discard_index. If it falls of the list, let it be. + * Otherwise add it back to the appropriate list. + */ +static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + block_group->discard_index++; + if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) { + block_group->discard_index = 1; + return; + } + + add_to_discard_list(discard_ctl, block_group); +} + +/** + * btrfs_discard_cancel_work - remove a block_group from the discard lists + * @discard_ctl: discard control + * @block_group: block_group of interest + * + * This removes @block_group from the discard lists. If necessary, it waits on + * the current work and then reschedules the delayed work. + */ +void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + if (remove_from_discard_list(discard_ctl, block_group)) { + cancel_delayed_work_sync(&discard_ctl->work); + btrfs_discard_schedule_work(discard_ctl, true); + } +} + +/** + * btrfs_discard_queue_work - handles queuing the block_groups + * @discard_ctl: discard control + * @block_group: block_group of interest + * + * This maintains the LRU order of the discard lists. + */ +void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) + return; + + if (block_group->used == 0) + add_to_discard_unused_list(discard_ctl, block_group); + else + add_to_discard_list(discard_ctl, block_group); + + if (!delayed_work_pending(&discard_ctl->work)) + btrfs_discard_schedule_work(discard_ctl, false); +} + +/** + * btrfs_discard_schedule_work - responsible for scheduling the discard work + * @discard_ctl: discard control + * @override: override the current timer + * + * Discards are issued by a delayed workqueue item. @override is used to + * update the current delay as the baseline delay interval is reevaluated on + * transaction commit. This is also maxed with any other rate limit. + */ +void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, + bool override) +{ + struct btrfs_block_group *block_group; + const u64 now = ktime_get_ns(); + + spin_lock(&discard_ctl->lock); + + if (!btrfs_run_discard_work(discard_ctl)) + goto out; + + if (!override && delayed_work_pending(&discard_ctl->work)) + goto out; + + block_group = find_next_block_group(discard_ctl, now); + if (block_group) { + unsigned long delay = discard_ctl->delay; + u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit); + + /* + * A single delayed workqueue item is responsible for + * discarding, so we can manage the bytes rate limit by keeping + * track of the previous discard. + */ + if (kbps_limit && discard_ctl->prev_discard) { + u64 bps_limit = ((u64)kbps_limit) * SZ_1K; + u64 bps_delay = div64_u64(discard_ctl->prev_discard * + MSEC_PER_SEC, bps_limit); + + delay = max(delay, msecs_to_jiffies(bps_delay)); + } + + /* + * This timeout is to hopefully prevent immediate discarding + * in a recently allocated block group. + */ + if (now < block_group->discard_eligible_time) { + u64 bg_timeout = block_group->discard_eligible_time - now; + + delay = max(delay, nsecs_to_jiffies(bg_timeout)); + } + + mod_delayed_work(discard_ctl->discard_workers, + &discard_ctl->work, delay); + } +out: + spin_unlock(&discard_ctl->lock); +} + +/** + * btrfs_finish_discard_pass - determine next step of a block_group + * @discard_ctl: discard control + * @block_group: block_group of interest + * + * This determines the next step for a block group after it's finished going + * through a pass on a discard list. If it is unused and fully trimmed, we can + * mark it unused and send it to the unused_bgs path. Otherwise, pass it onto + * the appropriate filter list or let it fall off. + */ +static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + remove_from_discard_list(discard_ctl, block_group); + + if (block_group->used == 0) { + if (btrfs_is_free_space_trimmed(block_group)) + btrfs_mark_bg_unused(block_group); + else + add_to_discard_unused_list(discard_ctl, block_group); + } else { + btrfs_update_discard_index(discard_ctl, block_group); + } +} + +/** + * btrfs_discard_workfn - discard work function + * @work: work + * + * This finds the next block_group to start discarding and then discards a + * single region. It does this in a two-pass fashion: first extents and second + * bitmaps. Completely discarded block groups are sent to the unused_bgs path. + */ +static void btrfs_discard_workfn(struct work_struct *work) +{ + struct btrfs_discard_ctl *discard_ctl; + struct btrfs_block_group *block_group; + enum btrfs_discard_state discard_state; + int discard_index = 0; + u64 trimmed = 0; + u64 minlen = 0; + + discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work); + + block_group = peek_discard_list(discard_ctl, &discard_state, + &discard_index); + if (!block_group || !btrfs_run_discard_work(discard_ctl)) + return; + + /* Perform discarding */ + minlen = discard_minlen[discard_index]; + + if (discard_state == BTRFS_DISCARD_BITMAPS) { + u64 maxlen = 0; + + /* + * Use the previous levels minimum discard length as the max + * length filter. In the case something is added to make a + * region go beyond the max filter, the entire bitmap is set + * back to BTRFS_TRIM_STATE_UNTRIMMED. + */ + if (discard_index != BTRFS_DISCARD_INDEX_UNUSED) + maxlen = discard_minlen[discard_index - 1]; + + btrfs_trim_block_group_bitmaps(block_group, &trimmed, + block_group->discard_cursor, + btrfs_block_group_end(block_group), + minlen, maxlen, true); + discard_ctl->discard_bitmap_bytes += trimmed; + } else { + btrfs_trim_block_group_extents(block_group, &trimmed, + block_group->discard_cursor, + btrfs_block_group_end(block_group), + minlen, true); + discard_ctl->discard_extent_bytes += trimmed; + } + + discard_ctl->prev_discard = trimmed; + + /* Determine next steps for a block_group */ + if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) { + if (discard_state == BTRFS_DISCARD_BITMAPS) { + btrfs_finish_discard_pass(discard_ctl, block_group); + } else { + block_group->discard_cursor = block_group->start; + spin_lock(&discard_ctl->lock); + if (block_group->discard_state != + BTRFS_DISCARD_RESET_CURSOR) + block_group->discard_state = + BTRFS_DISCARD_BITMAPS; + spin_unlock(&discard_ctl->lock); + } + } + + spin_lock(&discard_ctl->lock); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); + + btrfs_discard_schedule_work(discard_ctl, false); +} + +/** + * btrfs_run_discard_work - determines if async discard should be running + * @discard_ctl: discard control + * + * Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. + */ +bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) +{ + struct btrfs_fs_info *fs_info = container_of(discard_ctl, + struct btrfs_fs_info, + discard_ctl); + + return (!(fs_info->sb->s_flags & SB_RDONLY) && + test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); +} + +/** + * btrfs_discard_calc_delay - recalculate the base delay + * @discard_ctl: discard control + * + * Recalculate the base delay which is based off the total number of + * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms) + * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC). + */ +void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) +{ + s32 discardable_extents; + s64 discardable_bytes; + u32 iops_limit; + unsigned long delay; + unsigned long lower_limit = BTRFS_DISCARD_MIN_DELAY_MSEC; + + discardable_extents = atomic_read(&discard_ctl->discardable_extents); + if (!discardable_extents) + return; + + spin_lock(&discard_ctl->lock); + + /* + * The following is to fix a potential -1 discrepenancy that we're not + * sure how to reproduce. But given that this is the only place that + * utilizes these numbers and this is only called by from + * btrfs_finish_extent_commit() which is synchronized, we can correct + * here. + */ + if (discardable_extents < 0) + atomic_add(-discardable_extents, + &discard_ctl->discardable_extents); + + discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes); + if (discardable_bytes < 0) + atomic64_add(-discardable_bytes, + &discard_ctl->discardable_bytes); + + if (discardable_extents <= 0) { + spin_unlock(&discard_ctl->lock); + return; + } + + iops_limit = READ_ONCE(discard_ctl->iops_limit); + if (iops_limit) + lower_limit = max_t(unsigned long, lower_limit, + MSEC_PER_SEC / iops_limit); + + delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents; + delay = clamp(delay, lower_limit, BTRFS_DISCARD_MAX_DELAY_MSEC); + discard_ctl->delay = msecs_to_jiffies(delay); + + spin_unlock(&discard_ctl->lock); +} + +/** + * btrfs_discard_update_discardable - propagate discard counters + * @block_group: block_group of interest + * @ctl: free_space_ctl of @block_group + * + * This propagates deltas of counters up to the discard_ctl. It maintains a + * current counter and a previous counter passing the delta up to the global + * stat. Then the current counter value becomes the previous counter value. + */ +void btrfs_discard_update_discardable(struct btrfs_block_group *block_group, + struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_discard_ctl *discard_ctl; + s32 extents_delta; + s64 bytes_delta; + + if (!block_group || + !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) || + !btrfs_is_block_group_data_only(block_group)) + return; + + discard_ctl = &block_group->fs_info->discard_ctl; + + extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] - + ctl->discardable_extents[BTRFS_STAT_PREV]; + if (extents_delta) { + atomic_add(extents_delta, &discard_ctl->discardable_extents); + ctl->discardable_extents[BTRFS_STAT_PREV] = + ctl->discardable_extents[BTRFS_STAT_CURR]; + } + + bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] - + ctl->discardable_bytes[BTRFS_STAT_PREV]; + if (bytes_delta) { + atomic64_add(bytes_delta, &discard_ctl->discardable_bytes); + ctl->discardable_bytes[BTRFS_STAT_PREV] = + ctl->discardable_bytes[BTRFS_STAT_CURR]; + } +} + +/** + * btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists + * @fs_info: fs_info of interest + * + * The unused_bgs list needs to be punted to the discard lists because the + * order of operations is changed. In the normal sychronous discard path, the + * block groups are trimmed via a single large trim in transaction commit. This + * is ultimately what we are trying to avoid with asynchronous discard. Thus, + * it must be done before going down the unused_bgs path. + */ +void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group *block_group, *next; + + spin_lock(&fs_info->unused_bgs_lock); + /* We enabled async discard, so punt all to the queue */ + list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, + bg_list) { + list_del_init(&block_group->bg_list); + btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + +/** + * btrfs_discard_purge_list - purge discard lists + * @discard_ctl: discard control + * + * If we are disabling async discard, we may have intercepted block groups that + * are completely free and ready for the unused_bgs path. As discarding will + * now happen in transaction commit or not at all, we can safely mark the + * corresponding block groups as unused and they will be sent on their merry + * way to the unused_bgs list. + */ +static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) +{ + struct btrfs_block_group *block_group, *next; + int i; + + spin_lock(&discard_ctl->lock); + for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { + list_for_each_entry_safe(block_group, next, + &discard_ctl->discard_list[i], + discard_list) { + list_del_init(&block_group->discard_list); + spin_unlock(&discard_ctl->lock); + if (block_group->used == 0) + btrfs_mark_bg_unused(block_group); + spin_lock(&discard_ctl->lock); + } + } + spin_unlock(&discard_ctl->lock); +} + +void btrfs_discard_resume(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) { + btrfs_discard_cleanup(fs_info); + return; + } + + btrfs_discard_punt_unused_bgs_list(fs_info); + + set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); +} + +void btrfs_discard_stop(struct btrfs_fs_info *fs_info) +{ + clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); +} + +void btrfs_discard_init(struct btrfs_fs_info *fs_info) +{ + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; + int i; + + spin_lock_init(&discard_ctl->lock); + INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn); + + for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) + INIT_LIST_HEAD(&discard_ctl->discard_list[i]); + + discard_ctl->prev_discard = 0; + atomic_set(&discard_ctl->discardable_extents, 0); + atomic64_set(&discard_ctl->discardable_bytes, 0); + discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE; + discard_ctl->delay = BTRFS_DISCARD_MAX_DELAY_MSEC; + discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS; + discard_ctl->kbps_limit = 0; + discard_ctl->discard_extent_bytes = 0; + discard_ctl->discard_bitmap_bytes = 0; + atomic64_set(&discard_ctl->discard_bytes_saved, 0); +} + +void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info) +{ + btrfs_discard_stop(fs_info); + cancel_delayed_work_sync(&fs_info->discard_ctl.work); + btrfs_discard_purge_list(&fs_info->discard_ctl); +} diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h new file mode 100644 index 000000000000..21a15776dac4 --- /dev/null +++ b/fs/btrfs/discard.h @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef BTRFS_DISCARD_H +#define BTRFS_DISCARD_H + +#include <linux/sizes.h> + +struct btrfs_fs_info; +struct btrfs_discard_ctl; +struct btrfs_block_group; + +/* Discard size limits */ +#define BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE (SZ_64M) +#define BTRFS_ASYNC_DISCARD_MAX_FILTER (SZ_1M) +#define BTRFS_ASYNC_DISCARD_MIN_FILTER (SZ_32K) + +/* List operations */ +void btrfs_discard_check_filter(struct btrfs_block_group *block_group, u64 bytes); + +/* Work operations */ +void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group); +void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group); +void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, + bool override); +bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl); + +/* Update operations */ +void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl); +void btrfs_discard_update_discardable(struct btrfs_block_group *block_group, + struct btrfs_free_space_ctl *ctl); + +/* Setup/cleanup operations */ +void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info); +void btrfs_discard_resume(struct btrfs_fs_info *fs_info); +void btrfs_discard_stop(struct btrfs_fs_info *fs_info); +void btrfs_discard_init(struct btrfs_fs_info *fs_info); +void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info); + +#endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e0edfdc9c82b..7fa9bb79ad08 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -41,6 +41,7 @@ #include "tree-checker.h" #include "ref-verify.h" #include "block-group.h" +#include "discard.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -202,8 +203,8 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, * that covers the entire device */ struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, u64 start, u64 len, - int create) + struct page *page, size_t pg_offset, + u64 start, u64 len) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; @@ -1953,6 +1954,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->readahead_workers); btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); + if (fs_info->discard_ctl.discard_workers) + destroy_workqueue(fs_info->discard_ctl.discard_workers); /* * Now that all other work queues are destroyed, we can safely destroy * the queues used for metadata I/O, since tasks from those other work @@ -2148,6 +2151,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, max_active, 2); fs_info->qgroup_rescan_workers = btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); + fs_info->discard_ctl.discard_workers = + alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1); if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && @@ -2158,7 +2163,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && fs_info->fixup_workers && fs_info->delayed_workers && - fs_info->qgroup_rescan_workers)) { + fs_info->qgroup_rescan_workers && + fs_info->discard_ctl.discard_workers)) { return -ENOMEM; } @@ -2691,7 +2697,6 @@ int __cold open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); - spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); @@ -2792,6 +2797,7 @@ int __cold open_ctree(struct super_block *sb, btrfs_init_dev_replace_locks(fs_info); btrfs_init_qgroup(fs_info); + btrfs_discard_init(fs_info); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); @@ -3082,20 +3088,13 @@ int __cold open_ctree(struct super_block *sb, btrfs_free_extra_devids(fs_devices, 1); - ret = btrfs_sysfs_add_fsid(fs_devices, NULL); + ret = btrfs_sysfs_add_fsid(fs_devices); if (ret) { btrfs_err(fs_info, "failed to init sysfs fsid interface: %d", ret); goto fail_block_groups; } - ret = btrfs_sysfs_add_device(fs_devices); - if (ret) { - btrfs_err(fs_info, "failed to init sysfs device interface: %d", - ret); - goto fail_fsdev_sysfs; - } - ret = btrfs_sysfs_add_mounted(fs_info); if (ret) { btrfs_err(fs_info, "failed to init sysfs interface: %d", ret); @@ -3262,6 +3261,7 @@ int __cold open_ctree(struct super_block *sb, } btrfs_qgroup_rescan_resume(fs_info); + btrfs_discard_resume(fs_info); if (!fs_info->uuid_root) { btrfs_info(fs_info, "creating UUID tree"); @@ -3978,6 +3978,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); + /* Cancel or finish ongoing discard work */ + btrfs_discard_cleanup(fs_info); + if (!sb_rdonly(fs_info->sb)) { /* * The cleaner kthread is stopped, so do one final pass over @@ -4026,11 +4029,18 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) invalidate_inode_pages2(fs_info->btree_inode->i_mapping); btrfs_stop_all_workers(fs_info); - btrfs_free_block_groups(fs_info); - clear_bit(BTRFS_FS_OPEN, &fs_info->flags); free_root_pointers(fs_info, true); + /* + * We must free the block groups after dropping the fs_roots as we could + * have had an IO error and have left over tree log blocks that aren't + * cleaned up until the fs roots are freed. This makes the block group + * accounting appear to be wrong because there's pending reserved bytes, + * so make sure we do the block group cleanup afterwards. + */ + btrfs_free_block_groups(fs_info); + iput(fs_info->btree_inode); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 76f123ebb292..8c2d6cf1ce59 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -134,8 +134,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, u64 start, u64 len, - int create); + struct page *page, size_t pg_offset, + u64 start, u64 len); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 153f71a5bba9..0163fdd59f8f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -32,6 +32,7 @@ #include "block-rsv.h" #include "delalloc-space.h" #include "block-group.h" +#include "discard.h" #undef SCRAMBLE_DELAYED_REFS @@ -1869,8 +1870,8 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, btrfs_pin_extent(fs_info, head->bytenr, head->num_bytes, 1); if (head->is_data) { - ret = btrfs_del_csums(trans, fs_info, head->bytenr, - head->num_bytes); + ret = btrfs_del_csums(trans, fs_info->csum_root, + head->bytenr, head->num_bytes); } } @@ -2923,7 +2924,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) break; } - if (btrfs_test_opt(fs_info, DISCARD)) + if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, end + 1 - start, NULL); @@ -2934,6 +2935,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) cond_resched(); } + if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { + btrfs_discard_calc_delay(&fs_info->discard_ctl); + btrfs_discard_schedule_work(&fs_info->discard_ctl, true); + } + /* * Transaction is finished. We don't need the lock anymore. We * do need to clean up the block groups in case of a transaction @@ -3175,7 +3181,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (is_data) { - ret = btrfs_del_csums(trans, info, bytenr, num_bytes); + ret = btrfs_del_csums(trans, info->csum_root, bytenr, + num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3437,7 +3444,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache, */ struct find_free_extent_ctl { /* Basic allocation info */ - u64 ram_bytes; u64 num_bytes; u64 empty_size; u64 flags; @@ -3799,6 +3805,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 flags, int delalloc) { int ret = 0; + int cache_block_group_error = 0; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group *block_group = NULL; struct find_free_extent_ctl ffe_ctl = {0}; @@ -3808,7 +3815,6 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, WARN_ON(num_bytes < fs_info->sectorsize); - ffe_ctl.ram_bytes = ram_bytes; ffe_ctl.num_bytes = num_bytes; ffe_ctl.empty_size = empty_size; ffe_ctl.flags = flags; @@ -3958,7 +3964,20 @@ have_block_group: if (unlikely(!ffe_ctl.cached)) { ffe_ctl.have_caching_bg = true; ret = btrfs_cache_block_group(block_group, 0); - BUG_ON(ret < 0); + + /* + * If we get ENOMEM here or something else we want to + * try other block groups, because it may not be fatal. + * However if we can't find anything else we need to + * save our return here so that we return the actual + * error that caused problems, not ENOSPC. + */ + if (ret < 0) { + if (!cache_block_group_error) + cache_block_group_error = ret; + ret = 0; + goto loop; + } ret = 0; } @@ -4045,7 +4064,7 @@ loop: if (ret > 0) goto search; - if (ret == -ENOSPC) { + if (ret == -ENOSPC && !cache_block_group_error) { /* * Use ffe_ctl->total_free_space as fallback if we can't find * any contiguous hole. @@ -4056,6 +4075,8 @@ loop: space_info->max_extent_size = ffe_ctl.max_extent_size; spin_unlock(&space_info->lock); ins->offset = ffe_ctl.max_extent_size; + } else if (ret == -ENOSPC) { + ret = cache_block_group_error; } return ret; } @@ -4148,12 +4169,10 @@ again: return ret; } -static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, - int pin, int delalloc) +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 len, int delalloc) { struct btrfs_block_group *cache; - int ret = 0; cache = btrfs_lookup_block_group(fs_info, start); if (!cache) { @@ -4162,30 +4181,28 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, return -ENOSPC; } - if (pin) - pin_down_extent(cache, start, len, 1); - else { - if (btrfs_test_opt(fs_info, DISCARD)) - ret = btrfs_discard_extent(fs_info, start, len, NULL); - btrfs_add_free_space(cache, start, len); - btrfs_free_reserved_bytes(cache, len, delalloc); - trace_btrfs_reserved_extent_free(fs_info, start, len); - } + btrfs_add_free_space(cache, start, len); + btrfs_free_reserved_bytes(cache, len, delalloc); + trace_btrfs_reserved_extent_free(fs_info, start, len); btrfs_put_block_group(cache); - return ret; + return 0; } -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc) +int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { - return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc); -} + struct btrfs_block_group *cache; + int ret = 0; -int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len) -{ - return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0); + cache = btrfs_lookup_block_group(fs_info, start); + if (!cache) { + btrfs_err(fs_info, "unable to find block group for %llu", start); + return -ENOSPC; + } + + ret = pin_down_extent(cache, start, len, 1); + btrfs_put_block_group(cache); + return ret; } static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eb8bd0258360..c0f202741e09 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1593,21 +1593,25 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, /* Find first extent with bits cleared */ while (1) { node = __etree_search(tree, start, &next, &prev, NULL, NULL); - if (!node) { + if (!node && !next && !prev) { + /* + * Tree is completely empty, send full range and let + * caller deal with it + */ + *start_ret = 0; + *end_ret = -1; + goto out; + } else if (!node && !next) { + /* + * We are past the last allocated chunk, set start at + * the end of the last extent. + */ + state = rb_entry(prev, struct extent_state, rb_node); + *start_ret = state->end + 1; + *end_ret = -1; + goto out; + } else if (!node) { node = next; - if (!node) { - /* - * We are past the last allocated chunk, - * set start at the end of the last extent. The - * device alloc tree should never be empty so - * prev is always set. - */ - ASSERT(prev); - state = rb_entry(prev, struct extent_state, rb_node); - *start_ret = state->end + 1; - *end_ret = -1; - goto out; - } } /* * At this point 'node' either contains 'start' or start is @@ -3043,7 +3047,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0); + em = get_extent(BTRFS_I(inode), page, pg_offset, start, len); if (em_cached && !IS_ERR_OR_NULL(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); @@ -3438,11 +3442,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, ret = btrfs_writepage_cow_fixup(page, start, page_end); if (ret) { /* Fixup worker will requeue */ - if (ret == -EBUSY) - wbc->pages_skipped++; - else - redirty_page_for_writepage(wbc, page); - + redirty_page_for_writepage(wbc, page); update_nr_written(wbc, nr_written); unlock_page(page); return 1; @@ -3455,11 +3455,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, update_nr_written(wbc, nr_written + 1); end = page_end; - if (i_size <= start) { - btrfs_writepage_endio_finish_ordered(page, start, page_end, 1); - goto done; - } - blocksize = inode->i_sb->s_blocksize; while (cur <= end) { @@ -3471,8 +3466,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page_end, 1); break; } - em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur, - end - cur + 1, 1); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur, + end - cur + 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); ret = PTR_ERR_OR_ZERO(em); @@ -3497,22 +3492,11 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, */ if (compressed || block_start == EXTENT_MAP_HOLE || block_start == EXTENT_MAP_INLINE) { - /* - * end_io notification does not happen here for - * compressed extents - */ - if (!compressed) - btrfs_writepage_endio_finish_ordered(page, cur, - cur + iosize - 1, - 1); - else if (compressed) { - /* we don't want to end_page_writeback on - * a compressed extent. this happens - * elsewhere - */ + if (compressed) nr++; - } - + else + btrfs_writepage_endio_finish_ordered(page, cur, + cur + iosize - 1, 1); cur += iosize; pg_offset += iosize; continue; @@ -3540,7 +3524,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, pg_offset += iosize; nr++; } -done: *nr_ret = nr; return ret; } @@ -3562,7 +3545,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, u64 page_end = start + PAGE_SIZE - 1; int ret; int nr = 0; - size_t pg_offset = 0; + size_t pg_offset; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; unsigned long nr_written = 0; @@ -3591,14 +3574,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, flush_dcache_page(page); } - pg_offset = 0; - set_page_extent_mapped(page); if (!epd->extent_locked) { ret = writepage_delalloc(inode, page, wbc, start, &nr_written); if (ret == 1) - goto done_unlocked; + return 0; if (ret) goto done; } @@ -3606,7 +3587,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ret = __extent_writepage_io(inode, page, wbc, epd, i_size, nr_written, &nr); if (ret == 1) - goto done_unlocked; + return 0; done: if (nr == 0) { @@ -3621,9 +3602,6 @@ done: unlock_page(page); ASSERT(ret <= 0); return ret; - -done_unlocked: - return 0; } void wait_on_extent_buffer_writeback(struct extent_buffer *eb) @@ -3941,6 +3919,11 @@ int btree_write_cache_pages(struct address_space *mapping, if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; + /* + * Start from the beginning does not need to cycle over the + * range, mark it as scanned. + */ + scanned = (index == 0); } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; @@ -3958,7 +3941,6 @@ retry: tag))) { unsigned i; - scanned = 1; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -4087,6 +4069,11 @@ static int extent_write_cache_pages(struct address_space *mapping, if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; + /* + * Start from the beginning does not need to cycle over the + * range, mark it as scanned. + */ + scanned = (index == 0); } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; @@ -4120,7 +4107,6 @@ retry: &index, end, tag))) { unsigned i; - scanned = 1; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -4180,7 +4166,16 @@ retry: */ scanned = 1; index = 0; - goto retry; + + /* + * If we're looping we could run into a page that is locked by a + * writer and that writer could be waiting on writeback for a + * page in our current bio, and thus deadlock, so flush the + * write bio here. + */ + ret = flush_write_bio(epd); + if (!ret) + goto retry; } if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) @@ -5074,12 +5069,14 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, return eb; eb = alloc_dummy_extent_buffer(fs_info, start); if (!eb) - return NULL; + return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; again: ret = radix_tree_preload(GFP_NOFS); - if (ret) + if (ret) { + exists = ERR_PTR(ret); goto free_eb; + } spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, start >> PAGE_SHIFT, eb); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a8551a1f56e2..5d205bbaafdc 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -183,10 +183,8 @@ static inline int extent_compress_type(unsigned long bio_flags) struct extent_map_tree; typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, - struct page *page, - size_t pg_offset, - u64 start, u64 len, - int create); + struct page *page, size_t pg_offset, + u64 start, u64 len); int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 3270a40b0777..c2f365662d55 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -148,8 +148,19 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } -static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - u64 logical_offset, u8 *dst, int dio) +/** + * btrfs_lookup_bio_sums - Look up checksums for a bio. + * @inode: inode that the bio is for. + * @bio: bio embedded in btrfs_io_bio. + * @offset: Unless (u64)-1, look up checksums for this offset in the file. + * If (u64)-1, use the page offsets from the bio instead. + * @dst: Buffer of size btrfs_super_csum_size() used to return checksum. If + * NULL, the checksum is returned in btrfs_io_bio(bio)->csum instead. + * + * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. + */ +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, + u64 offset, u8 *dst) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct bio_vec bvec; @@ -158,8 +169,8 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_path *path; + const bool page_offsets = (offset == (u64)-1); u8 *csum; - u64 offset = 0; u64 item_start_offset = 0; u64 item_last_offset = 0; u64 disk_bytenr; @@ -205,15 +216,13 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio } disk_bytenr = (u64)bio->bi_iter.bi_sector << 9; - if (dio) - offset = logical_offset; bio_for_each_segment(bvec, bio, iter) { page_bytes_left = bvec.bv_len; if (count) goto next; - if (!dio) + if (page_offsets) offset = page_offset(bvec.bv_page) + bvec.bv_offset; count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, csum, nblocks); @@ -274,7 +283,8 @@ found: csum += count * csum_size; nblocks -= count; next: - while (count--) { + while (count > 0) { + count--; disk_bytenr += fs_info->sectorsize; offset += fs_info->sectorsize; page_bytes_left -= fs_info->sectorsize; @@ -285,18 +295,7 @@ next: WARN_ON_ONCE(count); btrfs_free_path(path); - return 0; -} - -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - u8 *dst) -{ - return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); -} - -blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) -{ - return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1); + return BLK_STS_OK; } int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, @@ -483,8 +482,8 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, - 1); for (i = 0; i < nr_sectors; i++) { - if (offset >= ordered->file_offset + ordered->len || - offset < ordered->file_offset) { + if (offset >= ordered->file_offset + ordered->num_bytes || + offset < ordered->file_offset) { unsigned long bytes_left; sums->len = this_sum_bytes; @@ -590,9 +589,9 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, * range of bytes. */ int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, u64 len) + struct btrfs_root *root, u64 bytenr, u64 len) { - struct btrfs_root *root = fs_info->csum_root; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *path; struct btrfs_key key; u64 end_byte = bytenr + len; @@ -602,6 +601,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int blocksize_bits = fs_info->sb->s_blocksize_bits; + ASSERT(root == fs_info->csum_root || + root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0cb43b682789..a16da274c9aa 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -477,8 +477,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, u64 em_len; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, search_start, - search_len, 0); + em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); if (IS_ERR(em)) return PTR_ERR(em); @@ -1501,7 +1500,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, ordered = btrfs_lookup_ordered_range(inode, start_pos, last_pos - start_pos + 1); if (ordered && - ordered->file_offset + ordered->len > start_pos && + ordered->file_offset + ordered->num_bytes > start_pos && ordered->file_offset <= last_pos) { unlock_extent_cached(&inode->io_tree, start_pos, last_pos, cached_state); @@ -2390,7 +2389,7 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, round_down(*start, fs_info->sectorsize), - round_up(*len, fs_info->sectorsize), 0); + round_up(*len, fs_info->sectorsize)); if (IS_ERR(em)) return PTR_ERR(em); @@ -2426,7 +2425,7 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, * we need to try again. */ if ((!ordered || - (ordered->file_offset + ordered->len <= lockstart || + (ordered->file_offset + ordered->num_bytes <= lockstart || ordered->file_offset > lockend)) && !filemap_range_has_page(inode->i_mapping, lockstart, lockend)) { @@ -2599,8 +2598,8 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, } } - if (clone_info) { - u64 clone_len = drop_end - cur_offset; + if (clone_info && drop_end > clone_info->file_offset) { + u64 clone_len = drop_end - clone_info->file_offset; ret = btrfs_insert_clone_extent(trans, inode, path, clone_info, clone_len); @@ -2957,7 +2956,7 @@ static int btrfs_zero_range_check_range_boundary(struct inode *inode, int ret; offset = round_down(offset, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); @@ -2990,8 +2989,8 @@ static int btrfs_zero_range(struct inode *inode, inode_dio_wait(inode); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, - alloc_start, alloc_end - alloc_start, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, + alloc_end - alloc_start); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -3034,8 +3033,8 @@ static int btrfs_zero_range(struct inode *inode, if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, - alloc_start, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, + sectorsize); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -3248,7 +3247,7 @@ static long btrfs_fallocate(struct file *file, int mode, ordered = btrfs_lookup_first_ordered_extent(inode, locked_end); if (ordered && - ordered->file_offset + ordered->len > alloc_start && + ordered->file_offset + ordered->num_bytes > alloc_start && ordered->file_offset < alloc_end) { btrfs_put_ordered_extent(ordered); unlock_extent_cached(&BTRFS_I(inode)->io_tree, @@ -3273,7 +3272,7 @@ static long btrfs_fallocate(struct file *file, int mode, INIT_LIST_HEAD(&reserve_list); while (cur_offset < alloc_end) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, - alloc_end - cur_offset, 0); + alloc_end - cur_offset); if (IS_ERR(em)) { ret = PTR_ERR(em); break; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 3283da419200..0598fd3c6e3f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -21,9 +21,11 @@ #include "space-info.h" #include "delalloc-space.h" #include "block-group.h" +#include "discard.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) -#define MAX_CACHE_BYTES_PER_GIG SZ_32K +#define MAX_CACHE_BYTES_PER_GIG SZ_64K +#define FORCE_EXTENT_THRESHOLD SZ_1M struct btrfs_trim_range { u64 start; @@ -31,6 +33,8 @@ struct btrfs_trim_range { struct list_head list; }; +static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info); static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); static void unlink_free_space(struct btrfs_free_space_ctl *ctl, @@ -752,6 +756,16 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, goto free_cache; } + /* + * Sync discard ensures that the free space cache is always + * trimmed. So when reading this in, the state should reflect + * that. We also do this for async as a stop gap for lack of + * persistence. + */ + if (btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_test_opt(fs_info, DISCARD_ASYNC)) + e->trim_state = BTRFS_TRIM_STATE_TRIMMED; + if (!e->bytes) { kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; @@ -805,12 +819,19 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, ret = io_ctl_read_bitmap(&io_ctl, e); if (ret) goto free_cache; + e->bitmap_extents = count_bitmap_extents(ctl, e); + if (!btrfs_free_space_trimmed(e)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += + e->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] += e->bytes; + } } io_ctl_drop_pages(&io_ctl); merge_space_tree(ctl); ret = 1; out: + btrfs_discard_update_discardable(ctl->private, ctl); io_ctl_free(&io_ctl); return ret; free_cache: @@ -1624,6 +1645,11 @@ __unlink_free_space(struct btrfs_free_space_ctl *ctl, { rb_erase(&info->offset_index, &ctl->free_space_offset); ctl->free_extents--; + + if (!info->bitmap && !btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR]--; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; + } } static void unlink_free_space(struct btrfs_free_space_ctl *ctl, @@ -1644,6 +1670,11 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, if (ret) return ret; + if (!info->bitmap && !btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR]++; + ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; + } + ctl->free_space += info->bytes; ctl->free_extents++; return ret; @@ -1664,26 +1695,17 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) ASSERT(ctl->total_bitmaps <= max_bitmaps); /* - * The goal is to keep the total amount of memory used per 1gb of space - * at or below 32k, so we need to adjust how much memory we allow to be - * used by extent based free space tracking + * We are trying to keep the total amount of memory used per 1GiB of + * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation + * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of + * bitmaps, we may end up using more memory than this. */ if (size < SZ_1G) max_bytes = MAX_CACHE_BYTES_PER_GIG; else max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); - /* - * we want to account for 1 more bitmap than what we have so we can make - * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as - * we add more bitmaps. - */ - bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit; - - if (bitmap_bytes >= max_bytes) { - ctl->extents_thresh = 0; - return; - } + bitmap_bytes = ctl->total_bitmaps * ctl->unit; /* * we want the extent entry threshold to always be at most 1/2 the max @@ -1700,17 +1722,31 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes) { - unsigned long start, count; + unsigned long start, count, end; + int extent_delta = -1; start = offset_to_bit(info->offset, ctl->unit, offset); count = bytes_to_bits(bytes, ctl->unit); - ASSERT(start + count <= BITS_PER_BITMAP); + end = start + count; + ASSERT(end <= BITS_PER_BITMAP); bitmap_clear(info->bitmap, start, count); info->bytes -= bytes; if (info->max_extent_size > ctl->unit) info->max_extent_size = 0; + + if (start && test_bit(start - 1, info->bitmap)) + extent_delta++; + + if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) + extent_delta++; + + info->bitmap_extents += extent_delta; + if (!btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; + } } static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, @@ -1725,16 +1761,30 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes) { - unsigned long start, count; + unsigned long start, count, end; + int extent_delta = 1; start = offset_to_bit(info->offset, ctl->unit, offset); count = bytes_to_bits(bytes, ctl->unit); - ASSERT(start + count <= BITS_PER_BITMAP); + end = start + count; + ASSERT(end <= BITS_PER_BITMAP); bitmap_set(info->bitmap, start, count); info->bytes += bytes; ctl->free_space += bytes; + + if (start && test_bit(start - 1, info->bitmap)) + extent_delta--; + + if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) + extent_delta--; + + info->bitmap_extents += extent_delta; + if (!btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; + ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; + } } /* @@ -1870,11 +1920,35 @@ out: return NULL; } +static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info) +{ + struct btrfs_block_group *block_group = ctl->private; + u64 bytes = bitmap_info->bytes; + unsigned int rs, re; + int count = 0; + + if (!block_group || !bytes) + return count; + + bitmap_for_each_set_region(bitmap_info->bitmap, rs, re, 0, + BITS_PER_BITMAP) { + bytes -= (rs - re) * ctl->unit; + count++; + + if (!bytes) + break; + } + + return count; +} + static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset) { info->offset = offset_to_bitmap(ctl, offset); info->bytes = 0; + info->bitmap_extents = 0; INIT_LIST_HEAD(&info->list); link_free_space(ctl, info); ctl->total_bitmaps++; @@ -1885,6 +1959,18 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, static void free_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info) { + /* + * Normally when this is called, the bitmap is completely empty. However, + * if we are blowing up the free space cache for one reason or another + * via __btrfs_remove_free_space_cache(), then it may not be freed and + * we may leave stats on the table. + */ + if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { + ctl->discardable_extents[BTRFS_STAT_CURR] -= + bitmap_info->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; + + } unlink_free_space(ctl, bitmap_info); kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); kmem_cache_free(btrfs_free_space_cachep, bitmap_info); @@ -1971,11 +2057,24 @@ again: static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, - u64 bytes) + u64 bytes, enum btrfs_trim_state trim_state) { u64 bytes_to_set = 0; u64 end; + /* + * This is a tradeoff to make bitmap trim state minimal. We mark the + * whole bitmap untrimmed if at any point we add untrimmed regions. + */ + if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { + if (btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += + info->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; + } + info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + } + end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); bytes_to_set = min(end - offset, bytes); @@ -2004,6 +2103,10 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, forced = true; #endif + /* This is a way to reclaim large regions from the bitmaps. */ + if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) + return false; + /* * If we are below the extents threshold then we can add this as an * extent, and don't have to deal with the bitmap @@ -2016,8 +2119,8 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, * of cache left then go ahead an dadd them, no sense in adding * the overhead of a bitmap if we don't have to. */ - if (info->bytes <= fs_info->sectorsize * 4) { - if (ctl->free_extents * 2 <= ctl->extents_thresh) + if (info->bytes <= fs_info->sectorsize * 8) { + if (ctl->free_extents * 3 <= ctl->extents_thresh) return false; } else { return false; @@ -2050,10 +2153,12 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_block_group *block_group = NULL; int added = 0; u64 bytes, offset, bytes_added; + enum btrfs_trim_state trim_state; int ret; bytes = info->bytes; offset = info->offset; + trim_state = info->trim_state; if (!ctl->op->use_bitmap(ctl, info)) return 0; @@ -2088,8 +2193,8 @@ again: } if (entry->offset == offset_to_bitmap(ctl, offset)) { - bytes_added = add_bytes_to_bitmap(ctl, entry, - offset, bytes); + bytes_added = add_bytes_to_bitmap(ctl, entry, offset, + bytes, trim_state); bytes -= bytes_added; offset += bytes_added; } @@ -2108,7 +2213,8 @@ no_cluster_bitmap: goto new_bitmap; } - bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, + trim_state); bytes -= bytes_added; offset += bytes_added; added = 0; @@ -2142,6 +2248,7 @@ new_bitmap: /* allocate the bitmap */ info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); + info->trim_state = BTRFS_TRIM_STATE_TRIMMED; spin_lock(&ctl->tree_lock); if (!info->bitmap) { ret = -ENOMEM; @@ -2161,6 +2268,22 @@ out: return ret; } +/* + * Free space merging rules: + * 1) Merge trimmed areas together + * 2) Let untrimmed areas coalesce with trimmed areas + * 3) Always pull neighboring regions from bitmaps + * + * The above rules are for when we merge free space based on btrfs_trim_state. + * Rules 2 and 3 are subtle because they are suboptimal, but are done for the + * same reason: to promote larger extent regions which makes life easier for + * find_free_extent(). Rule 2 enables coalescing based on the common path + * being returning free space from btrfs_finish_extent_commit(). So when free + * space is trimmed, it will prevent aggregating trimmed new region and + * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents + * and provide find_free_extent() with the largest extents possible hoping for + * the reuse path. + */ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { @@ -2169,6 +2292,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, bool merged = false; u64 offset = info->offset; u64 bytes = info->bytes; + const bool is_trimmed = btrfs_free_space_trimmed(info); /* * first we want to see if there is free space adjacent to the range we @@ -2182,7 +2306,9 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, else left_info = tree_search_offset(ctl, offset - 1, 0, 0); - if (right_info && !right_info->bitmap) { + /* See try_merge_free_space() comment. */ + if (right_info && !right_info->bitmap && + (!is_trimmed || btrfs_free_space_trimmed(right_info))) { if (update_stat) unlink_free_space(ctl, right_info); else @@ -2192,8 +2318,10 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, merged = true; } + /* See try_merge_free_space() comment. */ if (left_info && !left_info->bitmap && - left_info->offset + left_info->bytes == offset) { + left_info->offset + left_info->bytes == offset && + (!is_trimmed || btrfs_free_space_trimmed(left_info))) { if (update_stat) unlink_free_space(ctl, left_info); else @@ -2229,6 +2357,10 @@ static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, bytes = (j - i) * ctl->unit; info->bytes += bytes; + /* See try_merge_free_space() comment. */ + if (!btrfs_free_space_trimmed(bitmap)) + info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + if (update_stat) bitmap_clear_bits(ctl, bitmap, end, bytes); else @@ -2282,6 +2414,10 @@ static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, info->offset -= bytes; info->bytes += bytes; + /* See try_merge_free_space() comment. */ + if (!btrfs_free_space_trimmed(bitmap)) + info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + if (update_stat) bitmap_clear_bits(ctl, bitmap, info->offset, bytes); else @@ -2331,10 +2467,13 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, struct btrfs_free_space_ctl *ctl, - u64 offset, u64 bytes) + u64 offset, u64 bytes, + enum btrfs_trim_state trim_state) { + struct btrfs_block_group *block_group = ctl->private; struct btrfs_free_space *info; int ret = 0; + u64 filter_bytes = bytes; info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!info) @@ -2342,6 +2481,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, info->offset = offset; info->bytes = bytes; + info->trim_state = trim_state; RB_CLEAR_NODE(&info->offset_index); spin_lock(&ctl->tree_lock); @@ -2370,10 +2510,13 @@ link: */ steal_from_bitmap(ctl, info, true); + filter_bytes = max(filter_bytes, info->bytes); + ret = link_free_space(ctl, info); if (ret) kmem_cache_free(btrfs_free_space_cachep, info); out: + btrfs_discard_update_discardable(block_group, ctl); spin_unlock(&ctl->tree_lock); if (ret) { @@ -2381,15 +2524,44 @@ out: ASSERT(ret != -EEXIST); } + if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { + btrfs_discard_check_filter(block_group, filter_bytes); + btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); + } + return ret; } int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size) { + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) + trim_state = BTRFS_TRIM_STATE_TRIMMED; + + return __btrfs_add_free_space(block_group->fs_info, + block_group->free_space_ctl, + bytenr, size, trim_state); +} + +/* + * This is a subtle distinction because when adding free space back in general, + * we want it to be added as untrimmed for async. But in the case where we add + * it on loading of a block group, we want to consider it trimmed. + */ +int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, + u64 bytenr, u64 size) +{ + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || + btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) + trim_state = BTRFS_TRIM_STATE_TRIMMED; + return __btrfs_add_free_space(block_group->fs_info, block_group->free_space_ctl, - bytenr, size); + bytenr, size, trim_state); } int btrfs_remove_free_space(struct btrfs_block_group *block_group, @@ -2464,8 +2636,10 @@ again: } spin_unlock(&ctl->tree_lock); - ret = btrfs_add_free_space(block_group, offset + bytes, - old_end - (offset + bytes)); + ret = __btrfs_add_free_space(block_group->fs_info, ctl, + offset + bytes, + old_end - (offset + bytes), + info->trim_state); WARN_ON(ret); goto out; } @@ -2477,6 +2651,7 @@ again: goto again; } out_lock: + btrfs_discard_update_discardable(block_group, ctl); spin_unlock(&ctl->tree_lock); out: return ret; @@ -2562,8 +2737,22 @@ __btrfs_return_cluster_to_free_space( bitmap = (entry->bitmap != NULL); if (!bitmap) { + /* Merging treats extents as if they were new */ + if (!btrfs_free_space_trimmed(entry)) { + ctl->discardable_extents[BTRFS_STAT_CURR]--; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= + entry->bytes; + } + try_merge_free_space(ctl, entry, false); steal_from_bitmap(ctl, entry, false); + + /* As we insert directly, update these statistics */ + if (!btrfs_free_space_trimmed(entry)) { + ctl->discardable_extents[BTRFS_STAT_CURR]++; + ctl->discardable_bytes[BTRFS_STAT_CURR] += + entry->bytes; + } } tree_insert_offset(&ctl->free_space_offset, entry->offset, &entry->offset_index, bitmap); @@ -2599,6 +2788,8 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) { spin_lock(&ctl->tree_lock); __btrfs_remove_free_space_cache_locked(ctl); + if (ctl->private) + btrfs_discard_update_discardable(ctl->private, ctl); spin_unlock(&ctl->tree_lock); } @@ -2620,20 +2811,55 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) cond_resched_lock(&ctl->tree_lock); } __btrfs_remove_free_space_cache_locked(ctl); + btrfs_discard_update_discardable(block_group, ctl); spin_unlock(&ctl->tree_lock); } +/** + * btrfs_is_free_space_trimmed - see if everything is trimmed + * @block_group: block_group of interest + * + * Walk @block_group's free space rb_tree to determine if everything is trimmed. + */ +bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *info; + struct rb_node *node; + bool ret = true; + + spin_lock(&ctl->tree_lock); + node = rb_first(&ctl->free_space_offset); + + while (node) { + info = rb_entry(node, struct btrfs_free_space, offset_index); + + if (!btrfs_free_space_trimmed(info)) { + ret = false; + break; + } + + node = rb_next(node); + } + + spin_unlock(&ctl->tree_lock); + return ret; +} + u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 offset, u64 bytes, u64 empty_size, u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_discard_ctl *discard_ctl = + &block_group->fs_info->discard_ctl; struct btrfs_free_space *entry = NULL; u64 bytes_search = bytes + empty_size; u64 ret = 0; u64 align_gap = 0; u64 align_gap_len = 0; + enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; spin_lock(&ctl->tree_lock); entry = find_free_space(ctl, &offset, &bytes_search, @@ -2644,12 +2870,20 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, ret = offset; if (entry->bitmap) { bitmap_clear_bits(ctl, entry, offset, bytes); + + if (!btrfs_free_space_trimmed(entry)) + atomic64_add(bytes, &discard_ctl->discard_bytes_saved); + if (!entry->bytes) free_bitmap(ctl, entry); } else { unlink_free_space(ctl, entry); align_gap_len = offset - entry->offset; align_gap = entry->offset; + align_gap_trim_state = entry->trim_state; + + if (!btrfs_free_space_trimmed(entry)) + atomic64_add(bytes, &discard_ctl->discard_bytes_saved); entry->offset = offset + bytes; WARN_ON(entry->bytes < bytes + align_gap_len); @@ -2661,11 +2895,13 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, link_free_space(ctl, entry); } out: + btrfs_discard_update_discardable(block_group, ctl); spin_unlock(&ctl->tree_lock); if (align_gap_len) __btrfs_add_free_space(block_group->fs_info, ctl, - align_gap, align_gap_len); + align_gap, align_gap_len, + align_gap_trim_state); return ret; } @@ -2707,6 +2943,8 @@ int btrfs_return_cluster_to_free_space( ret = __btrfs_return_cluster_to_free_space(block_group, cluster); spin_unlock(&ctl->tree_lock); + btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); + /* finally drop our ref */ btrfs_put_block_group(block_group); return ret; @@ -2750,6 +2988,8 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, u64 min_start, u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_discard_ctl *discard_ctl = + &block_group->fs_info->discard_ctl; struct btrfs_free_space *entry = NULL; struct rb_node *node; u64 ret = 0; @@ -2814,7 +3054,12 @@ out: spin_lock(&ctl->tree_lock); + if (!btrfs_free_space_trimmed(entry)) + atomic64_add(bytes, &discard_ctl->discard_bytes_saved); + ctl->free_space -= bytes; + if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) + ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; if (entry->bytes == 0) { ctl->free_extents--; if (entry->bitmap) { @@ -2822,6 +3067,8 @@ out: entry->bitmap); ctl->total_bitmaps--; ctl->op->recalc_thresholds(ctl); + } else if (!btrfs_free_space_trimmed(entry)) { + ctl->discardable_extents[BTRFS_STAT_CURR]--; } kmem_cache_free(btrfs_free_space_cachep, entry); } @@ -3148,6 +3395,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) static int do_trimming(struct btrfs_block_group *block_group, u64 *total_trimmed, u64 start, u64 bytes, u64 reserved_start, u64 reserved_bytes, + enum btrfs_trim_state reserved_trim_state, struct btrfs_trim_range *trim_entry) { struct btrfs_space_info *space_info = block_group->space_info; @@ -3155,6 +3403,9 @@ static int do_trimming(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int ret; int update = 0; + const u64 end = start + bytes; + const u64 reserved_end = reserved_start + reserved_bytes; + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; u64 trimmed = 0; spin_lock(&space_info->lock); @@ -3168,11 +3419,20 @@ static int do_trimming(struct btrfs_block_group *block_group, spin_unlock(&space_info->lock); ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); - if (!ret) + if (!ret) { *total_trimmed += trimmed; + trim_state = BTRFS_TRIM_STATE_TRIMMED; + } mutex_lock(&ctl->cache_writeout_mutex); - btrfs_add_free_space(block_group, reserved_start, reserved_bytes); + if (reserved_start < start) + __btrfs_add_free_space(fs_info, ctl, reserved_start, + start - reserved_start, + reserved_trim_state); + if (start + bytes < reserved_start + reserved_bytes) + __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end, + reserved_trim_state); + __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state); list_del(&trim_entry->list); mutex_unlock(&ctl->cache_writeout_mutex); @@ -3190,16 +3450,24 @@ static int do_trimming(struct btrfs_block_group *block_group, return ret; } +/* + * If @async is set, then we will trim 1 region and return. + */ static int trim_no_bitmap(struct btrfs_block_group *block_group, - u64 *total_trimmed, u64 start, u64 end, u64 minlen) + u64 *total_trimmed, u64 start, u64 end, u64 minlen, + bool async) { + struct btrfs_discard_ctl *discard_ctl = + &block_group->fs_info->discard_ctl; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; struct rb_node *node; int ret = 0; u64 extent_start; u64 extent_bytes; + enum btrfs_trim_state extent_trim_state; u64 bytes; + const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); while (start < end) { struct btrfs_trim_range trim_entry; @@ -3207,49 +3475,66 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, mutex_lock(&ctl->cache_writeout_mutex); spin_lock(&ctl->tree_lock); - if (ctl->free_space < minlen) { - spin_unlock(&ctl->tree_lock); - mutex_unlock(&ctl->cache_writeout_mutex); - break; - } + if (ctl->free_space < minlen) + goto out_unlock; entry = tree_search_offset(ctl, start, 0, 1); - if (!entry) { - spin_unlock(&ctl->tree_lock); - mutex_unlock(&ctl->cache_writeout_mutex); - break; - } + if (!entry) + goto out_unlock; - /* skip bitmaps */ - while (entry->bitmap) { + /* Skip bitmaps and if async, already trimmed entries */ + while (entry->bitmap || + (async && btrfs_free_space_trimmed(entry))) { node = rb_next(&entry->offset_index); - if (!node) { - spin_unlock(&ctl->tree_lock); - mutex_unlock(&ctl->cache_writeout_mutex); - goto out; - } + if (!node) + goto out_unlock; entry = rb_entry(node, struct btrfs_free_space, offset_index); } - if (entry->offset >= end) { - spin_unlock(&ctl->tree_lock); - mutex_unlock(&ctl->cache_writeout_mutex); - break; - } + if (entry->offset >= end) + goto out_unlock; extent_start = entry->offset; extent_bytes = entry->bytes; - start = max(start, extent_start); - bytes = min(extent_start + extent_bytes, end) - start; - if (bytes < minlen) { - spin_unlock(&ctl->tree_lock); - mutex_unlock(&ctl->cache_writeout_mutex); - goto next; - } + extent_trim_state = entry->trim_state; + if (async) { + start = entry->offset; + bytes = entry->bytes; + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + goto next; + } + unlink_free_space(ctl, entry); + /* + * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. + * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim + * X when we come back around. So trim it now. + */ + if (max_discard_size && + bytes >= (max_discard_size + + BTRFS_ASYNC_DISCARD_MIN_FILTER)) { + bytes = max_discard_size; + extent_bytes = max_discard_size; + entry->offset += max_discard_size; + entry->bytes -= max_discard_size; + link_free_space(ctl, entry); + } else { + kmem_cache_free(btrfs_free_space_cachep, entry); + } + } else { + start = max(start, extent_start); + bytes = min(extent_start + extent_bytes, end) - start; + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + goto next; + } - unlink_free_space(ctl, entry); - kmem_cache_free(btrfs_free_space_cachep, entry); + unlink_free_space(ctl, entry); + kmem_cache_free(btrfs_free_space_cachep, entry); + } spin_unlock(&ctl->tree_lock); trim_entry.start = extent_start; @@ -3258,11 +3543,17 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, - extent_start, extent_bytes, &trim_entry); - if (ret) + extent_start, extent_bytes, extent_trim_state, + &trim_entry); + if (ret) { + block_group->discard_cursor = start + bytes; break; + } next: start += bytes; + block_group->discard_cursor = start; + if (async && *total_trimmed) + break; if (fatal_signal_pending(current)) { ret = -ERESTARTSYS; @@ -3271,19 +3562,76 @@ next: cond_resched(); } -out: + + return ret; + +out_unlock: + block_group->discard_cursor = btrfs_block_group_end(block_group); + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + return ret; } +/* + * If we break out of trimming a bitmap prematurely, we should reset the + * trimming bit. In a rather contrieved case, it's possible to race here so + * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. + * + * start = start of bitmap + * end = near end of bitmap + * + * Thread 1: Thread 2: + * trim_bitmaps(start) + * trim_bitmaps(end) + * end_trimming_bitmap() + * reset_trimming_bitmap() + */ +static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) +{ + struct btrfs_free_space *entry; + + spin_lock(&ctl->tree_lock); + entry = tree_search_offset(ctl, offset, 1, 0); + if (entry) { + if (btrfs_free_space_trimmed(entry)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += + entry->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; + } + entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + } + + spin_unlock(&ctl->tree_lock); +} + +static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *entry) +{ + if (btrfs_free_space_trimming_bitmap(entry)) { + entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; + ctl->discardable_extents[BTRFS_STAT_CURR] -= + entry->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; + } +} + +/* + * If @async is set, then we will trim 1 region and return. + */ static int trim_bitmaps(struct btrfs_block_group *block_group, - u64 *total_trimmed, u64 start, u64 end, u64 minlen) + u64 *total_trimmed, u64 start, u64 end, u64 minlen, + u64 maxlen, bool async) { + struct btrfs_discard_ctl *discard_ctl = + &block_group->fs_info->discard_ctl; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; int ret = 0; int ret2; u64 bytes; u64 offset = offset_to_bitmap(ctl, start); + const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); while (offset < end) { bool next_bitmap = false; @@ -3293,35 +3641,84 @@ static int trim_bitmaps(struct btrfs_block_group *block_group, spin_lock(&ctl->tree_lock); if (ctl->free_space < minlen) { + block_group->discard_cursor = + btrfs_block_group_end(block_group); spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); break; } entry = tree_search_offset(ctl, offset, 1, 0); - if (!entry) { + /* + * Bitmaps are marked trimmed lossily now to prevent constant + * discarding of the same bitmap (the reason why we are bound + * by the filters). So, retrim the block group bitmaps when we + * are preparing to punt to the unused_bgs list. This uses + * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED + * which is the only discard index which sets minlen to 0. + */ + if (!entry || (async && minlen && start == offset && + btrfs_free_space_trimmed(entry))) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); next_bitmap = true; goto next; } + /* + * Async discard bitmap trimming begins at by setting the start + * to be key.objectid and the offset_to_bitmap() aligns to the + * start of the bitmap. This lets us know we are fully + * scanning the bitmap rather than only some portion of it. + */ + if (start == offset) + entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; + bytes = minlen; ret2 = search_bitmap(ctl, entry, &start, &bytes, false); if (ret2 || start >= end) { + /* + * We lossily consider a bitmap trimmed if we only skip + * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. + */ + if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) + end_trimming_bitmap(ctl, entry); + else + entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); next_bitmap = true; goto next; } + /* + * We already trimmed a region, but are using the locking above + * to reset the trim_state. + */ + if (async && *total_trimmed) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + goto out; + } + bytes = min(bytes, end - start); - if (bytes < minlen) { + if (bytes < minlen || (async && maxlen && bytes > maxlen)) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); goto next; } + /* + * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. + * If X < @minlen, we won't trim X when we come back around. + * So trim it now. We differ here from trimming extents as we + * don't keep individual state per bit. + */ + if (async && + max_discard_size && + bytes > (max_discard_size + minlen)) + bytes = max_discard_size; + bitmap_clear_bits(ctl, entry, start, bytes); if (entry->bytes == 0) free_bitmap(ctl, entry); @@ -3333,19 +3730,25 @@ static int trim_bitmaps(struct btrfs_block_group *block_group, mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, - start, bytes, &trim_entry); - if (ret) + start, bytes, 0, &trim_entry); + if (ret) { + reset_trimming_bitmap(ctl, offset); + block_group->discard_cursor = + btrfs_block_group_end(block_group); break; + } next: if (next_bitmap) { offset += BITS_PER_BITMAP * ctl->unit; + start = offset; } else { start += bytes; - if (start >= offset + BITS_PER_BITMAP * ctl->unit) - offset += BITS_PER_BITMAP * ctl->unit; } + block_group->discard_cursor = start; if (fatal_signal_pending(current)) { + if (start != offset) + reset_trimming_bitmap(ctl, offset); ret = -ERESTARTSYS; break; } @@ -3353,6 +3756,10 @@ next: cond_resched(); } + if (offset >= end) + block_group->discard_cursor = end; + +out: return ret; } @@ -3399,7 +3806,9 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group *block_group) int btrfs_trim_block_group(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen) { + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int ret; + u64 rem = 0; *trimmed = 0; @@ -3411,16 +3820,66 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group, btrfs_get_block_group_trimming(block_group); spin_unlock(&block_group->lock); - ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); if (ret) goto out; - ret = trim_bitmaps(block_group, trimmed, start, end, minlen); + ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); + div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); + /* If we ended in the middle of a bitmap, reset the trimming flag */ + if (rem) + reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); out: btrfs_put_block_group_trimming(block_group); return ret; } +int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen, + bool async) +{ + int ret; + + *trimmed = 0; + + spin_lock(&block_group->lock); + if (block_group->removed) { + spin_unlock(&block_group->lock); + return 0; + } + btrfs_get_block_group_trimming(block_group); + spin_unlock(&block_group->lock); + + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); + btrfs_put_block_group_trimming(block_group); + + return ret; +} + +int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen, + u64 maxlen, bool async) +{ + int ret; + + *trimmed = 0; + + spin_lock(&block_group->lock); + if (block_group->removed) { + spin_unlock(&block_group->lock); + return 0; + } + btrfs_get_block_group_trimming(block_group); + spin_unlock(&block_group->lock); + + ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, + async); + + btrfs_put_block_group_trimming(block_group); + + return ret; +} + /* * Find the left-most item in the cache tree, and then return the * smallest inode number in the item. @@ -3600,6 +4059,7 @@ int test_add_free_space_entry(struct btrfs_block_group *cache, struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; struct btrfs_free_space *info = NULL, *bitmap_info; void *map = NULL; + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; u64 bytes_added; int ret; @@ -3641,7 +4101,8 @@ again: info = NULL; } - bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, + trim_state); bytes -= bytes_added; offset += bytes_added; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index ba9a23241101..2e0a8077aa74 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -6,6 +6,20 @@ #ifndef BTRFS_FREE_SPACE_CACHE_H #define BTRFS_FREE_SPACE_CACHE_H +/* + * This is the trim state of an extent or bitmap. + * + * BTRFS_TRIM_STATE_TRIMMING is special and used to maintain the state of a + * bitmap as we may need several trims to fully trim a single bitmap entry. + * This is reset should any free space other than trimmed space be added to the + * bitmap. + */ +enum btrfs_trim_state { + BTRFS_TRIM_STATE_UNTRIMMED, + BTRFS_TRIM_STATE_TRIMMED, + BTRFS_TRIM_STATE_TRIMMING, +}; + struct btrfs_free_space { struct rb_node offset_index; u64 offset; @@ -13,8 +27,21 @@ struct btrfs_free_space { u64 max_extent_size; unsigned long *bitmap; struct list_head list; + enum btrfs_trim_state trim_state; + s32 bitmap_extents; }; +static inline bool btrfs_free_space_trimmed(struct btrfs_free_space *info) +{ + return (info->trim_state == BTRFS_TRIM_STATE_TRIMMED); +} + +static inline bool btrfs_free_space_trimming_bitmap( + struct btrfs_free_space *info) +{ + return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING); +} + struct btrfs_free_space_ctl { spinlock_t tree_lock; struct rb_root free_space_offset; @@ -24,6 +51,8 @@ struct btrfs_free_space_ctl { int total_bitmaps; int unit; u64 start; + s32 discardable_extents[BTRFS_STAT_NR_ENTRIES]; + s64 discardable_bytes[BTRFS_STAT_NR_ENTRIES]; const struct btrfs_free_space_op *op; void *private; struct mutex cache_writeout_mutex; @@ -83,13 +112,17 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group); int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, struct btrfs_free_space_ctl *ctl, - u64 bytenr, u64 size); + u64 bytenr, u64 size, + enum btrfs_trim_state trim_state); int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); +int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, + u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl); void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group); +bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group); u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 offset, u64 bytes, u64 empty_size, u64 *max_extent_size); @@ -108,6 +141,12 @@ int btrfs_return_cluster_to_free_space( struct btrfs_free_cluster *cluster); int btrfs_trim_block_group(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen); +int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen, + bool async); +int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen, + u64 maxlen, bool async); /* Support functions for running our sanity tests */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 37345fb6191d..d5c9c69d8263 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -107,7 +107,7 @@ again: if (last != (u64)-1 && last + 1 != key.objectid) { __btrfs_add_free_space(fs_info, ctl, last + 1, - key.objectid - last - 1); + key.objectid - last - 1, 0); wake_up(&root->ino_cache_wait); } @@ -118,7 +118,7 @@ next: if (last < root->highest_objectid - 1) { __btrfs_add_free_space(fs_info, ctl, last + 1, - root->highest_objectid - last - 1); + root->highest_objectid - last - 1, 0); } spin_lock(&root->ino_cache_lock); @@ -175,7 +175,8 @@ static void start_caching(struct btrfs_root *root) ret = btrfs_find_free_objectid(root, &objectid); if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) { __btrfs_add_free_space(fs_info, ctl, objectid, - BTRFS_LAST_FREE_OBJECTID - objectid + 1); + BTRFS_LAST_FREE_OBJECTID - objectid + 1, + 0); wake_up(&root->ino_cache_wait); } @@ -221,7 +222,7 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid) return; again: if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { - __btrfs_add_free_space(fs_info, pinned, objectid, 1); + __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0); } else { down_write(&fs_info->commit_root_sem); spin_lock(&root->ino_cache_lock); @@ -234,7 +235,7 @@ again: start_caching(root); - __btrfs_add_free_space(fs_info, pinned, objectid, 1); + __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0); up_write(&fs_info->commit_root_sem); } @@ -281,7 +282,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) spin_unlock(rbroot_lock); if (count) __btrfs_add_free_space(root->fs_info, ctl, - info->offset, count); + info->offset, count, 0); kmem_cache_free(btrfs_free_space_cachep, info); } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 56032c518b26..5b3ec93ff911 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -44,7 +44,6 @@ #include "locking.h" #include "free-space-cache.h" #include "inode-map.h" -#include "backref.h" #include "props.h" #include "qgroup.h" #include "delalloc-space.h" @@ -64,7 +63,6 @@ struct btrfs_dio_data { static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; -static const struct inode_operations btrfs_dir_ro_inode_operations; static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; @@ -1479,10 +1477,10 @@ next_slot: disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); /* - * If extent we got ends before our range starts, skip - * to next extent + * If the extent we got ends before our current offset, + * skip to the next extent. */ - if (extent_end <= start) { + if (extent_end <= cur_offset) { path->slots[0]++; goto next_slot; } @@ -2128,7 +2126,7 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, bio_flags); goto out; } else if (!skip_sum) { - ret = btrfs_lookup_bio_sums(inode, bio, NULL); + ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL); if (ret) goto out; } @@ -2191,6 +2189,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { struct page *page; + struct inode *inode; struct btrfs_work work; }; @@ -2204,27 +2203,71 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct inode *inode; u64 page_start; u64 page_end; - int ret; + int ret = 0; + bool free_delalloc_space = true; fixup = container_of(work, struct btrfs_writepage_fixup, work); page = fixup->page; + inode = fixup->inode; + page_start = page_offset(page); + page_end = page_offset(page) + PAGE_SIZE - 1; + + /* + * This is similar to page_mkwrite, we need to reserve the space before + * we take the page lock. + */ + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, + PAGE_SIZE); again: lock_page(page); + + /* + * Before we queued this fixup, we took a reference on the page. + * page->mapping may go NULL, but it shouldn't be moved to a different + * address space. + */ if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { - ClearPageChecked(page); + /* + * Unfortunately this is a little tricky, either + * + * 1) We got here and our page had already been dealt with and + * we reserved our space, thus ret == 0, so we need to just + * drop our space reservation and bail. This can happen the + * first time we come into the fixup worker, or could happen + * while waiting for the ordered extent. + * 2) Our page was already dealt with, but we happened to get an + * ENOSPC above from the btrfs_delalloc_reserve_space. In + * this case we obviously don't have anything to release, but + * because the page was already dealt with we don't want to + * mark the page with an error, so make sure we're resetting + * ret to 0. This is why we have this check _before_ the ret + * check, because we do not want to have a surprise ENOSPC + * when the page was already properly dealt with. + */ + if (!ret) { + btrfs_delalloc_release_extents(BTRFS_I(inode), + PAGE_SIZE); + btrfs_delalloc_release_space(inode, data_reserved, + page_start, PAGE_SIZE, + true); + } + ret = 0; goto out_page; } - inode = page->mapping->host; - page_start = page_offset(page); - page_end = page_offset(page) + PAGE_SIZE - 1; + /* + * We can't mess with the page state unless it is locked, so now that + * it is locked bail if we failed to make our space reservation. + */ + if (ret) + goto out_page; lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (PagePrivate2(page)) - goto out; + goto out_reserved; ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); @@ -2237,39 +2280,49 @@ again: goto again; } - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, - PAGE_SIZE); - if (ret) { - mapping_set_error(page->mapping, ret); - end_extent_writepage(page, ret, page_start, page_end); - ClearPageChecked(page); - goto out; - } - ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state); - if (ret) { - mapping_set_error(page->mapping, ret); - end_extent_writepage(page, ret, page_start, page_end); - ClearPageChecked(page); + if (ret) goto out_reserved; - } - ClearPageChecked(page); - set_page_dirty(page); + /* + * Everything went as planned, we're now the owner of a dirty page with + * delayed allocation bits set and space reserved for our COW + * destination. + * + * The page was dirty when we started, nothing should have cleaned it. + */ + BUG_ON(!PageDirty(page)); + free_delalloc_space = false; out_reserved: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - if (ret) + if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); -out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); out_page: + if (ret) { + /* + * We hit ENOSPC or other errors. Update the mapping and page + * to reflect the errors and clean the page. + */ + mapping_set_error(page->mapping, ret); + end_extent_writepage(page, ret, page_start, page_end); + clear_page_dirty_for_io(page); + SetPageError(page); + } + ClearPageChecked(page); unlock_page(page); put_page(page); kfree(fixup); extent_changeset_free(data_reserved); + /* + * As a precaution, do a delayed iput in case it would be the last iput + * that could need flushing space. Recursing back to fixup worker would + * deadlock. + */ + btrfs_add_delayed_iput(inode); } /* @@ -2293,6 +2346,13 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) if (TestClearPagePrivate2(page)) return 0; + /* + * PageChecked is set below when we create a fixup worker for this page, + * don't try to create another one if we're already PageChecked() + * + * The extent_io writepage code will redirty the page if we send back + * EAGAIN. + */ if (PageChecked(page)) return -EAGAIN; @@ -2300,12 +2360,21 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) if (!fixup) return -EAGAIN; + /* + * We are already holding a reference to this inode from + * write_cache_pages. We need to hold it because the space reservation + * takes place outside of the page lock, and we can't trust + * page->mapping outside of the page lock. + */ + ihold(inode); SetPageChecked(page); get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; + fixup->inode = inode; btrfs_queue_work(fs_info->fixup_workers, &fixup->work); - return -EBUSY; + + return -EAGAIN; } static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -2394,649 +2463,6 @@ out: return ret; } -/* snapshot-aware defrag */ -struct sa_defrag_extent_backref { - struct rb_node node; - struct old_sa_defrag_extent *old; - u64 root_id; - u64 inum; - u64 file_pos; - u64 extent_offset; - u64 num_bytes; - u64 generation; -}; - -struct old_sa_defrag_extent { - struct list_head list; - struct new_sa_defrag_extent *new; - - u64 extent_offset; - u64 bytenr; - u64 offset; - u64 len; - int count; -}; - -struct new_sa_defrag_extent { - struct rb_root root; - struct list_head head; - struct btrfs_path *path; - struct inode *inode; - u64 file_pos; - u64 len; - u64 bytenr; - u64 disk_len; - u8 compress_type; -}; - -static int backref_comp(struct sa_defrag_extent_backref *b1, - struct sa_defrag_extent_backref *b2) -{ - if (b1->root_id < b2->root_id) - return -1; - else if (b1->root_id > b2->root_id) - return 1; - - if (b1->inum < b2->inum) - return -1; - else if (b1->inum > b2->inum) - return 1; - - if (b1->file_pos < b2->file_pos) - return -1; - else if (b1->file_pos > b2->file_pos) - return 1; - - /* - * [------------------------------] ===> (a range of space) - * |<--->| |<---->| =============> (fs/file tree A) - * |<---------------------------->| ===> (fs/file tree B) - * - * A range of space can refer to two file extents in one tree while - * refer to only one file extent in another tree. - * - * So we may process a disk offset more than one time(two extents in A) - * and locate at the same extent(one extent in B), then insert two same - * backrefs(both refer to the extent in B). - */ - return 0; -} - -static void backref_insert(struct rb_root *root, - struct sa_defrag_extent_backref *backref) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct sa_defrag_extent_backref *entry; - int ret; - - while (*p) { - parent = *p; - entry = rb_entry(parent, struct sa_defrag_extent_backref, node); - - ret = backref_comp(backref, entry); - if (ret < 0) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - - rb_link_node(&backref->node, parent, p); - rb_insert_color(&backref->node, root); -} - -/* - * Note the backref might has changed, and in this case we just return 0. - */ -static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, - void *ctx) -{ - struct btrfs_file_extent_item *extent; - struct old_sa_defrag_extent *old = ctx; - struct new_sa_defrag_extent *new = old->new; - struct btrfs_path *path = new->path; - struct btrfs_key key; - struct btrfs_root *root; - struct sa_defrag_extent_backref *backref; - struct extent_buffer *leaf; - struct inode *inode = new->inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int slot; - int ret; - u64 extent_offset; - u64 num_bytes; - - if (BTRFS_I(inode)->root->root_key.objectid == root_id && - inum == btrfs_ino(BTRFS_I(inode))) - return 0; - - key.objectid = root_id; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(root)) { - if (PTR_ERR(root) == -ENOENT) - return 0; - WARN_ON(1); - btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu", - inum, offset, root_id); - return PTR_ERR(root); - } - - key.objectid = inum; - key.type = BTRFS_EXTENT_DATA_KEY; - if (offset > (u64)-1 << 32) - key.offset = 0; - else - key.offset = offset; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (WARN_ON(ret < 0)) - return ret; - ret = 0; - - while (1) { - cond_resched(); - - leaf = path->nodes[0]; - slot = path->slots[0]; - - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - goto out; - } - continue; - } - - path->slots[0]++; - - btrfs_item_key_to_cpu(leaf, &key, slot); - - if (key.objectid > inum) - goto out; - - if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) - continue; - - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - - if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) - continue; - - /* - * 'offset' refers to the exact key.offset, - * NOT the 'offset' field in btrfs_extent_data_ref, ie. - * (key.offset - extent_offset). - */ - if (key.offset != offset) - continue; - - extent_offset = btrfs_file_extent_offset(leaf, extent); - num_bytes = btrfs_file_extent_num_bytes(leaf, extent); - - if (extent_offset >= old->extent_offset + old->offset + - old->len || extent_offset + num_bytes <= - old->extent_offset + old->offset) - continue; - break; - } - - backref = kmalloc(sizeof(*backref), GFP_NOFS); - if (!backref) { - ret = -ENOENT; - goto out; - } - - backref->root_id = root_id; - backref->inum = inum; - backref->file_pos = offset; - backref->num_bytes = num_bytes; - backref->extent_offset = extent_offset; - backref->generation = btrfs_file_extent_generation(leaf, extent); - backref->old = old; - backref_insert(&new->root, backref); - old->count++; -out: - btrfs_release_path(path); - WARN_ON(ret); - return ret; -} - -static noinline bool record_extent_backrefs(struct btrfs_path *path, - struct new_sa_defrag_extent *new) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); - struct old_sa_defrag_extent *old, *tmp; - int ret; - - new->path = path; - - list_for_each_entry_safe(old, tmp, &new->head, list) { - ret = iterate_inodes_from_logical(old->bytenr + - old->extent_offset, fs_info, - path, record_one_backref, - old, false); - if (ret < 0 && ret != -ENOENT) - return false; - - /* no backref to be processed for this extent */ - if (!old->count) { - list_del(&old->list); - kfree(old); - } - } - - if (list_empty(&new->head)) - return false; - - return true; -} - -static int relink_is_mergable(struct extent_buffer *leaf, - struct btrfs_file_extent_item *fi, - struct new_sa_defrag_extent *new) -{ - if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr) - return 0; - - if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) - return 0; - - if (btrfs_file_extent_compression(leaf, fi) != new->compress_type) - return 0; - - if (btrfs_file_extent_encryption(leaf, fi) || - btrfs_file_extent_other_encoding(leaf, fi)) - return 0; - - return 1; -} - -/* - * Note the backref might has changed, and in this case we just return 0. - */ -static noinline int relink_extent_backref(struct btrfs_path *path, - struct sa_defrag_extent_backref *prev, - struct sa_defrag_extent_backref *backref) -{ - struct btrfs_file_extent_item *extent; - struct btrfs_file_extent_item *item; - struct btrfs_ordered_extent *ordered; - struct btrfs_trans_handle *trans; - struct btrfs_ref ref = { 0 }; - struct btrfs_root *root; - struct btrfs_key key; - struct extent_buffer *leaf; - struct old_sa_defrag_extent *old = backref->old; - struct new_sa_defrag_extent *new = old->new; - struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); - struct inode *inode; - struct extent_state *cached = NULL; - int ret = 0; - u64 start; - u64 len; - u64 lock_start; - u64 lock_end; - bool merge = false; - int index; - - if (prev && prev->root_id == backref->root_id && - prev->inum == backref->inum && - prev->file_pos + prev->num_bytes == backref->file_pos) - merge = true; - - /* step 1: get root */ - key.objectid = backref->root_id; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - index = srcu_read_lock(&fs_info->subvol_srcu); - - root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - if (PTR_ERR(root) == -ENOENT) - return 0; - return PTR_ERR(root); - } - - if (btrfs_root_readonly(root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - return 0; - } - - /* step 2: get inode */ - key.objectid = backref->inum; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - - inode = btrfs_iget(fs_info->sb, &key, root); - if (IS_ERR(inode)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - return 0; - } - - srcu_read_unlock(&fs_info->subvol_srcu, index); - - /* step 3: relink backref */ - lock_start = backref->file_pos; - lock_end = backref->file_pos + backref->num_bytes - 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, - &cached); - - ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); - if (ordered) { - btrfs_put_ordered_extent(ordered); - goto out_unlock; - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_unlock; - } - - key.objectid = backref->inum; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = backref->file_pos; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - goto out_free_path; - } else if (ret > 0) { - ret = 0; - goto out_free_path; - } - - extent = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - - if (btrfs_file_extent_generation(path->nodes[0], extent) != - backref->generation) - goto out_free_path; - - btrfs_release_path(path); - - start = backref->file_pos; - if (backref->extent_offset < old->extent_offset + old->offset) - start += old->extent_offset + old->offset - - backref->extent_offset; - - len = min(backref->extent_offset + backref->num_bytes, - old->extent_offset + old->offset + old->len); - len -= max(backref->extent_offset, old->extent_offset + old->offset); - - ret = btrfs_drop_extents(trans, root, inode, start, - start + len, 1); - if (ret) - goto out_free_path; -again: - key.objectid = btrfs_ino(BTRFS_I(inode)); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = start; - - path->leave_spinning = 1; - if (merge) { - struct btrfs_file_extent_item *fi; - u64 extent_len; - struct btrfs_key found_key; - - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) - goto out_free_path; - - path->slots[0]--; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - extent_len = btrfs_file_extent_num_bytes(leaf, fi); - - if (extent_len + found_key.offset == start && - relink_is_mergable(leaf, fi, new)) { - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_len + len); - btrfs_mark_buffer_dirty(leaf); - inode_add_bytes(inode, len); - - ret = 1; - goto out_free_path; - } else { - merge = false; - btrfs_release_path(path); - goto again; - } - } - - ret = btrfs_insert_empty_item(trans, root, path, &key, - sizeof(*extent)); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_free_path; - } - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); - btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); - btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); - btrfs_set_file_extent_num_bytes(leaf, item, len); - btrfs_set_file_extent_ram_bytes(leaf, item, new->len); - btrfs_set_file_extent_generation(leaf, item, trans->transid); - btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); - btrfs_set_file_extent_compression(leaf, item, new->compress_type); - btrfs_set_file_extent_encryption(leaf, item, 0); - btrfs_set_file_extent_other_encoding(leaf, item, 0); - - btrfs_mark_buffer_dirty(leaf); - inode_add_bytes(inode, len); - btrfs_release_path(path); - - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new->bytenr, - new->disk_len, 0); - btrfs_init_data_ref(&ref, backref->root_id, backref->inum, - new->file_pos); /* start - extent_offset */ - ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_free_path; - } - - ret = 1; -out_free_path: - btrfs_release_path(path); - path->leave_spinning = 0; - btrfs_end_transaction(trans); -out_unlock: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, - &cached); - iput(inode); - return ret; -} - -static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) -{ - struct old_sa_defrag_extent *old, *tmp; - - if (!new) - return; - - list_for_each_entry_safe(old, tmp, &new->head, list) { - kfree(old); - } - kfree(new); -} - -static void relink_file_extents(struct new_sa_defrag_extent *new) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); - struct btrfs_path *path; - struct sa_defrag_extent_backref *backref; - struct sa_defrag_extent_backref *prev = NULL; - struct rb_node *node; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return; - - if (!record_extent_backrefs(path, new)) { - btrfs_free_path(path); - goto out; - } - btrfs_release_path(path); - - while (1) { - node = rb_first(&new->root); - if (!node) - break; - rb_erase(node, &new->root); - - backref = rb_entry(node, struct sa_defrag_extent_backref, node); - - ret = relink_extent_backref(path, prev, backref); - WARN_ON(ret < 0); - - kfree(prev); - - if (ret == 1) - prev = backref; - else - prev = NULL; - cond_resched(); - } - kfree(prev); - - btrfs_free_path(path); -out: - free_sa_defrag_extent(new); - - atomic_dec(&fs_info->defrag_running); - wake_up(&fs_info->transaction_wait); -} - -static struct new_sa_defrag_extent * -record_old_file_extents(struct inode *inode, - struct btrfs_ordered_extent *ordered) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path; - struct btrfs_key key; - struct old_sa_defrag_extent *old; - struct new_sa_defrag_extent *new; - int ret; - - new = kmalloc(sizeof(*new), GFP_NOFS); - if (!new) - return NULL; - - new->inode = inode; - new->file_pos = ordered->file_offset; - new->len = ordered->len; - new->bytenr = ordered->start; - new->disk_len = ordered->disk_len; - new->compress_type = ordered->compress_type; - new->root = RB_ROOT; - INIT_LIST_HEAD(&new->head); - - path = btrfs_alloc_path(); - if (!path) - goto out_kfree; - - key.objectid = btrfs_ino(BTRFS_I(inode)); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = new->file_pos; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out_free_path; - if (ret > 0 && path->slots[0] > 0) - path->slots[0]--; - - /* find out all the old extents for the file range */ - while (1) { - struct btrfs_file_extent_item *extent; - struct extent_buffer *l; - int slot; - u64 num_bytes; - u64 offset; - u64 end; - u64 disk_bytenr; - u64 extent_offset; - - l = path->nodes[0]; - slot = path->slots[0]; - - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out_free_path; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(l, &key, slot); - - if (key.objectid != btrfs_ino(BTRFS_I(inode))) - break; - if (key.type != BTRFS_EXTENT_DATA_KEY) - break; - if (key.offset >= new->file_pos + new->len) - break; - - extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); - - num_bytes = btrfs_file_extent_num_bytes(l, extent); - if (key.offset + num_bytes < new->file_pos) - goto next; - - disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); - if (!disk_bytenr) - goto next; - - extent_offset = btrfs_file_extent_offset(l, extent); - - old = kmalloc(sizeof(*old), GFP_NOFS); - if (!old) - goto out_free_path; - - offset = max(new->file_pos, key.offset); - end = min(new->file_pos + new->len, key.offset + num_bytes); - - old->bytenr = disk_bytenr; - old->extent_offset = extent_offset; - old->offset = offset - key.offset; - old->len = end - offset; - old->new = new; - old->count = 0; - list_add_tail(&old->list, &new->head); -next: - path->slots[0]++; - cond_resched(); - } - - btrfs_free_path(path); - atomic_inc(&fs_info->defrag_running); - - return new; - -out_free_path: - btrfs_free_path(path); -out_kfree: - free_sa_defrag_extent(new); - return NULL; -} - static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, u64 start, u64 len) { @@ -3064,15 +2490,19 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) struct btrfs_trans_handle *trans = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - struct new_sa_defrag_extent *new = NULL; + u64 start, end; int compress_type = 0; int ret = 0; - u64 logical_len = ordered_extent->len; + u64 logical_len = ordered_extent->num_bytes; bool freespace_inode; bool truncated = false; bool range_locked = false; bool clear_new_delalloc_bytes = false; bool clear_reserved_extent = true; + unsigned int clear_bits; + + start = ordered_extent->file_offset; + end = start + ordered_extent->num_bytes - 1; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && @@ -3086,10 +2516,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - btrfs_free_io_failure_record(BTRFS_I(inode), - ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1); + btrfs_free_io_failure_record(BTRFS_I(inode), start, end); if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; @@ -3107,8 +2534,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) * space for NOCOW range. * As NOCOW won't cause a new delayed ref, just free the space */ - btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, - ordered_extent->len); + btrfs_qgroup_free_data(inode, NULL, start, + ordered_extent->num_bytes); btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); @@ -3127,23 +2554,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) } range_locked = true; - lock_extent_bits(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + ordered_extent->len - 1, - &cached_state); - - ret = test_range_bit(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + ordered_extent->len - 1, - EXTENT_DEFRAG, 0, cached_state); - if (ret) { - u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); - if (0 && last_snapshot >= BTRFS_I(inode)->generation) - /* the inode is shared */ - new = record_old_file_extents(inode, ordered_extent); - - clear_extent_bit(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + ordered_extent->len - 1, - EXTENT_DEFRAG, 0, 0, &cached_state); - } + lock_extent_bits(io_tree, start, end, &cached_state); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); @@ -3161,31 +2572,30 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compress_type); - btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, - ordered_extent->len); + btrfs_qgroup_free_data(inode, NULL, start, + ordered_extent->num_bytes); ret = btrfs_mark_extent_written(trans, BTRFS_I(inode), ordered_extent->file_offset, ordered_extent->file_offset + logical_len); } else { BUG_ON(root == fs_info->tree_root); - ret = insert_reserved_file_extent(trans, inode, - ordered_extent->file_offset, - ordered_extent->start, - ordered_extent->disk_len, + ret = insert_reserved_file_extent(trans, inode, start, + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes, logical_len, logical_len, compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); if (!ret) { clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, - ordered_extent->start, - ordered_extent->disk_len); + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); } } unpin_extent_cache(&BTRFS_I(inode)->extent_tree, - ordered_extent->file_offset, ordered_extent->len, - trans->transid); + ordered_extent->file_offset, + ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; @@ -3205,37 +2615,27 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) } ret = 0; out: - if (range_locked || clear_new_delalloc_bytes) { - unsigned int clear_bits = 0; - - if (range_locked) - clear_bits |= EXTENT_LOCKED; - if (clear_new_delalloc_bytes) - clear_bits |= EXTENT_DELALLOC_NEW; - clear_extent_bit(&BTRFS_I(inode)->io_tree, - ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, - clear_bits, - (clear_bits & EXTENT_LOCKED) ? 1 : 0, - 0, &cached_state); - } + clear_bits = EXTENT_DEFRAG; + if (range_locked) + clear_bits |= EXTENT_LOCKED; + if (clear_new_delalloc_bytes) + clear_bits |= EXTENT_DELALLOC_NEW; + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, + (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0, + &cached_state); if (trans) btrfs_end_transaction(trans); if (ret || truncated) { - u64 start, end; + u64 unwritten_start = start; if (truncated) - start = ordered_extent->file_offset + logical_len; - else - start = ordered_extent->file_offset; - end = ordered_extent->file_offset + ordered_extent->len - 1; - clear_extent_uptodate(io_tree, start, end, NULL); + unwritten_start += logical_len; + clear_extent_uptodate(io_tree, unwritten_start, end, NULL); /* Drop the cache for the part of the extent we didn't write. */ - btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), unwritten_start, end, 0); /* * If the ordered extent had an IOERR or something else went @@ -3250,29 +2650,28 @@ out: if ((ret || !logical_len) && clear_reserved_extent && !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && - !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { + /* + * Discard the range before returning it back to the + * free space pool + */ + if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC)) + btrfs_discard_extent(fs_info, + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes, + NULL); btrfs_free_reserved_extent(fs_info, - ordered_extent->start, - ordered_extent->disk_len, 1); + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes, 1); + } } - /* * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ btrfs_remove_ordered_extent(inode, ordered_extent); - /* for snapshot-aware defrag */ - if (new) { - if (ret) { - free_sa_defrag_extent(new); - atomic_dec(&fs_info->defrag_running); - } else { - relink_file_extents(new); - } - } - /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ @@ -4238,18 +3637,30 @@ out: } static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, - struct inode *dir, u64 objectid, - const char *name, int name_len) + struct inode *dir, struct dentry *dentry) { struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; + const char *name = dentry->d_name.name; + int name_len = dentry->d_name.len; u64 index; int ret; + u64 objectid; u64 dir_ino = btrfs_ino(BTRFS_I(dir)); + if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { + objectid = inode->root->root_key.objectid; + } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { + objectid = inode->location.objectid; + } else { + WARN_ON(1); + return -EINVAL; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -4271,13 +3682,16 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid, - dir_ino, &index, name, name_len); - if (ret < 0) { - if (ret != -ENOENT) { - btrfs_abort_transaction(trans, ret); - goto out; - } + /* + * This is a placeholder inode for a subvolume we didn't have a + * reference to at the time of the snapshot creation. In the meantime + * we could have renamed the real subvol link into our snapshot, so + * depending on btrfs_del_root_ref to return -ENOENT here is incorret. + * Instead simply lookup the dir_index_item for this entry so we can + * remove it. Otherwise we know we have a ref to the root and we can + * call btrfs_del_root_ref, and it _shouldn't_ fail. + */ + if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { di = btrfs_search_dir_index_item(root, path, dir_ino, name, name_len); if (IS_ERR_OR_NULL(di)) { @@ -4292,8 +3706,16 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); index = key.offset; + btrfs_release_path(path); + } else { + ret = btrfs_del_root_ref(trans, objectid, + root->root_key.objectid, dir_ino, + &index, name, name_len); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } } - btrfs_release_path(path); ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index); if (ret) { @@ -4487,8 +3909,7 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); - ret = btrfs_unlink_subvol(trans, dir, dest->root_key.objectid, - dentry->d_name.name, dentry->d_name.len); + ret = btrfs_unlink_subvol(trans, dir, dentry); if (ret) { err = ret; btrfs_abort_transaction(trans, ret); @@ -4583,10 +4004,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) return PTR_ERR(trans); if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - err = btrfs_unlink_subvol(trans, dir, - BTRFS_I(inode)->location.objectid, - dentry->d_name.name, - dentry->d_name.len); + err = btrfs_unlink_subvol(trans, dir, dentry); goto out; } @@ -5157,7 +4575,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) cur_offset = hole_start; while (1) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, - block_end - cur_offset, 0); + block_end - cur_offset); if (IS_ERR(em)) { err = PTR_ERR(em); em = NULL; @@ -5728,7 +5146,6 @@ static void inode_tree_add(struct inode *inode) static void inode_tree_del(struct inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; int empty = 0; @@ -5741,7 +5158,6 @@ static void inode_tree_del(struct inode *inode) spin_unlock(&root->inode_lock); if (empty && btrfs_root_refs(&root->root_item) == 0) { - synchronize_srcu(&fs_info->subvol_srcu); spin_lock(&root->inode_lock); empty = RB_EMPTY_ROOT(&root->inode_tree); spin_unlock(&root->inode_lock); @@ -5843,7 +5259,11 @@ static struct inode *new_simple_dir(struct super_block *s, set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; - inode->i_op = &btrfs_dir_ro_inode_operations; + /* + * We only need lookup, the rest is read-only and there's no inode + * associated with the dentry + */ + inode->i_op = &simple_dir_inode_operations; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; @@ -6934,18 +6354,27 @@ static noinline int uncompress_inline(struct btrfs_path *path, return ret; } -/* - * a bit scary, this does extent mapping from logical file offset to the disk. - * the ugly parts come from merging extents from the disk with the in-ram - * representation. This gets more complex because of the data=ordered code, - * where the in-ram extents might be locked pending data=ordered completion. +/** + * btrfs_get_extent - Lookup the first extent overlapping a range in a file. + * @inode: file to search in + * @page: page to read extent data into if the extent is inline + * @pg_offset: offset into @page to copy to + * @start: file offset + * @len: length of range starting at @start + * + * This returns the first &struct extent_map which overlaps with the given + * range, reading it from the B-tree and caching it if necessary. Note that + * there may be more extents which overlap the given range after the returned + * extent_map. * - * This also copies inline extents directly into the page. + * If @page is not NULL and the extent is inline, this also reads the extent + * data directly into the page and marks the extent up to date in the io_tree. + * + * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, - size_t pg_offset, u64 start, u64 len, - int create) + struct page *page, size_t pg_offset, + u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret; @@ -6962,7 +6391,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct extent_map *em = NULL; struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_io_tree *io_tree = &inode->io_tree; - const bool new_inline = !page || create; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -7085,8 +6513,7 @@ next: goto insert; } - btrfs_extent_item_to_extent_map(inode, path, item, - new_inline, em); + btrfs_extent_item_to_extent_map(inode, path, item, !page, em); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { @@ -7098,7 +6525,7 @@ next: size_t extent_offset; size_t copy_size; - if (new_inline) + if (!page) goto out; size = btrfs_file_extent_ram_bytes(leaf, item); @@ -7181,7 +6608,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 delalloc_end; int err = 0; - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + em = btrfs_get_extent(inode, NULL, 0, start, len); if (IS_ERR(em)) return em; /* @@ -7806,7 +7233,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto err; } - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto unlock_err; @@ -8358,8 +7785,8 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, * contention. */ if (dip->logical_offset == file_offset) { - ret = btrfs_lookup_bio_sums_dio(inode, dip->orig_bio, - file_offset); + ret = btrfs_lookup_bio_sums(inode, dip->orig_bio, file_offset, + NULL); if (ret) return ret; } @@ -8872,7 +8299,8 @@ again: ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, page_end - start + 1); if (ordered) { - end = min(page_end, ordered->file_offset + ordered->len - 1); + end = min(page_end, + ordered->file_offset + ordered->num_bytes - 1); /* * IO on this page will never be started, so we need * to account for any ordered extents now @@ -9073,7 +8501,6 @@ again: ret = VM_FAULT_SIGBUS; goto out_unlock; } - ret2 = 0; /* page is wholly or partially inside EOF */ if (page_start + PAGE_SIZE > size) @@ -9097,12 +8524,10 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state); - if (!ret2) { - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - sb_end_pagefault(inode->i_sb); - extent_changeset_free(data_reserved); - return VM_FAULT_LOCKED; - } + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); + return VM_FAULT_LOCKED; out_unlock: unlock_page(page); @@ -9400,7 +8825,7 @@ void btrfs_destroy_inode(struct inode *inode) else { btrfs_err(fs_info, "found ordered extent %llu %llu on inode cleanup", - ordered->file_offset, ordered->len); + ordered->file_offset, ordered->num_bytes); btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); @@ -9538,7 +8963,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; - u64 root_objectid; int ret; bool root_log_pinned = false; bool dest_log_pinned = false; @@ -9556,9 +8980,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, btrfs_init_log_ctx(&ctx_dest, new_inode); /* close the race window with snapshot create/destroy ioctl */ - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) - down_read(&fs_info->subvol_sem); - if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + if (old_ino == BTRFS_FIRST_FREE_OBJECTID || + new_ino == BTRFS_FIRST_FREE_OBJECTID) down_read(&fs_info->subvol_sem); /* @@ -9645,10 +9068,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { - root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; - ret = btrfs_unlink_subvol(trans, old_dir, root_objectid, - old_dentry->d_name.name, - old_dentry->d_name.len); + ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), @@ -9664,10 +9084,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { - root_objectid = BTRFS_I(new_inode)->root->root_key.objectid; - ret = btrfs_unlink_subvol(trans, new_dir, root_objectid, - new_dentry->d_name.name, - new_dentry->d_name.len); + ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), @@ -9792,9 +9209,8 @@ out_fail: ret = ret ? ret : ret2; } out_notrans: - if (new_ino == BTRFS_FIRST_FREE_OBJECTID) - up_read(&fs_info->subvol_sem); - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + if (new_ino == BTRFS_FIRST_FREE_OBJECTID || + old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); ASSERT(list_empty(&ctx_root.list)); @@ -9866,7 +9282,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_inode = d_inode(new_dentry); struct inode *old_inode = d_inode(old_dentry); u64 index = 0; - u64 root_objectid; int ret; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); bool log_pinned = false; @@ -9974,10 +9389,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BTRFS_I(old_inode), 1); if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { - root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; - ret = btrfs_unlink_subvol(trans, old_dir, root_objectid, - old_dentry->d_name.name, - old_dentry->d_name.len); + ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), @@ -9996,10 +9408,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_inode->i_ctime = current_time(new_inode); if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - root_objectid = BTRFS_I(new_inode)->location.objectid; - ret = btrfs_unlink_subvol(trans, new_dir, root_objectid, - new_dentry->d_name.name, - new_dentry->d_name.len); + ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), @@ -10835,7 +10244,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_block_group *bg; u64 len = isize - start; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -11003,11 +10412,6 @@ static const struct inode_operations btrfs_dir_inode_operations = { .update_time = btrfs_update_time, .tmpfile = btrfs_tmpfile, }; -static const struct inode_operations btrfs_dir_ro_inode_operations = { - .lookup = btrfs_lookup, - .permission = btrfs_permission, - .update_time = btrfs_update_time, -}; static const struct file_operations btrfs_dir_file_operations = { .llseek = generic_file_llseek, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a1ee0b775e65..4f4b13830b25 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -704,11 +704,17 @@ static noinline int create_subvol(struct inode *dir, btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); ret = btrfs_update_inode(trans, root, dir); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, btrfs_ino(BTRFS_I(dir)), index, name, namelen); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); @@ -1122,7 +1128,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) /* get the big lock and read metadata off disk */ lock_extent_bits(io_tree, start, end, &cached); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); unlock_extent_cached(io_tree, start, end, &cached); if (IS_ERR(em)) @@ -3237,6 +3243,7 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { + const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; int ret; /* @@ -3244,7 +3251,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, * source range to serialize with relocation. */ btrfs_double_extent_lock(src, loff, dst, dst_loff, len); - ret = btrfs_clone(src, dst, loff, len, len, dst_loff, 1); + ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); return ret; @@ -3720,24 +3727,18 @@ process_slot: ret = 0; if (last_dest_end < destoff + len) { - struct btrfs_clone_extent_info clone_info = { 0 }; /* - * We have an implicit hole (NO_HOLES feature is enabled) that - * fully or partially overlaps our cloning range at its end. + * We have an implicit hole that fully or partially overlaps our + * cloning range at its end. This means that we either have the + * NO_HOLES feature enabled or the implicit hole happened due to + * mixing buffered and direct IO writes against this file. */ btrfs_release_path(path); path->leave_spinning = 0; - /* - * We are dealing with a hole and our clone_info already has a - * disk_offset of 0, we only need to fill the data length and - * file offset. - */ - clone_info.data_len = destoff + len - last_dest_end; - clone_info.file_offset = last_dest_end; ret = btrfs_punch_hole_range(inode, path, last_dest_end, destoff + len - 1, - &clone_info, &trans); + NULL, &trans); if (ret) goto out; @@ -4252,7 +4253,19 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 0); - if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa))) + /* + * Copy scrub args to user space even if btrfs_scrub_dev() returned an + * error. This is important as it allows user space to know how much + * progress scrub has done. For example, if scrub is canceled we get + * -ECANCELED from btrfs_scrub_dev() and return that error back to user + * space. Later user space can inspect the progress from the structure + * btrfs_ioctl_scrub_args and resume scrub from where it left off + * previously (btrfs-progs does this). + * If we fail to copy the btrfs_ioctl_scrub_args structure to user space + * then return -EFAULT to signal the structure was not copied or it may + * be corrupt and unreliable due to a partial copy. + */ + if (copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; if (!(sa->flags & BTRFS_SCRUB_READONLY)) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index fb09bc2f8e4d..ecb9fb6a6fe0 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -20,9 +20,9 @@ static struct kmem_cache *btrfs_ordered_extent_cache; static u64 entry_end(struct btrfs_ordered_extent *entry) { - if (entry->file_offset + entry->len < entry->file_offset) + if (entry->file_offset + entry->num_bytes < entry->file_offset) return (u64)-1; - return entry->file_offset + entry->len; + return entry->file_offset + entry->num_bytes; } /* returns NULL if the insertion worked, or it returns the node it did find @@ -52,14 +52,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, return NULL; } -static void ordered_data_tree_panic(struct inode *inode, int errno, - u64 offset) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - btrfs_panic(fs_info, errno, - "Inconsistency in ordered tree at offset %llu", offset); -} - /* * look for a given offset in the tree, and if it can't be found return the * first lesser offset @@ -120,7 +112,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) { if (file_offset < entry->file_offset || - entry->file_offset + entry->len <= file_offset) + entry->file_offset + entry->num_bytes <= file_offset) return 0; return 1; } @@ -129,7 +121,7 @@ static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, u64 len) { if (file_offset + len <= entry->file_offset || - entry->file_offset + entry->len <= file_offset) + entry->file_offset + entry->num_bytes <= file_offset) return 0; return 1; } @@ -161,19 +153,14 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, } /* allocate and add a new ordered_extent into the per-inode tree. - * file_offset is the logical offset in the file - * - * start is the disk block number of an extent already reserved in the - * extent allocation tree - * - * len is the length of the extent * * The tree is given a single reference on the ordered extent that was * inserted. */ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, - int type, int dio, int compress_type) + u64 disk_bytenr, u64 num_bytes, + u64 disk_num_bytes, int type, int dio, + int compress_type) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -187,10 +174,10 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, return -ENOMEM; entry->file_offset = file_offset; - entry->start = start; - entry->len = len; - entry->disk_len = disk_len; - entry->bytes_left = len; + entry->disk_bytenr = disk_bytenr; + entry->num_bytes = num_bytes; + entry->disk_num_bytes = disk_num_bytes; + entry->bytes_left = num_bytes; entry->inode = igrab(inode); entry->compress_type = compress_type; entry->truncated_len = (u64)-1; @@ -198,7 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, set_bit(type, &entry->flags); if (dio) { - percpu_counter_add_batch(&fs_info->dio_bytes, len, + percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes, fs_info->delalloc_batch); set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); } @@ -219,7 +206,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, node = tree_insert(&tree->tree, file_offset, &entry->rb_node); if (node) - ordered_data_tree_panic(inode, -EEXIST, file_offset); + btrfs_panic(fs_info, -EEXIST, + "inconsistency in ordered tree at offset %llu", + file_offset); spin_unlock_irq(&tree->lock); spin_lock(&root->ordered_extent_lock); @@ -247,27 +236,30 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, } int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type) + u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, + int type) { - return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 0, + return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, + num_bytes, disk_num_bytes, type, 0, BTRFS_COMPRESS_NONE); } int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type) + u64 disk_bytenr, u64 num_bytes, + u64 disk_num_bytes, int type) { - return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 1, + return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, + num_bytes, disk_num_bytes, type, 1, BTRFS_COMPRESS_NONE); } int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, - int type, int compress_type) + u64 disk_bytenr, u64 num_bytes, + u64 disk_num_bytes, int type, + int compress_type) { - return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 0, + return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, + num_bytes, disk_num_bytes, type, 0, compress_type); } @@ -328,8 +320,8 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, } dec_start = max(*file_offset, entry->file_offset); - dec_end = min(*file_offset + io_size, entry->file_offset + - entry->len); + dec_end = min(*file_offset + io_size, + entry->file_offset + entry->num_bytes); *file_offset = dec_end; if (dec_start > dec_end) { btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu", @@ -471,10 +463,11 @@ void btrfs_remove_ordered_extent(struct inode *inode, btrfs_mod_outstanding_extents(btrfs_inode, -1); spin_unlock(&btrfs_inode->lock); if (root != fs_info->tree_root) - btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false); + btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes, + false); if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - percpu_counter_add_batch(&fs_info->dio_bytes, -entry->len, + percpu_counter_add_batch(&fs_info->dio_bytes, -entry->num_bytes, fs_info->delalloc_batch); tree = &btrfs_inode->ordered_tree; @@ -534,8 +527,8 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, ordered = list_first_entry(&splice, struct btrfs_ordered_extent, root_extent_list); - if (range_end <= ordered->start || - ordered->start + ordered->disk_len <= range_start) { + if (range_end <= ordered->disk_bytenr || + ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) { list_move_tail(&ordered->root_extent_list, &skipped); cond_resched_lock(&root->ordered_extent_lock); continue; @@ -619,7 +612,7 @@ void btrfs_start_ordered_extent(struct inode *inode, int wait) { u64 start = entry->file_offset; - u64 end = start + entry->len - 1; + u64 end = start + entry->num_bytes - 1; trace_btrfs_ordered_extent_start(inode, entry); @@ -680,7 +673,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - if (ordered->file_offset + ordered->len <= start) { + if (ordered->file_offset + ordered->num_bytes <= start) { btrfs_put_ordered_extent(ordered); break; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4eb0319a86d7..3beb4da4ab41 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -67,14 +67,13 @@ struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; - /* disk byte number */ - u64 start; - - /* ram length of the extent in bytes */ - u64 len; - - /* extent length on disk */ - u64 disk_len; + /* + * These fields directly correspond to the same fields in + * btrfs_file_extent_item. + */ + u64 disk_bytenr; + u64 num_bytes; + u64 disk_num_bytes; /* number of bytes that still need writing */ u64 bytes_left; @@ -161,12 +160,15 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, u64 *file_offset, u64 io_size, int uptodate); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type); + u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, + int type); int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type); + u64 disk_bytenr, u64 num_bytes, + u64 disk_num_bytes, int type); int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, - int type, int compress_type); + u64 disk_bytenr, u64 num_bytes, + u64 disk_num_bytes, int type, + int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 873b6b694107..61f44e78e3c9 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -317,7 +317,7 @@ void btrfs_print_leaf(struct extent_buffer *l) print_uuid_item(l, btrfs_item_ptr_offset(l, i), btrfs_item_size_nr(l, i)); break; - }; + } } } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 93aeb2e539a4..98d9a50352d6 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1243,7 +1243,6 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; @@ -1259,9 +1258,8 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, return -ENOMEM; mutex_lock(&fs_info->qgroup_ioctl_lock); - quota_root = fs_info->quota_root; - if (!quota_root) { - ret = -EINVAL; + if (!fs_info->quota_root) { + ret = -ENOTCONN; goto out; } member = find_qgroup_rb(fs_info, src); @@ -1307,7 +1305,6 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; @@ -1320,9 +1317,8 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, if (!tmp) return -ENOMEM; - quota_root = fs_info->quota_root; - if (!quota_root) { - ret = -EINVAL; + if (!fs_info->quota_root) { + ret = -ENOTCONN; goto out; } @@ -1387,11 +1383,11 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); - quota_root = fs_info->quota_root; - if (!quota_root) { - ret = -EINVAL; + if (!fs_info->quota_root) { + ret = -ENOTCONN; goto out; } + quota_root = fs_info->quota_root; qgroup = find_qgroup_rb(fs_info, qgroupid); if (qgroup) { ret = -EEXIST; @@ -1416,15 +1412,13 @@ out: int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_qgroup_list *list; int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); - quota_root = fs_info->quota_root; - if (!quota_root) { - ret = -EINVAL; + if (!fs_info->quota_root) { + ret = -ENOTCONN; goto out; } @@ -1465,7 +1459,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; int ret = 0; /* Sometimes we would want to clear the limit on this qgroup. @@ -1475,9 +1468,8 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, const u64 CLEAR_VALUE = -1; mutex_lock(&fs_info->qgroup_ioctl_lock); - quota_root = fs_info->quota_root; - if (!quota_root) { - ret = -EINVAL; + if (!fs_info->quota_root) { + ret = -ENOTCONN; goto out; } @@ -2423,8 +2415,12 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 nr_old_roots = 0; int ret = 0; + /* + * If quotas get disabled meanwhile, the resouces need to be freed and + * we can't just exit here. + */ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) - return 0; + goto out_free; if (new_roots) { if (!maybe_fs_roots(new_roots)) @@ -2578,10 +2574,9 @@ cleanup: int btrfs_run_qgroups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root = fs_info->quota_root; int ret = 0; - if (!quota_root) + if (!fs_info->quota_root) return ret; spin_lock(&fs_info->qgroup_lock); @@ -2875,7 +2870,6 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, enum btrfs_qgroup_rsv_type type) { - struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = root->root_key.objectid; @@ -2894,8 +2888,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, enforce = false; spin_lock(&fs_info->qgroup_lock); - quota_root = fs_info->quota_root; - if (!quota_root) + if (!fs_info->quota_root) goto out; qgroup = find_qgroup_rb(fs_info, ref_root); @@ -2962,7 +2955,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { - struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct ulist_node *unode; struct ulist_iterator uiter; @@ -2980,8 +2972,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, } spin_lock(&fs_info->qgroup_lock); - quota_root = fs_info->quota_root; - if (!quota_root) + if (!fs_info->quota_root) goto out; qgroup = find_qgroup_rb(fs_info, ref_root); @@ -3232,12 +3223,12 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { btrfs_warn(fs_info, - "qgroup rescan init failed, qgroup is not enabled"); + "qgroup rescan init failed, qgroup rescan is not queued"); ret = -EINVAL; } else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { btrfs_warn(fs_info, - "qgroup rescan init failed, qgroup rescan is not queued"); + "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; } @@ -3681,7 +3672,6 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, int num_bytes) { - struct btrfs_root *quota_root = fs_info->quota_root; struct btrfs_qgroup *qgroup; struct ulist_node *unode; struct ulist_iterator uiter; @@ -3689,7 +3679,7 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, if (num_bytes == 0) return; - if (!quota_root) + if (!fs_info->quota_root) return; spin_lock(&fs_info->qgroup_lock); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d897a8e5e430..995d4b8b1cfd 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -517,6 +517,34 @@ static int update_backref_cache(struct btrfs_trans_handle *trans, return 1; } +static bool reloc_root_is_dead(struct btrfs_root *root) +{ + /* + * Pair with set_bit/clear_bit in clean_dirty_subvols and + * btrfs_update_reloc_root. We need to see the updated bit before + * trying to access reloc_root + */ + smp_rmb(); + if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) + return true; + return false; +} + +/* + * Check if this subvolume tree has valid reloc tree. + * + * Reloc tree after swap is considered dead, thus not considered as valid. + * This is enough for most callers, as they don't distinguish dead reloc root + * from no reloc root. But should_ignore_root() below is a special case. + */ +static bool have_reloc_root(struct btrfs_root *root) +{ + if (reloc_root_is_dead(root)) + return false; + if (!root->reloc_root) + return false; + return true; +} static int should_ignore_root(struct btrfs_root *root) { @@ -525,6 +553,10 @@ static int should_ignore_root(struct btrfs_root *root) if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return 0; + /* This root has been merged with its reloc tree, we can ignore it */ + if (reloc_root_is_dead(root)) + return 1; + reloc_root = root->reloc_root; if (!reloc_root) return 0; @@ -1439,7 +1471,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, * The subvolume has reloc tree but the swap is finished, no need to * create/update the dead reloc tree */ - if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) + if (reloc_root_is_dead(root)) return 0; if (root->reloc_root) { @@ -1478,8 +1510,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root_item *root_item; int ret; - if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) || - !root->reloc_root) + if (!have_reloc_root(root)) goto out; reloc_root = root->reloc_root; @@ -1489,6 +1520,11 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, if (fs_info->reloc_ctl->merge_reloc_tree && btrfs_root_refs(root_item) == 0) { set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); + /* + * Mark the tree as dead before we change reloc_root so + * have_reloc_root will not touch it from now on. + */ + smp_wmb(); __del_reloc_root(reloc_root); } @@ -2201,6 +2237,11 @@ static int clean_dirty_subvols(struct reloc_control *rc) if (ret2 < 0 && !ret) ret = ret2; } + /* + * Need barrier to ensure clear_bit() only happens after + * root->reloc_root = NULL. Pairs with have_reloc_root. + */ + smp_wmb(); clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); btrfs_put_fs_root(root); } else { @@ -4291,6 +4332,15 @@ static void describe_relocation(struct btrfs_fs_info *fs_info, block_group->start, buf); } +static const char *stage_to_string(int stage) +{ + if (stage == MOVE_DATA_EXTENTS) + return "move data extents"; + if (stage == UPDATE_DATA_PTRS) + return "update data pointers"; + return "unknown"; +} + /* * function to relocate all extents in a block group. */ @@ -4365,12 +4415,15 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) rc->block_group->length); while (1) { + int finishes_stage; + mutex_lock(&fs_info->cleaner_mutex); ret = relocate_block_group(rc); mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) err = ret; + finishes_stage = rc->stage; /* * We may have gotten ENOSPC after we already dirtied some * extents. If writeout happens while we're relocating a @@ -4396,8 +4449,8 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) if (rc->extents_found == 0) break; - btrfs_info(fs_info, "found %llu extents", rc->extents_found); - + btrfs_info(fs_info, "found %llu extents, stage: %s", + rc->extents_found, stage_to_string(finishes_stage)); } WARN_ON(rc->block_group->pinned > 0); @@ -4552,6 +4605,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) fs_root = read_fs_root(fs_info, reloc_root->root_key.offset); if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); + list_add_tail(&reloc_root->root_list, &reloc_roots); goto out_free; } @@ -4614,7 +4668,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) LIST_HEAD(list); ordered = btrfs_lookup_ordered_extent(inode, file_pos); - BUG_ON(ordered->file_offset != file_pos || ordered->len != len); + BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr, @@ -4638,7 +4692,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) * disk_len vs real len like with real inodes since it's all * disk length. */ - new_bytenr = ordered->start + (sums->bytenr - disk_bytenr); + new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr; sums->bytenr = new_bytenr; btrfs_add_ordered_sum(ordered, sums); @@ -4717,7 +4771,7 @@ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, struct btrfs_root *root = pending->root; struct reloc_control *rc = root->fs_info->reloc_ctl; - if (!root->reloc_root || !rc) + if (!rc || !have_reloc_root(root)) return; if (!rc->merge_reloc_tree) @@ -4751,7 +4805,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct reloc_control *rc = root->fs_info->reloc_ctl; int ret; - if (!root->reloc_root || !rc) + if (!rc || !have_reloc_root(root)) return 0; rc = root->fs_info->reloc_ctl; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 3b17b647d002..612411c74550 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -376,11 +376,13 @@ again: leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); - - WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); - WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); ptr = (unsigned long)(ref + 1); - WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); + if ((btrfs_root_ref_dirid(leaf, ref) != dirid) || + (btrfs_root_ref_name_len(leaf, ref) != name_len) || + memcmp_extent_buffer(leaf, name, ptr, name_len)) { + err = -ENOENT; + goto out; + } *sequence = btrfs_root_ref_sequence(leaf, ref); ret = btrfs_del_item(trans, tree_root, path); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 21de630b0730..61b37c56a7fb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -8,6 +8,7 @@ #include <linux/sched/mm.h> #include <crypto/hash.h> #include "ctree.h" +#include "discard.h" #include "volumes.h" #include "disk-io.h" #include "ordered-data.h" @@ -3577,17 +3578,27 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * This can easily boost the amount of SYSTEM chunks if cleaner * thread can't be triggered fast enough, and use up all space * of btrfs_super_block::sys_chunk_array + * + * While for dev replace, we need to try our best to mark block + * group RO, to prevent race between: + * - Write duplication + * Contains latest data + * - Scrub copy + * Contains data from commit tree + * + * If target block group is not marked RO, nocow writes can + * be overwritten by scrub copy, causing data corruption. + * So for dev-replace, it's not allowed to continue if a block + * group is not RO. */ - ret = btrfs_inc_block_group_ro(cache, false); - scrub_pause_off(fs_info); - + ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); if (ret == 0) { ro_set = 1; - } else if (ret == -ENOSPC) { + } else if (ret == -ENOSPC && !sctx->is_dev_replace) { /* * btrfs_inc_block_group_ro return -ENOSPC when it * failed in creating new chunk for metadata. - * It is not a problem for scrub/replace, because + * It is not a problem for scrub, because * metadata are always cowed, and our scrub paused * commit_transactions. */ @@ -3596,9 +3607,22 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, btrfs_warn(fs_info, "failed setting block group ro: %d", ret); btrfs_put_block_group(cache); + scrub_pause_off(fs_info); break; } + /* + * Now the target block is marked RO, wait for nocow writes to + * finish before dev-replace. + * COW is fine, as COW never overwrites extents in commit tree. + */ + if (sctx->is_dev_replace) { + btrfs_wait_nocow_writers(cache); + btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, + cache->length); + } + + scrub_pause_off(fs_info); down_write(&dev_replace->rwsem); dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; @@ -3659,7 +3683,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!cache->removed && !cache->ro && cache->reserved == 0 && cache->used == 0) { spin_unlock(&cache->lock); - btrfs_mark_bg_unused(cache); + if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_discard_queue_work(&fs_info->discard_ctl, + cache); + else + btrfs_mark_bg_unused(cache); } else { spin_unlock(&cache->lock); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index ae2db5eb1549..a055b657cb85 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1269,7 +1269,8 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) * destination of the stream. */ if (ino == bctx->cur_objectid && - offset >= bctx->sctx->cur_inode_next_write_offset) + offset + bctx->extent_len > + bctx->sctx->cur_inode_next_write_offset) return 0; } @@ -7084,12 +7085,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) spin_unlock(&send_root->root_item_lock); /* - * This is done when we lookup the root, it should already be complete - * by the time we get here. - */ - WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE); - - /* * Userspace tools do the checks and warn the user if it's * not RO. */ diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index f09aa6ee9113..01297c5b2666 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -159,10 +159,9 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) return (global->size << 1); } -static int can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk) +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) { u64 profile; u64 avail; @@ -173,7 +172,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info, if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) return 0; - if (system_chunk) + if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) profile = btrfs_system_alloc_profile(fs_info); else profile = btrfs_metadata_alloc_profile(fs_info); @@ -227,8 +226,8 @@ again: /* Check and see if our ticket can be satisified now. */ if ((used + ticket->bytes <= space_info->total_bytes) || - can_overcommit(fs_info, space_info, ticket->bytes, flush, - false)) { + btrfs_can_overcommit(fs_info, space_info, ticket->bytes, + flush)) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, ticket->bytes); @@ -626,8 +625,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, static inline u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - bool system_chunk) + struct btrfs_space_info *space_info) { struct reserve_ticket *ticket; u64 used; @@ -642,14 +640,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, return to_reclaim; to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (can_overcommit(fs_info, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + if (btrfs_can_overcommit(fs_info, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) return 0; used = btrfs_space_info_used(space_info, true); - if (can_overcommit(fs_info, space_info, SZ_1M, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + if (btrfs_can_overcommit(fs_info, space_info, SZ_1M, + BTRFS_RESERVE_FLUSH_ALL)) expected = div_factor_fine(space_info->total_bytes, 95); else expected = div_factor_fine(space_info->total_bytes, 90); @@ -665,7 +663,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, - u64 used, bool system_chunk) + u64 used) { u64 thresh = div_factor_fine(space_info->total_bytes, 98); @@ -673,8 +671,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) return 0; - if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, - system_chunk)) + if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info)) return 0; return (used >= thresh && !btrfs_fs_closing(fs_info) && @@ -765,8 +762,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, - false); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); if (!to_reclaim) { space_info->flush = 0; spin_unlock(&space_info->lock); @@ -785,8 +781,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) return; } to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, - space_info, - false); + space_info); if (last_tickets_id == space_info->tickets_id) { flush_state++; } else { @@ -858,8 +853,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, int flush_state; spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, - false); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); if (!to_reclaim) { spin_unlock(&space_info->lock); return; @@ -990,8 +984,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk) + enum btrfs_reserve_flush_enum flush) { struct reserve_ticket ticket; u64 used; @@ -1013,8 +1006,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, */ if (!pending_tickets && ((used + orig_bytes <= space_info->total_bytes) || - can_overcommit(fs_info, space_info, orig_bytes, flush, - system_chunk))) { + btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); ret = 0; @@ -1054,8 +1046,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_do_async_reclaim(fs_info, space_info, - used, system_chunk) && + need_do_async_reclaim(fs_info, space_info, used) && !work_busy(&fs_info->async_reclaim_work)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); @@ -1092,10 +1083,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; - bool system_chunk = (root == fs_info->chunk_root); ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, - orig_bytes, flush, system_chunk); + orig_bytes, flush); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { if (block_rsv != global_rsv && diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 1a349e3f9cc1..24514cd2c6c1 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -127,6 +127,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, enum btrfs_reserve_flush_enum flush); void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info); +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f452a94abdc3..0616a5434793 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -46,6 +46,7 @@ #include "sysfs.h" #include "tests/btrfs-tests.h" #include "block-group.h" +#include "discard.h" #include "qgroup.h" #define CREATE_TRACE_POINTS @@ -146,6 +147,8 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function if (sb_rdonly(sb)) return; + btrfs_discard_stop(fs_info); + /* btrfs handle error by forcing the filesystem readonly */ sb->s_flags |= SB_RDONLY; btrfs_info(fs_info, "forced readonly"); @@ -313,6 +316,7 @@ enum { Opt_datasum, Opt_nodatasum, Opt_defrag, Opt_nodefrag, Opt_discard, Opt_nodiscard, + Opt_discard_mode, Opt_nologreplay, Opt_norecovery, Opt_ratio, @@ -375,6 +379,7 @@ static const match_table_t tokens = { {Opt_defrag, "autodefrag"}, {Opt_nodefrag, "noautodefrag"}, {Opt_discard, "discard"}, + {Opt_discard_mode, "discard=%s"}, {Opt_nodiscard, "nodiscard"}, {Opt_nologreplay, "nologreplay"}, {Opt_norecovery, "norecovery"}, @@ -695,12 +700,26 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, info->metadata_ratio); break; case Opt_discard: - btrfs_set_and_info(info, DISCARD, - "turning on discard"); + case Opt_discard_mode: + if (token == Opt_discard || + strcmp(args[0].from, "sync") == 0) { + btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC); + btrfs_set_and_info(info, DISCARD_SYNC, + "turning on sync discard"); + } else if (strcmp(args[0].from, "async") == 0) { + btrfs_clear_opt(info->mount_opt, DISCARD_SYNC); + btrfs_set_and_info(info, DISCARD_ASYNC, + "turning on async discard"); + } else { + ret = -EINVAL; + goto out; + } break; case Opt_nodiscard: - btrfs_clear_and_info(info, DISCARD, + btrfs_clear_and_info(info, DISCARD_SYNC, "turning off discard"); + btrfs_clear_and_info(info, DISCARD_ASYNC, + "turning off async discard"); break; case Opt_space_cache: case Opt_space_cache_version: @@ -1322,8 +1341,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",nologreplay"); if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); - if (btrfs_test_opt(info, DISCARD)) + if (btrfs_test_opt(info, DISCARD_SYNC)) seq_puts(seq, ",discard"); + if (btrfs_test_opt(info, DISCARD_ASYNC)) + seq_puts(seq, ",discard=async"); if (!(info->sb->s_flags & SB_POSIXACL)) seq_puts(seq, ",noacl"); if (btrfs_test_opt(info, SPACE_CACHE)) @@ -1713,6 +1734,14 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, btrfs_cleanup_defrag_inodes(fs_info); } + /* If we toggled discard async */ + if (!btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) && + btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_discard_resume(fs_info); + else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) && + !btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_discard_cleanup(fs_info); + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); } @@ -1760,6 +1789,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) */ cancel_work_sync(&fs_info->async_reclaim_work); + btrfs_discard_cleanup(fs_info); + /* wait for the uuid_scan task to finish */ down(&fs_info->uuid_tree_rescan_sem); /* avoid complains from lockdep et al. */ @@ -2104,7 +2135,15 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ thresh = SZ_4M; - if (!mixed && total_free_meta - thresh < block_rsv->size) + /* + * We only want to claim there's no available space if we can no longer + * allocate chunks for our metadata profile and our global reserve will + * not fit in the free metadata space. If we aren't ->full then we + * still can allocate chunks and thus are fine using the currently + * calculated f_bavail. + */ + if (!mixed && block_rsv->space_info->full && + total_free_meta - thresh < block_rsv->size) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 5ebbe8a5ee76..7436422194da 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -12,6 +12,7 @@ #include <crypto/hash.h> #include "ctree.h" +#include "discard.h" #include "disk-io.h" #include "transaction.h" #include "sysfs.h" @@ -339,11 +340,177 @@ static const struct attribute_group btrfs_static_feature_attr_group = { #ifdef CONFIG_BTRFS_DEBUG /* + * Discard statistics and tunables + */ +#define discard_to_fs_info(_kobj) to_fs_info((_kobj)->parent->parent) + +static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%lld\n", + atomic64_read(&fs_info->discard_ctl.discardable_bytes)); +} +BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show); + +static ssize_t btrfs_discardable_extents_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%d\n", + atomic_read(&fs_info->discard_ctl.discardable_extents)); +} +BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show); + +static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%lld\n", + fs_info->discard_ctl.discard_bitmap_bytes); +} +BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show); + +static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%lld\n", + atomic64_read(&fs_info->discard_ctl.discard_bytes_saved)); +} +BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show); + +static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%lld\n", + fs_info->discard_ctl.discard_extent_bytes); +} +BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show); + +static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", + READ_ONCE(fs_info->discard_ctl.iops_limit)); +} + +static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; + u32 iops_limit; + int ret; + + ret = kstrtou32(buf, 10, &iops_limit); + if (ret) + return -EINVAL; + + WRITE_ONCE(discard_ctl->iops_limit, iops_limit); + + return len; +} +BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show, + btrfs_discard_iops_limit_store); + +static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", + READ_ONCE(fs_info->discard_ctl.kbps_limit)); +} + +static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; + u32 kbps_limit; + int ret; + + ret = kstrtou32(buf, 10, &kbps_limit); + if (ret) + return -EINVAL; + + WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit); + + return len; +} +BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show, + btrfs_discard_kbps_limit_store); + +static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + READ_ONCE(fs_info->discard_ctl.max_discard_size)); +} + +static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; + u64 max_discard_size; + int ret; + + ret = kstrtou64(buf, 10, &max_discard_size); + if (ret) + return -EINVAL; + + WRITE_ONCE(discard_ctl->max_discard_size, max_discard_size); + + return len; +} +BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show, + btrfs_discard_max_discard_size_store); + +static const struct attribute *discard_debug_attrs[] = { + BTRFS_ATTR_PTR(discard, discardable_bytes), + BTRFS_ATTR_PTR(discard, discardable_extents), + BTRFS_ATTR_PTR(discard, discard_bitmap_bytes), + BTRFS_ATTR_PTR(discard, discard_bytes_saved), + BTRFS_ATTR_PTR(discard, discard_extent_bytes), + BTRFS_ATTR_PTR(discard, iops_limit), + BTRFS_ATTR_PTR(discard, kbps_limit), + BTRFS_ATTR_PTR(discard, max_discard_size), + NULL, +}; + +/* * Runtime debugging exported via sysfs * * /sys/fs/btrfs/debug - applies to module or all filesystems * /sys/fs/btrfs/UUID - applies only to the given filesystem */ +static const struct attribute *btrfs_debug_mount_attrs[] = { + NULL, +}; + static struct attribute *btrfs_debug_feature_attrs[] = { NULL }; @@ -734,10 +901,10 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { - if (fs_devs->device_dir_kobj) { - kobject_del(fs_devs->device_dir_kobj); - kobject_put(fs_devs->device_dir_kobj); - fs_devs->device_dir_kobj = NULL; + if (fs_devs->devices_kobj) { + kobject_del(fs_devs->devices_kobj); + kobject_put(fs_devs->devices_kobj); + fs_devs->devices_kobj = NULL; } if (fs_devs->fsid_kobj.state_initialized) { @@ -771,6 +938,19 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) kobject_del(fs_info->space_info_kobj); kobject_put(fs_info->space_info_kobj); } +#ifdef CONFIG_BTRFS_DEBUG + if (fs_info->discard_debug_kobj) { + sysfs_remove_files(fs_info->discard_debug_kobj, + discard_debug_attrs); + kobject_del(fs_info->discard_debug_kobj); + kobject_put(fs_info->discard_debug_kobj); + } + if (fs_info->debug_kobj) { + sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); + kobject_del(fs_info->debug_kobj); + kobject_put(fs_info->debug_kobj); + } +#endif addrm_unknown_feature_attrs(fs_info, false); sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group); sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs); @@ -969,46 +1149,120 @@ int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, struct hd_struct *disk; struct kobject *disk_kobj; - if (!fs_devices->device_dir_kobj) + if (!fs_devices->devices_kobj) return -EINVAL; - if (one_device && one_device->bdev) { - disk = one_device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; + if (one_device) { + if (one_device->bdev) { + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + sysfs_remove_link(fs_devices->devices_kobj, + disk_kobj->name); + } - sysfs_remove_link(fs_devices->device_dir_kobj, - disk_kobj->name); - } + kobject_del(&one_device->devid_kobj); + kobject_put(&one_device->devid_kobj); + + wait_for_completion(&one_device->kobj_unregister); - if (one_device) return 0; + } - list_for_each_entry(one_device, - &fs_devices->devices, dev_list) { - if (!one_device->bdev) - continue; - disk = one_device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; + list_for_each_entry(one_device, &fs_devices->devices, dev_list) { + + if (one_device->bdev) { + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + sysfs_remove_link(fs_devices->devices_kobj, + disk_kobj->name); + } + kobject_del(&one_device->devid_kobj); + kobject_put(&one_device->devid_kobj); - sysfs_remove_link(fs_devices->device_dir_kobj, - disk_kobj->name); + wait_for_completion(&one_device->kobj_unregister); } return 0; } -int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) +static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) { - if (!fs_devs->device_dir_kobj) - fs_devs->device_dir_kobj = kobject_create_and_add("devices", - &fs_devs->fsid_kobj); + int val; + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); - if (!fs_devs->device_dir_kobj) - return -ENOMEM; + val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - return 0; + return snprintf(buf, PAGE_SIZE, "%d\n", val); +} +BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show); + +static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + int val; + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); + + return snprintf(buf, PAGE_SIZE, "%d\n", val); +} +BTRFS_ATTR(devid, missing, btrfs_sysfs_missing_show); + +static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + int val; + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); + + return snprintf(buf, PAGE_SIZE, "%d\n", val); +} +BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); + +static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + int val; + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + + return snprintf(buf, PAGE_SIZE, "%d\n", val); +} +BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); + +static struct attribute *devid_attrs[] = { + BTRFS_ATTR_PTR(devid, in_fs_metadata), + BTRFS_ATTR_PTR(devid, missing), + BTRFS_ATTR_PTR(devid, replace_target), + BTRFS_ATTR_PTR(devid, writeable), + NULL +}; +ATTRIBUTE_GROUPS(devid); + +static void btrfs_release_devid_kobj(struct kobject *kobj) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + memset(&device->devid_kobj, 0, sizeof(struct kobject)); + complete(&device->kobj_unregister); } +static struct kobj_type devid_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = devid_groups, + .release = btrfs_release_devid_kobj, +}; + int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { @@ -1016,22 +1270,31 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *dev; list_for_each_entry(dev, &fs_devices->devices, dev_list) { - struct hd_struct *disk; - struct kobject *disk_kobj; - - if (!dev->bdev) - continue; if (one_device && one_device != dev) continue; - disk = dev->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; + if (dev->bdev) { + struct hd_struct *disk; + struct kobject *disk_kobj; + + disk = dev->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; - error = sysfs_create_link(fs_devices->device_dir_kobj, - disk_kobj, disk_kobj->name); - if (error) + error = sysfs_create_link(fs_devices->devices_kobj, + disk_kobj, disk_kobj->name); + if (error) + break; + } + + init_completion(&dev->kobj_unregister); + error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype, + fs_devices->devices_kobj, "%llu", + dev->devid); + if (error) { + kobject_put(&dev->devid_kobj); break; + } } return error; @@ -1063,27 +1326,49 @@ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, "sysfs: failed to create fsid for sprout"); } +void btrfs_sysfs_update_devid(struct btrfs_device *device) +{ + char tmp[24]; + + snprintf(tmp, sizeof(tmp), "%llu", device->devid); + + if (kobject_rename(&device->devid_kobj, tmp)) + btrfs_warn(device->fs_devices->fs_info, + "sysfs: failed to update devid for %llu", + device->devid); +} + /* /sys/fs/btrfs/ entry */ static struct kset *btrfs_kset; /* + * Creates: + * /sys/fs/btrfs/UUID + * * Can be called by the device discovery thread. - * And parent can be specified for seed device */ -int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, - struct kobject *parent) +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) { int error; init_completion(&fs_devs->kobj_unregister); fs_devs->fsid_kobj.kset = btrfs_kset; - error = kobject_init_and_add(&fs_devs->fsid_kobj, - &btrfs_ktype, parent, "%pU", fs_devs->fsid); + error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL, + "%pU", fs_devs->fsid); if (error) { kobject_put(&fs_devs->fsid_kobj); return error; } + fs_devs->devices_kobj = kobject_create_and_add("devices", + &fs_devs->fsid_kobj); + if (!fs_devs->devices_kobj) { + btrfs_err(fs_devs->fs_info, + "failed to init sysfs device interface"); + kobject_put(&fs_devs->fsid_kobj); + return -ENOMEM; + } + return 0; } @@ -1111,8 +1396,26 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) goto failure; #ifdef CONFIG_BTRFS_DEBUG - error = sysfs_create_group(fsid_kobj, - &btrfs_debug_feature_attr_group); + fs_info->debug_kobj = kobject_create_and_add("debug", fsid_kobj); + if (!fs_info->debug_kobj) { + error = -ENOMEM; + goto failure; + } + + error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); + if (error) + goto failure; + + /* Discard directory */ + fs_info->discard_debug_kobj = kobject_create_and_add("discard", + fs_info->debug_kobj); + if (!fs_info->discard_debug_kobj) { + error = -ENOMEM; + goto failure; + } + + error = sysfs_create_files(fs_info->discard_debug_kobj, + discard_debug_attrs); if (error) goto failure; #endif @@ -1209,6 +1512,9 @@ void __cold btrfs_exit_sysfs(void) sysfs_unmerge_group(&btrfs_kset->kobj, &btrfs_static_feature_attr_group); sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); +#ifdef CONFIG_BTRFS_DEBUG + sysfs_remove_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); +#endif kset_unregister(btrfs_kset); } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index e10c3adfc30f..c68582add92e 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -18,9 +18,7 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, - struct kobject *parent); -int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, const u8 *fsid); @@ -36,5 +34,6 @@ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache); int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info); void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info); +void btrfs_sysfs_update_devid(struct btrfs_device *device); #endif diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index a7aca4141788..84fb3fa940a6 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -86,6 +86,27 @@ static void btrfs_destroy_test_fs(void) unregister_filesystem(&test_type); } +struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) +{ + struct btrfs_device *dev; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return ERR_PTR(-ENOMEM); + + extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL); + INIT_LIST_HEAD(&dev->dev_list); + list_add(&dev->dev_list, &fs_info->fs_devices->devices); + + return dev; +} + +static void btrfs_free_dummy_device(struct btrfs_device *dev) +{ + extent_io_tree_release(&dev->alloc_state); + kfree(dev); +} + struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) { struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), @@ -121,7 +142,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) spin_lock_init(&fs_info->qgroup_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); - spin_lock_init(&fs_info->tree_mod_seq_lock); mutex_init(&fs_info->qgroup_ioctl_lock); mutex_init(&fs_info->qgroup_rescan_lock); rwlock_init(&fs_info->tree_mod_log_lock); @@ -132,12 +152,14 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) INIT_LIST_HEAD(&fs_info->dirty_qgroups); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); + INIT_LIST_HEAD(&fs_info->fs_devices->devices); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); extent_io_tree_init(fs_info, &fs_info->freed_extents[0], IO_TREE_FS_INFO_FREED_EXTENTS0, NULL); extent_io_tree_init(fs_info, &fs_info->freed_extents[1], IO_TREE_FS_INFO_FREED_EXTENTS1, NULL); + extent_map_tree_init(&fs_info->mapping_tree); fs_info->pinned_extents = &fs_info->freed_extents[0]; set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); @@ -150,6 +172,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { struct radix_tree_iter iter; void **slot; + struct btrfs_device *dev, *tmp; if (!fs_info) return; @@ -180,6 +203,11 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->buffer_lock); + btrfs_mapping_tree_free(&fs_info->mapping_tree); + list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, + dev_list) { + btrfs_free_dummy_device(dev); + } btrfs_free_qgroup_config(fs_info); btrfs_free_fs_roots(fs_info); cleanup_srcu_struct(&fs_info->subvol_srcu); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 9e52527357d8..7a2d7ffbe30e 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -46,6 +46,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long lengt void btrfs_free_dummy_block_group(struct btrfs_block_group *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info); #else static inline int btrfs_run_sanity_tests(void) { diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 123d9a614357..df7ce874a74b 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -441,8 +441,17 @@ static int test_find_first_clear_extent_bit(void) int ret = -EINVAL; test_msg("running find_first_clear_extent_bit test"); + extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); + /* Test correct handling of empty tree */ + find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); + if (start != 0 || end != -1) { + test_err( + "error getting a range from completely empty tree: start %llu end %llu", + start, end); + goto out; + } /* * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between * 4M-32M diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 4a7f796c9900..57379e96ccc9 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -6,6 +6,9 @@ #include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" +#include "../volumes.h" +#include "../disk-io.h" +#include "../block-group.h" static void free_extent_map_tree(struct extent_map_tree *em_tree) { @@ -437,11 +440,153 @@ static int test_case_4(struct btrfs_fs_info *fs_info, return ret; } +struct rmap_test_vector { + u64 raid_type; + u64 physical_start; + u64 data_stripe_size; + u64 num_data_stripes; + u64 num_stripes; + /* Assume we won't have more than 5 physical stripes */ + u64 data_stripe_phys_start[5]; + bool expected_mapped_addr; + /* Physical to logical addresses */ + u64 mapped_logical[5]; +}; + +static int test_rmap_block(struct btrfs_fs_info *fs_info, + struct rmap_test_vector *test) +{ + struct extent_map *em; + struct map_lookup *map = NULL; + u64 *logical = NULL; + int i, out_ndaddrs, out_stripe_len; + int ret; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + map = kmalloc(map_lookup_size(test->num_stripes), GFP_KERNEL); + if (!map) { + kfree(em); + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); + /* Start at 4GiB logical address */ + em->start = SZ_4G; + em->len = test->data_stripe_size * test->num_data_stripes; + em->block_len = em->len; + em->orig_block_len = test->data_stripe_size; + em->map_lookup = map; + + map->num_stripes = test->num_stripes; + map->stripe_len = BTRFS_STRIPE_LEN; + map->type = test->raid_type; + + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *dev = btrfs_alloc_dummy_device(fs_info); + + if (IS_ERR(dev)) { + test_err("cannot allocate device"); + ret = PTR_ERR(dev); + goto out; + } + map->stripes[i].dev = dev; + map->stripes[i].physical = test->data_stripe_phys_start[i]; + } + + write_lock(&fs_info->mapping_tree.lock); + ret = add_extent_mapping(&fs_info->mapping_tree, em, 0); + write_unlock(&fs_info->mapping_tree.lock); + if (ret) { + test_err("error adding block group mapping to mapping tree"); + goto out_free; + } + + ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), + &logical, &out_ndaddrs, &out_stripe_len); + if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { + test_err("didn't rmap anything but expected %d", + test->expected_mapped_addr); + goto out; + } + + if (out_stripe_len != BTRFS_STRIPE_LEN) { + test_err("calculated stripe length doesn't match"); + goto out; + } + + if (out_ndaddrs != test->expected_mapped_addr) { + for (i = 0; i < out_ndaddrs; i++) + test_msg("mapped %llu", logical[i]); + test_err("unexpected number of mapped addresses: %d", out_ndaddrs); + goto out; + } + + for (i = 0; i < out_ndaddrs; i++) { + if (logical[i] != test->mapped_logical[i]) { + test_err("unexpected logical address mapped"); + goto out; + } + } + + ret = 0; +out: + write_lock(&fs_info->mapping_tree.lock); + remove_extent_mapping(&fs_info->mapping_tree, em); + write_unlock(&fs_info->mapping_tree.lock); + /* For us */ + free_extent_map(em); +out_free: + /* For the tree */ + free_extent_map(em); + kfree(logical); + return ret; +} + int btrfs_test_extent_map(void) { struct btrfs_fs_info *fs_info = NULL; struct extent_map_tree *em_tree; - int ret = 0; + int ret = 0, i; + struct rmap_test_vector rmap_tests[] = { + { + /* + * Test a chunk with 2 data stripes one of which + * interesects the physical address of the super block + * is correctly recognised. + */ + .raid_type = BTRFS_BLOCK_GROUP_RAID1, + .physical_start = SZ_64M - SZ_4M, + .data_stripe_size = SZ_256M, + .num_data_stripes = 2, + .num_stripes = 2, + .data_stripe_phys_start = + {SZ_64M - SZ_4M, SZ_64M - SZ_4M + SZ_256M}, + .expected_mapped_addr = true, + .mapped_logical= {SZ_4G + SZ_4M} + }, + { + /* + * Test that out-of-range physical addresses are + * ignored + */ + + /* SINGLE chunk type */ + .raid_type = 0, + .physical_start = SZ_4G, + .data_stripe_size = SZ_256M, + .num_data_stripes = 1, + .num_stripes = 1, + .data_stripe_phys_start = {SZ_256M}, + .expected_mapped_addr = false, + .mapped_logical = {0} + } + }; test_msg("running extent_map tests"); @@ -474,6 +619,13 @@ int btrfs_test_extent_map(void) goto out; ret = test_case_4(fs_info, em_tree); + test_msg("running rmap tests"); + for (i = 0; i < ARRAY_SIZE(rmap_tests); i++) { + ret = test_rmap_block(fs_info, &rmap_tests[i]); + if (ret) + goto out; + } + out: kfree(em_tree); btrfs_free_dummy_fs_info(fs_info); diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 1a846bf6e197..914eea5ba6a7 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -452,9 +452,9 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, root->fs_info->tree_root = root; root->node = alloc_test_extent_buffer(root->fs_info, nodesize); - if (!root->node) { + if (IS_ERR(root->node)) { test_std_err(TEST_ALLOC_EXTENT_BUFFER); - ret = -ENOMEM; + ret = PTR_ERR(root->node); goto out; } btrfs_set_header_level(root->node, 0); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 09ecf7dc7b08..24a8c714f56c 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -263,7 +263,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* First with no extents */ BTRFS_I(inode)->root = root; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize); if (IS_ERR(em)) { em = NULL; test_err("got an error when we shouldn't have"); @@ -283,7 +283,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) */ setup_file_extents(root, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -305,7 +305,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -333,7 +333,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -356,7 +356,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Regular extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -384,7 +384,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are split extents */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -413,7 +413,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -435,7 +435,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -469,7 +469,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -498,7 +498,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are a half written prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -528,7 +528,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -561,7 +561,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -596,7 +596,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Now for the compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -630,7 +630,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Split compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -665,7 +665,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -692,7 +692,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -727,8 +727,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* A hole between regular extents but no hole extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, - sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -755,7 +754,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -788,7 +787,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -872,7 +871,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) insert_inode_item_key(root); insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -894,8 +893,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) } free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, - 2 * sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 09aaca1efd62..ac035a6fa003 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -484,9 +484,9 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) * *cough*backref walking code*cough* */ root->node = alloc_test_extent_buffer(root->fs_info, nodesize); - if (!root->node) { + if (IS_ERR(root->node)) { test_err("couldn't allocate dummy buffer"); - ret = -ENOMEM; + ret = PTR_ERR(root->node); goto out; } btrfs_set_header_level(root->node, 0); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index cfc08ef9b876..33dcc88b428a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -147,13 +147,14 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) } } -static noinline void switch_commit_roots(struct btrfs_transaction *trans) +static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) { + struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root, *tmp; down_write(&fs_info->commit_root_sem); - list_for_each_entry_safe(root, tmp, &trans->switch_commits, + list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits, dirty_list) { list_del_init(&root->dirty_list); free_extent_buffer(root->commit_root); @@ -165,16 +166,17 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans) } /* We can free old roots now. */ - spin_lock(&trans->dropped_roots_lock); - while (!list_empty(&trans->dropped_roots)) { - root = list_first_entry(&trans->dropped_roots, + spin_lock(&cur_trans->dropped_roots_lock); + while (!list_empty(&cur_trans->dropped_roots)) { + root = list_first_entry(&cur_trans->dropped_roots, struct btrfs_root, root_list); list_del_init(&root->root_list); - spin_unlock(&trans->dropped_roots_lock); + spin_unlock(&cur_trans->dropped_roots_lock); + btrfs_free_log(trans, root); btrfs_drop_and_free_fs_root(fs_info, root); - spin_lock(&trans->dropped_roots_lock); + spin_lock(&cur_trans->dropped_roots_lock); } - spin_unlock(&trans->dropped_roots_lock); + spin_unlock(&cur_trans->dropped_roots_lock); up_write(&fs_info->commit_root_sem); } @@ -1421,7 +1423,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, ret = commit_cowonly_roots(trans); if (ret) goto out; - switch_commit_roots(trans->transaction); + switch_commit_roots(trans); ret = btrfs_write_and_wait_transaction(trans); if (ret) btrfs_handle_fs_error(fs_info, ret, @@ -2013,6 +2015,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ASSERT(refcount_read(&trans->use_count) == 1); + /* + * Some places just start a transaction to commit it. We need to make + * sure that if this commit fails that the abort code actually marks the + * transaction as failed, so set trans->dirty to make the abort code do + * the right thing. + */ + trans->dirty = true; + /* Stop the commit early if ->aborted is set */ if (unlikely(READ_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; @@ -2301,7 +2311,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) list_add_tail(&fs_info->chunk_root->dirty_list, &cur_trans->switch_commits); - switch_commit_roots(cur_trans); + switch_commit_roots(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); ASSERT(list_empty(&cur_trans->io_bgs)); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 493d4d9e0f79..a92f8a6dd192 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -227,7 +227,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, */ if (item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START) { file_extent_err(leaf, slot, - "invalid item size, have %u expect [%lu, %u)", + "invalid item size, have %u expect [%zu, %u)", item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START, SZ_4K); return -EUCLEAN; @@ -332,7 +332,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, } static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, - int slot) + int slot, struct btrfs_key *prev_key) { struct btrfs_fs_info *fs_info = leaf->fs_info; u32 sectorsize = fs_info->sectorsize; @@ -356,6 +356,118 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, btrfs_item_size_nr(leaf, slot), csumsize); return -EUCLEAN; } + if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) { + u64 prev_csum_end; + u32 prev_item_size; + + prev_item_size = btrfs_item_size_nr(leaf, slot - 1); + prev_csum_end = (prev_item_size / csumsize) * sectorsize; + prev_csum_end += prev_key->offset; + if (prev_csum_end > key->offset) { + generic_err(leaf, slot - 1, +"csum end range (%llu) goes beyond the start range (%llu) of the next csum item", + prev_csum_end, key->offset); + return -EUCLEAN; + } + } + return 0; +} + +/* Inode item error output has the same format as dir_item_err() */ +#define inode_item_err(eb, slot, fmt, ...) \ + dir_item_err(eb, slot, fmt, __VA_ARGS__) + +static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key, + int slot) +{ + struct btrfs_key item_key; + bool is_inode_item; + + btrfs_item_key_to_cpu(leaf, &item_key, slot); + is_inode_item = (item_key.type == BTRFS_INODE_ITEM_KEY); + + /* For XATTR_ITEM, location key should be all 0 */ + if (item_key.type == BTRFS_XATTR_ITEM_KEY) { + if (key->type != 0 || key->objectid != 0 || key->offset != 0) + return -EUCLEAN; + return 0; + } + + if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID || + key->objectid > BTRFS_LAST_FREE_OBJECTID) && + key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID && + key->objectid != BTRFS_FREE_INO_OBJECTID) { + if (is_inode_item) { + generic_err(leaf, slot, + "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu", + key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, + BTRFS_FIRST_FREE_OBJECTID, + BTRFS_LAST_FREE_OBJECTID, + BTRFS_FREE_INO_OBJECTID); + } else { + dir_item_err(leaf, slot, +"invalid location key objectid: has %llu expect %llu or [%llu, %llu] or %llu", + key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, + BTRFS_FIRST_FREE_OBJECTID, + BTRFS_LAST_FREE_OBJECTID, + BTRFS_FREE_INO_OBJECTID); + } + return -EUCLEAN; + } + if (key->offset != 0) { + if (is_inode_item) + inode_item_err(leaf, slot, + "invalid key offset: has %llu expect 0", + key->offset); + else + dir_item_err(leaf, slot, + "invalid location key offset:has %llu expect 0", + key->offset); + return -EUCLEAN; + } + return 0; +} + +static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, + int slot) +{ + struct btrfs_key item_key; + bool is_root_item; + + btrfs_item_key_to_cpu(leaf, &item_key, slot); + is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY); + + /* No such tree id */ + if (key->objectid == 0) { + if (is_root_item) + generic_err(leaf, slot, "invalid root id 0"); + else + dir_item_err(leaf, slot, + "invalid location key root id 0"); + return -EUCLEAN; + } + + /* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */ + if (!is_fstree(key->objectid) && !is_root_item) { + dir_item_err(leaf, slot, + "invalid location key objectid, have %llu expect [%llu, %llu]", + key->objectid, BTRFS_FIRST_FREE_OBJECTID, + BTRFS_LAST_FREE_OBJECTID); + return -EUCLEAN; + } + + /* + * ROOT_ITEM with non-zero offset means this is a snapshot, created at + * @offset transid. + * Furthermore, for location key in DIR_ITEM, its offset is always -1. + * + * So here we only check offset for reloc tree whose key->offset must + * be a valid tree. + */ + if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) { + generic_err(leaf, slot, "invalid root id 0 for reloc tree"); + return -EUCLEAN; + } return 0; } @@ -372,12 +484,14 @@ static int check_dir_item(struct extent_buffer *leaf, return -EUCLEAN; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); while (cur < item_size) { + struct btrfs_key location_key; u32 name_len; u32 data_len; u32 max_name_len; u32 total_size; u32 name_hash; u8 dir_type; + int ret; /* header itself should not cross item boundary */ if (cur + sizeof(*di) > item_size) { @@ -387,6 +501,25 @@ static int check_dir_item(struct extent_buffer *leaf, return -EUCLEAN; } + /* Location key check */ + btrfs_dir_item_key_to_cpu(leaf, di, &location_key); + if (location_key.type == BTRFS_ROOT_ITEM_KEY) { + ret = check_root_key(leaf, &location_key, slot); + if (ret < 0) + return ret; + } else if (location_key.type == BTRFS_INODE_ITEM_KEY || + location_key.type == 0) { + ret = check_inode_key(leaf, &location_key, slot); + if (ret < 0) + return ret; + } else { + dir_item_err(leaf, slot, + "invalid location key type, have %u, expect %u or %u", + location_key.type, BTRFS_ROOT_ITEM_KEY, + BTRFS_INODE_ITEM_KEY); + return -EUCLEAN; + } + /* dir type check */ dir_type = btrfs_dir_type(leaf, di); if (dir_type >= BTRFS_FT_MAX) { @@ -724,6 +857,44 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, return 0; } +/* + * Enhanced version of chunk item checker. + * + * The common btrfs_check_chunk_valid() doesn't check item size since it needs + * to work on super block sys_chunk_array which doesn't have full item ptr. + */ +static int check_leaf_chunk_item(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_key *key, int slot) +{ + int num_stripes; + + if (btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk)) { + chunk_err(leaf, chunk, key->offset, + "invalid chunk item size: have %u expect [%zu, %u)", + btrfs_item_size_nr(leaf, slot), + sizeof(struct btrfs_chunk), + BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); + return -EUCLEAN; + } + + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + /* Let btrfs_check_chunk_valid() handle this error type */ + if (num_stripes == 0) + goto out; + + if (btrfs_chunk_item_size(num_stripes) != + btrfs_item_size_nr(leaf, slot)) { + chunk_err(leaf, chunk, key->offset, + "invalid chunk item size: have %u expect %lu", + btrfs_item_size_nr(leaf, slot), + btrfs_chunk_item_size(num_stripes)); + return -EUCLEAN; + } +out: + return btrfs_check_chunk_valid(leaf, chunk, key->offset); +} + __printf(3, 4) __cold static void dev_item_err(const struct extent_buffer *eb, int slot, @@ -787,7 +958,7 @@ static int check_dev_item(struct extent_buffer *leaf, } /* Inode item error output has the same format as dir_item_err() */ -#define inode_item_err(fs_info, eb, slot, fmt, ...) \ +#define inode_item_err(eb, slot, fmt, ...) \ dir_item_err(eb, slot, fmt, __VA_ARGS__) static int check_inode_item(struct extent_buffer *leaf, @@ -798,30 +969,17 @@ static int check_inode_item(struct extent_buffer *leaf, u64 super_gen = btrfs_super_generation(fs_info->super_copy); u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); u32 mode; + int ret; + + ret = check_inode_key(leaf, key, slot); + if (ret < 0) + return ret; - if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID || - key->objectid > BTRFS_LAST_FREE_OBJECTID) && - key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID && - key->objectid != BTRFS_FREE_INO_OBJECTID) { - generic_err(leaf, slot, - "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu", - key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, - BTRFS_FIRST_FREE_OBJECTID, - BTRFS_LAST_FREE_OBJECTID, - BTRFS_FREE_INO_OBJECTID); - return -EUCLEAN; - } - if (key->offset != 0) { - inode_item_err(fs_info, leaf, slot, - "invalid key offset: has %llu expect 0", - key->offset); - return -EUCLEAN; - } iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); /* Here we use super block generation + 1 to handle log tree */ if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "invalid inode generation: has %llu expect (0, %llu]", btrfs_inode_generation(leaf, iitem), super_gen + 1); @@ -829,7 +987,7 @@ static int check_inode_item(struct extent_buffer *leaf, } /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */ if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "invalid inode generation: has %llu expect [0, %llu]", btrfs_inode_transid(leaf, iitem), super_gen + 1); return -EUCLEAN; @@ -842,7 +1000,7 @@ static int check_inode_item(struct extent_buffer *leaf, */ mode = btrfs_inode_mode(leaf, iitem); if (mode & ~valid_mask) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "unknown mode bit detected: 0x%x", mode & ~valid_mask); return -EUCLEAN; @@ -855,20 +1013,20 @@ static int check_inode_item(struct extent_buffer *leaf, */ if (!has_single_bit_set(mode & S_IFMT)) { if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "invalid mode: has 0%o expect valid S_IF* bit(s)", mode & S_IFMT); return -EUCLEAN; } } if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "invalid nlink: has %u expect no more than 1 for dir", btrfs_inode_nlink(leaf, iitem)); return -EUCLEAN; } if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "unknown flags detected: 0x%llx", btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK); @@ -884,22 +1042,11 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, struct btrfs_root_item ri; const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY | BTRFS_ROOT_SUBVOL_DEAD; + int ret; - /* No such tree id */ - if (key->objectid == 0) { - generic_err(leaf, slot, "invalid root id 0"); - return -EUCLEAN; - } - - /* - * Some older kernel may create ROOT_ITEM with non-zero offset, so here - * we only check offset for reloc tree whose key->offset must be a - * valid tree. - */ - if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) { - generic_err(leaf, slot, "invalid root id 0 for reloc tree"); - return -EUCLEAN; - } + ret = check_root_key(leaf, key, slot); + if (ret < 0) + return ret; if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) { generic_err(leaf, slot, @@ -1288,8 +1435,8 @@ static int check_extent_data_ref(struct extent_buffer *leaf, return 0; } -#define inode_ref_err(fs_info, eb, slot, fmt, args...) \ - inode_item_err(fs_info, eb, slot, fmt, ##args) +#define inode_ref_err(eb, slot, fmt, args...) \ + inode_item_err(eb, slot, fmt, ##args) static int check_inode_ref(struct extent_buffer *leaf, struct btrfs_key *key, struct btrfs_key *prev_key, int slot) @@ -1302,7 +1449,7 @@ static int check_inode_ref(struct extent_buffer *leaf, return -EUCLEAN; /* namelen can't be 0, so item_size == sizeof() is also invalid */ if (btrfs_item_size_nr(leaf, slot) <= sizeof(*iref)) { - inode_ref_err(fs_info, leaf, slot, + inode_ref_err(leaf, slot, "invalid item size, have %u expect (%zu, %u)", btrfs_item_size_nr(leaf, slot), sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); @@ -1315,7 +1462,7 @@ static int check_inode_ref(struct extent_buffer *leaf, u16 namelen; if (ptr + sizeof(iref) > end) { - inode_ref_err(fs_info, leaf, slot, + inode_ref_err(leaf, slot, "inode ref overflow, ptr %lu end %lu inode_ref_size %zu", ptr, end, sizeof(iref)); return -EUCLEAN; @@ -1324,7 +1471,7 @@ static int check_inode_ref(struct extent_buffer *leaf, iref = (struct btrfs_inode_ref *)ptr; namelen = btrfs_inode_ref_name_len(leaf, iref); if (ptr + sizeof(*iref) + namelen > end) { - inode_ref_err(fs_info, leaf, slot, + inode_ref_err(leaf, slot, "inode ref overflow, ptr %lu end %lu namelen %u", ptr, end, namelen); return -EUCLEAN; @@ -1355,7 +1502,7 @@ static int check_leaf_item(struct extent_buffer *leaf, ret = check_extent_data_item(leaf, key, slot, prev_key); break; case BTRFS_EXTENT_CSUM_KEY: - ret = check_csum_item(leaf, key, slot); + ret = check_csum_item(leaf, key, slot, prev_key); break; case BTRFS_DIR_ITEM_KEY: case BTRFS_DIR_INDEX_KEY: @@ -1370,7 +1517,7 @@ static int check_leaf_item(struct extent_buffer *leaf, break; case BTRFS_CHUNK_ITEM_KEY: chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); - ret = btrfs_check_chunk_valid(leaf, chunk, key->offset); + ret = check_leaf_chunk_item(leaf, chunk, key, slot); break; case BTRFS_DEV_ITEM_KEY: ret = check_dev_item(leaf, key, slot); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6f757361db53..7dd7552f53a4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -808,7 +808,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_del_csums(trans, fs_info, + ret = btrfs_del_csums(trans, + fs_info->csum_root, sums->bytenr, sums->len); if (!ret) @@ -2673,14 +2674,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, u32 blocksize; int ret = 0; - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - while (*level > 0) { struct btrfs_key first_key; - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); cur = path->nodes[*level]; WARN_ON(btrfs_header_level(cur) != *level); @@ -2731,9 +2727,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent( - fs_info, bytenr, - blocksize); + ret = btrfs_pin_reserved_extent(fs_info, + bytenr, blocksize); if (ret) { free_extent_buffer(next); return ret; @@ -2748,7 +2743,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return ret; } - WARN_ON(*level <= 0); if (path->nodes[*level-1]) free_extent_buffer(path->nodes[*level-1]); path->nodes[*level-1] = next; @@ -2756,9 +2750,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, path->slots[*level] = 0; cond_resched(); } - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); cond_resched(); @@ -2814,8 +2805,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent( - fs_info, + ret = btrfs_pin_reserved_extent(fs_info, path->nodes[*level]->start, path->nodes[*level]->len); if (ret) @@ -2895,10 +2885,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, clear_extent_buffer_dirty(next); } - WARN_ON(log->root_key.objectid != - BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent(fs_info, - next->start, next->len); + ret = btrfs_pin_reserved_extent(fs_info, next->start, + next->len); if (ret) goto out; } @@ -3909,10 +3897,32 @@ static int log_inode_item(struct btrfs_trans_handle *trans, return 0; } +static int log_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *log_root, + struct btrfs_ordered_sum *sums) +{ + int ret; + + /* + * Due to extent cloning, we might have logged a csum item that covers a + * subrange of a cloned extent, and later we can end up logging a csum + * item for a larger subrange of the same extent or the entire range. + * This would leave csum items in the log tree that cover the same range + * and break the searches for checksums in the log tree, resulting in + * some checksums missing in the fs/subvolume tree. So just delete (or + * trim and adjust) any existing csum items in the log for this range. + */ + ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); + if (ret) + return ret; + + return btrfs_csum_file_blocks(trans, log_root, sums); +} + static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *dst_path, - struct btrfs_path *src_path, u64 *last_extent, + struct btrfs_path *src_path, int start_slot, int nr, int inode_only, u64 logged_isize) { @@ -3923,7 +3933,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *extent; struct btrfs_inode_item *inode_item; struct extent_buffer *src = src_path->nodes[0]; - struct btrfs_key first_key, last_key, key; int ret; struct btrfs_key *ins_keys; u32 *ins_sizes; @@ -3931,9 +3940,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int i; struct list_head ordered_sums; int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; - bool has_extents = false; - bool need_find_last_extent = true; - bool done = false; INIT_LIST_HEAD(&ordered_sums); @@ -3942,8 +3948,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, if (!ins_data) return -ENOMEM; - first_key.objectid = (u64)-1; - ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); @@ -3964,9 +3968,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + i); - if (i == nr - 1) - last_key = ins_keys[i]; - if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_path->slots[0], @@ -3980,20 +3981,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset, ins_sizes[i]); } - /* - * We set need_find_last_extent here in case we know we were - * processing other items and then walk into the first extent in - * the inode. If we don't hit an extent then nothing changes, - * we'll do the last search the next time around. - */ - if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { - has_extents = true; - if (first_key.objectid == (u64)-1) - first_key = ins_keys[i]; - } else { - need_find_last_extent = false; - } - /* take a reference on file data extents so that truncates * or deletes of this inode don't have to relog the inode * again @@ -4054,172 +4041,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_csum_file_blocks(trans, log, sums); + ret = log_csums(trans, log, sums); list_del(&sums->list); kfree(sums); } - if (!has_extents) - return ret; - - if (need_find_last_extent && *last_extent == first_key.offset) { - /* - * We don't have any leafs between our current one and the one - * we processed before that can have file extent items for our - * inode (and have a generation number smaller than our current - * transaction id). - */ - need_find_last_extent = false; - } - - /* - * Because we use btrfs_search_forward we could skip leaves that were - * not modified and then assume *last_extent is valid when it really - * isn't. So back up to the previous leaf and read the end of the last - * extent before we go and fill in holes. - */ - if (need_find_last_extent) { - u64 len; - - ret = btrfs_prev_leaf(inode->root, src_path); - if (ret < 0) - return ret; - if (ret) - goto fill_holes; - if (src_path->slots[0]) - src_path->slots[0]--; - src = src_path->nodes[0]; - btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); - if (key.objectid != btrfs_ino(inode) || - key.type != BTRFS_EXTENT_DATA_KEY) - goto fill_holes; - extent = btrfs_item_ptr(src, src_path->slots[0], - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(src, extent) == - BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(src, extent); - *last_extent = ALIGN(key.offset + len, - fs_info->sectorsize); - } else { - len = btrfs_file_extent_num_bytes(src, extent); - *last_extent = key.offset + len; - } - } -fill_holes: - /* So we did prev_leaf, now we need to move to the next leaf, but a few - * things could have happened - * - * 1) A merge could have happened, so we could currently be on a leaf - * that holds what we were copying in the first place. - * 2) A split could have happened, and now not all of the items we want - * are on the same leaf. - * - * So we need to adjust how we search for holes, we need to drop the - * path and re-search for the first extent key we found, and then walk - * forward until we hit the last one we copied. - */ - if (need_find_last_extent) { - /* btrfs_prev_leaf could return 1 without releasing the path */ - btrfs_release_path(src_path); - ret = btrfs_search_slot(NULL, inode->root, &first_key, - src_path, 0, 0); - if (ret < 0) - return ret; - ASSERT(ret == 0); - src = src_path->nodes[0]; - i = src_path->slots[0]; - } else { - i = start_slot; - } - - /* - * Ok so here we need to go through and fill in any holes we may have - * to make sure that holes are punched for those areas in case they had - * extents previously. - */ - while (!done) { - u64 offset, len; - u64 extent_end; - - if (i >= btrfs_header_nritems(src_path->nodes[0])) { - ret = btrfs_next_leaf(inode->root, src_path); - if (ret < 0) - return ret; - ASSERT(ret == 0); - src = src_path->nodes[0]; - i = 0; - need_find_last_extent = true; - } - - btrfs_item_key_to_cpu(src, &key, i); - if (!btrfs_comp_cpu_keys(&key, &last_key)) - done = true; - if (key.objectid != btrfs_ino(inode) || - key.type != BTRFS_EXTENT_DATA_KEY) { - i++; - continue; - } - extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(src, extent) == - BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(src, extent); - extent_end = ALIGN(key.offset + len, - fs_info->sectorsize); - } else { - len = btrfs_file_extent_num_bytes(src, extent); - extent_end = key.offset + len; - } - i++; - - if (*last_extent == key.offset) { - *last_extent = extent_end; - continue; - } - offset = *last_extent; - len = key.offset - *last_extent; - ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), - offset, 0, 0, len, 0, len, 0, 0, 0); - if (ret) - break; - *last_extent = extent_end; - } - - /* - * Check if there is a hole between the last extent found in our leaf - * and the first extent in the next leaf. If there is one, we need to - * log an explicit hole so that at replay time we can punch the hole. - */ - if (ret == 0 && - key.objectid == btrfs_ino(inode) && - key.type == BTRFS_EXTENT_DATA_KEY && - i == btrfs_header_nritems(src_path->nodes[0])) { - ret = btrfs_next_leaf(inode->root, src_path); - need_find_last_extent = true; - if (ret > 0) { - ret = 0; - } else if (ret == 0) { - btrfs_item_key_to_cpu(src_path->nodes[0], &key, - src_path->slots[0]); - if (key.objectid == btrfs_ino(inode) && - key.type == BTRFS_EXTENT_DATA_KEY && - *last_extent < key.offset) { - const u64 len = key.offset - *last_extent; - - ret = btrfs_insert_file_extent(trans, log, - btrfs_ino(inode), - *last_extent, 0, - 0, len, 0, len, - 0, 0, 0); - *last_extent += len; - } - } - } - /* - * Need to let the callers know we dropped the path so they should - * re-search. - */ - if (!ret && need_find_last_extent) - ret = 1; return ret; } @@ -4274,7 +4100,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_csum_file_blocks(trans, log_root, sums); + ret = log_csums(trans, log_root, sums); list_del(&sums->list); kfree(sums); } @@ -4384,7 +4210,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, const u64 i_size = i_size_read(&inode->vfs_inode); const u64 ino = btrfs_ino(inode); struct btrfs_path *dst_path = NULL; - u64 last_extent = (u64)-1; + bool dropped_extents = false; int ins_nr = 0; int start_slot; int ret; @@ -4406,8 +4232,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, if (slot >= btrfs_header_nritems(leaf)) { if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - &last_extent, start_slot, - ins_nr, 1, 0); + start_slot, ins_nr, 1, 0); if (ret < 0) goto out; ins_nr = 0; @@ -4431,8 +4256,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, path->slots[0]++; continue; } - if (last_extent == (u64)-1) { - last_extent = key.offset; + if (!dropped_extents) { /* * Avoid logging extent items logged in past fsync calls * and leading to duplicate keys in the log tree. @@ -4446,6 +4270,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, } while (ret == -EAGAIN); if (ret) goto out; + dropped_extents = true; } if (ins_nr == 0) start_slot = slot; @@ -4460,7 +4285,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, } } if (ins_nr > 0) { - ret = copy_items(trans, inode, dst_path, path, &last_extent, + ret = copy_items(trans, inode, dst_path, path, start_slot, ins_nr, 1, 0); if (ret > 0) ret = 0; @@ -4647,13 +4472,8 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, if (slot >= nritems) { if (ins_nr > 0) { - u64 last_extent = 0; - ret = copy_items(trans, inode, dst_path, path, - &last_extent, start_slot, - ins_nr, 1, 0); - /* can't be 1, extent items aren't processed */ - ASSERT(ret <= 0); + start_slot, ins_nr, 1, 0); if (ret < 0) return ret; ins_nr = 0; @@ -4677,13 +4497,8 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, cond_resched(); } if (ins_nr > 0) { - u64 last_extent = 0; - ret = copy_items(trans, inode, dst_path, path, - &last_extent, start_slot, - ins_nr, 1, 0); - /* can't be 1, extent items aren't processed */ - ASSERT(ret <= 0); + start_slot, ins_nr, 1, 0); if (ret < 0) return ret; } @@ -4692,100 +4507,119 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, } /* - * If the no holes feature is enabled we need to make sure any hole between the - * last extent and the i_size of our inode is explicitly marked in the log. This - * is to make sure that doing something like: - * - * 1) create file with 128Kb of data - * 2) truncate file to 64Kb - * 3) truncate file to 256Kb - * 4) fsync file - * 5) <crash/power failure> - * 6) mount fs and trigger log replay - * - * Will give us a file with a size of 256Kb, the first 64Kb of data match what - * the file had in its first 64Kb of data at step 1 and the last 192Kb of the - * file correspond to a hole. The presence of explicit holes in a log tree is - * what guarantees that log replay will remove/adjust file extent items in the - * fs/subvol tree. - * - * Here we do not need to care about holes between extents, that is already done - * by copy_items(). We also only need to do this in the full sync path, where we - * lookup for extents from the fs/subvol tree only. In the fast path case, we - * lookup the list of modified extent maps and if any represents a hole, we - * insert a corresponding extent representing a hole in the log tree. + * When using the NO_HOLES feature if we punched a hole that causes the + * deletion of entire leafs or all the extent items of the first leaf (the one + * that contains the inode item and references) we may end up not processing + * any extents, because there are no leafs with a generation matching the + * current transaction that have extent items for our inode. So we need to find + * if any holes exist and then log them. We also need to log holes after any + * truncate operation that changes the inode's size. */ -static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *inode, - struct btrfs_path *path) +static int btrfs_log_holes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_inode *inode, + struct btrfs_path *path) { struct btrfs_fs_info *fs_info = root->fs_info; - int ret; struct btrfs_key key; - u64 hole_start; - u64 hole_size; - struct extent_buffer *leaf; - struct btrfs_root *log = root->log_root; const u64 ino = btrfs_ino(inode); const u64 i_size = i_size_read(&inode->vfs_inode); + u64 prev_extent_end = 0; + int ret; - if (!btrfs_fs_incompat(fs_info, NO_HOLES)) + if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) return 0; key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = (u64)-1; + key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - ASSERT(ret != 0); if (ret < 0) return ret; - ASSERT(path->slots[0] > 0); - path->slots[0]--; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { - /* inode does not have any extents */ - hole_start = 0; - hole_size = i_size; - } else { + while (true) { struct btrfs_file_extent_item *extent; + struct extent_buffer *leaf = path->nodes[0]; u64 len; - /* - * If there's an extent beyond i_size, an explicit hole was - * already inserted by copy_items(). - */ - if (key.offset >= i_size) - return 0; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + if (ret > 0) { + ret = 0; + break; + } + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + break; + + /* We have a hole, log it. */ + if (prev_extent_end < key.offset) { + const u64 hole_len = key.offset - prev_extent_end; + + /* + * Release the path to avoid deadlocks with other code + * paths that search the root while holding locks on + * leafs from the log root. + */ + btrfs_release_path(path); + ret = btrfs_insert_file_extent(trans, root->log_root, + ino, prev_extent_end, 0, + 0, hole_len, 0, hole_len, + 0, 0, 0); + if (ret < 0) + return ret; + + /* + * Search for the same key again in the root. Since it's + * an extent item and we are holding the inode lock, the + * key must still exist. If it doesn't just emit warning + * and return an error to fall back to a transaction + * commit. + */ + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + if (WARN_ON(ret > 0)) + return -ENOENT; + leaf = path->nodes[0]; + } extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, extent) == - BTRFS_FILE_EXTENT_INLINE) - return 0; + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_ram_bytes(leaf, extent); + prev_extent_end = ALIGN(key.offset + len, + fs_info->sectorsize); + } else { + len = btrfs_file_extent_num_bytes(leaf, extent); + prev_extent_end = key.offset + len; + } - len = btrfs_file_extent_num_bytes(leaf, extent); - /* Last extent goes beyond i_size, no need to log a hole. */ - if (key.offset + len > i_size) - return 0; - hole_start = key.offset + len; - hole_size = i_size - hole_start; + path->slots[0]++; + cond_resched(); } - btrfs_release_path(path); - /* Last extent ends at i_size. */ - if (hole_size == 0) - return 0; + if (prev_extent_end < i_size) { + u64 hole_len; - hole_size = ALIGN(hole_size, fs_info->sectorsize); - ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, - hole_size, 0, hole_size, 0, 0, 0); - return ret; + btrfs_release_path(path); + hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); + ret = btrfs_insert_file_extent(trans, root->log_root, + ino, prev_extent_end, 0, 0, + hole_len, 0, hole_len, + 0, 0, 0); + if (ret < 0) + return ret; + } + + return 0; } /* @@ -4989,6 +4823,50 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, continue; } /* + * If the inode was already logged skip it - otherwise we can + * hit an infinite loop. Example: + * + * From the commit root (previous transaction) we have the + * following inodes: + * + * inode 257 a directory + * inode 258 with references "zz" and "zz_link" on inode 257 + * inode 259 with reference "a" on inode 257 + * + * And in the current (uncommitted) transaction we have: + * + * inode 257 a directory, unchanged + * inode 258 with references "a" and "a2" on inode 257 + * inode 259 with reference "zz_link" on inode 257 + * inode 261 with reference "zz" on inode 257 + * + * When logging inode 261 the following infinite loop could + * happen if we don't skip already logged inodes: + * + * - we detect inode 258 as a conflicting inode, with inode 261 + * on reference "zz", and log it; + * + * - we detect inode 259 as a conflicting inode, with inode 258 + * on reference "a", and log it; + * + * - we detect inode 258 as a conflicting inode, with inode 259 + * on reference "zz_link", and log it - again! After this we + * repeat the above steps forever. + */ + spin_lock(&BTRFS_I(inode)->lock); + /* + * Check the inode's logged_trans only instead of + * btrfs_inode_in_log(). This is because the last_log_commit of + * the inode is not updated when we only log that it exists and + * and it has the full sync bit set (see btrfs_log_inode()). + */ + if (BTRFS_I(inode)->logged_trans == trans->transid) { + spin_unlock(&BTRFS_I(inode)->lock); + btrfs_add_delayed_iput(inode); + continue; + } + spin_unlock(&BTRFS_I(inode)->lock); + /* * We are safe logging the other inode without acquiring its * lock as long as we log with the LOG_INODE_EXISTS mode. We * are safe against concurrent renames of the other inode as @@ -5087,7 +4965,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_key max_key; struct btrfs_root *log = root->log_root; - u64 last_extent = 0; int err = 0; int ret; int nritems; @@ -5265,7 +5142,7 @@ again: ins_start_slot = path->slots[0]; } ret = copy_items(trans, inode, dst_path, path, - &last_extent, ins_start_slot, + ins_start_slot, ins_nr, inode_only, logged_isize); if (ret < 0) { @@ -5288,17 +5165,13 @@ again: if (ins_nr == 0) goto next_slot; ret = copy_items(trans, inode, dst_path, path, - &last_extent, ins_start_slot, + ins_start_slot, ins_nr, inode_only, logged_isize); if (ret < 0) { err = ret; goto out_unlock; } ins_nr = 0; - if (ret) { - btrfs_release_path(path); - continue; - } goto next_slot; } @@ -5311,18 +5184,13 @@ again: goto next_slot; } - ret = copy_items(trans, inode, dst_path, path, &last_extent, + ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, logged_isize); if (ret < 0) { err = ret; goto out_unlock; } - if (ret) { - ins_nr = 0; - btrfs_release_path(path); - continue; - } ins_nr = 1; ins_start_slot = path->slots[0]; next_slot: @@ -5336,13 +5204,12 @@ next_slot: } if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, - &last_extent, ins_start_slot, + ins_start_slot, ins_nr, inode_only, logged_isize); if (ret < 0) { err = ret; goto out_unlock; } - ret = 0; ins_nr = 0; } btrfs_release_path(path); @@ -5357,14 +5224,13 @@ next_key: } } if (ins_nr) { - ret = copy_items(trans, inode, dst_path, path, &last_extent, + ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, logged_isize); if (ret < 0) { err = ret; goto out_unlock; } - ret = 0; ins_nr = 0; } @@ -5377,7 +5243,7 @@ next_key: if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_trailing_hole(trans, root, inode, path); + err = btrfs_log_holes(trans, root, inode, path); if (err) goto out_unlock; } @@ -6294,9 +6160,28 @@ again: wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); + + /* + * We didn't find the subvol, likely because it was + * deleted. This is ok, simply skip this log and go to + * the next one. + * + * We need to exclude the root because we can't have + * other log replays overwriting this log as we'll read + * it back in a few more times. This will keep our + * block from being modified, and we'll just bail for + * each subsequent pass. + */ + if (ret == -ENOENT) + ret = btrfs_pin_extent_for_log_replay(fs_info, + log->node->start, + log->node->len); free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); + + if (!ret) + goto next; btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root for tree log recovery."); goto error; @@ -6328,7 +6213,6 @@ again: &root->highest_objectid); } - key.offset = found_key.offset - 1; wc.replay_dest->log_root = NULL; free_extent_buffer(log->node); free_extent_buffer(log->commit_root); @@ -6336,9 +6220,10 @@ again: if (ret) goto error; - +next: if (found_key.offset == 0) break; + key.offset = found_key.offset - 1; } btrfs_release_path(path); diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 91caab63bdf5..76b84f2397b1 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -324,6 +324,8 @@ again_search_slot: } if (ret < 0 && ret != -ENOENT) goto out; + key.offset++; + goto again_search_slot; } item_size -= sizeof(subid_le); offset += sizeof(subid_le); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d8e5560db285..9cfc668f91f4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -30,6 +30,7 @@ #include "tree-checker.h" #include "space-info.h" #include "block-group.h" +#include "discard.h" const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { @@ -61,11 +62,12 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID1C3] = { .sub_stripes = 1, .dev_stripes = 1, - .devs_max = 0, + .devs_max = 3, .devs_min = 3, .tolerated_failures = 2, .devs_increment = 3, .ncopies = 3, + .nparity = 0, .raid_name = "raid1c3", .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, @@ -73,11 +75,12 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID1C4] = { .sub_stripes = 1, .dev_stripes = 1, - .devs_max = 0, + .devs_max = 4, .devs_min = 4, .tolerated_failures = 3, .devs_increment = 4, .ncopies = 4, + .nparity = 0, .raid_name = "raid1c4", .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, @@ -438,39 +441,6 @@ static noinline struct btrfs_fs_devices *find_fsid( ASSERT(fsid); - if (metadata_fsid) { - /* - * Handle scanned device having completed its fsid change but - * belonging to a fs_devices that was created by first scanning - * a device which didn't have its fsid/metadata_uuid changed - * at all and the CHANGING_FSID_V2 flag set. - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (fs_devices->fsid_change && - memcmp(metadata_fsid, fs_devices->fsid, - BTRFS_FSID_SIZE) == 0 && - memcmp(fs_devices->fsid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) == 0) { - return fs_devices; - } - } - /* - * Handle scanned device having completed its fsid change but - * belonging to a fs_devices that was created by a device that - * has an outdated pair of fsid/metadata_uuid and - * CHANGING_FSID_V2 flag set. - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (fs_devices->fsid_change && - memcmp(fs_devices->metadata_uuid, - fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && - memcmp(metadata_fsid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) == 0) { - return fs_devices; - } - } - } - /* Handle non-split brain cases */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { if (metadata_fsid) { @@ -486,6 +456,47 @@ static noinline struct btrfs_fs_devices *find_fsid( return NULL; } +static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( + struct btrfs_super_block *disk_super) +{ + + struct btrfs_fs_devices *fs_devices; + + /* + * Handle scanned device having completed its fsid change but + * belonging to a fs_devices that was created by first scanning + * a device which didn't have its fsid/metadata_uuid changed + * at all and the CHANGING_FSID_V2 flag set. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (fs_devices->fsid_change && + memcmp(disk_super->metadata_uuid, fs_devices->fsid, + BTRFS_FSID_SIZE) == 0 && + memcmp(fs_devices->fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) { + return fs_devices; + } + } + /* + * Handle scanned device having completed its fsid change but + * belonging to a fs_devices that was created by a device that + * has an outdated pair of fsid/metadata_uuid and + * CHANGING_FSID_V2 flag set. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (fs_devices->fsid_change && + memcmp(fs_devices->metadata_uuid, + fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && + memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) { + return fs_devices; + } + } + + return find_fsid(disk_super->fsid, disk_super->metadata_uuid); +} + + static int btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, int flush, struct block_device **bdev, @@ -669,7 +680,9 @@ error_brelse: /* * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices - * being created with a disk that has already completed its fsid change. + * being created with a disk that has already completed its fsid change. Such + * disk can belong to an fs which has its FSID changed or to one which doesn't. + * Handle both cases here. */ static struct btrfs_fs_devices *find_fsid_inprogress( struct btrfs_super_block *disk_super) @@ -685,7 +698,7 @@ static struct btrfs_fs_devices *find_fsid_inprogress( } } - return NULL; + return find_fsid(disk_super->fsid, NULL); } @@ -697,17 +710,54 @@ static struct btrfs_fs_devices *find_fsid_changed( /* * Handles the case where scanned device is part of an fs that had * multiple successful changes of FSID but curently device didn't - * observe it. Meaning our fsid will be different than theirs. + * observe it. Meaning our fsid will be different than theirs. We need + * to handle two subcases : + * 1 - The fs still continues to have different METADATA/FSID uuids. + * 2 - The fs is switched back to its original FSID (METADATA/FSID + * are equal). */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + /* Changed UUIDs */ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, BTRFS_FSID_SIZE) == 0 && memcmp(fs_devices->fsid, disk_super->fsid, - BTRFS_FSID_SIZE) != 0) { + BTRFS_FSID_SIZE) != 0) + return fs_devices; + + /* Unchanged UUIDs */ + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, + BTRFS_FSID_SIZE) == 0 && + memcmp(fs_devices->fsid, disk_super->metadata_uuid, + BTRFS_FSID_SIZE) == 0) + return fs_devices; + } + + return NULL; +} + +static struct btrfs_fs_devices *find_fsid_reverted_metadata( + struct btrfs_super_block *disk_super) +{ + struct btrfs_fs_devices *fs_devices; + + /* + * Handle the case where the scanned device is part of an fs whose last + * metadata UUID change reverted it to the original FSID. At the same + * time * fs_devices was first created by another constitutent device + * which didn't fully observe the operation. This results in an + * btrfs_fs_devices created with metadata/fsid different AND + * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the + * fs_devices equal to the FSID of the disk. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) != 0 && + memcmp(fs_devices->metadata_uuid, disk_super->fsid, + BTRFS_FSID_SIZE) == 0 && + fs_devices->fsid_change) return fs_devices; - } } return NULL; @@ -734,24 +784,16 @@ static noinline struct btrfs_device *device_list_add(const char *path, BTRFS_SUPER_FLAG_CHANGING_FSID_V2); if (fsid_change_in_progress) { - if (!has_metadata_uuid) { - /* - * When we have an image which has CHANGING_FSID_V2 set - * it might belong to either a filesystem which has - * disks with completed fsid change or it might belong - * to fs with no UUID changes in effect, handle both. - */ + if (!has_metadata_uuid) fs_devices = find_fsid_inprogress(disk_super); - if (!fs_devices) - fs_devices = find_fsid(disk_super->fsid, NULL); - } else { + else fs_devices = find_fsid_changed(disk_super); - } } else if (has_metadata_uuid) { - fs_devices = find_fsid(disk_super->fsid, - disk_super->metadata_uuid); + fs_devices = find_fsid_with_metadata_uuid(disk_super); } else { - fs_devices = find_fsid(disk_super->fsid, NULL); + fs_devices = find_fsid_reverted_metadata(disk_super); + if (!fs_devices) + fs_devices = find_fsid(disk_super->fsid, NULL); } @@ -781,12 +823,18 @@ static noinline struct btrfs_device *device_list_add(const char *path, * a device which had the CHANGING_FSID_V2 flag then replace the * metadata_uuid/fsid values of the fs_devices. */ - if (has_metadata_uuid && fs_devices->fsid_change && + if (fs_devices->fsid_change && found_transid > fs_devices->latest_generation) { memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); - memcpy(fs_devices->metadata_uuid, - disk_super->metadata_uuid, BTRFS_FSID_SIZE); + + if (has_metadata_uuid) + memcpy(fs_devices->metadata_uuid, + disk_super->metadata_uuid, + BTRFS_FSID_SIZE); + else + memcpy(fs_devices->metadata_uuid, + disk_super->fsid, BTRFS_FSID_SIZE); fs_devices->fsid_change = false; } @@ -1064,11 +1112,6 @@ static void btrfs_close_bdev(struct btrfs_device *device) static void btrfs_close_one_device(struct btrfs_device *device) { struct btrfs_fs_devices *fs_devices = device->fs_devices; - struct btrfs_device *new_device; - struct rcu_string *name; - - if (device->bdev) - fs_devices->open_devices--; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@ -1080,23 +1123,22 @@ static void btrfs_close_one_device(struct btrfs_device *device) fs_devices->missing_devices--; btrfs_close_bdev(device); - - new_device = btrfs_alloc_device(NULL, &device->devid, - device->uuid); - BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ - - /* Safe because we are under uuid_mutex */ - if (device->name) { - name = rcu_string_strdup(device->name->str, GFP_NOFS); - BUG_ON(!name); /* -ENOMEM */ - rcu_assign_pointer(new_device->name, name); + if (device->bdev) { + fs_devices->open_devices--; + device->bdev = NULL; } + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - list_replace_rcu(&device->dev_list, &new_device->dev_list); - new_device->fs_devices = device->fs_devices; + device->fs_info = NULL; + atomic_set(&device->dev_stats_ccnt, 0); + extent_io_tree_release(&device->alloc_state); - synchronize_rcu(); - btrfs_free_device(device); + /* Verify the device is back in a pristine state */ + ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); + ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); + ASSERT(list_empty(&device->dev_alloc_list)); + ASSERT(list_empty(&device->post_commit_list)); + ASSERT(atomic_read(&device->reada_in_flight) == 0); } static int close_fs_devices(struct btrfs_fs_devices *fs_devices) @@ -2130,7 +2172,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) { struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; - WARN_ON(!tgtdev); mutex_lock(&fs_devices->device_list_mutex); btrfs_sysfs_rm_device_link(fs_devices, tgtdev); @@ -2875,6 +2916,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct btrfs_root *root = fs_info->chunk_root; struct btrfs_trans_handle *trans; + struct btrfs_block_group *block_group; int ret; /* @@ -2898,6 +2940,12 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) if (ret) return ret; + block_group = btrfs_lookup_block_group(fs_info, chunk_offset); + if (!block_group) + return -ENOENT; + btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); + btrfs_put_block_group(block_group); + trans = btrfs_start_trans_remove_block_group(root->fs_info, chunk_offset); if (IS_ERR(trans)) { @@ -3881,7 +3929,11 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, } } - num_devices = btrfs_num_devices(fs_info); + /* + * rw_devices will not change at the moment, device add/delete/replace + * are excluded by EXCL_OP + */ + num_devices = fs_info->fs_devices->rw_devices; /* * SINGLE profile on-disk has no profile bit, but in-memory we have a @@ -6107,75 +6159,6 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); } -int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - u64 physical, u64 **logical, int *naddrs, int *stripe_len) -{ - struct extent_map *em; - struct map_lookup *map; - u64 *buf; - u64 bytenr; - u64 length; - u64 stripe_nr; - u64 rmap_len; - int i, j, nr = 0; - - em = btrfs_get_chunk_map(fs_info, chunk_start, 1); - if (IS_ERR(em)) - return -EIO; - - map = em->map_lookup; - length = em->len; - rmap_len = map->stripe_len; - - if (map->type & BTRFS_BLOCK_GROUP_RAID10) - length = div_u64(length, map->num_stripes / map->sub_stripes); - else if (map->type & BTRFS_BLOCK_GROUP_RAID0) - length = div_u64(length, map->num_stripes); - else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - length = div_u64(length, nr_data_stripes(map)); - rmap_len = map->stripe_len * nr_data_stripes(map); - } - - buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); - BUG_ON(!buf); /* -ENOMEM */ - - for (i = 0; i < map->num_stripes; i++) { - if (map->stripes[i].physical > physical || - map->stripes[i].physical + length <= physical) - continue; - - stripe_nr = physical - map->stripes[i].physical; - stripe_nr = div64_u64(stripe_nr, map->stripe_len); - - if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - stripe_nr = stripe_nr * map->num_stripes + i; - stripe_nr = div_u64(stripe_nr, map->sub_stripes); - } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - stripe_nr = stripe_nr * map->num_stripes + i; - } /* else if RAID[56], multiply by nr_data_stripes(). - * Alternatively, just use rmap_len below instead of - * map->stripe_len */ - - bytenr = chunk_start + stripe_nr * rmap_len; - WARN_ON(nr >= map->num_stripes); - for (j = 0; j < nr; j++) { - if (buf[j] == bytenr) - break; - } - if (j == nr) { - WARN_ON(nr >= map->num_stripes); - buf[nr++] = bytenr; - } - } - - *logical = buf; - *naddrs = nr; - *stripe_len = rmap_len; - - free_extent_map(em); - return 0; -} - static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) { bio->bi_private = bbio->private; @@ -6476,19 +6459,14 @@ static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) { int index = btrfs_bg_flags_to_raid_index(type); int ncopies = btrfs_raid_array[index].ncopies; + const int nparity = btrfs_raid_array[index].nparity; int data_stripes; - switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { - case BTRFS_BLOCK_GROUP_RAID5: - data_stripes = num_stripes - 1; - break; - case BTRFS_BLOCK_GROUP_RAID6: - data_stripes = num_stripes - 2; - break; - default: + if (nparity) + data_stripes = num_stripes - nparity; + else data_stripes = num_stripes / ncopies; - break; - } + return div_u64(chunk_len, data_stripes); } @@ -7327,6 +7305,8 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, else btrfs_dev_stat_set(dev, i, 0); } + btrfs_info(fs_info, "device stats zeroed by %s (%d)", + current->comm, task_pid_nr(current)); } else { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) if (stats->nr_items > i) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index fc1b564b9cfe..409f4816fb89 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -120,8 +120,6 @@ struct btrfs_device { /* per-device scrub information */ struct scrub_ctx *scrub_ctx; - struct btrfs_work work; - /* readahead state */ atomic_t reada_in_flight; u64 reada_next; @@ -138,6 +136,10 @@ struct btrfs_device { atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; struct extent_io_tree alloc_state; + + struct completion kobj_unregister; + /* For sysfs/FSID/devinfo/devid/ */ + struct kobject devid_kobj; }; /* @@ -168,7 +170,7 @@ btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ write_seqcount_end(&dev->data_seqcount); \ preempt_enable(); \ } -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) #define BTRFS_DEVICE_GETSET_FUNCS(name) \ static inline u64 \ btrfs_device_get_##name(const struct btrfs_device *dev) \ @@ -255,7 +257,7 @@ struct btrfs_fs_devices { struct btrfs_fs_info *fs_info; /* sysfs kobjects */ struct kobject fsid_kobj; - struct kobject *device_dir_kobj; + struct kobject *devices_kobj; struct completion kobj_unregister; }; @@ -417,8 +419,6 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_bio **bbio_ret); int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 len, struct btrfs_io_geometry *io_geom); -int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - u64 physical, u64 **logical, int *naddrs, int *stripe_len); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index a6c90a003c12..05615a1099db 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -20,9 +20,13 @@ #include <linux/refcount.h> #include "compression.h" +/* workspace buffer size for s390 zlib hardware support */ +#define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE) + struct workspace { z_stream strm; char *buf; + unsigned int buf_size; struct list_head list; int level; }; @@ -61,7 +65,21 @@ struct list_head *zlib_alloc_workspace(unsigned int level) zlib_inflate_workspacesize()); workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL); workspace->level = level; - workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + workspace->buf = NULL; + /* + * In case of s390 zlib hardware support, allocate lager workspace + * buffer. If allocator fails, fall back to a single page buffer. + */ + if (zlib_deflate_dfltcc_enabled()) { + workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE, + __GFP_NOMEMALLOC | __GFP_NORETRY | + __GFP_NOWARN | GFP_NOIO); + workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE; + } + if (!workspace->buf) { + workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + workspace->buf_size = PAGE_SIZE; + } if (!workspace->strm.workspace || !workspace->buf) goto fail; @@ -85,6 +103,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, struct page *in_page = NULL; struct page *out_page = NULL; unsigned long bytes_left; + unsigned int in_buf_pages; unsigned long len = *total_out; unsigned long nr_dest_pages = *out_pages; const unsigned long max_out = nr_dest_pages * PAGE_SIZE; @@ -102,9 +121,6 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (out_page == NULL) { ret = -ENOMEM; @@ -114,12 +130,51 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, pages[0] = out_page; nr_pages = 1; - workspace->strm.next_in = data_in; + workspace->strm.next_in = workspace->buf; + workspace->strm.avail_in = 0; workspace->strm.next_out = cpage_out; workspace->strm.avail_out = PAGE_SIZE; - workspace->strm.avail_in = min(len, PAGE_SIZE); while (workspace->strm.total_in < len) { + /* + * Get next input pages and copy the contents to + * the workspace buffer if required. + */ + if (workspace->strm.avail_in == 0) { + bytes_left = len - workspace->strm.total_in; + in_buf_pages = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), + workspace->buf_size / PAGE_SIZE); + if (in_buf_pages > 1) { + int i; + + for (i = 0; i < in_buf_pages; i++) { + if (in_page) { + kunmap(in_page); + put_page(in_page); + } + in_page = find_get_page(mapping, + start >> PAGE_SHIFT); + data_in = kmap(in_page); + memcpy(workspace->buf + i * PAGE_SIZE, + data_in, PAGE_SIZE); + start += PAGE_SIZE; + } + workspace->strm.next_in = workspace->buf; + } else { + if (in_page) { + kunmap(in_page); + put_page(in_page); + } + in_page = find_get_page(mapping, + start >> PAGE_SHIFT); + data_in = kmap(in_page); + start += PAGE_SIZE; + workspace->strm.next_in = data_in; + } + workspace->strm.avail_in = min(bytes_left, + (unsigned long) workspace->buf_size); + } + ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); if (ret != Z_OK) { pr_debug("BTRFS: deflate in loop returned %d\n", @@ -161,33 +216,43 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, /* we're all done */ if (workspace->strm.total_in >= len) break; - - /* we've read in a full page, get a new one */ - if (workspace->strm.avail_in == 0) { - if (workspace->strm.total_out > max_out) - break; - - bytes_left = len - workspace->strm.total_in; - kunmap(in_page); - put_page(in_page); - - start += PAGE_SIZE; - in_page = find_get_page(mapping, - start >> PAGE_SHIFT); - data_in = kmap(in_page); - workspace->strm.avail_in = min(bytes_left, - PAGE_SIZE); - workspace->strm.next_in = data_in; - } + if (workspace->strm.total_out > max_out) + break; } workspace->strm.avail_in = 0; - ret = zlib_deflate(&workspace->strm, Z_FINISH); - zlib_deflateEnd(&workspace->strm); - - if (ret != Z_STREAM_END) { - ret = -EIO; - goto out; + /* + * Call deflate with Z_FINISH flush parameter providing more output + * space but no more input data, until it returns with Z_STREAM_END. + */ + while (ret != Z_STREAM_END) { + ret = zlib_deflate(&workspace->strm, Z_FINISH); + if (ret == Z_STREAM_END) + break; + if (ret != Z_OK && ret != Z_BUF_ERROR) { + zlib_deflateEnd(&workspace->strm); + ret = -EIO; + goto out; + } else if (workspace->strm.avail_out == 0) { + /* get another page for the stream end */ + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -E2BIG; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + pages[nr_pages] = out_page; + nr_pages++; + workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.next_out = cpage_out; + } } + zlib_deflateEnd(&workspace->strm); if (workspace->strm.total_out >= workspace->strm.total_in) { ret = -E2BIG; @@ -231,7 +296,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->strm.total_out = 0; workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = workspace->buf_size; /* If it's deflate, and it's got no preset dictionary, then we can tell zlib to skip the adler32 check. */ @@ -270,7 +335,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = workspace->buf_size; if (workspace->strm.avail_in == 0) { unsigned long tmp; @@ -320,7 +385,7 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, workspace->strm.total_in = 0; workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = workspace->buf_size; workspace->strm.total_out = 0; /* If it's deflate, and it's got no preset dictionary, then we can tell zlib to skip the adler32 check. */ @@ -364,7 +429,7 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, buf_offset = 0; bytes = min(PAGE_SIZE - pg_offset, - PAGE_SIZE - buf_offset); + PAGE_SIZE - (buf_offset % PAGE_SIZE)); bytes = min(bytes, bytes_left); kaddr = kmap_atomic(dest_page); @@ -375,7 +440,7 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, bytes_left -= bytes; next: workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = workspace->buf_size; } if (ret != Z_STREAM_END && bytes_left != 0) diff --git a/fs/buffer.c b/fs/buffer.c index d8c7242426bb..b8d28370cfd7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1433,7 +1433,7 @@ static bool has_bh_in_lru(int cpu, void *dummy) void invalidate_bh_lrus(void) { - on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL); + on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1); } EXPORT_SYMBOL_GPL(invalidate_bh_lrus); @@ -3031,11 +3031,9 @@ static void end_bio_bh_io_sync(struct bio *bio) * errors, this only handles the "we need to be able to * do IO at the final sector" case. */ -void guard_bio_eod(int op, struct bio *bio) +void guard_bio_eod(struct bio *bio) { sector_t maxsector; - struct bio_vec *bvec = bio_last_bvec_all(bio); - unsigned truncated_bytes; struct hd_struct *part; rcu_read_lock(); @@ -3061,28 +3059,7 @@ void guard_bio_eod(int op, struct bio *bio) if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) return; - /* Uhhuh. We've got a bio that straddles the device size! */ - truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9); - - /* - * The bio contains more than one segment which spans EOD, just return - * and let IO layer turn it into an EIO - */ - if (truncated_bytes > bvec->bv_len) - return; - - /* Truncate the bio.. */ - bio->bi_iter.bi_size -= truncated_bytes; - bvec->bv_len -= truncated_bytes; - - /* ..and clear the end of the buffer for reads */ - if (op == REQ_OP_READ) { - struct bio_vec bv; - - mp_bvec_last_segment(bvec, &bv); - zero_user(bv.bv_page, bv.bv_offset + bv.bv_len, - truncated_bytes); - } + bio_truncate(bio, maxsector << 9); } static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, @@ -3118,15 +3095,15 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; - /* Take care of bh's that straddle the end of the device */ - guard_bio_eod(op, bio); - if (buffer_meta(bh)) op_flags |= REQ_META; if (buffer_prio(bh)) op_flags |= REQ_PRIO; bio_set_op_attrs(bio, op, op_flags); + /* Take care of bh's that straddle the end of the device */ + guard_bio_eod(bio); + if (wbc) { wbc_init_bio(wbc, bio); wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f5a38910a82b..9d09bb53c1ab 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1011,18 +1011,13 @@ static int __ceph_is_single_caps(struct ceph_inode_info *ci) return rb_first(&ci->i_caps) == rb_last(&ci->i_caps); } -static int __ceph_is_any_caps(struct ceph_inode_info *ci) -{ - return !RB_EMPTY_ROOT(&ci->i_caps); -} - int ceph_is_any_caps(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); int ret; spin_lock(&ci->i_ceph_lock); - ret = __ceph_is_any_caps(ci); + ret = __ceph_is_any_real_caps(ci); spin_unlock(&ci->i_ceph_lock); return ret; @@ -1099,15 +1094,16 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) if (removed) ceph_put_cap(mdsc, cap); - /* when reconnect denied, we remove session caps forcibly, - * i_wr_ref can be non-zero. If there are ongoing write, - * keep i_snap_realm. - */ - if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm) - drop_inode_snap_realm(ci); + if (!__ceph_is_any_real_caps(ci)) { + /* when reconnect denied, we remove session caps forcibly, + * i_wr_ref can be non-zero. If there are ongoing write, + * keep i_snap_realm. + */ + if (ci->i_wr_ref == 0 && ci->i_snap_realm) + drop_inode_snap_realm(ci); - if (!__ceph_is_any_real_caps(ci)) __cap_delay_cancel(mdsc, ci); + } } struct cap_msg_args { @@ -2764,7 +2760,19 @@ int ceph_get_caps(struct file *filp, int need, int want, if (ret == -EAGAIN) continue; if (!ret) { + struct ceph_mds_client *mdsc = fsc->mdsc; + struct cap_wait cw; DEFINE_WAIT_FUNC(wait, woken_wake_function); + + cw.ino = inode->i_ino; + cw.tgid = current->tgid; + cw.need = need; + cw.want = want; + + spin_lock(&mdsc->caps_list_lock); + list_add(&cw.list, &mdsc->cap_wait_list); + spin_unlock(&mdsc->caps_list_lock); + add_wait_queue(&ci->i_cap_wq, &wait); flags |= NON_BLOCKING; @@ -2778,6 +2786,11 @@ int ceph_get_caps(struct file *filp, int need, int want, } remove_wait_queue(&ci->i_cap_wq, &wait); + + spin_lock(&mdsc->caps_list_lock); + list_del(&cw.list); + spin_unlock(&mdsc->caps_list_lock); + if (ret == -EAGAIN) continue; } @@ -2928,7 +2941,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) ci->i_head_snapc = NULL; } /* see comment in __ceph_remove_cap() */ - if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) + if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) drop_inode_snap_realm(ci); } spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index facb387c2735..c281f32b54f7 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -139,6 +139,7 @@ static int caps_show(struct seq_file *s, void *p) struct ceph_fs_client *fsc = s->private; struct ceph_mds_client *mdsc = fsc->mdsc; int total, avail, used, reserved, min, i; + struct cap_wait *cw; ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min); seq_printf(s, "total\t\t%d\n" @@ -166,6 +167,18 @@ static int caps_show(struct seq_file *s, void *p) } mutex_unlock(&mdsc->mutex); + seq_printf(s, "\n\nWaiters:\n--------\n"); + seq_printf(s, "tgid ino need want\n"); + seq_printf(s, "-----------------------------------------------------\n"); + + spin_lock(&mdsc->caps_list_lock); + list_for_each_entry(cw, &mdsc->cap_wait_list, list) { + seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino, + ceph_cap_string(cw->need), + ceph_cap_string(cw->want)); + } + spin_unlock(&mdsc->caps_list_lock); + return 0; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 88687ed65cff..21ada2c4a88c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -708,8 +708,10 @@ void ceph_mdsc_release_request(struct kref *kref) /* avoid calling iput_final() in mds dispatch threads */ ceph_async_iput(req->r_inode); } - if (req->r_parent) + if (req->r_parent) { ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); + ceph_async_iput(req->r_parent); + } ceph_async_iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); @@ -2015,7 +2017,7 @@ void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr) if (!nr) return; val = atomic_add_return(nr, &mdsc->cap_reclaim_pending); - if (!(val % CEPH_CAPS_PER_RELEASE)) { + if ((val % CEPH_CAPS_PER_RELEASE) < nr) { atomic_set(&mdsc->cap_reclaim_pending, 0); ceph_queue_cap_reclaim_work(mdsc); } @@ -2032,12 +2034,13 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; size_t size = sizeof(struct ceph_mds_reply_dir_entry); - int order, num_entries; + unsigned int num_entries; + int order; spin_lock(&ci->i_ceph_lock); num_entries = ci->i_files + ci->i_subdirs; spin_unlock(&ci->i_ceph_lock); - num_entries = max(num_entries, 1); + num_entries = max(num_entries, 1U); num_entries = min(num_entries, opt->max_readdir); order = get_order(size * num_entries); @@ -2673,8 +2676,10 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ if (req->r_inode) ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); - if (req->r_parent) + if (req->r_parent) { ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); + ihold(req->r_parent); + } if (req->r_old_dentry_dir) ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); @@ -4166,6 +4171,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; INIT_LIST_HEAD(&mdsc->cap_delay_list); + INIT_LIST_HEAD(&mdsc->cap_wait_list); spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 5cd131b41d84..14c7e8c49970 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -340,6 +340,14 @@ struct ceph_quotarealm_inode { struct inode *inode; }; +struct cap_wait { + struct list_head list; + unsigned long ino; + pid_t tgid; + int need; + int want; +}; + /* * mds client state */ @@ -416,6 +424,7 @@ struct ceph_mds_client { spinlock_t caps_list_lock; struct list_head caps_list; /* unused (reserved or unreserved) */ + struct list_head cap_wait_list; int caps_total_count; /* total caps allocated */ int caps_use_count; /* in use */ int caps_use_max; /* max used caps */ diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index aeec1d6e3769..471bac335fae 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -158,6 +158,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) void *pexport_targets = NULL; struct ceph_timespec laggy_since; struct ceph_mds_info *info; + bool laggy; ceph_decode_need(p, end, sizeof(u64) + 1, bad); global_id = ceph_decode_64(p); @@ -190,6 +191,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) if (err) goto corrupt; ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); + laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0; *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); *p += namelen; @@ -207,10 +209,11 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) *p = info_end; } - dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", + dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n", i+1, n, global_id, mds, inc, ceph_pr_addr(&addr), - ceph_mds_state_name(state)); + ceph_mds_state_name(state), + laggy ? "(laggy)" : ""); if (mds < 0 || state <= 0) continue; @@ -230,8 +233,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) info->global_id = global_id; info->state = state; info->addr = addr; - info->laggy = (laggy_since.tv_sec != 0 || - laggy_since.tv_nsec != 0); + info->laggy = laggy; info->num_export_targets = num_export_targets; if (num_export_targets) { info->export_targets = kcalloc(num_export_targets, @@ -355,6 +357,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_damaged = false; } bad_ext: + dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", + !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); *p = end; dout("mdsmap_decode success epoch %u\n", m->m_epoch); return m; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9c9a7c68eea3..29a795f975df 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -172,10 +172,10 @@ static const struct fs_parameter_enum ceph_mount_param_enums[] = { static const struct fs_parameter_spec ceph_mount_param_specs[] = { fsparam_flag_no ("acl", Opt_acl), fsparam_flag_no ("asyncreaddir", Opt_asyncreaddir), - fsparam_u32 ("caps_max", Opt_caps_max), + fsparam_s32 ("caps_max", Opt_caps_max), fsparam_u32 ("caps_wanted_delay_max", Opt_caps_wanted_delay_max), fsparam_u32 ("caps_wanted_delay_min", Opt_caps_wanted_delay_min), - fsparam_s32 ("write_congestion_kb", Opt_congestion_kb), + fsparam_u32 ("write_congestion_kb", Opt_congestion_kb), fsparam_flag_no ("copyfrom", Opt_copyfrom), fsparam_flag_no ("dcache", Opt_dcache), fsparam_flag_no ("dirstat", Opt_dirstat), @@ -187,8 +187,8 @@ static const struct fs_parameter_spec ceph_mount_param_specs[] = { fsparam_flag_no ("quotadf", Opt_quotadf), fsparam_u32 ("rasize", Opt_rasize), fsparam_flag_no ("rbytes", Opt_rbytes), - fsparam_s32 ("readdir_max_bytes", Opt_readdir_max_bytes), - fsparam_s32 ("readdir_max_entries", Opt_readdir_max_entries), + fsparam_u32 ("readdir_max_bytes", Opt_readdir_max_bytes), + fsparam_u32 ("readdir_max_entries", Opt_readdir_max_entries), fsparam_enum ("recover_session", Opt_recover_session), fsparam_flag_no ("require_active_mds", Opt_require_active_mds), fsparam_u32 ("rsize", Opt_rsize), @@ -328,7 +328,9 @@ static int ceph_parse_mount_param(struct fs_context *fc, fsopt->caps_wanted_delay_max = result.uint_32; break; case Opt_caps_max: - fsopt->caps_max = result.uint_32; + if (result.int_32 < 0) + goto out_of_range; + fsopt->caps_max = result.int_32; break; case Opt_readdir_max_entries: if (result.uint_32 < 1) @@ -547,25 +549,25 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_show_option(m, "recover_session", "clean"); if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) - seq_printf(m, ",wsize=%d", fsopt->wsize); + seq_printf(m, ",wsize=%u", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) - seq_printf(m, ",rsize=%d", fsopt->rsize); + seq_printf(m, ",rsize=%u", fsopt->rsize); if (fsopt->rasize != CEPH_RASIZE_DEFAULT) - seq_printf(m, ",rasize=%d", fsopt->rasize); + seq_printf(m, ",rasize=%u", fsopt->rasize); if (fsopt->congestion_kb != default_congestion_kb()) - seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); + seq_printf(m, ",write_congestion_kb=%u", fsopt->congestion_kb); if (fsopt->caps_max) seq_printf(m, ",caps_max=%d", fsopt->caps_max); if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) - seq_printf(m, ",caps_wanted_delay_min=%d", + seq_printf(m, ",caps_wanted_delay_min=%u", fsopt->caps_wanted_delay_min); if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) - seq_printf(m, ",caps_wanted_delay_max=%d", + seq_printf(m, ",caps_wanted_delay_max=%u", fsopt->caps_wanted_delay_max); if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) - seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir); + seq_printf(m, ",readdir_max_entries=%u", fsopt->max_readdir); if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) - seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); + seq_printf(m, ",readdir_max_bytes=%u", fsopt->max_readdir_bytes); if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) seq_show_option(m, "snapdirname", fsopt->snapdir_name); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f0f9cb7447ac..3bf1a01cd536 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -73,16 +73,16 @@ #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ struct ceph_mount_options { - int flags; + unsigned int flags; - int wsize; /* max write size */ - int rsize; /* max read size */ - int rasize; /* max readahead */ - int congestion_kb; /* max writeback in flight */ - int caps_wanted_delay_min, caps_wanted_delay_max; + unsigned int wsize; /* max write size */ + unsigned int rsize; /* max read size */ + unsigned int rasize; /* max readahead */ + unsigned int congestion_kb; /* max writeback in flight */ + unsigned int caps_wanted_delay_min, caps_wanted_delay_max; int caps_max; - int max_readdir; /* max readdir result (entires) */ - int max_readdir_bytes; /* max readdir result (bytes) */ + unsigned int max_readdir; /* max readdir result (entries) */ + unsigned int max_readdir_bytes; /* max readdir result (bytes) */ /* * everything above this point can be memcmp'd; everything below diff --git a/fs/char_dev.c b/fs/char_dev.c index 00dfe17871ac..c5e6eff5a381 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -352,7 +352,7 @@ static struct kobject *cdev_get(struct cdev *p) if (owner && !try_module_get(owner)) return NULL; - kobj = kobject_get(&p->kobj); + kobj = kobject_get_unless_zero(&p->kobj); if (!kobj) module_put(owner); return kobj; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 19f6e592b941..276e4b5ea8e0 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -611,12 +611,12 @@ static int cifs_stats_proc_open(struct inode *inode, struct file *file) return single_open(file, cifs_stats_proc_show, NULL); } -static const struct file_operations cifs_stats_proc_fops = { - .open = cifs_stats_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_stats_proc_write, +static const struct proc_ops cifs_stats_proc_ops = { + .proc_open = cifs_stats_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifs_stats_proc_write, }; #ifdef CONFIG_CIFS_SMB_DIRECT @@ -640,12 +640,12 @@ static int name##_open(struct inode *inode, struct file *file) \ return single_open(file, name##_proc_show, NULL); \ } \ \ -static const struct file_operations cifs_##name##_proc_fops = { \ - .open = name##_open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ - .write = name##_write, \ +static const struct proc_ops cifs_##name##_proc_fops = { \ + .proc_open = name##_open, \ + .proc_read = seq_read, \ + .proc_lseek = seq_lseek, \ + .proc_release = single_release, \ + .proc_write = name##_write, \ } PROC_FILE_DEFINE(rdma_readwrite_threshold); @@ -659,11 +659,11 @@ PROC_FILE_DEFINE(smbd_receive_credit_max); #endif static struct proc_dir_entry *proc_fs_cifs; -static const struct file_operations cifsFYI_proc_fops; -static const struct file_operations cifs_lookup_cache_proc_fops; -static const struct file_operations traceSMB_proc_fops; -static const struct file_operations cifs_security_flags_proc_fops; -static const struct file_operations cifs_linux_ext_proc_fops; +static const struct proc_ops cifsFYI_proc_ops; +static const struct proc_ops cifs_lookup_cache_proc_ops; +static const struct proc_ops traceSMB_proc_ops; +static const struct proc_ops cifs_security_flags_proc_ops; +static const struct proc_ops cifs_linux_ext_proc_ops; void cifs_proc_init(void) @@ -678,18 +678,18 @@ cifs_proc_init(void) proc_create_single("open_files", 0400, proc_fs_cifs, cifs_debug_files_proc_show); - proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_fops); - proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_fops); - proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_fops); + proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_ops); + proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_ops); + proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_ops); proc_create("LinuxExtensionsEnabled", 0644, proc_fs_cifs, - &cifs_linux_ext_proc_fops); + &cifs_linux_ext_proc_ops); proc_create("SecurityFlags", 0644, proc_fs_cifs, - &cifs_security_flags_proc_fops); + &cifs_security_flags_proc_ops); proc_create("LookupCacheEnabled", 0644, proc_fs_cifs, - &cifs_lookup_cache_proc_fops); + &cifs_lookup_cache_proc_ops); #ifdef CONFIG_CIFS_DFS_UPCALL - proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_fops); + proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_ops); #endif #ifdef CONFIG_CIFS_SMB_DIRECT @@ -774,12 +774,12 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, return count; } -static const struct file_operations cifsFYI_proc_fops = { - .open = cifsFYI_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifsFYI_proc_write, +static const struct proc_ops cifsFYI_proc_ops = { + .proc_open = cifsFYI_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifsFYI_proc_write, }; static int cifs_linux_ext_proc_show(struct seq_file *m, void *v) @@ -805,12 +805,12 @@ static ssize_t cifs_linux_ext_proc_write(struct file *file, return count; } -static const struct file_operations cifs_linux_ext_proc_fops = { - .open = cifs_linux_ext_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_linux_ext_proc_write, +static const struct proc_ops cifs_linux_ext_proc_ops = { + .proc_open = cifs_linux_ext_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifs_linux_ext_proc_write, }; static int cifs_lookup_cache_proc_show(struct seq_file *m, void *v) @@ -836,12 +836,12 @@ static ssize_t cifs_lookup_cache_proc_write(struct file *file, return count; } -static const struct file_operations cifs_lookup_cache_proc_fops = { - .open = cifs_lookup_cache_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_lookup_cache_proc_write, +static const struct proc_ops cifs_lookup_cache_proc_ops = { + .proc_open = cifs_lookup_cache_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifs_lookup_cache_proc_write, }; static int traceSMB_proc_show(struct seq_file *m, void *v) @@ -867,12 +867,12 @@ static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer, return count; } -static const struct file_operations traceSMB_proc_fops = { - .open = traceSMB_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = traceSMB_proc_write, +static const struct proc_ops traceSMB_proc_ops = { + .proc_open = traceSMB_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = traceSMB_proc_write, }; static int cifs_security_flags_proc_show(struct seq_file *m, void *v) @@ -978,12 +978,12 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, return count; } -static const struct file_operations cifs_security_flags_proc_fops = { - .open = cifs_security_flags_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_security_flags_proc_write, +static const struct proc_ops cifs_security_flags_proc_ops = { + .proc_open = cifs_security_flags_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifs_security_flags_proc_write, }; #else inline void cifs_proc_init(void) diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 41957b82d796..606f26d862dc 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -120,17 +120,17 @@ cifs_build_devname(char *nodename, const char *prepath) /** - * cifs_compose_mount_options - creates mount options for refferral + * cifs_compose_mount_options - creates mount options for referral * @sb_mountdata: parent/root DFS mount options (template) * @fullpath: full path in UNC format - * @ref: server's referral + * @ref: optional server's referral * @devname: optional pointer for saving device name * * creates mount options for submount based on template options sb_mountdata * and replacing unc,ip,prefixpath options with ones we've got form ref_unc. * * Returns: pointer to new mount options or ERR_PTR. - * Caller is responcible for freeing retunrned value if it is not error. + * Caller is responsible for freeing returned value if it is not error. */ char *cifs_compose_mount_options(const char *sb_mountdata, const char *fullpath, @@ -150,18 +150,27 @@ char *cifs_compose_mount_options(const char *sb_mountdata, if (sb_mountdata == NULL) return ERR_PTR(-EINVAL); - if (strlen(fullpath) - ref->path_consumed) { - prepath = fullpath + ref->path_consumed; - /* skip initial delimiter */ - if (*prepath == '/' || *prepath == '\\') - prepath++; - } + if (ref) { + if (strlen(fullpath) - ref->path_consumed) { + prepath = fullpath + ref->path_consumed; + /* skip initial delimiter */ + if (*prepath == '/' || *prepath == '\\') + prepath++; + } - name = cifs_build_devname(ref->node_name, prepath); - if (IS_ERR(name)) { - rc = PTR_ERR(name); - name = NULL; - goto compose_mount_options_err; + name = cifs_build_devname(ref->node_name, prepath); + if (IS_ERR(name)) { + rc = PTR_ERR(name); + name = NULL; + goto compose_mount_options_err; + } + } else { + name = cifs_build_devname((char *)fullpath, NULL); + if (IS_ERR(name)) { + rc = PTR_ERR(name); + name = NULL; + goto compose_mount_options_err; + } } rc = dns_resolve_server_name_to_ip(name, &srvIP); @@ -225,6 +234,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata, if (devname) *devname = name; + else + kfree(name); /*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/ /*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/ @@ -241,23 +252,23 @@ compose_mount_options_err: } /** - * cifs_dfs_do_refmount - mounts specified path using provided refferal + * cifs_dfs_do_mount - mounts specified path using DFS full path + * + * Always pass down @fullpath to smb3_do_mount() so we can use the root server + * to perform failover in case we failed to connect to the first target in the + * referral. + * * @cifs_sb: parent/root superblock * @fullpath: full path in UNC format - * @ref: server's referral */ -static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt, - struct cifs_sb_info *cifs_sb, - const char *fullpath, const struct dfs_info3_param *ref) +static struct vfsmount *cifs_dfs_do_mount(struct dentry *mntpt, + struct cifs_sb_info *cifs_sb, + const char *fullpath) { struct vfsmount *mnt; char *mountdata; char *devname; - /* - * Always pass down the DFS full path to smb3_do_mount() so we - * can use it later for failover. - */ devname = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL); if (!devname) return ERR_PTR(-ENOMEM); @@ -266,7 +277,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt, /* strip first '\' from fullpath */ mountdata = cifs_compose_mount_options(cifs_sb->mountdata, - fullpath + 1, ref, NULL); + fullpath + 1, NULL, NULL); if (IS_ERR(mountdata)) { kfree(devname); return (struct vfsmount *)mountdata; @@ -278,28 +289,16 @@ static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt, return mnt; } -static void dump_referral(const struct dfs_info3_param *ref) -{ - cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name); - cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name); - cifs_dbg(FYI, "DFS: fl: %d, srv_type: %d\n", - ref->flags, ref->server_type); - cifs_dbg(FYI, "DFS: ref_flags: %d, path_consumed: %d\n", - ref->ref_flag, ref->path_consumed); -} - /* * Create a vfsmount that we can automount */ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) { - struct dfs_info3_param referral = {0}; struct cifs_sb_info *cifs_sb; struct cifs_ses *ses; struct cifs_tcon *tcon; char *full_path, *root_path; unsigned int xid; - int len; int rc; struct vfsmount *mnt; @@ -357,7 +356,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) if (!rc) { rc = dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb), full_path + 1, - &referral, NULL); + NULL, NULL); } free_xid(xid); @@ -366,26 +365,16 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) mnt = ERR_PTR(rc); goto free_root_path; } - - dump_referral(&referral); - - len = strlen(referral.node_name); - if (len < 2) { - cifs_dbg(VFS, "%s: Net Address path too short: %s\n", - __func__, referral.node_name); - mnt = ERR_PTR(-EINVAL); - goto free_dfs_ref; - } /* - * cifs_mount() will retry every available node server in case - * of failures. + * OK - we were able to get and cache a referral for @full_path. + * + * Now, pass it down to cifs_mount() and it will retry every available + * node server in case of failures - no need to do it here. */ - mnt = cifs_dfs_do_refmount(mntpt, cifs_sb, full_path, &referral); - cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", __func__, - referral.node_name, mnt); + mnt = cifs_dfs_do_mount(mntpt, cifs_sb, full_path); + cifs_dbg(FYI, "%s: cifs_dfs_do_mount:%s , mnt:%p\n", __func__, + full_path + 1, mnt); -free_dfs_ref: - free_dfs_info_param(&referral); free_root_path: kfree(root_path); free_full_path: diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 96ae72b556ac..fb41e51dd574 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -802,6 +802,26 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, return; } +unsigned int setup_authusers_ACE(struct cifs_ace *pntace) +{ + int i; + unsigned int ace_size = 20; + + pntace->type = ACCESS_ALLOWED_ACE_TYPE; + pntace->flags = 0x0; + pntace->access_req = cpu_to_le32(GENERIC_ALL); + pntace->sid.num_subauth = 1; + pntace->sid.revision = 1; + for (i = 0; i < NUM_AUTHS; i++) + pntace->sid.authority[i] = sid_authusers.authority[i]; + + pntace->sid.sub_auth[0] = sid_authusers.sub_auth[0]; + + /* size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth*4) */ + pntace->size = cpu_to_le16(ace_size); + return ace_size; +} + /* * Fill in the special SID based on the mode. See * http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index b59dc7478130..b87456bae1a1 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -149,9 +149,12 @@ extern ssize_t cifs_file_copychunk_range(unsigned int xid, size_t len, unsigned int flags); extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); +extern void cifs_setsize(struct inode *inode, loff_t offset); +extern int cifs_truncate_page(struct address_space *mapping, loff_t from); + #ifdef CONFIG_CIFS_NFSD_EXPORT extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.24" +#define CIFS_VERSION "2.25" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index fd0262ce5ad5..239338d57086 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1061,7 +1061,7 @@ cap_unix(struct cifs_ses *ses) struct cached_fid { bool is_valid:1; /* Do we have a useable root fid */ bool file_all_info_is_valid:1; - + bool has_lease:1; struct kref refcount; struct cifs_fid *fid; struct mutex fid_mutex; @@ -1588,6 +1588,7 @@ struct mid_q_entry { mid_callback_t *callback; /* call completion callback */ mid_handle_t *handle; /* call handle mid callback */ void *callback_data; /* general purpose pointer for callback */ + struct task_struct *creator; void *resp_buf; /* pointer to received SMB header */ unsigned int resp_buf_size; int mid_state; /* wish this were enum but can not pass to wait_event */ @@ -1693,6 +1694,7 @@ struct cifs_fattr { struct timespec64 cf_atime; struct timespec64 cf_mtime; struct timespec64 cf_ctime; + u32 cf_cifstag; }; static inline void free_dfs_info_param(struct dfs_info3_param *param) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 9c229408a251..948bf3474db1 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -213,6 +213,7 @@ extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *, const struct cifs_fid *, u32 *); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, const char *, int); +extern unsigned int setup_authusers_ACE(struct cifs_ace *pace); extern unsigned int setup_special_mode_ACE(struct cifs_ace *pace, __u64 nmode); extern void dequeue_mid(struct mid_q_entry *mid, bool malformed); @@ -596,6 +597,9 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface); void extract_unc_hostname(const char *unc, const char **h, size_t *len); int copy_path_name(char *dst, const char *src); +int smb2_parse_query_directory(struct cifs_tcon *tcon, struct kvec *rsp_iov, + int resp_buftype, + struct cifs_search_info *srch_inf); #ifdef CONFIG_CIFS_DFS_UPCALL static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 4f554f019a98..a481296f417f 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -42,6 +42,7 @@ #include "cifsproto.h" #include "cifs_unicode.h" #include "cifs_debug.h" +#include "smb2proto.h" #include "fscache.h" #include "smbdirect.h" #ifdef CONFIG_CIFS_DFS_UPCALL @@ -112,6 +113,8 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) mutex_lock(&tcon->crfid.fid_mutex); tcon->crfid.is_valid = false; + /* cached handle is not valid, so SMB2_CLOSE won't be sent below */ + close_shroot_lease_locked(&tcon->crfid); memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid)); mutex_unlock(&tcon->crfid.fid_mutex); @@ -4616,7 +4619,7 @@ findFirstRetry: psrch_inf->unicode = false; psrch_inf->ntwrk_buf_start = (char *)pSMBr; - psrch_inf->smallBuf = 0; + psrch_inf->smallBuf = false; psrch_inf->srch_entries_start = (char *) &pSMBr->hdr.Protocol + le16_to_cpu(pSMBr->t2.DataOffset); @@ -4750,7 +4753,7 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon, cifs_buf_release(psrch_inf->ntwrk_buf_start); psrch_inf->srch_entries_start = response_data; psrch_inf->ntwrk_buf_start = (char *)pSMB; - psrch_inf->smallBuf = 0; + psrch_inf->smallBuf = false; if (parms->EndofSearch) psrch_inf->endOfSearch = true; else diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 05ea0e2b7e0e..0aa3623ae0e1 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3709,8 +3709,10 @@ match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data) { struct cifs_sb_info *old = CIFS_SB(sb); struct cifs_sb_info *new = mnt_data->cifs_sb; - bool old_set = old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH; - bool new_set = new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH; + bool old_set = (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) && + old->prepath; + bool new_set = (new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) && + new->prepath; if (old_set && new_set && !strcmp(new->prepath, old->prepath)) return 1; diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 2faa05860a48..43c1b43a07ec 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -5,11 +5,10 @@ * Copyright (c) 2018-2019 Paulo Alcantara <palcantara@suse.de> */ -#include <linux/rcupdate.h> -#include <linux/rculist.h> #include <linux/jhash.h> #include <linux/ktime.h> #include <linux/slab.h> +#include <linux/proc_fs.h> #include <linux/nls.h> #include <linux/workqueue.h> #include "cifsglob.h" @@ -22,67 +21,68 @@ #include "dfs_cache.h" -#define DFS_CACHE_HTABLE_SIZE 32 -#define DFS_CACHE_MAX_ENTRIES 64 +#define CACHE_HTABLE_SIZE 32 +#define CACHE_MAX_ENTRIES 64 #define IS_INTERLINK_SET(v) ((v) & (DFSREF_REFERRAL_SERVER | \ DFSREF_STORAGE_SERVER)) -struct dfs_cache_tgt { - char *t_name; - struct list_head t_list; +struct cache_dfs_tgt { + char *name; + struct list_head list; }; -struct dfs_cache_entry { - struct hlist_node ce_hlist; - const char *ce_path; - int ce_ttl; - int ce_srvtype; - int ce_flags; - struct timespec64 ce_etime; - int ce_path_consumed; - int ce_numtgts; - struct list_head ce_tlist; - struct dfs_cache_tgt *ce_tgthint; - struct rcu_head ce_rcu; +struct cache_entry { + struct hlist_node hlist; + const char *path; + int ttl; + int srvtype; + int flags; + struct timespec64 etime; + int path_consumed; + int numtgts; + struct list_head tlist; + struct cache_dfs_tgt *tgthint; }; -static struct kmem_cache *dfs_cache_slab __read_mostly; - -struct dfs_cache_vol_info { - char *vi_fullpath; - struct smb_vol vi_vol; - char *vi_mntdata; - struct list_head vi_list; +struct vol_info { + char *fullpath; + spinlock_t smb_vol_lock; + struct smb_vol smb_vol; + char *mntdata; + struct list_head list; + struct list_head rlist; + struct kref refcnt; }; -struct dfs_cache { - struct mutex dc_lock; - struct nls_table *dc_nlsc; - struct list_head dc_vol_list; - int dc_ttl; - struct delayed_work dc_refresh; -}; +static struct kmem_cache *cache_slab __read_mostly; +static struct workqueue_struct *dfscache_wq __read_mostly; -static struct dfs_cache dfs_cache; +static int cache_ttl; +static DEFINE_SPINLOCK(cache_ttl_lock); + +static struct nls_table *cache_nlsc; /* * Number of entries in the cache */ -static size_t dfs_cache_count; +static atomic_t cache_count; + +static struct hlist_head cache_htable[CACHE_HTABLE_SIZE]; +static DECLARE_RWSEM(htable_rw_lock); -static DEFINE_MUTEX(dfs_cache_list_lock); -static struct hlist_head dfs_cache_htable[DFS_CACHE_HTABLE_SIZE]; +static LIST_HEAD(vol_list); +static DEFINE_SPINLOCK(vol_list_lock); static void refresh_cache_worker(struct work_struct *work); -static inline bool is_path_valid(const char *path) -{ - return path && (strchr(path + 1, '\\') || strchr(path + 1, '/')); -} +static DECLARE_DELAYED_WORK(refresh_task, refresh_cache_worker); -static inline int get_normalized_path(const char *path, char **npath) +static int get_normalized_path(const char *path, char **npath) { + if (!path || strlen(path) < 3 || (*path != '\\' && *path != '/')) + return -EINVAL; + if (*path == '\\') { *npath = (char *)path; } else { @@ -100,57 +100,48 @@ static inline void free_normalized_path(const char *path, char *npath) kfree(npath); } -static inline bool cache_entry_expired(const struct dfs_cache_entry *ce) +static inline bool cache_entry_expired(const struct cache_entry *ce) { struct timespec64 ts; ktime_get_coarse_real_ts64(&ts); - return timespec64_compare(&ts, &ce->ce_etime) >= 0; + return timespec64_compare(&ts, &ce->etime) >= 0; } -static inline void free_tgts(struct dfs_cache_entry *ce) +static inline void free_tgts(struct cache_entry *ce) { - struct dfs_cache_tgt *t, *n; + struct cache_dfs_tgt *t, *n; - list_for_each_entry_safe(t, n, &ce->ce_tlist, t_list) { - list_del(&t->t_list); - kfree(t->t_name); + list_for_each_entry_safe(t, n, &ce->tlist, list) { + list_del(&t->list); + kfree(t->name); kfree(t); } } -static void free_cache_entry(struct rcu_head *rcu) +static inline void flush_cache_ent(struct cache_entry *ce) { - struct dfs_cache_entry *ce = container_of(rcu, struct dfs_cache_entry, - ce_rcu); - kmem_cache_free(dfs_cache_slab, ce); -} - -static inline void flush_cache_ent(struct dfs_cache_entry *ce) -{ - if (hlist_unhashed(&ce->ce_hlist)) - return; - - hlist_del_init_rcu(&ce->ce_hlist); - kfree_const(ce->ce_path); + hlist_del_init(&ce->hlist); + kfree(ce->path); free_tgts(ce); - dfs_cache_count--; - call_rcu(&ce->ce_rcu, free_cache_entry); + atomic_dec(&cache_count); + kmem_cache_free(cache_slab, ce); } static void flush_cache_ents(void) { int i; - rcu_read_lock(); - for (i = 0; i < DFS_CACHE_HTABLE_SIZE; i++) { - struct hlist_head *l = &dfs_cache_htable[i]; - struct dfs_cache_entry *ce; + for (i = 0; i < CACHE_HTABLE_SIZE; i++) { + struct hlist_head *l = &cache_htable[i]; + struct hlist_node *n; + struct cache_entry *ce; - hlist_for_each_entry_rcu(ce, l, ce_hlist) - flush_cache_ent(ce); + hlist_for_each_entry_safe(ce, n, l, hlist) { + if (!hlist_unhashed(&ce->hlist)) + flush_cache_ent(ce); + } } - rcu_read_unlock(); } /* @@ -158,36 +149,39 @@ static void flush_cache_ents(void) */ static int dfscache_proc_show(struct seq_file *m, void *v) { - int bucket; - struct dfs_cache_entry *ce; - struct dfs_cache_tgt *t; + int i; + struct cache_entry *ce; + struct cache_dfs_tgt *t; seq_puts(m, "DFS cache\n---------\n"); - mutex_lock(&dfs_cache_list_lock); - - rcu_read_lock(); - hash_for_each_rcu(dfs_cache_htable, bucket, ce, ce_hlist) { - seq_printf(m, - "cache entry: path=%s,type=%s,ttl=%d,etime=%ld," - "interlink=%s,path_consumed=%d,expired=%s\n", - ce->ce_path, - ce->ce_srvtype == DFS_TYPE_ROOT ? "root" : "link", - ce->ce_ttl, ce->ce_etime.tv_nsec, - IS_INTERLINK_SET(ce->ce_flags) ? "yes" : "no", - ce->ce_path_consumed, - cache_entry_expired(ce) ? "yes" : "no"); - - list_for_each_entry(t, &ce->ce_tlist, t_list) { - seq_printf(m, " %s%s\n", - t->t_name, - ce->ce_tgthint == t ? " (target hint)" : ""); + down_read(&htable_rw_lock); + for (i = 0; i < CACHE_HTABLE_SIZE; i++) { + struct hlist_head *l = &cache_htable[i]; + + hlist_for_each_entry(ce, l, hlist) { + if (hlist_unhashed(&ce->hlist)) + continue; + + seq_printf(m, + "cache entry: path=%s,type=%s,ttl=%d,etime=%ld," + "interlink=%s,path_consumed=%d,expired=%s\n", + ce->path, + ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", + ce->ttl, ce->etime.tv_nsec, + IS_INTERLINK_SET(ce->flags) ? "yes" : "no", + ce->path_consumed, + cache_entry_expired(ce) ? "yes" : "no"); + + list_for_each_entry(t, &ce->tlist, list) { + seq_printf(m, " %s%s\n", + t->name, + ce->tgthint == t ? " (target hint)" : ""); + } } - } - rcu_read_unlock(); + up_read(&htable_rw_lock); - mutex_unlock(&dfs_cache_list_lock); return 0; } @@ -205,9 +199,10 @@ static ssize_t dfscache_proc_write(struct file *file, const char __user *buffer, return -EINVAL; cifs_dbg(FYI, "clearing dfs cache"); - mutex_lock(&dfs_cache_list_lock); + + down_write(&htable_rw_lock); flush_cache_ents(); - mutex_unlock(&dfs_cache_list_lock); + up_write(&htable_rw_lock); return count; } @@ -217,34 +212,34 @@ static int dfscache_proc_open(struct inode *inode, struct file *file) return single_open(file, dfscache_proc_show, NULL); } -const struct file_operations dfscache_proc_fops = { - .open = dfscache_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = dfscache_proc_write, +const struct proc_ops dfscache_proc_ops = { + .proc_open = dfscache_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = dfscache_proc_write, }; #ifdef CONFIG_CIFS_DEBUG2 -static inline void dump_tgts(const struct dfs_cache_entry *ce) +static inline void dump_tgts(const struct cache_entry *ce) { - struct dfs_cache_tgt *t; + struct cache_dfs_tgt *t; cifs_dbg(FYI, "target list:\n"); - list_for_each_entry(t, &ce->ce_tlist, t_list) { - cifs_dbg(FYI, " %s%s\n", t->t_name, - ce->ce_tgthint == t ? " (target hint)" : ""); + list_for_each_entry(t, &ce->tlist, list) { + cifs_dbg(FYI, " %s%s\n", t->name, + ce->tgthint == t ? " (target hint)" : ""); } } -static inline void dump_ce(const struct dfs_cache_entry *ce) +static inline void dump_ce(const struct cache_entry *ce) { cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld," - "interlink=%s,path_consumed=%d,expired=%s\n", ce->ce_path, - ce->ce_srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ce_ttl, - ce->ce_etime.tv_nsec, - IS_INTERLINK_SET(ce->ce_flags) ? "yes" : "no", - ce->ce_path_consumed, + "interlink=%s,path_consumed=%d,expired=%s\n", ce->path, + ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl, + ce->etime.tv_nsec, + IS_INTERLINK_SET(ce->flags) ? "yes" : "no", + ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no"); dump_tgts(ce); } @@ -284,25 +279,34 @@ static inline void dump_refs(const struct dfs_info3_param *refs, int numrefs) */ int dfs_cache_init(void) { + int rc; int i; - dfs_cache_slab = kmem_cache_create("cifs_dfs_cache", - sizeof(struct dfs_cache_entry), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!dfs_cache_slab) + dfscache_wq = alloc_workqueue("cifs-dfscache", + WQ_FREEZABLE | WQ_MEM_RECLAIM, 1); + if (!dfscache_wq) return -ENOMEM; - for (i = 0; i < DFS_CACHE_HTABLE_SIZE; i++) - INIT_HLIST_HEAD(&dfs_cache_htable[i]); + cache_slab = kmem_cache_create("cifs_dfs_cache", + sizeof(struct cache_entry), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!cache_slab) { + rc = -ENOMEM; + goto out_destroy_wq; + } + + for (i = 0; i < CACHE_HTABLE_SIZE; i++) + INIT_HLIST_HEAD(&cache_htable[i]); - INIT_LIST_HEAD(&dfs_cache.dc_vol_list); - mutex_init(&dfs_cache.dc_lock); - INIT_DELAYED_WORK(&dfs_cache.dc_refresh, refresh_cache_worker); - dfs_cache.dc_ttl = -1; - dfs_cache.dc_nlsc = load_nls_default(); + atomic_set(&cache_count, 0); + cache_nlsc = load_nls_default(); cifs_dbg(FYI, "%s: initialized DFS referral cache\n", __func__); return 0; + +out_destroy_wq: + destroy_workqueue(dfscache_wq); + return rc; } static inline unsigned int cache_entry_hash(const void *data, int size) @@ -310,7 +314,7 @@ static inline unsigned int cache_entry_hash(const void *data, int size) unsigned int h; h = jhash(data, size, 0); - return h & (DFS_CACHE_HTABLE_SIZE - 1); + return h & (CACHE_HTABLE_SIZE - 1); } /* Check whether second path component of @path is SYSVOL or NETLOGON */ @@ -325,11 +329,11 @@ static inline bool is_sysvol_or_netlogon(const char *path) } /* Return target hint of a DFS cache entry */ -static inline char *get_tgt_name(const struct dfs_cache_entry *ce) +static inline char *get_tgt_name(const struct cache_entry *ce) { - struct dfs_cache_tgt *t = ce->ce_tgthint; + struct cache_dfs_tgt *t = ce->tgthint; - return t ? t->t_name : ERR_PTR(-ENOENT); + return t ? t->name : ERR_PTR(-ENOENT); } /* Return expire time out of a new entry's TTL */ @@ -346,19 +350,19 @@ static inline struct timespec64 get_expire_time(int ttl) } /* Allocate a new DFS target */ -static inline struct dfs_cache_tgt *alloc_tgt(const char *name) +static struct cache_dfs_tgt *alloc_target(const char *name) { - struct dfs_cache_tgt *t; + struct cache_dfs_tgt *t; - t = kmalloc(sizeof(*t), GFP_KERNEL); + t = kmalloc(sizeof(*t), GFP_ATOMIC); if (!t) return ERR_PTR(-ENOMEM); - t->t_name = kstrndup(name, strlen(name), GFP_KERNEL); - if (!t->t_name) { + t->name = kstrndup(name, strlen(name), GFP_ATOMIC); + if (!t->name) { kfree(t); return ERR_PTR(-ENOMEM); } - INIT_LIST_HEAD(&t->t_list); + INIT_LIST_HEAD(&t->list); return t; } @@ -367,180 +371,184 @@ static inline struct dfs_cache_tgt *alloc_tgt(const char *name) * target hint. */ static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs, - struct dfs_cache_entry *ce, const char *tgthint) + struct cache_entry *ce, const char *tgthint) { int i; - ce->ce_ttl = refs[0].ttl; - ce->ce_etime = get_expire_time(ce->ce_ttl); - ce->ce_srvtype = refs[0].server_type; - ce->ce_flags = refs[0].ref_flag; - ce->ce_path_consumed = refs[0].path_consumed; + ce->ttl = refs[0].ttl; + ce->etime = get_expire_time(ce->ttl); + ce->srvtype = refs[0].server_type; + ce->flags = refs[0].ref_flag; + ce->path_consumed = refs[0].path_consumed; for (i = 0; i < numrefs; i++) { - struct dfs_cache_tgt *t; + struct cache_dfs_tgt *t; - t = alloc_tgt(refs[i].node_name); + t = alloc_target(refs[i].node_name); if (IS_ERR(t)) { free_tgts(ce); return PTR_ERR(t); } - if (tgthint && !strcasecmp(t->t_name, tgthint)) { - list_add(&t->t_list, &ce->ce_tlist); + if (tgthint && !strcasecmp(t->name, tgthint)) { + list_add(&t->list, &ce->tlist); tgthint = NULL; } else { - list_add_tail(&t->t_list, &ce->ce_tlist); + list_add_tail(&t->list, &ce->tlist); } - ce->ce_numtgts++; + ce->numtgts++; } - ce->ce_tgthint = list_first_entry_or_null(&ce->ce_tlist, - struct dfs_cache_tgt, t_list); + ce->tgthint = list_first_entry_or_null(&ce->tlist, + struct cache_dfs_tgt, list); return 0; } /* Allocate a new cache entry */ -static struct dfs_cache_entry * -alloc_cache_entry(const char *path, const struct dfs_info3_param *refs, - int numrefs) +static struct cache_entry *alloc_cache_entry(const char *path, + const struct dfs_info3_param *refs, + int numrefs) { - struct dfs_cache_entry *ce; + struct cache_entry *ce; int rc; - ce = kmem_cache_zalloc(dfs_cache_slab, GFP_KERNEL); + ce = kmem_cache_zalloc(cache_slab, GFP_KERNEL); if (!ce) return ERR_PTR(-ENOMEM); - ce->ce_path = kstrdup_const(path, GFP_KERNEL); - if (!ce->ce_path) { - kmem_cache_free(dfs_cache_slab, ce); + ce->path = kstrndup(path, strlen(path), GFP_KERNEL); + if (!ce->path) { + kmem_cache_free(cache_slab, ce); return ERR_PTR(-ENOMEM); } - INIT_HLIST_NODE(&ce->ce_hlist); - INIT_LIST_HEAD(&ce->ce_tlist); + INIT_HLIST_NODE(&ce->hlist); + INIT_LIST_HEAD(&ce->tlist); rc = copy_ref_data(refs, numrefs, ce, NULL); if (rc) { - kfree_const(ce->ce_path); - kmem_cache_free(dfs_cache_slab, ce); + kfree(ce->path); + kmem_cache_free(cache_slab, ce); ce = ERR_PTR(rc); } return ce; } +/* Must be called with htable_rw_lock held */ static void remove_oldest_entry(void) { - int bucket; - struct dfs_cache_entry *ce; - struct dfs_cache_entry *to_del = NULL; - - rcu_read_lock(); - hash_for_each_rcu(dfs_cache_htable, bucket, ce, ce_hlist) { - if (!to_del || timespec64_compare(&ce->ce_etime, - &to_del->ce_etime) < 0) - to_del = ce; + int i; + struct cache_entry *ce; + struct cache_entry *to_del = NULL; + + for (i = 0; i < CACHE_HTABLE_SIZE; i++) { + struct hlist_head *l = &cache_htable[i]; + + hlist_for_each_entry(ce, l, hlist) { + if (hlist_unhashed(&ce->hlist)) + continue; + if (!to_del || timespec64_compare(&ce->etime, + &to_del->etime) < 0) + to_del = ce; + } } + if (!to_del) { cifs_dbg(FYI, "%s: no entry to remove", __func__); - goto out; + return; } + cifs_dbg(FYI, "%s: removing entry", __func__); dump_ce(to_del); flush_cache_ent(to_del); -out: - rcu_read_unlock(); } /* Add a new DFS cache entry */ -static inline struct dfs_cache_entry * -add_cache_entry(unsigned int hash, const char *path, - const struct dfs_info3_param *refs, int numrefs) +static int add_cache_entry(const char *path, unsigned int hash, + struct dfs_info3_param *refs, int numrefs) { - struct dfs_cache_entry *ce; + struct cache_entry *ce; ce = alloc_cache_entry(path, refs, numrefs); if (IS_ERR(ce)) - return ce; + return PTR_ERR(ce); - hlist_add_head_rcu(&ce->ce_hlist, &dfs_cache_htable[hash]); - - mutex_lock(&dfs_cache.dc_lock); - if (dfs_cache.dc_ttl < 0) { - dfs_cache.dc_ttl = ce->ce_ttl; - queue_delayed_work(cifsiod_wq, &dfs_cache.dc_refresh, - dfs_cache.dc_ttl * HZ); + spin_lock(&cache_ttl_lock); + if (!cache_ttl) { + cache_ttl = ce->ttl; + queue_delayed_work(dfscache_wq, &refresh_task, cache_ttl * HZ); } else { - dfs_cache.dc_ttl = min_t(int, dfs_cache.dc_ttl, ce->ce_ttl); - mod_delayed_work(cifsiod_wq, &dfs_cache.dc_refresh, - dfs_cache.dc_ttl * HZ); + cache_ttl = min_t(int, cache_ttl, ce->ttl); + mod_delayed_work(dfscache_wq, &refresh_task, cache_ttl * HZ); } - mutex_unlock(&dfs_cache.dc_lock); + spin_unlock(&cache_ttl_lock); - return ce; + down_write(&htable_rw_lock); + hlist_add_head(&ce->hlist, &cache_htable[hash]); + dump_ce(ce); + up_write(&htable_rw_lock); + + return 0; } -static struct dfs_cache_entry *__find_cache_entry(unsigned int hash, - const char *path) +/* + * Find a DFS cache entry in hash table and optionally check prefix path against + * @path. + * Use whole path components in the match. + * Must be called with htable_rw_lock held. + * + * Return ERR_PTR(-ENOENT) if the entry is not found. + */ +static struct cache_entry *lookup_cache_entry(const char *path, + unsigned int *hash) { - struct dfs_cache_entry *ce; + struct cache_entry *ce; + unsigned int h; bool found = false; - rcu_read_lock(); - hlist_for_each_entry_rcu(ce, &dfs_cache_htable[hash], ce_hlist) { - if (!strcasecmp(path, ce->ce_path)) { -#ifdef CONFIG_CIFS_DEBUG2 - char *name = get_tgt_name(ce); + h = cache_entry_hash(path, strlen(path)); - if (IS_ERR(name)) { - rcu_read_unlock(); - return ERR_CAST(name); - } - cifs_dbg(FYI, "%s: cache hit\n", __func__); - cifs_dbg(FYI, "%s: target hint: %s\n", __func__, name); -#endif + hlist_for_each_entry(ce, &cache_htable[h], hlist) { + if (!strcasecmp(path, ce->path)) { found = true; + dump_ce(ce); break; } } - rcu_read_unlock(); - return found ? ce : ERR_PTR(-ENOENT); -} -/* - * Find a DFS cache entry in hash table and optionally check prefix path against - * @path. - * Use whole path components in the match. - * Return ERR_PTR(-ENOENT) if the entry is not found. - */ -static inline struct dfs_cache_entry *find_cache_entry(const char *path, - unsigned int *hash) -{ - *hash = cache_entry_hash(path, strlen(path)); - return __find_cache_entry(*hash, path); + if (!found) + ce = ERR_PTR(-ENOENT); + if (hash) + *hash = h; + + return ce; } -static inline void destroy_slab_cache(void) +static void __vol_release(struct vol_info *vi) { - rcu_barrier(); - kmem_cache_destroy(dfs_cache_slab); + kfree(vi->fullpath); + kfree(vi->mntdata); + cifs_cleanup_volume_info_contents(&vi->smb_vol); + kfree(vi); } -static inline void free_vol(struct dfs_cache_vol_info *vi) +static void vol_release(struct kref *kref) { - list_del(&vi->vi_list); - kfree(vi->vi_fullpath); - kfree(vi->vi_mntdata); - cifs_cleanup_volume_info_contents(&vi->vi_vol); - kfree(vi); + struct vol_info *vi = container_of(kref, struct vol_info, refcnt); + + spin_lock(&vol_list_lock); + list_del(&vi->list); + spin_unlock(&vol_list_lock); + __vol_release(vi); } static inline void free_vol_list(void) { - struct dfs_cache_vol_info *vi, *nvi; + struct vol_info *vi, *nvi; - list_for_each_entry_safe(vi, nvi, &dfs_cache.dc_vol_list, vi_list) - free_vol(vi); + list_for_each_entry_safe(vi, nvi, &vol_list, list) { + list_del_init(&vi->list); + __vol_release(vi); + } } /** @@ -548,83 +556,78 @@ static inline void free_vol_list(void) */ void dfs_cache_destroy(void) { - cancel_delayed_work_sync(&dfs_cache.dc_refresh); - unload_nls(dfs_cache.dc_nlsc); + cancel_delayed_work_sync(&refresh_task); + unload_nls(cache_nlsc); free_vol_list(); - mutex_destroy(&dfs_cache.dc_lock); - flush_cache_ents(); - destroy_slab_cache(); - mutex_destroy(&dfs_cache_list_lock); + kmem_cache_destroy(cache_slab); + destroy_workqueue(dfscache_wq); cifs_dbg(FYI, "%s: destroyed DFS referral cache\n", __func__); } -static inline struct dfs_cache_entry * -__update_cache_entry(const char *path, const struct dfs_info3_param *refs, - int numrefs) +/* Must be called with htable_rw_lock held */ +static int __update_cache_entry(const char *path, + const struct dfs_info3_param *refs, + int numrefs) { int rc; - unsigned int h; - struct dfs_cache_entry *ce; + struct cache_entry *ce; char *s, *th = NULL; - ce = find_cache_entry(path, &h); + ce = lookup_cache_entry(path, NULL); if (IS_ERR(ce)) - return ce; + return PTR_ERR(ce); - if (ce->ce_tgthint) { - s = ce->ce_tgthint->t_name; - th = kstrndup(s, strlen(s), GFP_KERNEL); + if (ce->tgthint) { + s = ce->tgthint->name; + th = kstrndup(s, strlen(s), GFP_ATOMIC); if (!th) - return ERR_PTR(-ENOMEM); + return -ENOMEM; } free_tgts(ce); - ce->ce_numtgts = 0; + ce->numtgts = 0; rc = copy_ref_data(refs, numrefs, ce, th); - kfree(th); - if (rc) - ce = ERR_PTR(rc); + kfree(th); - return ce; + return rc; } -/* Update an expired cache entry by getting a new DFS referral from server */ -static struct dfs_cache_entry * -update_cache_entry(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_codepage, int remap, - const char *path, struct dfs_cache_entry *ce) +static int get_dfs_referral(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_codepage, int remap, + const char *path, struct dfs_info3_param **refs, + int *numrefs) { - int rc; - struct dfs_info3_param *refs = NULL; - int numrefs = 0; + cifs_dbg(FYI, "%s: get an DFS referral for %s\n", __func__, path); - cifs_dbg(FYI, "%s: update expired cache entry\n", __func__); - /* - * Check if caller provided enough parameters to update an expired - * entry. - */ if (!ses || !ses->server || !ses->server->ops->get_dfs_refer) - return ERR_PTR(-ETIME); + return -EOPNOTSUPP; if (unlikely(!nls_codepage)) - return ERR_PTR(-ETIME); + return -EINVAL; - cifs_dbg(FYI, "%s: DFS referral request for %s\n", __func__, path); + *refs = NULL; + *numrefs = 0; - rc = ses->server->ops->get_dfs_refer(xid, ses, path, &refs, &numrefs, - nls_codepage, remap); - if (rc) - ce = ERR_PTR(rc); - else - ce = __update_cache_entry(path, refs, numrefs); + return ses->server->ops->get_dfs_refer(xid, ses, path, refs, numrefs, + nls_codepage, remap); +} - dump_refs(refs, numrefs); - free_dfs_info_array(refs, numrefs); +/* Update an expired cache entry by getting a new DFS referral from server */ +static int update_cache_entry(const char *path, + const struct dfs_info3_param *refs, + int numrefs) +{ - return ce; + int rc; + + down_write(&htable_rw_lock); + rc = __update_cache_entry(path, refs, numrefs); + up_write(&htable_rw_lock); + + return rc; } /* @@ -636,95 +639,86 @@ update_cache_entry(const unsigned int xid, struct cifs_ses *ses, * For interlinks, __cifs_dfs_mount() and expand_dfs_referral() are supposed to * handle them properly. */ -static struct dfs_cache_entry * -do_dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_codepage, int remap, - const char *path, bool noreq) +static int __dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_codepage, int remap, + const char *path, bool noreq) { int rc; - unsigned int h; - struct dfs_cache_entry *ce; - struct dfs_info3_param *nrefs; - int numnrefs; + unsigned int hash; + struct cache_entry *ce; + struct dfs_info3_param *refs = NULL; + int numrefs = 0; + bool newent = false; cifs_dbg(FYI, "%s: search path: %s\n", __func__, path); - ce = find_cache_entry(path, &h); - if (IS_ERR(ce)) { - cifs_dbg(FYI, "%s: cache miss\n", __func__); - /* - * If @noreq is set, no requests will be sent to the server for - * either updating or getting a new DFS referral. - */ - if (noreq) - return ce; - /* - * No cache entry was found, so check for valid parameters that - * will be required to get a new DFS referral and then create a - * new cache entry. - */ - if (!ses || !ses->server || !ses->server->ops->get_dfs_refer) { - ce = ERR_PTR(-EOPNOTSUPP); - return ce; - } - if (unlikely(!nls_codepage)) { - ce = ERR_PTR(-EINVAL); - return ce; - } + down_read(&htable_rw_lock); - nrefs = NULL; - numnrefs = 0; + ce = lookup_cache_entry(path, &hash); - cifs_dbg(FYI, "%s: DFS referral request for %s\n", __func__, - path); + /* + * If @noreq is set, no requests will be sent to the server. Just return + * the cache entry. + */ + if (noreq) { + up_read(&htable_rw_lock); + return PTR_ERR_OR_ZERO(ce); + } - rc = ses->server->ops->get_dfs_refer(xid, ses, path, &nrefs, - &numnrefs, nls_codepage, - remap); - if (rc) { - ce = ERR_PTR(rc); - return ce; + if (!IS_ERR(ce)) { + if (!cache_entry_expired(ce)) { + dump_ce(ce); + up_read(&htable_rw_lock); + return 0; } + } else { + newent = true; + } - dump_refs(nrefs, numnrefs); + up_read(&htable_rw_lock); - cifs_dbg(FYI, "%s: new cache entry\n", __func__); + /* + * No entry was found. + * + * Request a new DFS referral in order to create a new cache entry, or + * updating an existing one. + */ + rc = get_dfs_referral(xid, ses, nls_codepage, remap, path, + &refs, &numrefs); + if (rc) + return rc; - if (dfs_cache_count >= DFS_CACHE_MAX_ENTRIES) { - cifs_dbg(FYI, "%s: reached max cache size (%d)", - __func__, DFS_CACHE_MAX_ENTRIES); - remove_oldest_entry(); - } - ce = add_cache_entry(h, path, nrefs, numnrefs); - free_dfs_info_array(nrefs, numnrefs); + dump_refs(refs, numrefs); - if (IS_ERR(ce)) - return ce; + if (!newent) { + rc = update_cache_entry(path, refs, numrefs); + goto out_free_refs; + } - dfs_cache_count++; + if (atomic_read(&cache_count) >= CACHE_MAX_ENTRIES) { + cifs_dbg(FYI, "%s: reached max cache size (%d)", __func__, + CACHE_MAX_ENTRIES); + down_write(&htable_rw_lock); + remove_oldest_entry(); + up_write(&htable_rw_lock); } - dump_ce(ce); + rc = add_cache_entry(path, hash, refs, numrefs); + if (!rc) + atomic_inc(&cache_count); - /* Just return the found cache entry in case @noreq is set */ - if (noreq) - return ce; - - if (cache_entry_expired(ce)) { - cifs_dbg(FYI, "%s: expired cache entry\n", __func__); - ce = update_cache_entry(xid, ses, nls_codepage, remap, path, - ce); - if (IS_ERR(ce)) { - cifs_dbg(FYI, "%s: failed to update expired entry\n", - __func__); - } - } - return ce; +out_free_refs: + free_dfs_info_array(refs, numrefs); + return rc; } -/* Set up a new DFS referral from a given cache entry */ -static int setup_ref(const char *path, const struct dfs_cache_entry *ce, - struct dfs_info3_param *ref, const char *tgt) +/* + * Set up a DFS referral from a given cache entry. + * + * Must be called with htable_rw_lock held. + */ +static int setup_referral(const char *path, struct cache_entry *ce, + struct dfs_info3_param *ref, const char *target) { int rc; @@ -732,21 +726,20 @@ static int setup_ref(const char *path, const struct dfs_cache_entry *ce, memset(ref, 0, sizeof(*ref)); - ref->path_name = kstrndup(path, strlen(path), GFP_KERNEL); + ref->path_name = kstrndup(path, strlen(path), GFP_ATOMIC); if (!ref->path_name) return -ENOMEM; - ref->path_consumed = ce->ce_path_consumed; - - ref->node_name = kstrndup(tgt, strlen(tgt), GFP_KERNEL); + ref->node_name = kstrndup(target, strlen(target), GFP_ATOMIC); if (!ref->node_name) { rc = -ENOMEM; goto err_free_path; } - ref->ttl = ce->ce_ttl; - ref->server_type = ce->ce_srvtype; - ref->ref_flag = ce->ce_flags; + ref->path_consumed = ce->path_consumed; + ref->ttl = ce->ttl; + ref->server_type = ce->srvtype; + ref->ref_flag = ce->flags; return 0; @@ -757,38 +750,37 @@ err_free_path: } /* Return target list of a DFS cache entry */ -static int get_tgt_list(const struct dfs_cache_entry *ce, - struct dfs_cache_tgt_list *tl) +static int get_targets(struct cache_entry *ce, struct dfs_cache_tgt_list *tl) { int rc; struct list_head *head = &tl->tl_list; - struct dfs_cache_tgt *t; + struct cache_dfs_tgt *t; struct dfs_cache_tgt_iterator *it, *nit; memset(tl, 0, sizeof(*tl)); INIT_LIST_HEAD(head); - list_for_each_entry(t, &ce->ce_tlist, t_list) { - it = kzalloc(sizeof(*it), GFP_KERNEL); + list_for_each_entry(t, &ce->tlist, list) { + it = kzalloc(sizeof(*it), GFP_ATOMIC); if (!it) { rc = -ENOMEM; goto err_free_it; } - it->it_name = kstrndup(t->t_name, strlen(t->t_name), - GFP_KERNEL); + it->it_name = kstrndup(t->name, strlen(t->name), GFP_ATOMIC); if (!it->it_name) { kfree(it); rc = -ENOMEM; goto err_free_it; } - if (ce->ce_tgthint == t) + if (ce->tgthint == t) list_add(&it->it_list, head); else list_add_tail(&it->it_list, head); } - tl->tl_numtgts = ce->ce_numtgts; + + tl->tl_numtgts = ce->numtgts; return 0; @@ -829,28 +821,35 @@ int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, { int rc; char *npath; - struct dfs_cache_entry *ce; - - if (unlikely(!is_path_valid(path))) - return -EINVAL; + struct cache_entry *ce; rc = get_normalized_path(path, &npath); if (rc) return rc; - mutex_lock(&dfs_cache_list_lock); - ce = do_dfs_cache_find(xid, ses, nls_codepage, remap, npath, false); - if (!IS_ERR(ce)) { - if (ref) - rc = setup_ref(path, ce, ref, get_tgt_name(ce)); - else - rc = 0; - if (!rc && tgt_list) - rc = get_tgt_list(ce, tgt_list); - } else { + rc = __dfs_cache_find(xid, ses, nls_codepage, remap, npath, false); + if (rc) + goto out_free_path; + + down_read(&htable_rw_lock); + + ce = lookup_cache_entry(npath, NULL); + if (IS_ERR(ce)) { + up_read(&htable_rw_lock); rc = PTR_ERR(ce); + goto out_free_path; } - mutex_unlock(&dfs_cache_list_lock); + + if (ref) + rc = setup_referral(path, ce, ref, get_tgt_name(ce)); + else + rc = 0; + if (!rc && tgt_list) + rc = get_targets(ce, tgt_list); + + up_read(&htable_rw_lock); + +out_free_path: free_normalized_path(path, npath); return rc; } @@ -876,31 +875,33 @@ int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref, { int rc; char *npath; - struct dfs_cache_entry *ce; - - if (unlikely(!is_path_valid(path))) - return -EINVAL; + struct cache_entry *ce; rc = get_normalized_path(path, &npath); if (rc) return rc; - mutex_lock(&dfs_cache_list_lock); - ce = do_dfs_cache_find(0, NULL, NULL, 0, npath, true); + cifs_dbg(FYI, "%s: path: %s\n", __func__, npath); + + down_read(&htable_rw_lock); + + ce = lookup_cache_entry(npath, NULL); if (IS_ERR(ce)) { rc = PTR_ERR(ce); - goto out; + goto out_unlock; } if (ref) - rc = setup_ref(path, ce, ref, get_tgt_name(ce)); + rc = setup_referral(path, ce, ref, get_tgt_name(ce)); else rc = 0; if (!rc && tgt_list) - rc = get_tgt_list(ce, tgt_list); -out: - mutex_unlock(&dfs_cache_list_lock); + rc = get_targets(ce, tgt_list); + +out_unlock: + up_read(&htable_rw_lock); free_normalized_path(path, npath); + return rc; } @@ -929,44 +930,46 @@ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses, { int rc; char *npath; - struct dfs_cache_entry *ce; - struct dfs_cache_tgt *t; - - if (unlikely(!is_path_valid(path))) - return -EINVAL; + struct cache_entry *ce; + struct cache_dfs_tgt *t; rc = get_normalized_path(path, &npath); if (rc) return rc; - cifs_dbg(FYI, "%s: path: %s\n", __func__, npath); + cifs_dbg(FYI, "%s: update target hint - path: %s\n", __func__, npath); - mutex_lock(&dfs_cache_list_lock); - ce = do_dfs_cache_find(xid, ses, nls_codepage, remap, npath, false); + rc = __dfs_cache_find(xid, ses, nls_codepage, remap, npath, false); + if (rc) + goto out_free_path; + + down_write(&htable_rw_lock); + + ce = lookup_cache_entry(npath, NULL); if (IS_ERR(ce)) { rc = PTR_ERR(ce); - goto out; + goto out_unlock; } - rc = 0; - - t = ce->ce_tgthint; + t = ce->tgthint; - if (likely(!strcasecmp(it->it_name, t->t_name))) - goto out; + if (likely(!strcasecmp(it->it_name, t->name))) + goto out_unlock; - list_for_each_entry(t, &ce->ce_tlist, t_list) { - if (!strcasecmp(t->t_name, it->it_name)) { - ce->ce_tgthint = t; + list_for_each_entry(t, &ce->tlist, list) { + if (!strcasecmp(t->name, it->it_name)) { + ce->tgthint = t; cifs_dbg(FYI, "%s: new target hint: %s\n", __func__, it->it_name); break; } } -out: - mutex_unlock(&dfs_cache_list_lock); +out_unlock: + up_write(&htable_rw_lock); +out_free_path: free_normalized_path(path, npath); + return rc; } @@ -989,10 +992,10 @@ int dfs_cache_noreq_update_tgthint(const char *path, { int rc; char *npath; - struct dfs_cache_entry *ce; - struct dfs_cache_tgt *t; + struct cache_entry *ce; + struct cache_dfs_tgt *t; - if (unlikely(!is_path_valid(path)) || !it) + if (!it) return -EINVAL; rc = get_normalized_path(path, &npath); @@ -1001,33 +1004,33 @@ int dfs_cache_noreq_update_tgthint(const char *path, cifs_dbg(FYI, "%s: path: %s\n", __func__, npath); - mutex_lock(&dfs_cache_list_lock); + down_write(&htable_rw_lock); - ce = do_dfs_cache_find(0, NULL, NULL, 0, npath, true); + ce = lookup_cache_entry(npath, NULL); if (IS_ERR(ce)) { rc = PTR_ERR(ce); - goto out; + goto out_unlock; } rc = 0; + t = ce->tgthint; - t = ce->ce_tgthint; + if (unlikely(!strcasecmp(it->it_name, t->name))) + goto out_unlock; - if (unlikely(!strcasecmp(it->it_name, t->t_name))) - goto out; - - list_for_each_entry(t, &ce->ce_tlist, t_list) { - if (!strcasecmp(t->t_name, it->it_name)) { - ce->ce_tgthint = t; + list_for_each_entry(t, &ce->tlist, list) { + if (!strcasecmp(t->name, it->it_name)) { + ce->tgthint = t; cifs_dbg(FYI, "%s: new target hint: %s\n", __func__, it->it_name); break; } } -out: - mutex_unlock(&dfs_cache_list_lock); +out_unlock: + up_write(&htable_rw_lock); free_normalized_path(path, npath); + return rc; } @@ -1047,13 +1050,10 @@ int dfs_cache_get_tgt_referral(const char *path, { int rc; char *npath; - struct dfs_cache_entry *ce; - unsigned int h; + struct cache_entry *ce; if (!it || !ref) return -EINVAL; - if (unlikely(!is_path_valid(path))) - return -EINVAL; rc = get_normalized_path(path, &npath); if (rc) @@ -1061,21 +1061,22 @@ int dfs_cache_get_tgt_referral(const char *path, cifs_dbg(FYI, "%s: path: %s\n", __func__, npath); - mutex_lock(&dfs_cache_list_lock); + down_read(&htable_rw_lock); - ce = find_cache_entry(npath, &h); + ce = lookup_cache_entry(npath, NULL); if (IS_ERR(ce)) { rc = PTR_ERR(ce); - goto out; + goto out_unlock; } cifs_dbg(FYI, "%s: target name: %s\n", __func__, it->it_name); - rc = setup_ref(path, ce, ref, it->it_name); + rc = setup_referral(path, ce, ref, it->it_name); -out: - mutex_unlock(&dfs_cache_list_lock); +out_unlock: + up_read(&htable_rw_lock); free_normalized_path(path, npath); + return rc; } @@ -1085,7 +1086,7 @@ static int dup_vol(struct smb_vol *vol, struct smb_vol *new) if (vol->username) { new->username = kstrndup(vol->username, strlen(vol->username), - GFP_KERNEL); + GFP_KERNEL); if (!new->username) return -ENOMEM; } @@ -1103,7 +1104,7 @@ static int dup_vol(struct smb_vol *vol, struct smb_vol *new) } if (vol->domainname) { new->domainname = kstrndup(vol->domainname, - strlen(vol->domainname), GFP_KERNEL); + strlen(vol->domainname), GFP_KERNEL); if (!new->domainname) goto err_free_unc; } @@ -1150,7 +1151,7 @@ err_free_username: int dfs_cache_add_vol(char *mntdata, struct smb_vol *vol, const char *fullpath) { int rc; - struct dfs_cache_vol_info *vi; + struct vol_info *vi; if (!vol || !fullpath || !mntdata) return -EINVAL; @@ -1161,38 +1162,41 @@ int dfs_cache_add_vol(char *mntdata, struct smb_vol *vol, const char *fullpath) if (!vi) return -ENOMEM; - vi->vi_fullpath = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL); - if (!vi->vi_fullpath) { + vi->fullpath = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL); + if (!vi->fullpath) { rc = -ENOMEM; goto err_free_vi; } - rc = dup_vol(vol, &vi->vi_vol); + rc = dup_vol(vol, &vi->smb_vol); if (rc) goto err_free_fullpath; - vi->vi_mntdata = mntdata; + vi->mntdata = mntdata; + spin_lock_init(&vi->smb_vol_lock); + kref_init(&vi->refcnt); + + spin_lock(&vol_list_lock); + list_add_tail(&vi->list, &vol_list); + spin_unlock(&vol_list_lock); - mutex_lock(&dfs_cache.dc_lock); - list_add_tail(&vi->vi_list, &dfs_cache.dc_vol_list); - mutex_unlock(&dfs_cache.dc_lock); return 0; err_free_fullpath: - kfree(vi->vi_fullpath); + kfree(vi->fullpath); err_free_vi: kfree(vi); return rc; } -static inline struct dfs_cache_vol_info *find_vol(const char *fullpath) +/* Must be called with vol_list_lock held */ +static struct vol_info *find_vol(const char *fullpath) { - struct dfs_cache_vol_info *vi; + struct vol_info *vi; - list_for_each_entry(vi, &dfs_cache.dc_vol_list, vi_list) { - cifs_dbg(FYI, "%s: vi->vi_fullpath: %s\n", __func__, - vi->vi_fullpath); - if (!strcasecmp(vi->vi_fullpath, fullpath)) + list_for_each_entry(vi, &vol_list, list) { + cifs_dbg(FYI, "%s: vi->fullpath: %s\n", __func__, vi->fullpath); + if (!strcasecmp(vi->fullpath, fullpath)) return vi; } return ERR_PTR(-ENOENT); @@ -1208,30 +1212,31 @@ static inline struct dfs_cache_vol_info *find_vol(const char *fullpath) */ int dfs_cache_update_vol(const char *fullpath, struct TCP_Server_Info *server) { - int rc; - struct dfs_cache_vol_info *vi; + struct vol_info *vi; if (!fullpath || !server) return -EINVAL; cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath); - mutex_lock(&dfs_cache.dc_lock); - + spin_lock(&vol_list_lock); vi = find_vol(fullpath); if (IS_ERR(vi)) { - rc = PTR_ERR(vi); - goto out; + spin_unlock(&vol_list_lock); + return PTR_ERR(vi); } + kref_get(&vi->refcnt); + spin_unlock(&vol_list_lock); cifs_dbg(FYI, "%s: updating volume info\n", __func__); - memcpy(&vi->vi_vol.dstaddr, &server->dstaddr, - sizeof(vi->vi_vol.dstaddr)); - rc = 0; + spin_lock(&vi->smb_vol_lock); + memcpy(&vi->smb_vol.dstaddr, &server->dstaddr, + sizeof(vi->smb_vol.dstaddr)); + spin_unlock(&vi->smb_vol_lock); -out: - mutex_unlock(&dfs_cache.dc_lock); - return rc; + kref_put(&vi->refcnt, vol_release); + + return 0; } /** @@ -1241,18 +1246,18 @@ out: */ void dfs_cache_del_vol(const char *fullpath) { - struct dfs_cache_vol_info *vi; + struct vol_info *vi; if (!fullpath || !*fullpath) return; cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath); - mutex_lock(&dfs_cache.dc_lock); + spin_lock(&vol_list_lock); vi = find_vol(fullpath); - if (!IS_ERR(vi)) - free_vol(vi); - mutex_unlock(&dfs_cache.dc_lock); + spin_unlock(&vol_list_lock); + + kref_put(&vi->refcnt, vol_release); } /* Get all tcons that are within a DFS namespace and can be refreshed */ @@ -1280,7 +1285,7 @@ static void get_tcons(struct TCP_Server_Info *server, struct list_head *head) spin_unlock(&cifs_tcp_ses_lock); } -static inline bool is_dfs_link(const char *path) +static bool is_dfs_link(const char *path) { char *s; @@ -1290,7 +1295,7 @@ static inline bool is_dfs_link(const char *path) return !!strchr(s + 1, '\\'); } -static inline char *get_dfs_root(const char *path) +static char *get_dfs_root(const char *path) { char *s, *npath; @@ -1309,31 +1314,67 @@ static inline char *get_dfs_root(const char *path) return npath; } +static inline void put_tcp_server(struct TCP_Server_Info *server) +{ + cifs_put_tcp_session(server, 0); +} + +static struct TCP_Server_Info *get_tcp_server(struct smb_vol *vol) +{ + struct TCP_Server_Info *server; + + server = cifs_find_tcp_session(vol); + if (IS_ERR_OR_NULL(server)) + return NULL; + + spin_lock(&GlobalMid_Lock); + if (server->tcpStatus != CifsGood) { + spin_unlock(&GlobalMid_Lock); + put_tcp_server(server); + return NULL; + } + spin_unlock(&GlobalMid_Lock); + + return server; +} + /* Find root SMB session out of a DFS link path */ -static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi, - struct cifs_tcon *tcon, const char *path) +static struct cifs_ses *find_root_ses(struct vol_info *vi, + struct cifs_tcon *tcon, + const char *path) { char *rpath; int rc; + struct cache_entry *ce; struct dfs_info3_param ref = {0}; char *mdata = NULL, *devname = NULL; struct TCP_Server_Info *server; struct cifs_ses *ses; - struct smb_vol vol; + struct smb_vol vol = {NULL}; rpath = get_dfs_root(path); if (IS_ERR(rpath)) return ERR_CAST(rpath); - memset(&vol, 0, sizeof(vol)); + down_read(&htable_rw_lock); + + ce = lookup_cache_entry(rpath, NULL); + if (IS_ERR(ce)) { + up_read(&htable_rw_lock); + ses = ERR_CAST(ce); + goto out; + } - rc = dfs_cache_noreq_find(rpath, &ref, NULL); + rc = setup_referral(path, ce, &ref, get_tgt_name(ce)); if (rc) { + up_read(&htable_rw_lock); ses = ERR_PTR(rc); goto out; } - mdata = cifs_compose_mount_options(vi->vi_mntdata, rpath, &ref, + up_read(&htable_rw_lock); + + mdata = cifs_compose_mount_options(vi->mntdata, rpath, &ref, &devname); free_dfs_info_param(&ref); @@ -1351,13 +1392,8 @@ static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi, goto out; } - server = cifs_find_tcp_session(&vol); - if (IS_ERR_OR_NULL(server)) { - ses = ERR_PTR(-EHOSTDOWN); - goto out; - } - if (server->tcpStatus != CifsGood) { - cifs_put_tcp_session(server, 0); + server = get_tcp_server(&vol); + if (!server) { ses = ERR_PTR(-EHOSTDOWN); goto out; } @@ -1373,17 +1409,15 @@ out: } /* Refresh DFS cache entry from a given tcon */ -static void do_refresh_tcon(struct dfs_cache *dc, struct dfs_cache_vol_info *vi, - struct cifs_tcon *tcon) +static int refresh_tcon(struct vol_info *vi, struct cifs_tcon *tcon) { int rc = 0; unsigned int xid; char *path, *npath; - unsigned int h; - struct dfs_cache_entry *ce; + struct cache_entry *ce; + struct cifs_ses *root_ses = NULL, *ses; struct dfs_info3_param *refs = NULL; int numrefs = 0; - struct cifs_ses *root_ses = NULL, *ses; xid = get_xid(); @@ -1391,19 +1425,23 @@ static void do_refresh_tcon(struct dfs_cache *dc, struct dfs_cache_vol_info *vi, rc = get_normalized_path(path, &npath); if (rc) - goto out; + goto out_free_xid; - mutex_lock(&dfs_cache_list_lock); - ce = find_cache_entry(npath, &h); - mutex_unlock(&dfs_cache_list_lock); + down_read(&htable_rw_lock); + ce = lookup_cache_entry(npath, NULL); if (IS_ERR(ce)) { rc = PTR_ERR(ce); - goto out; + up_read(&htable_rw_lock); + goto out_free_path; } - if (!cache_entry_expired(ce)) - goto out; + if (!cache_entry_expired(ce)) { + up_read(&htable_rw_lock); + goto out_free_path; + } + + up_read(&htable_rw_lock); /* If it's a DFS Link, then use root SMB session for refreshing it */ if (is_dfs_link(npath)) { @@ -1411,35 +1449,29 @@ static void do_refresh_tcon(struct dfs_cache *dc, struct dfs_cache_vol_info *vi, if (IS_ERR(ses)) { rc = PTR_ERR(ses); root_ses = NULL; - goto out; + goto out_free_path; } } else { ses = tcon->ses; } - if (unlikely(!ses->server->ops->get_dfs_refer)) { - rc = -EOPNOTSUPP; - } else { - rc = ses->server->ops->get_dfs_refer(xid, ses, path, &refs, - &numrefs, dc->dc_nlsc, - tcon->remap); - if (!rc) { - mutex_lock(&dfs_cache_list_lock); - ce = __update_cache_entry(npath, refs, numrefs); - mutex_unlock(&dfs_cache_list_lock); - dump_refs(refs, numrefs); - free_dfs_info_array(refs, numrefs); - if (IS_ERR(ce)) - rc = PTR_ERR(ce); - } + rc = get_dfs_referral(xid, ses, cache_nlsc, tcon->remap, npath, &refs, + &numrefs); + if (!rc) { + dump_refs(refs, numrefs); + rc = update_cache_entry(npath, refs, numrefs); + free_dfs_info_array(refs, numrefs); } -out: if (root_ses) cifs_put_smb_ses(root_ses); - free_xid(xid); +out_free_path: free_normalized_path(path, npath); + +out_free_xid: + free_xid(xid); + return rc; } /* @@ -1448,30 +1480,61 @@ out: */ static void refresh_cache_worker(struct work_struct *work) { - struct dfs_cache *dc = container_of(work, struct dfs_cache, - dc_refresh.work); - struct dfs_cache_vol_info *vi; + struct vol_info *vi, *nvi; struct TCP_Server_Info *server; - LIST_HEAD(list); + LIST_HEAD(vols); + LIST_HEAD(tcons); struct cifs_tcon *tcon, *ntcon; + int rc; - mutex_lock(&dc->dc_lock); - - list_for_each_entry(vi, &dc->dc_vol_list, vi_list) { - server = cifs_find_tcp_session(&vi->vi_vol); - if (IS_ERR_OR_NULL(server)) + /* + * Find SMB volumes that are eligible (server->tcpStatus == CifsGood) + * for refreshing. + */ + spin_lock(&vol_list_lock); + list_for_each_entry(vi, &vol_list, list) { + server = get_tcp_server(&vi->smb_vol); + if (!server) continue; - if (server->tcpStatus != CifsGood) - goto next; - get_tcons(server, &list); - list_for_each_entry_safe(tcon, ntcon, &list, ulist) { - do_refresh_tcon(dc, vi, tcon); + + kref_get(&vi->refcnt); + list_add_tail(&vi->rlist, &vols); + put_tcp_server(server); + } + spin_unlock(&vol_list_lock); + + /* Walk through all TCONs and refresh any expired cache entry */ + list_for_each_entry_safe(vi, nvi, &vols, rlist) { + spin_lock(&vi->smb_vol_lock); + server = get_tcp_server(&vi->smb_vol); + spin_unlock(&vi->smb_vol_lock); + + if (!server) + goto next_vol; + + get_tcons(server, &tcons); + rc = 0; + + list_for_each_entry_safe(tcon, ntcon, &tcons, ulist) { + /* + * Skip tcp server if any of its tcons failed to refresh + * (possibily due to reconnects). + */ + if (!rc) + rc = refresh_tcon(vi, tcon); + list_del_init(&tcon->ulist); cifs_put_tcon(tcon); } -next: - cifs_put_tcp_session(server, 0); + + put_tcp_server(server); + +next_vol: + list_del_init(&vi->rlist); + kref_put(&vi->refcnt, vol_release); } - queue_delayed_work(cifsiod_wq, &dc->dc_refresh, dc->dc_ttl * HZ); - mutex_unlock(&dc->dc_lock); + + spin_lock(&cache_ttl_lock); + queue_delayed_work(dfscache_wq, &refresh_task, cache_ttl * HZ); + spin_unlock(&cache_ttl_lock); } diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h index 76c732943f5f..99ee44f8ad07 100644 --- a/fs/cifs/dfs_cache.h +++ b/fs/cifs/dfs_cache.h @@ -24,7 +24,7 @@ struct dfs_cache_tgt_iterator { extern int dfs_cache_init(void); extern void dfs_cache_destroy(void); -extern const struct file_operations dfscache_proc_fops; +extern const struct proc_ops dfscache_proc_ops; extern int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_codepage, int remap, diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 043288b5c728..a4e8f7d445ac 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2921,7 +2921,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, "direct_writev couldn't get user pages " "(rc=%zd) iter type %d iov_offset %zd " "count %zd\n", - result, from->type, + result, iov_iter_type(from), from->iov_offset, from->count); dump_stack(); @@ -3132,7 +3132,7 @@ static ssize_t __cifs_writev( * In this case, fall back to non-direct write function. * this could be improved by getting pages directly in ITER_KVEC */ - if (direct && from->type & ITER_KVEC) { + if (direct && iov_iter_is_kvec(from)) { cifs_dbg(FYI, "use non-direct cifs_writev for kvec I/O\n"); direct = false; } @@ -3652,7 +3652,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, "couldn't get user pages (rc=%zd)" " iter type %d" " iov_offset %zd count %zd\n", - result, direct_iov.type, + result, iov_iter_type(&direct_iov), direct_iov.iov_offset, direct_iov.count); dump_stack(); @@ -3863,7 +3863,7 @@ static ssize_t __cifs_readv( * fall back to data copy read path * this could be improved by getting pages directly in ITER_KVEC */ - if (direct && to->type & ITER_KVEC) { + if (direct && iov_iter_is_kvec(to)) { cifs_dbg(FYI, "use non-direct cifs_user_readv for kvec I/O\n"); direct = false; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 026ed49e8aa4..676e96a7a9f0 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -2229,7 +2229,7 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start, return -ENOTSUPP; } -static int cifs_truncate_page(struct address_space *mapping, loff_t from) +int cifs_truncate_page(struct address_space *mapping, loff_t from) { pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (PAGE_SIZE - 1); @@ -2246,7 +2246,7 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from) return rc; } -static void cifs_setsize(struct inode *inode, loff_t offset) +void cifs_setsize(struct inode *inode, loff_t offset) { struct cifsInodeInfo *cifs_i = CIFS_I(inode); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 3925a7bfc74d..d17587c2c4ab 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -139,6 +139,28 @@ retry: dput(dentry); } +static bool reparse_file_needs_reval(const struct cifs_fattr *fattr) +{ + if (!(fattr->cf_cifsattrs & ATTR_REPARSE)) + return false; + /* + * The DFS tags should be only intepreted by server side as per + * MS-FSCC 2.1.2.1, but let's include them anyway. + * + * Besides, if cf_cifstag is unset (0), then we still need it to be + * revalidated to know exactly what reparse point it is. + */ + switch (fattr->cf_cifstag) { + case IO_REPARSE_TAG_DFS: + case IO_REPARSE_TAG_DFSR: + case IO_REPARSE_TAG_SYMLINK: + case IO_REPARSE_TAG_NFS: + case 0: + return true; + } + return false; +} + static void cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) { @@ -158,7 +180,7 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) * is a symbolic link, DFS referral or a reparse point with a direct * access like junctions, deduplicated files, NFS symlinks. */ - if (fattr->cf_cifsattrs & ATTR_REPARSE) + if (reparse_file_needs_reval(fattr)) fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; /* non-unix readdir doesn't provide nlink */ @@ -194,19 +216,37 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) } } +static void __dir_info_to_fattr(struct cifs_fattr *fattr, const void *info) +{ + const FILE_DIRECTORY_INFO *fi = info; + + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_cifsattrs = le32_to_cpu(fi->ExtFileAttributes); + fattr->cf_eof = le64_to_cpu(fi->EndOfFile); + fattr->cf_bytes = le64_to_cpu(fi->AllocationSize); + fattr->cf_createtime = le64_to_cpu(fi->CreationTime); + fattr->cf_atime = cifs_NTtimeToUnix(fi->LastAccessTime); + fattr->cf_ctime = cifs_NTtimeToUnix(fi->ChangeTime); + fattr->cf_mtime = cifs_NTtimeToUnix(fi->LastWriteTime); +} + void cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, struct cifs_sb_info *cifs_sb) { - memset(fattr, 0, sizeof(*fattr)); - fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes); - fattr->cf_eof = le64_to_cpu(info->EndOfFile); - fattr->cf_bytes = le64_to_cpu(info->AllocationSize); - fattr->cf_createtime = le64_to_cpu(info->CreationTime); - fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); - fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); - fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); + __dir_info_to_fattr(fattr, info); + cifs_fill_common_info(fattr, cifs_sb); +} +static void cifs_fulldir_info_to_fattr(struct cifs_fattr *fattr, + SEARCH_ID_FULL_DIR_INFO *info, + struct cifs_sb_info *cifs_sb) +{ + __dir_info_to_fattr(fattr, info); + + /* See MS-FSCC 2.4.18 FileIdFullDirectoryInformation */ + if (fattr->cf_cifsattrs & ATTR_REPARSE) + fattr->cf_cifstag = le32_to_cpu(info->EaSize); cifs_fill_common_info(fattr, cifs_sb); } @@ -755,6 +795,11 @@ static int cifs_filldir(char *find_entry, struct file *file, (FIND_FILE_STANDARD_INFO *)find_entry, cifs_sb); break; + case SMB_FIND_FILE_ID_FULL_DIR_INFO: + cifs_fulldir_info_to_fattr(&fattr, + (SEARCH_ID_FULL_DIR_INFO *)find_entry, + cifs_sb); + break; default: cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *)find_entry, diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 8b0b512c5792..afe1f03aabe3 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -67,7 +67,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, goto out; - if (oparms->tcon->use_resilient) { + if (oparms->tcon->use_resilient) { /* default timeout is 0, servers pick default (120 seconds) */ nr_ioctl_req.Timeout = cpu_to_le32(oparms->tcon->handle_timeout); diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 18c7a33adceb..5ef5e97a6d13 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -95,6 +95,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, goto finished; } + memset(&oparms, 0, sizeof(struct cifs_open_parms)); oparms.tcon = tcon; oparms.desired_access = desired_access; oparms.disposition = create_disposition; diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 0516fc482d43..0511aaf451d4 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -743,7 +743,7 @@ __smb2_handle_cancelled_cmd(struct cifs_tcon *tcon, __u16 cmd, __u64 mid, { struct close_cancelled_open *cancelled; - cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL); + cancelled = kzalloc(sizeof(*cancelled), GFP_ATOMIC); if (!cancelled) return -ENOMEM; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index a5c96bc522cb..6787fce26f20 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -12,6 +12,7 @@ #include <linux/uuid.h> #include <linux/sort.h> #include <crypto/aead.h> +#include "cifsfs.h" #include "cifsglob.h" #include "smb2pdu.h" #include "smb2proto.h" @@ -616,6 +617,7 @@ smb2_close_cached_fid(struct kref *ref) cfid->fid->volatile_fid); cfid->is_valid = false; cfid->file_all_info_is_valid = false; + cfid->has_lease = false; } } @@ -626,13 +628,28 @@ void close_shroot(struct cached_fid *cfid) mutex_unlock(&cfid->fid_mutex); } +void close_shroot_lease_locked(struct cached_fid *cfid) +{ + if (cfid->has_lease) { + cfid->has_lease = false; + kref_put(&cfid->refcount, smb2_close_cached_fid); + } +} + +void close_shroot_lease(struct cached_fid *cfid) +{ + mutex_lock(&cfid->fid_mutex); + close_shroot_lease_locked(cfid); + mutex_unlock(&cfid->fid_mutex); +} + void smb2_cached_lease_break(struct work_struct *work) { struct cached_fid *cfid = container_of(work, struct cached_fid, lease_break); - close_shroot(cfid); + close_shroot_lease(cfid); } /* @@ -773,6 +790,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) /* BB TBD check to see if oplock level check can be removed below */ if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) { kref_get(&tcon->crfid.refcount); + tcon->crfid.has_lease = true; smb2_parse_contexts(server, o_rsp, &oparms.fid->epoch, oparms.fid->lease_key, &oplock, NULL); @@ -787,7 +805,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) sizeof(struct smb2_file_all_info), &rsp_iov[1], sizeof(struct smb2_file_all_info), (char *)&tcon->crfid.file_all_info)) - tcon->crfid.file_all_info_is_valid = 1; + tcon->crfid.file_all_info_is_valid = true; oshr_exit: mutex_unlock(&tcon->crfid.fid_mutex); @@ -1506,7 +1524,9 @@ smb2_ioctl_query_info(const unsigned int xid, COMPOUND_FID, COMPOUND_FID, qi.info_type, true, buffer, qi.output_buffer_length, - CIFSMaxBufSize); + CIFSMaxBufSize - + MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE); } } else if (qi.flags == PASSTHRU_SET_INFO) { /* Can eventually relax perm check since server enforces too */ @@ -2036,14 +2056,33 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_search_info *srch_inf) { __le16 *utf16_path; - int rc; - __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct smb_rqst rqst[2]; + struct kvec rsp_iov[2]; + int resp_buftype[2]; + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + struct kvec qd_iov[SMB2_QUERY_DIRECTORY_IOV_SIZE]; + int rc, flags = 0; + u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_open_parms oparms; + struct smb2_query_directory_rsp *qd_rsp = NULL; + struct smb2_create_rsp *op_rsp = NULL; utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) return -ENOMEM; + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + memset(rqst, 0, sizeof(rqst)); + resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER; + memset(rsp_iov, 0, sizeof(rsp_iov)); + + /* Open */ + memset(&open_iov, 0, sizeof(open_iov)); + rqst[0].rq_iov = open_iov; + rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; + oparms.tcon = tcon; oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA; oparms.disposition = FILE_OPEN; @@ -2054,22 +2093,75 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL); - kfree(utf16_path); - if (rc) { - cifs_dbg(FYI, "open dir failed rc=%d\n", rc); - return rc; - } + rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, utf16_path); + if (rc) + goto qdf_free; + smb2_set_next_command(tcon, &rqst[0]); + /* Query directory */ srch_inf->entries_in_buffer = 0; srch_inf->index_of_last_entry = 2; - rc = SMB2_query_directory(xid, tcon, fid->persistent_fid, - fid->volatile_fid, 0, srch_inf); - if (rc) { - cifs_dbg(FYI, "query directory failed rc=%d\n", rc); + memset(&qd_iov, 0, sizeof(qd_iov)); + rqst[1].rq_iov = qd_iov; + rqst[1].rq_nvec = SMB2_QUERY_DIRECTORY_IOV_SIZE; + + rc = SMB2_query_directory_init(xid, tcon, &rqst[1], + COMPOUND_FID, COMPOUND_FID, + 0, srch_inf->info_level); + if (rc) + goto qdf_free; + + smb2_set_related(&rqst[1]); + + rc = compound_send_recv(xid, tcon->ses, flags, 2, rqst, + resp_buftype, rsp_iov); + + /* If the open failed there is nothing to do */ + op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; + if (op_rsp == NULL || op_rsp->sync_hdr.Status != STATUS_SUCCESS) { + cifs_dbg(FYI, "query_dir_first: open failed rc=%d\n", rc); + goto qdf_free; + } + fid->persistent_fid = op_rsp->PersistentFileId; + fid->volatile_fid = op_rsp->VolatileFileId; + + /* Anything else than ENODATA means a genuine error */ + if (rc && rc != -ENODATA) { SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); + cifs_dbg(FYI, "query_dir_first: query directory failed rc=%d\n", rc); + trace_smb3_query_dir_err(xid, fid->persistent_fid, + tcon->tid, tcon->ses->Suid, 0, 0, rc); + goto qdf_free; } + + qd_rsp = (struct smb2_query_directory_rsp *)rsp_iov[1].iov_base; + if (qd_rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { + trace_smb3_query_dir_done(xid, fid->persistent_fid, + tcon->tid, tcon->ses->Suid, 0, 0); + srch_inf->endOfSearch = true; + rc = 0; + goto qdf_free; + } + + rc = smb2_parse_query_directory(tcon, &rsp_iov[1], resp_buftype[1], + srch_inf); + if (rc) { + trace_smb3_query_dir_err(xid, fid->persistent_fid, tcon->tid, + tcon->ses->Suid, 0, 0, rc); + goto qdf_free; + } + resp_buftype[1] = CIFS_NO_BUFFER; + + trace_smb3_query_dir_done(xid, fid->persistent_fid, tcon->tid, + tcon->ses->Suid, 0, srch_inf->entries_in_buffer); + + qdf_free: + kfree(utf16_path); + SMB2_open_free(&rqst[0]); + SMB2_query_directory_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); return rc; } @@ -2680,7 +2772,10 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl_init(tcon, &rqst[1], fid.persistent_fid, fid.volatile_fid, FSCTL_GET_REPARSE_POINT, - true /* is_fctl */, NULL, 0, CIFSMaxBufSize); + true /* is_fctl */, NULL, 0, + CIFSMaxBufSize - + MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE); if (rc) goto querty_exit; @@ -3078,28 +3173,32 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, } /* + * Extending the file + */ + if ((keep_size == false) && i_size_read(inode) < off + len) { + if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0) + smb2_set_sparse(xid, tcon, cfile, inode, false); + + eof = cpu_to_le64(off + len); + rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, &eof); + if (rc == 0) { + cifsi->server_eof = off + len; + cifs_setsize(inode, off + len); + cifs_truncate_page(inode->i_mapping, inode->i_size); + truncate_setsize(inode, off + len); + } + goto out; + } + + /* * Files are non-sparse by default so falloc may be a no-op - * Must check if file sparse. If not sparse, and not extending - * then no need to do anything since file already allocated + * Must check if file sparse. If not sparse, and since we are not + * extending then no need to do anything since file already allocated */ if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0) { - if (keep_size == true) - rc = 0; - /* check if extending file */ - else if (i_size_read(inode) >= off + len) - /* not extending file and already not sparse */ - rc = 0; - /* BB: in future add else clause to extend file */ - else - rc = -EOPNOTSUPP; - if (rc) - trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, - tcon->tid, tcon->ses->Suid, off, len, rc); - else - trace_smb3_falloc_done(xid, cfile->fid.persistent_fid, - tcon->tid, tcon->ses->Suid, off, len); - free_xid(xid); - return rc; + rc = 0; + goto out; } if ((keep_size == true) || (i_size_read(inode) >= off + len)) { @@ -3113,25 +3212,14 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, */ if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) { rc = -EOPNOTSUPP; - trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, - tcon->tid, tcon->ses->Suid, off, len, rc); - free_xid(xid); - return rc; - } - - smb2_set_sparse(xid, tcon, cfile, inode, false); - rc = 0; - } else { - smb2_set_sparse(xid, tcon, cfile, inode, false); - rc = 0; - if (i_size_read(inode) < off + len) { - eof = cpu_to_le64(off + len); - rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, cfile->pid, - &eof); + goto out; } } + smb2_set_sparse(xid, tcon, cfile, inode, false); + rc = 0; + +out: if (rc) trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, tcon->tid, tcon->ses->Suid, off, len, rc); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 0ab6b1200288..14f209f7376f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -312,7 +312,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) if (server->tcpStatus != CifsNeedReconnect) break; - if (--retries) + if (retries && --retries) continue; /* @@ -1847,7 +1847,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) return 0; - close_shroot(&tcon->crfid); + close_shroot_lease(&tcon->crfid); rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req, &total_len); @@ -2199,13 +2199,14 @@ create_sd_buf(umode_t mode, unsigned int *len) struct cifs_ace *pace; unsigned int sdlen, acelen; - *len = roundup(sizeof(struct crt_sd_ctxt) + sizeof(struct cifs_ace), 8); + *len = roundup(sizeof(struct crt_sd_ctxt) + sizeof(struct cifs_ace) * 2, + 8); buf = kzalloc(*len, GFP_KERNEL); if (buf == NULL) return buf; sdlen = sizeof(struct smb3_sd) + sizeof(struct smb3_acl) + - sizeof(struct cifs_ace); + 2 * sizeof(struct cifs_ace); buf->ccontext.DataOffset = cpu_to_le16(offsetof (struct crt_sd_ctxt, sd)); @@ -2232,8 +2233,12 @@ create_sd_buf(umode_t mode, unsigned int *len) /* create one ACE to hold the mode embedded in reserved special SID */ pace = (struct cifs_ace *)(sizeof(struct crt_sd_ctxt) + (char *)buf); acelen = setup_special_mode_ACE(pace, (__u64)mode); + /* and one more ACE to allow access for authenticated users */ + pace = (struct cifs_ace *)(acelen + (sizeof(struct crt_sd_ctxt) + + (char *)buf)); + acelen += setup_authusers_ACE(pace); buf->acl.AclSize = cpu_to_le16(sizeof(struct cifs_acl) + acelen); - buf->acl.AceCount = cpu_to_le16(1); + buf->acl.AceCount = cpu_to_le16(2); return buf; } @@ -4296,56 +4301,38 @@ num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size) /* * Readdir/FindFirst */ -int -SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, int index, - struct cifs_search_info *srch_inf) +int SMB2_query_directory_init(const unsigned int xid, + struct cifs_tcon *tcon, struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, + int index, int info_level) { - struct smb_rqst rqst; + struct TCP_Server_Info *server = tcon->ses->server; struct smb2_query_directory_req *req; - struct smb2_query_directory_rsp *rsp = NULL; - struct kvec iov[2]; - struct kvec rsp_iov; - int rc = 0; - int len; - int resp_buftype = CIFS_NO_BUFFER; unsigned char *bufptr; - struct TCP_Server_Info *server; - struct cifs_ses *ses = tcon->ses; __le16 asteriks = cpu_to_le16('*'); - char *end_of_smb; - unsigned int output_size = CIFSMaxBufSize; - size_t info_buf_size; - int flags = 0; + unsigned int output_size = CIFSMaxBufSize - + MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE; unsigned int total_len; - - if (ses && (ses->server)) - server = ses->server; - else - return -EIO; + struct kvec *iov = rqst->rq_iov; + int len, rc; rc = smb2_plain_req_init(SMB2_QUERY_DIRECTORY, tcon, (void **) &req, &total_len); if (rc) return rc; - if (smb3_encryption_required(tcon)) - flags |= CIFS_TRANSFORM_REQ; - - switch (srch_inf->info_level) { + switch (info_level) { case SMB_FIND_FILE_DIRECTORY_INFO: req->FileInformationClass = FILE_DIRECTORY_INFORMATION; - info_buf_size = sizeof(FILE_DIRECTORY_INFO) - 1; break; case SMB_FIND_FILE_ID_FULL_DIR_INFO: req->FileInformationClass = FILEID_FULL_DIRECTORY_INFORMATION; - info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1; break; default: cifs_tcon_dbg(VFS, "info level %u isn't supported\n", - srch_inf->info_level); - rc = -EINVAL; - goto qdir_exit; + info_level); + return -EINVAL; } req->FileIndex = cpu_to_le32(index); @@ -4374,40 +4361,50 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, iov[1].iov_base = (char *)(req->Buffer); iov[1].iov_len = len; - memset(&rqst, 0, sizeof(struct smb_rqst)); - rqst.rq_iov = iov; - rqst.rq_nvec = 2; - trace_smb3_query_dir_enter(xid, persistent_fid, tcon->tid, tcon->ses->Suid, index, output_size); - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); - rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base; + return 0; +} - if (rc) { - if (rc == -ENODATA && - rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { - trace_smb3_query_dir_done(xid, persistent_fid, - tcon->tid, tcon->ses->Suid, index, 0); - srch_inf->endOfSearch = true; - rc = 0; - } else { - trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid, - tcon->ses->Suid, index, 0, rc); - cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); - } - goto qdir_exit; +void SMB2_query_directory_free(struct smb_rqst *rqst) +{ + if (rqst && rqst->rq_iov) { + cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ + } +} + +int +smb2_parse_query_directory(struct cifs_tcon *tcon, + struct kvec *rsp_iov, + int resp_buftype, + struct cifs_search_info *srch_inf) +{ + struct smb2_query_directory_rsp *rsp; + size_t info_buf_size; + char *end_of_smb; + int rc; + + rsp = (struct smb2_query_directory_rsp *)rsp_iov->iov_base; + + switch (srch_inf->info_level) { + case SMB_FIND_FILE_DIRECTORY_INFO: + info_buf_size = sizeof(FILE_DIRECTORY_INFO) - 1; + break; + case SMB_FIND_FILE_ID_FULL_DIR_INFO: + info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1; + break; + default: + cifs_tcon_dbg(VFS, "info level %u isn't supported\n", + srch_inf->info_level); + return -EINVAL; } rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset), - le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, + le32_to_cpu(rsp->OutputBufferLength), rsp_iov, info_buf_size); - if (rc) { - trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid, - tcon->ses->Suid, index, 0, rc); - goto qdir_exit; - } + if (rc) + return rc; srch_inf->unicode = true; @@ -4420,7 +4417,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, srch_inf->ntwrk_buf_start = (char *)rsp; srch_inf->srch_entries_start = srch_inf->last_entry = (char *)rsp + le16_to_cpu(rsp->OutputBufferOffset); - end_of_smb = rsp_iov.iov_len + (char *)rsp; + end_of_smb = rsp_iov->iov_len + (char *)rsp; srch_inf->entries_in_buffer = num_entries(srch_inf->srch_entries_start, end_of_smb, &srch_inf->last_entry, info_buf_size); @@ -4435,11 +4432,72 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, else cifs_tcon_dbg(VFS, "illegal search buffer type\n"); + return 0; +} + +int +SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, int index, + struct cifs_search_info *srch_inf) +{ + struct smb_rqst rqst; + struct kvec iov[SMB2_QUERY_DIRECTORY_IOV_SIZE]; + struct smb2_query_directory_rsp *rsp = NULL; + int resp_buftype = CIFS_NO_BUFFER; + struct kvec rsp_iov; + int rc = 0; + struct cifs_ses *ses = tcon->ses; + int flags = 0; + + if (!ses || !(ses->server)) + return -EIO; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + memset(&rqst, 0, sizeof(struct smb_rqst)); + memset(&iov, 0, sizeof(iov)); + rqst.rq_iov = iov; + rqst.rq_nvec = SMB2_QUERY_DIRECTORY_IOV_SIZE; + + rc = SMB2_query_directory_init(xid, tcon, &rqst, persistent_fid, + volatile_fid, index, + srch_inf->info_level); + if (rc) + goto qdir_exit; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base; + + if (rc) { + if (rc == -ENODATA && + rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { + trace_smb3_query_dir_done(xid, persistent_fid, + tcon->tid, tcon->ses->Suid, index, 0); + srch_inf->endOfSearch = true; + rc = 0; + } else { + trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid, + tcon->ses->Suid, index, 0, rc); + cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); + } + goto qdir_exit; + } + + rc = smb2_parse_query_directory(tcon, &rsp_iov, resp_buftype, + srch_inf); + if (rc) { + trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid, + tcon->ses->Suid, index, 0, rc); + goto qdir_exit; + } + resp_buftype = CIFS_NO_BUFFER; + trace_smb3_query_dir_done(xid, persistent_fid, tcon->tid, tcon->ses->Suid, index, srch_inf->entries_in_buffer); - return rc; qdir_exit: + SMB2_query_directory_free(&rqst); free_rsp_buf(resp_buftype, rsp); return rc; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 7b1c379fdf7a..4c43dbd1e089 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -1282,6 +1282,8 @@ struct smb2_echo_rsp { #define SMB2_INDEX_SPECIFIED 0x04 #define SMB2_REOPEN 0x10 +#define SMB2_QUERY_DIRECTORY_IOV_SIZE 2 + struct smb2_query_directory_req { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 33 */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index a18272c987fe..6c678e00046f 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -70,6 +70,8 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server, extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid); extern void close_shroot(struct cached_fid *cfid); +extern void close_shroot_lease(struct cached_fid *cfid); +extern void close_shroot_lease_locked(struct cached_fid *cfid); extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src); extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, @@ -195,6 +197,11 @@ extern int SMB2_echo(struct TCP_Server_Info *server); extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, int index, struct cifs_search_info *srch_inf); +extern int SMB2_query_directory_init(unsigned int xid, struct cifs_tcon *tcon, + struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, + int index, int info_level); +extern void SMB2_query_directory_free(struct smb_rqst *rqst); extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 pid, __le64 *eof); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 387c88704c52..fe6acfce3390 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -685,6 +685,8 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, * The default is for the mid to be synchronous, so the * default callback just wakes up the current task. */ + get_task_struct(current); + temp->creator = current; temp->callback = cifs_wake_up_task; temp->callback_data = current; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 3d2e11f85cba..cb3ee916f527 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -76,6 +76,8 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) * The default is for the mid to be synchronous, so the * default callback just wakes up the current task. */ + get_task_struct(current); + temp->creator = current; temp->callback = cifs_wake_up_task; temp->callback_data = current; @@ -158,6 +160,7 @@ static void _cifs_mid_q_entry_release(struct kref *refcount) } } #endif + put_task_struct(midEntry->creator); mempool_free(midEntry, cifs_mid_poolp); } diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index db4ba8f6077e..b8299173ea7e 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -32,7 +32,8 @@ #include "cifs_unicode.h" #define MAX_EA_VALUE_SIZE CIFSMaxBufSize -#define CIFS_XATTR_CIFS_ACL "system.cifs_acl" +#define CIFS_XATTR_CIFS_ACL "system.cifs_acl" /* DACL only */ +#define CIFS_XATTR_CIFS_NTSD "system.cifs_ntsd" /* owner plus DACL */ #define CIFS_XATTR_ATTRIB "cifs.dosattrib" /* full name: user.cifs.dosattrib */ #define CIFS_XATTR_CREATETIME "cifs.creationtime" /* user.cifs.creationtime */ /* @@ -40,12 +41,62 @@ * confusing users and using the 20+ year old term 'cifs' when it is no longer * secure, replaced by SMB2 (then even more highly secure SMB3) many years ago */ -#define SMB3_XATTR_CIFS_ACL "system.smb3_acl" +#define SMB3_XATTR_CIFS_ACL "system.smb3_acl" /* DACL only */ +#define SMB3_XATTR_CIFS_NTSD "system.smb3_ntsd" /* owner plus DACL */ #define SMB3_XATTR_ATTRIB "smb3.dosattrib" /* full name: user.smb3.dosattrib */ #define SMB3_XATTR_CREATETIME "smb3.creationtime" /* user.smb3.creationtime */ /* BB need to add server (Samba e.g) support for security and trusted prefix */ -enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT }; +enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT, + XATTR_CIFS_NTSD }; + +static int cifs_attrib_set(unsigned int xid, struct cifs_tcon *pTcon, + struct inode *inode, char *full_path, + const void *value, size_t size) +{ + ssize_t rc = -EOPNOTSUPP; + __u32 *pattrib = (__u32 *)value; + __u32 attrib; + FILE_BASIC_INFO info_buf; + + if ((value == NULL) || (size != sizeof(__u32))) + return -ERANGE; + + memset(&info_buf, 0, sizeof(info_buf)); + attrib = *pattrib; + info_buf.Attributes = cpu_to_le32(attrib); + if (pTcon->ses->server->ops->set_file_info) + rc = pTcon->ses->server->ops->set_file_info(inode, full_path, + &info_buf, xid); + if (rc == 0) + CIFS_I(inode)->cifsAttrs = attrib; + + return rc; +} + +static int cifs_creation_time_set(unsigned int xid, struct cifs_tcon *pTcon, + struct inode *inode, char *full_path, + const void *value, size_t size) +{ + ssize_t rc = -EOPNOTSUPP; + __u64 *pcreation_time = (__u64 *)value; + __u64 creation_time; + FILE_BASIC_INFO info_buf; + + if ((value == NULL) || (size != sizeof(__u64))) + return -ERANGE; + + memset(&info_buf, 0, sizeof(info_buf)); + creation_time = *pcreation_time; + info_buf.CreationTime = cpu_to_le64(creation_time); + if (pTcon->ses->server->ops->set_file_info) + rc = pTcon->ses->server->ops->set_file_info(inode, full_path, + &info_buf, xid); + if (rc == 0) + CIFS_I(inode)->createtime = creation_time; + + return rc; +} static int cifs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, @@ -86,6 +137,23 @@ static int cifs_xattr_set(const struct xattr_handler *handler, switch (handler->flags) { case XATTR_USER: + cifs_dbg(FYI, "%s:setting user xattr %s\n", __func__, name); + if ((strcmp(name, CIFS_XATTR_ATTRIB) == 0) || + (strcmp(name, SMB3_XATTR_ATTRIB) == 0)) { + rc = cifs_attrib_set(xid, pTcon, inode, full_path, + value, size); + if (rc == 0) /* force revalidate of the inode */ + CIFS_I(inode)->time = 0; + break; + } else if ((strcmp(name, CIFS_XATTR_CREATETIME) == 0) || + (strcmp(name, SMB3_XATTR_CREATETIME) == 0)) { + rc = cifs_creation_time_set(xid, pTcon, inode, + full_path, value, size); + if (rc == 0) /* force revalidate of the inode */ + CIFS_I(inode)->time = 0; + break; + } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto out; @@ -95,7 +163,8 @@ static int cifs_xattr_set(const struct xattr_handler *handler, cifs_sb->local_nls, cifs_sb); break; - case XATTR_CIFS_ACL: { + case XATTR_CIFS_ACL: + case XATTR_CIFS_NTSD: { struct cifs_ntsd *pacl; if (!value) @@ -106,12 +175,25 @@ static int cifs_xattr_set(const struct xattr_handler *handler, } else { memcpy(pacl, value, size); if (value && - pTcon->ses->server->ops->set_acl) - rc = pTcon->ses->server->ops->set_acl(pacl, - size, inode, - full_path, CIFS_ACL_DACL); - else + pTcon->ses->server->ops->set_acl) { + rc = 0; + if (handler->flags == XATTR_CIFS_NTSD) { + /* set owner and DACL */ + rc = pTcon->ses->server->ops->set_acl( + pacl, size, inode, + full_path, + CIFS_ACL_OWNER); + } + if (rc == 0) { + /* set DACL */ + rc = pTcon->ses->server->ops->set_acl( + pacl, size, inode, + full_path, + CIFS_ACL_DACL); + } + } else { rc = -EOPNOTSUPP; + } if (rc == 0) /* force revalidate of the inode */ CIFS_I(inode)->time = 0; kfree(pacl); @@ -179,7 +261,7 @@ static int cifs_creation_time_get(struct dentry *dentry, struct inode *inode, void *value, size_t size) { ssize_t rc; - __u64 * pcreatetime; + __u64 *pcreatetime; rc = cifs_revalidate_dentry_attr(dentry); if (rc) @@ -244,7 +326,9 @@ static int cifs_xattr_get(const struct xattr_handler *handler, full_path, name, value, size, cifs_sb); break; - case XATTR_CIFS_ACL: { + case XATTR_CIFS_ACL: + case XATTR_CIFS_NTSD: { + /* the whole ntsd is fetched regardless */ u32 acllen; struct cifs_ntsd *pacl; @@ -382,6 +466,26 @@ static const struct xattr_handler smb3_acl_xattr_handler = { .set = cifs_xattr_set, }; +static const struct xattr_handler cifs_cifs_ntsd_xattr_handler = { + .name = CIFS_XATTR_CIFS_NTSD, + .flags = XATTR_CIFS_NTSD, + .get = cifs_xattr_get, + .set = cifs_xattr_set, +}; + +/* + * Although this is just an alias for the above, need to move away from + * confusing users and using the 20 year old term 'cifs' when it is no + * longer secure and was replaced by SMB2/SMB3 a long time ago, and + * SMB3 and later are highly secure. + */ +static const struct xattr_handler smb3_ntsd_xattr_handler = { + .name = SMB3_XATTR_CIFS_NTSD, + .flags = XATTR_CIFS_NTSD, + .get = cifs_xattr_get, + .set = cifs_xattr_set, +}; + static const struct xattr_handler cifs_posix_acl_access_xattr_handler = { .name = XATTR_NAME_POSIX_ACL_ACCESS, .flags = XATTR_ACL_ACCESS, @@ -401,6 +505,8 @@ const struct xattr_handler *cifs_xattr_handlers[] = { &cifs_os2_xattr_handler, &cifs_cifs_acl_xattr_handler, &smb3_acl_xattr_handler, /* alias for above since avoiding "cifs" */ + &cifs_cifs_ntsd_xattr_handler, + &smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */ &cifs_posix_acl_access_xattr_handler, &cifs_posix_acl_default_xattr_handler, NULL diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c deleted file mode 100644 index 358ea2ecf36b..000000000000 --- a/fs/compat_ioctl.c +++ /dev/null @@ -1,261 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * ioctl32.c: Conversion between 32bit and 64bit native ioctls. - * - * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) - * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs - * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz) - * - * These routines maintain argument size conversion between 32bit and 64bit - * ioctls. - */ - -#include <linux/types.h> -#include <linux/compat.h> -#include <linux/kernel.h> -#include <linux/capability.h> -#include <linux/compiler.h> -#include <linux/sched.h> -#include <linux/smp.h> -#include <linux/ioctl.h> -#include <linux/if.h> -#include <linux/raid/md_u.h> -#include <linux/falloc.h> -#include <linux/file.h> -#include <linux/ppp-ioctl.h> -#include <linux/if_pppox.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> -#include <linux/blkdev.h> -#include <linux/serial.h> -#include <linux/ctype.h> -#include <linux/syscalls.h> -#include <linux/gfp.h> -#include <linux/cec.h> - -#include "internal.h" - -#ifdef CONFIG_BLOCK -#include <linux/cdrom.h> -#include <linux/fd.h> -#include <scsi/scsi.h> -#include <scsi/scsi_ioctl.h> -#include <scsi/sg.h> -#endif - -#include <linux/uaccess.h> -#include <linux/watchdog.h> - -#include <linux/hiddev.h> - - -#include <linux/sort.h> - -/* - * simple reversible transform to make our table more evenly - * distributed after sorting. - */ -#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff) - -#define COMPATIBLE_IOCTL(cmd) XFORM((u32)cmd), -static unsigned int ioctl_pointer[] = { -#ifdef CONFIG_BLOCK -/* Big S */ -COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN) -COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK) -COMPATIBLE_IOCTL(SCSI_IOCTL_DOORUNLOCK) -COMPATIBLE_IOCTL(SCSI_IOCTL_TEST_UNIT_READY) -COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER) -COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND) -COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) -COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) -#endif -#ifdef CONFIG_BLOCK -/* SG stuff */ -COMPATIBLE_IOCTL(SG_IO) -COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) -COMPATIBLE_IOCTL(SG_SET_TIMEOUT) -COMPATIBLE_IOCTL(SG_GET_TIMEOUT) -COMPATIBLE_IOCTL(SG_EMULATED_HOST) -COMPATIBLE_IOCTL(SG_GET_TRANSFORM) -COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE) -COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE) -COMPATIBLE_IOCTL(SG_GET_SCSI_ID) -COMPATIBLE_IOCTL(SG_SET_FORCE_LOW_DMA) -COMPATIBLE_IOCTL(SG_GET_LOW_DMA) -COMPATIBLE_IOCTL(SG_SET_FORCE_PACK_ID) -COMPATIBLE_IOCTL(SG_GET_PACK_ID) -COMPATIBLE_IOCTL(SG_GET_NUM_WAITING) -COMPATIBLE_IOCTL(SG_SET_DEBUG) -COMPATIBLE_IOCTL(SG_GET_SG_TABLESIZE) -COMPATIBLE_IOCTL(SG_GET_COMMAND_Q) -COMPATIBLE_IOCTL(SG_SET_COMMAND_Q) -COMPATIBLE_IOCTL(SG_GET_VERSION_NUM) -COMPATIBLE_IOCTL(SG_NEXT_CMD_LEN) -COMPATIBLE_IOCTL(SG_SCSI_RESET) -COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) -COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN) -COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN) -#endif -}; - -/* - * Convert common ioctl arguments based on their command number - * - * Please do not add any code in here. Instead, implement - * a compat_ioctl operation in the place that handleÑ• the - * ioctl for the native case. - */ -static long do_ioctl_trans(unsigned int cmd, - unsigned long arg, struct file *file) -{ - return -ENOIOCTLCMD; -} - -static int compat_ioctl_check_table(unsigned int xcmd) -{ -#ifdef CONFIG_BLOCK - int i; - const int max = ARRAY_SIZE(ioctl_pointer) - 1; - - BUILD_BUG_ON(max >= (1 << 16)); - - /* guess initial offset into table, assuming a - normalized distribution */ - i = ((xcmd >> 16) * max) >> 16; - - /* do linear search up first, until greater or equal */ - while (ioctl_pointer[i] < xcmd && i < max) - i++; - - /* then do linear search down */ - while (ioctl_pointer[i] > xcmd && i > 0) - i--; - - return ioctl_pointer[i] == xcmd; -#else - return 0; -#endif -} - -COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, - compat_ulong_t, arg32) -{ - unsigned long arg = arg32; - struct fd f = fdget(fd); - int error = -EBADF; - if (!f.file) - goto out; - - /* RED-PEN how should LSM module know it's handling 32bit? */ - error = security_file_ioctl(f.file, cmd, arg); - if (error) - goto out_fput; - - switch (cmd) { - /* these are never seen by ->ioctl(), no argument or int argument */ - case FIOCLEX: - case FIONCLEX: - case FIFREEZE: - case FITHAW: - case FICLONE: - goto do_ioctl; - /* these are never seen by ->ioctl(), pointer argument */ - case FIONBIO: - case FIOASYNC: - case FIOQSIZE: - case FS_IOC_FIEMAP: - case FIGETBSZ: - case FICLONERANGE: - case FIDEDUPERANGE: - goto found_handler; - /* - * The next group is the stuff handled inside file_ioctl(). - * For regular files these never reach ->ioctl(); for - * devices, sockets, etc. they do and one (FIONREAD) is - * even accepted in some cases. In all those cases - * argument has the same type, so we can handle these - * here, shunting them towards do_vfs_ioctl(). - * ->compat_ioctl() will never see any of those. - */ - /* pointer argument, never actually handled by ->ioctl() */ - case FIBMAP: - goto found_handler; - /* handled by some ->ioctl(); always a pointer to int */ - case FIONREAD: - goto found_handler; - /* these get messy on amd64 due to alignment differences */ -#if defined(CONFIG_X86_64) - case FS_IOC_RESVSP_32: - case FS_IOC_RESVSP64_32: - error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg)); - goto out_fput; - case FS_IOC_UNRESVSP_32: - case FS_IOC_UNRESVSP64_32: - error = compat_ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE, - compat_ptr(arg)); - goto out_fput; - case FS_IOC_ZERO_RANGE_32: - error = compat_ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE, - compat_ptr(arg)); - goto out_fput; -#else - case FS_IOC_RESVSP: - case FS_IOC_RESVSP64: - case FS_IOC_UNRESVSP: - case FS_IOC_UNRESVSP64: - case FS_IOC_ZERO_RANGE: - goto found_handler; -#endif - - default: - if (f.file->f_op->compat_ioctl) { - error = f.file->f_op->compat_ioctl(f.file, cmd, arg); - if (error != -ENOIOCTLCMD) - goto out_fput; - } - - if (!f.file->f_op->unlocked_ioctl) - goto do_ioctl; - break; - } - - if (compat_ioctl_check_table(XFORM(cmd))) - goto found_handler; - - error = do_ioctl_trans(cmd, arg, f.file); - if (error == -ENOIOCTLCMD) - error = -ENOTTY; - - goto out_fput; - - found_handler: - arg = (unsigned long)compat_ptr(arg); - do_ioctl: - error = do_vfs_ioctl(f.file, fd, cmd, arg); - out_fput: - fdput(f); - out: - return error; -} - -static int __init init_sys32_ioctl_cmp(const void *p, const void *q) -{ - unsigned int a, b; - a = *(unsigned int *)p; - b = *(unsigned int *)q; - if (a > b) - return 1; - if (a < b) - return -1; - return 0; -} - -static int __init init_sys32_ioctl(void) -{ - sort(ioctl_pointer, ARRAY_SIZE(ioctl_pointer), sizeof(*ioctl_pointer), - init_sys32_ioctl_cmp, NULL); - return 0; -} -__initcall(init_sys32_ioctl); diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index ff5a1746cbae..8046d7c7a3e9 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -2,13 +2,8 @@ config FS_ENCRYPTION bool "FS Encryption (Per-file encryption)" select CRYPTO - select CRYPTO_AES - select CRYPTO_CBC - select CRYPTO_ECB - select CRYPTO_XTS - select CRYPTO_CTS - select CRYPTO_SHA512 - select CRYPTO_HMAC + select CRYPTO_HASH + select CRYPTO_SKCIPHER select KEYS help Enable encryption of files and directories. This @@ -16,3 +11,16 @@ config FS_ENCRYPTION efficient since it avoids caching the encrypted and decrypted pages in the page cache. Currently Ext4, F2FS and UBIFS make use of this feature. + +# Filesystems supporting encryption must select this if FS_ENCRYPTION. This +# allows the algorithms to be built as modules when all the filesystems are. +config FS_ENCRYPTION_ALGS + tristate + select CRYPTO_AES + select CRYPTO_CBC + select CRYPTO_CTS + select CRYPTO_ECB + select CRYPTO_HMAC + select CRYPTO_SHA256 + select CRYPTO_SHA512 + select CRYPTO_XTS diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 1f4b8a277060..4fa18fff9c4e 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -41,53 +41,101 @@ void fscrypt_decrypt_bio(struct bio *bio) } EXPORT_SYMBOL(fscrypt_decrypt_bio); +/** + * fscrypt_zeroout_range() - zero out a range of blocks in an encrypted file + * @inode: the file's inode + * @lblk: the first file logical block to zero out + * @pblk: the first filesystem physical block to zero out + * @len: number of blocks to zero out + * + * Zero out filesystem blocks in an encrypted regular file on-disk, i.e. write + * ciphertext blocks which decrypt to the all-zeroes block. The blocks must be + * both logically and physically contiguous. It's also assumed that the + * filesystem only uses a single block device, ->s_bdev. + * + * Note that since each block uses a different IV, this involves writing a + * different ciphertext to each block; we can't simply reuse the same one. + * + * Return: 0 on success; -errno on failure. + */ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, - sector_t pblk, unsigned int len) + sector_t pblk, unsigned int len) { const unsigned int blockbits = inode->i_blkbits; const unsigned int blocksize = 1 << blockbits; - struct page *ciphertext_page; + const unsigned int blocks_per_page_bits = PAGE_SHIFT - blockbits; + const unsigned int blocks_per_page = 1 << blocks_per_page_bits; + struct page *pages[16]; /* write up to 16 pages at a time */ + unsigned int nr_pages; + unsigned int i; + unsigned int offset; struct bio *bio; - int ret, err = 0; + int ret, err; - ciphertext_page = fscrypt_alloc_bounce_page(GFP_NOWAIT); - if (!ciphertext_page) - return -ENOMEM; + if (len == 0) + return 0; - while (len--) { - err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk, - ZERO_PAGE(0), ciphertext_page, - blocksize, 0, GFP_NOFS); - if (err) - goto errout; + BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_PAGES); + nr_pages = min_t(unsigned int, ARRAY_SIZE(pages), + (len + blocks_per_page - 1) >> blocks_per_page_bits); - bio = bio_alloc(GFP_NOWAIT, 1); - if (!bio) { - err = -ENOMEM; - goto errout; - } + /* + * We need at least one page for ciphertext. Allocate the first one + * from a mempool, with __GFP_DIRECT_RECLAIM set so that it can't fail. + * + * Any additional page allocations are allowed to fail, as they only + * help performance, and waiting on the mempool for them could deadlock. + */ + for (i = 0; i < nr_pages; i++) { + pages[i] = fscrypt_alloc_bounce_page(i == 0 ? GFP_NOFS : + GFP_NOWAIT | __GFP_NOWARN); + if (!pages[i]) + break; + } + nr_pages = i; + if (WARN_ON(nr_pages <= 0)) + return -EINVAL; + + /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */ + bio = bio_alloc(GFP_NOFS, nr_pages); + + do { bio_set_dev(bio, inode->i_sb->s_bdev); bio->bi_iter.bi_sector = pblk << (blockbits - 9); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - ret = bio_add_page(bio, ciphertext_page, blocksize, 0); - if (WARN_ON(ret != blocksize)) { - /* should never happen! */ - bio_put(bio); - err = -EIO; - goto errout; - } + + i = 0; + offset = 0; + do { + err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk, + ZERO_PAGE(0), pages[i], + blocksize, offset, GFP_NOFS); + if (err) + goto out; + lblk++; + pblk++; + len--; + offset += blocksize; + if (offset == PAGE_SIZE || len == 0) { + ret = bio_add_page(bio, pages[i++], offset, 0); + if (WARN_ON(ret != offset)) { + err = -EIO; + goto out; + } + offset = 0; + } + } while (i != nr_pages && len != 0); + err = submit_bio_wait(bio); - if (err == 0 && bio->bi_status) - err = -EIO; - bio_put(bio); if (err) - goto errout; - lblk++; - pblk++; - } + goto out; + bio_reset(bio); + } while (len != 0); err = 0; -errout: - fscrypt_free_bounce_page(ciphertext_page); +out: + bio_put(bio); + for (i = 0; i < nr_pages; i++) + fscrypt_free_bounce_page(pages[i]); return err; } EXPORT_SYMBOL(fscrypt_zeroout_range); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 3719efa546c6..1ecaac7ee3cb 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -25,8 +25,6 @@ #include <linux/module.h> #include <linux/scatterlist.h> #include <linux/ratelimit.h> -#include <linux/dcache.h> -#include <linux/namei.h> #include <crypto/skcipher.h> #include "fscrypt_private.h" @@ -140,7 +138,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, * multiple of the filesystem's block size. * @offs: Byte offset within @page of the first block to encrypt. Must be * a multiple of the filesystem's block size. - * @gfp_flags: Memory allocation flags + * @gfp_flags: Memory allocation flags. See details below. * * A new bounce page is allocated, and the specified block(s) are encrypted into * it. In the bounce page, the ciphertext block(s) will be located at the same @@ -150,6 +148,11 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, * * This is for use by the filesystem's ->writepages() method. * + * The bounce page allocation is mempool-backed, so it will always succeed when + * @gfp_flags includes __GFP_DIRECT_RECLAIM, e.g. when it's GFP_NOFS. However, + * only the first page of each bio can be allocated this way. To prevent + * deadlocks, for any additional pages a mask like GFP_NOWAIT must be used. + * * Return: the new encrypted bounce page on success; an ERR_PTR() on failure */ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, @@ -286,54 +289,6 @@ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, } EXPORT_SYMBOL(fscrypt_decrypt_block_inplace); -/* - * Validate dentries in encrypted directories to make sure we aren't potentially - * caching stale dentries after a key has been added. - */ -static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) -{ - struct dentry *dir; - int err; - int valid; - - /* - * Plaintext names are always valid, since fscrypt doesn't support - * reverting to ciphertext names without evicting the directory's inode - * -- which implies eviction of the dentries in the directory. - */ - if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME)) - return 1; - - /* - * Ciphertext name; valid if the directory's key is still unavailable. - * - * Although fscrypt forbids rename() on ciphertext names, we still must - * use dget_parent() here rather than use ->d_parent directly. That's - * because a corrupted fs image may contain directory hard links, which - * the VFS handles by moving the directory's dentry tree in the dcache - * each time ->lookup() finds the directory and it already has a dentry - * elsewhere. Thus ->d_parent can be changing, and we must safely grab - * a reference to some ->d_parent to prevent it from being freed. - */ - - if (flags & LOOKUP_RCU) - return -ECHILD; - - dir = dget_parent(dentry); - err = fscrypt_get_encryption_info(d_inode(dir)); - valid = !fscrypt_has_encryption_key(d_inode(dir)); - dput(dir); - - if (err < 0) - return err; - - return valid; -} - -const struct dentry_operations fscrypt_d_ops = { - .d_revalidate = fscrypt_d_revalidate, -}; - /** * fscrypt_initialize() - allocate major buffers for fs encryption. * @cop_flags: fscrypt operations flags diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 3da3707c10e3..4c212442a8f7 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -11,10 +11,87 @@ * This has not yet undergone a rigorous security audit. */ +#include <linux/namei.h> #include <linux/scatterlist.h> +#include <crypto/hash.h> +#include <crypto/sha.h> #include <crypto/skcipher.h> #include "fscrypt_private.h" +/** + * struct fscrypt_nokey_name - identifier for directory entry when key is absent + * + * When userspace lists an encrypted directory without access to the key, the + * filesystem must present a unique "no-key name" for each filename that allows + * it to find the directory entry again if requested. Naively, that would just + * mean using the ciphertext filenames. However, since the ciphertext filenames + * can contain illegal characters ('\0' and '/'), they must be encoded in some + * way. We use base64. But that can cause names to exceed NAME_MAX (255 + * bytes), so we also need to use a strong hash to abbreviate long names. + * + * The filesystem may also need another kind of hash, the "dirhash", to quickly + * find the directory entry. Since filesystems normally compute the dirhash + * over the on-disk filename (i.e. the ciphertext), it's not computable from + * no-key names that abbreviate the ciphertext using the strong hash to fit in + * NAME_MAX. It's also not computable if it's a keyed hash taken over the + * plaintext (but it may still be available in the on-disk directory entry); + * casefolded directories use this type of dirhash. At least in these cases, + * each no-key name must include the name's dirhash too. + * + * To meet all these requirements, we base64-encode the following + * variable-length structure. It contains the dirhash, or 0's if the filesystem + * didn't provide one; up to 149 bytes of the ciphertext name; and for + * ciphertexts longer than 149 bytes, also the SHA-256 of the remaining bytes. + * + * This ensures that each no-key name contains everything needed to find the + * directory entry again, contains only legal characters, doesn't exceed + * NAME_MAX, is unambiguous unless there's a SHA-256 collision, and that we only + * take the performance hit of SHA-256 on very long filenames (which are rare). + */ +struct fscrypt_nokey_name { + u32 dirhash[2]; + u8 bytes[149]; + u8 sha256[SHA256_DIGEST_SIZE]; +}; /* 189 bytes => 252 bytes base64-encoded, which is <= NAME_MAX (255) */ + +/* + * Decoded size of max-size nokey name, i.e. a name that was abbreviated using + * the strong hash and thus includes the 'sha256' field. This isn't simply + * sizeof(struct fscrypt_nokey_name), as the padding at the end isn't included. + */ +#define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256) + +static struct crypto_shash *sha256_hash_tfm; + +static int fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result) +{ + struct crypto_shash *tfm = READ_ONCE(sha256_hash_tfm); + + if (unlikely(!tfm)) { + struct crypto_shash *prev_tfm; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + fscrypt_err(NULL, + "Error allocating SHA-256 transform: %ld", + PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + prev_tfm = cmpxchg(&sha256_hash_tfm, NULL, tfm); + if (prev_tfm) { + crypto_free_shash(tfm); + tfm = prev_tfm; + } + } + { + SHASH_DESC_ON_STACK(desc, tfm); + + desc->tfm = tfm; + + return crypto_shash_digest(desc, data, data_len, result); + } +} + static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') @@ -27,19 +104,19 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) } /** - * fname_encrypt() - encrypt a filename + * fscrypt_fname_encrypt() - encrypt a filename * * The output buffer must be at least as large as the input buffer. * Any extra space is filled with NUL padding before encryption. * * Return: 0 on success, -errno on failure */ -int fname_encrypt(struct inode *inode, const struct qstr *iname, - u8 *out, unsigned int olen) +int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen) { struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); - struct fscrypt_info *ci = inode->i_crypt_info; + const struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; union fscrypt_iv iv; struct scatterlist sg; @@ -85,14 +162,14 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname, * * Return: 0 on success, -errno on failure */ -static int fname_decrypt(struct inode *inode, - const struct fscrypt_str *iname, - struct fscrypt_str *oname) +static int fname_decrypt(const struct inode *inode, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) { struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; - struct fscrypt_info *ci = inode->i_crypt_info; + const struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; union fscrypt_iv iv; int res; @@ -206,9 +283,7 @@ int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 max_encrypted_len, struct fscrypt_str *crypto_str) { - const u32 max_encoded_len = - max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE), - 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))); + const u32 max_encoded_len = BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX); u32 max_presented_len; max_presented_len = max(max_encoded_len, max_encrypted_len); @@ -241,19 +316,21 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); * * The caller must have allocated sufficient memory for the @oname string. * - * If the key is available, we'll decrypt the disk name; otherwise, we'll encode - * it for presentation. Short names are directly base64-encoded, while long - * names are encoded in fscrypt_digested_name format. + * If the key is available, we'll decrypt the disk name. Otherwise, we'll + * encode it for presentation in fscrypt_nokey_name format. + * See struct fscrypt_nokey_name for details. * * Return: 0 on success, -errno on failure */ -int fscrypt_fname_disk_to_usr(struct inode *inode, - u32 hash, u32 minor_hash, - const struct fscrypt_str *iname, - struct fscrypt_str *oname) +int fscrypt_fname_disk_to_usr(const struct inode *inode, + u32 hash, u32 minor_hash, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); - struct fscrypt_digested_name digested_name; + struct fscrypt_nokey_name nokey_name; + u32 size; /* size of the unencoded no-key name */ + int err; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; @@ -268,24 +345,37 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, if (fscrypt_has_encryption_key(inode)) return fname_decrypt(inode, iname, oname); - if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) { - oname->len = base64_encode(iname->name, iname->len, - oname->name); - return 0; - } + /* + * Sanity check that struct fscrypt_nokey_name doesn't have padding + * between fields and that its encoded size never exceeds NAME_MAX. + */ + BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, dirhash) != + offsetof(struct fscrypt_nokey_name, bytes)); + BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, bytes) != + offsetof(struct fscrypt_nokey_name, sha256)); + BUILD_BUG_ON(BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX) > NAME_MAX); + if (hash) { - digested_name.hash = hash; - digested_name.minor_hash = minor_hash; + nokey_name.dirhash[0] = hash; + nokey_name.dirhash[1] = minor_hash; + } else { + nokey_name.dirhash[0] = 0; + nokey_name.dirhash[1] = 0; + } + if (iname->len <= sizeof(nokey_name.bytes)) { + memcpy(nokey_name.bytes, iname->name, iname->len); + size = offsetof(struct fscrypt_nokey_name, bytes[iname->len]); } else { - digested_name.hash = 0; - digested_name.minor_hash = 0; + memcpy(nokey_name.bytes, iname->name, sizeof(nokey_name.bytes)); + /* Compute strong hash of remaining part of name. */ + err = fscrypt_do_sha256(&iname->name[sizeof(nokey_name.bytes)], + iname->len - sizeof(nokey_name.bytes), + nokey_name.sha256); + if (err) + return err; + size = FSCRYPT_NOKEY_NAME_MAX; } - memcpy(digested_name.digest, - FSCRYPT_FNAME_DIGEST(iname->name, iname->len), - FSCRYPT_FNAME_DIGEST_SIZE); - oname->name[0] = '_'; - oname->len = 1 + base64_encode((const u8 *)&digested_name, - sizeof(digested_name), oname->name + 1); + oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name); return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); @@ -306,8 +396,7 @@ EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); * get the disk_name. * * Else, for keyless @lookup operations, @iname is the presented ciphertext, so - * we decode it to get either the ciphertext disk_name (for short names) or the - * fscrypt_digested_name (for long names). Non-@lookup operations will be + * we decode it to get the fscrypt_nokey_name. Non-@lookup operations will be * impossible in this case, so we fail them with ENOKEY. * * If successful, fscrypt_free_filename() must be called later to clean up. @@ -317,8 +406,8 @@ EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { + struct fscrypt_nokey_name *nokey_name; int ret; - int digested; memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; @@ -342,8 +431,8 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (!fname->crypto_buf.name) return -ENOMEM; - ret = fname_encrypt(dir, iname, fname->crypto_buf.name, - fname->crypto_buf.len); + ret = fscrypt_fname_encrypt(dir, iname, fname->crypto_buf.name, + fname->crypto_buf.len); if (ret) goto errout; fname->disk_name.name = fname->crypto_buf.name; @@ -358,40 +447,31 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, * We don't have the key and we are doing a lookup; decode the * user-supplied name */ - if (iname->name[0] == '_') { - if (iname->len != - 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))) - return -ENOENT; - digested = 1; - } else { - if (iname->len > - BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE)) - return -ENOENT; - digested = 0; - } - fname->crypto_buf.name = - kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE, - sizeof(struct fscrypt_digested_name)), - GFP_KERNEL); + if (iname->len > BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX)) + return -ENOENT; + + fname->crypto_buf.name = kmalloc(FSCRYPT_NOKEY_NAME_MAX, GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; - ret = base64_decode(iname->name + digested, iname->len - digested, - fname->crypto_buf.name); - if (ret < 0) { + ret = base64_decode(iname->name, iname->len, fname->crypto_buf.name); + if (ret < (int)offsetof(struct fscrypt_nokey_name, bytes[1]) || + (ret > offsetof(struct fscrypt_nokey_name, sha256) && + ret != FSCRYPT_NOKEY_NAME_MAX)) { ret = -ENOENT; goto errout; } fname->crypto_buf.len = ret; - if (digested) { - const struct fscrypt_digested_name *n = - (const void *)fname->crypto_buf.name; - fname->hash = n->hash; - fname->minor_hash = n->minor_hash; - } else { - fname->disk_name.name = fname->crypto_buf.name; - fname->disk_name.len = fname->crypto_buf.len; + + nokey_name = (void *)fname->crypto_buf.name; + fname->hash = nokey_name->dirhash[0]; + fname->minor_hash = nokey_name->dirhash[1]; + if (ret != FSCRYPT_NOKEY_NAME_MAX) { + /* The full ciphertext filename is available. */ + fname->disk_name.name = nokey_name->bytes; + fname->disk_name.len = + ret - offsetof(struct fscrypt_nokey_name, bytes); } return 0; @@ -400,3 +480,109 @@ errout: return ret; } EXPORT_SYMBOL(fscrypt_setup_filename); + +/** + * fscrypt_match_name() - test whether the given name matches a directory entry + * @fname: the name being searched for + * @de_name: the name from the directory entry + * @de_name_len: the length of @de_name in bytes + * + * Normally @fname->disk_name will be set, and in that case we simply compare + * that to the name stored in the directory entry. The only exception is that + * if we don't have the key for an encrypted directory and the name we're + * looking for is very long, then we won't have the full disk_name and instead + * we'll need to match against a fscrypt_nokey_name that includes a strong hash. + * + * Return: %true if the name matches, otherwise %false. + */ +bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + const struct fscrypt_nokey_name *nokey_name = + (const void *)fname->crypto_buf.name; + u8 sha256[SHA256_DIGEST_SIZE]; + + if (likely(fname->disk_name.name)) { + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, de_name_len); + } + if (de_name_len <= sizeof(nokey_name->bytes)) + return false; + if (memcmp(de_name, nokey_name->bytes, sizeof(nokey_name->bytes))) + return false; + if (fscrypt_do_sha256(&de_name[sizeof(nokey_name->bytes)], + de_name_len - sizeof(nokey_name->bytes), sha256)) + return false; + return !memcmp(sha256, nokey_name->sha256, sizeof(sha256)); +} +EXPORT_SYMBOL_GPL(fscrypt_match_name); + +/** + * fscrypt_fname_siphash() - calculate the SipHash of a filename + * @dir: the parent directory + * @name: the filename to calculate the SipHash of + * + * Given a plaintext filename @name and a directory @dir which uses SipHash as + * its dirhash method and has had its fscrypt key set up, this function + * calculates the SipHash of that name using the directory's secret dirhash key. + * + * Return: the SipHash of @name using the hash key of @dir + */ +u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) +{ + const struct fscrypt_info *ci = dir->i_crypt_info; + + WARN_ON(!ci->ci_dirhash_key_initialized); + + return siphash(name->name, name->len, &ci->ci_dirhash_key); +} +EXPORT_SYMBOL_GPL(fscrypt_fname_siphash); + +/* + * Validate dentries in encrypted directories to make sure we aren't potentially + * caching stale dentries after a key has been added. + */ +static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct dentry *dir; + int err; + int valid; + + /* + * Plaintext names are always valid, since fscrypt doesn't support + * reverting to ciphertext names without evicting the directory's inode + * -- which implies eviction of the dentries in the directory. + */ + if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME)) + return 1; + + /* + * Ciphertext name; valid if the directory's key is still unavailable. + * + * Although fscrypt forbids rename() on ciphertext names, we still must + * use dget_parent() here rather than use ->d_parent directly. That's + * because a corrupted fs image may contain directory hard links, which + * the VFS handles by moving the directory's dentry tree in the dcache + * each time ->lookup() finds the directory and it already has a dentry + * elsewhere. Thus ->d_parent can be changing, and we must safely grab + * a reference to some ->d_parent to prevent it from being freed. + */ + + if (flags & LOOKUP_RCU) + return -ECHILD; + + dir = dget_parent(dentry); + err = fscrypt_get_encryption_info(d_inode(dir)); + valid = !fscrypt_has_encryption_key(d_inode(dir)); + dput(dir); + + if (err < 0) + return err; + + return valid; +} + +const struct dentry_operations fscrypt_d_ops = { + .d_revalidate = fscrypt_d_revalidate, +}; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 130b50e5a011..9aae851409e5 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -12,6 +12,7 @@ #define _FSCRYPT_PRIVATE_H #include <linux/fscrypt.h> +#include <linux/siphash.h> #include <crypto/hash.h> #define CONST_STRLEN(str) (sizeof(str) - 1) @@ -136,12 +137,6 @@ fscrypt_policy_flags(const union fscrypt_policy *policy) BUG(); } -static inline bool -fscrypt_is_direct_key_policy(const union fscrypt_policy *policy) -{ - return fscrypt_policy_flags(policy) & FSCRYPT_POLICY_FLAG_DIRECT_KEY; -} - /** * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. @@ -194,6 +189,14 @@ struct fscrypt_info { */ struct fscrypt_direct_key *ci_direct_key; + /* + * This inode's hash key for filenames. This is a 128-bit SipHash-2-4 + * key. This is only set for directories that use a keyed dirhash over + * the plaintext filenames -- currently just casefolded directories. + */ + siphash_key_t ci_dirhash_key; + bool ci_dirhash_key_initialized; + /* The encryption policy used by this inode */ union fscrypt_policy ci_policy; @@ -206,24 +209,6 @@ typedef enum { FS_ENCRYPT, } fscrypt_direction_t; -static inline bool fscrypt_valid_enc_modes(u32 contents_mode, - u32 filenames_mode) -{ - if (contents_mode == FSCRYPT_MODE_AES_128_CBC && - filenames_mode == FSCRYPT_MODE_AES_128_CTS) - return true; - - if (contents_mode == FSCRYPT_MODE_AES_256_XTS && - filenames_mode == FSCRYPT_MODE_AES_256_CTS) - return true; - - if (contents_mode == FSCRYPT_MODE_ADIANTUM && - filenames_mode == FSCRYPT_MODE_ADIANTUM) - return true; - - return false; -} - /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; extern int fscrypt_initialize(unsigned int cop_flags); @@ -233,7 +218,6 @@ extern int fscrypt_crypt_block(const struct inode *inode, unsigned int len, unsigned int offs, gfp_t gfp_flags); extern struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); -extern const struct dentry_operations fscrypt_d_ops; extern void __printf(3, 4) __cold fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...); @@ -260,11 +244,13 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, const struct fscrypt_info *ci); /* fname.c */ -extern int fname_encrypt(struct inode *inode, const struct qstr *iname, - u8 *out, unsigned int olen); +extern int fscrypt_fname_encrypt(const struct inode *inode, + const struct qstr *iname, + u8 *out, unsigned int olen); extern bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); +extern const struct dentry_operations fscrypt_d_ops; /* hkdf.c */ @@ -283,11 +269,12 @@ extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, * output doesn't reveal another. */ #define HKDF_CONTEXT_KEY_IDENTIFIER 1 -#define HKDF_CONTEXT_PER_FILE_KEY 2 +#define HKDF_CONTEXT_PER_FILE_ENC_KEY 2 #define HKDF_CONTEXT_DIRECT_KEY 3 #define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 +#define HKDF_CONTEXT_DIRHASH_KEY 5 -extern int fscrypt_hkdf_expand(struct fscrypt_hkdf *hkdf, u8 context, +extern int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, u8 *okm, unsigned int okmlen); @@ -448,18 +435,17 @@ struct fscrypt_mode { int logged_impl_name; }; -static inline bool -fscrypt_mode_supports_direct_key(const struct fscrypt_mode *mode) -{ - return mode->ivsize >= offsetofend(union fscrypt_iv, nonce); -} +extern struct fscrypt_mode fscrypt_modes[]; extern struct crypto_skcipher * fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, const struct inode *inode); -extern int fscrypt_set_derived_key(struct fscrypt_info *ci, - const u8 *derived_key); +extern int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, + const u8 *raw_key); + +extern int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, + const struct fscrypt_master_key *mk); /* keysetup_v1.c */ diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index f21873e1b467..efb95bd19a89 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -112,7 +112,7 @@ out: * adds to its application-specific info strings to guarantee that it doesn't * accidentally repeat an info string when using HKDF for different purposes.) */ -int fscrypt_hkdf_expand(struct fscrypt_hkdf *hkdf, u8 context, +int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, u8 *okm, unsigned int okmlen) { diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index bb3b7fcfdd48..5ef861742921 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -5,6 +5,8 @@ * Encryption hooks for higher-level filesystem operations. */ +#include <linux/key.h> + #include "fscrypt_private.h" /** @@ -122,6 +124,48 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, } EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); +/** + * fscrypt_prepare_setflags() - prepare to change flags with FS_IOC_SETFLAGS + * @inode: the inode on which flags are being changed + * @oldflags: the old flags + * @flags: the new flags + * + * The caller should be holding i_rwsem for write. + * + * Return: 0 on success; -errno if the flags change isn't allowed or if + * another error occurs. + */ +int fscrypt_prepare_setflags(struct inode *inode, + unsigned int oldflags, unsigned int flags) +{ + struct fscrypt_info *ci; + struct fscrypt_master_key *mk; + int err; + + /* + * When the CASEFOLD flag is set on an encrypted directory, we must + * derive the secret key needed for the dirhash. This is only possible + * if the directory uses a v2 encryption policy. + */ + if (IS_ENCRYPTED(inode) && (flags & ~oldflags & FS_CASEFOLD_FL)) { + err = fscrypt_require_key(inode); + if (err) + return err; + ci = inode->i_crypt_info; + if (ci->ci_policy.version != FSCRYPT_POLICY_V2) + return -EINVAL; + mk = ci->ci_master_key->payload.data[0]; + down_read(&mk->mk_secret_sem); + if (is_master_key_secret_present(&mk->mk_secret)) + err = fscrypt_derive_dirhash_key(ci, mk); + else + err = -ENOKEY; + up_read(&mk->mk_secret_sem); + return err; + } + return 0; +} + int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link) @@ -188,7 +232,8 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, ciphertext_len = disk_link->len - sizeof(*sd); sd->len = cpu_to_le16(ciphertext_len); - err = fname_encrypt(inode, &iname, sd->encrypted_path, ciphertext_len); + err = fscrypt_fname_encrypt(inode, &iname, sd->encrypted_path, + ciphertext_len); if (err) goto err_free_sd; diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 040df1f5e1c8..ab41b25d4fa1 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -151,7 +151,7 @@ static struct key *search_fscrypt_keyring(struct key *keyring, } #define FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE \ - (CONST_STRLEN("fscrypt-") + FIELD_SIZEOF(struct super_block, s_id)) + (CONST_STRLEN("fscrypt-") + sizeof_field(struct super_block, s_id)) #define FSCRYPT_MK_DESCRIPTION_SIZE (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + 1) @@ -465,6 +465,109 @@ out_unlock: return err; } +static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep) +{ + const struct fscrypt_provisioning_key_payload *payload = prep->data; + + if (prep->datalen < sizeof(*payload) + FSCRYPT_MIN_KEY_SIZE || + prep->datalen > sizeof(*payload) + FSCRYPT_MAX_KEY_SIZE) + return -EINVAL; + + if (payload->type != FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && + payload->type != FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) + return -EINVAL; + + if (payload->__reserved) + return -EINVAL; + + prep->payload.data[0] = kmemdup(payload, prep->datalen, GFP_KERNEL); + if (!prep->payload.data[0]) + return -ENOMEM; + + prep->quotalen = prep->datalen; + return 0; +} + +static void fscrypt_provisioning_key_free_preparse( + struct key_preparsed_payload *prep) +{ + kzfree(prep->payload.data[0]); +} + +static void fscrypt_provisioning_key_describe(const struct key *key, + struct seq_file *m) +{ + seq_puts(m, key->description); + if (key_is_positive(key)) { + const struct fscrypt_provisioning_key_payload *payload = + key->payload.data[0]; + + seq_printf(m, ": %u [%u]", key->datalen, payload->type); + } +} + +static void fscrypt_provisioning_key_destroy(struct key *key) +{ + kzfree(key->payload.data[0]); +} + +static struct key_type key_type_fscrypt_provisioning = { + .name = "fscrypt-provisioning", + .preparse = fscrypt_provisioning_key_preparse, + .free_preparse = fscrypt_provisioning_key_free_preparse, + .instantiate = generic_key_instantiate, + .describe = fscrypt_provisioning_key_describe, + .destroy = fscrypt_provisioning_key_destroy, +}; + +/* + * Retrieve the raw key from the Linux keyring key specified by 'key_id', and + * store it into 'secret'. + * + * The key must be of type "fscrypt-provisioning" and must have the field + * fscrypt_provisioning_key_payload::type set to 'type', indicating that it's + * only usable with fscrypt with the particular KDF version identified by + * 'type'. We don't use the "logon" key type because there's no way to + * completely restrict the use of such keys; they can be used by any kernel API + * that accepts "logon" keys and doesn't require a specific service prefix. + * + * The ability to specify the key via Linux keyring key is intended for cases + * where userspace needs to re-add keys after the filesystem is unmounted and + * re-mounted. Most users should just provide the raw key directly instead. + */ +static int get_keyring_key(u32 key_id, u32 type, + struct fscrypt_master_key_secret *secret) +{ + key_ref_t ref; + struct key *key; + const struct fscrypt_provisioning_key_payload *payload; + int err; + + ref = lookup_user_key(key_id, 0, KEY_NEED_SEARCH); + if (IS_ERR(ref)) + return PTR_ERR(ref); + key = key_ref_to_ptr(ref); + + if (key->type != &key_type_fscrypt_provisioning) + goto bad_key; + payload = key->payload.data[0]; + + /* Don't allow fscrypt v1 keys to be used as v2 keys and vice versa. */ + if (payload->type != type) + goto bad_key; + + secret->size = key->datalen - sizeof(*payload); + memcpy(secret->raw, payload->raw, secret->size); + err = 0; + goto out_put; + +bad_key: + err = -EKEYREJECTED; +out_put: + key_ref_put(ref); + return err; +} + /* * Add a master encryption key to the filesystem, causing all files which were * encrypted with it to appear "unlocked" (decrypted) when accessed. @@ -503,18 +606,25 @@ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) if (!valid_key_spec(&arg.key_spec)) return -EINVAL; - if (arg.raw_size < FSCRYPT_MIN_KEY_SIZE || - arg.raw_size > FSCRYPT_MAX_KEY_SIZE) - return -EINVAL; - if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; memset(&secret, 0, sizeof(secret)); - secret.size = arg.raw_size; - err = -EFAULT; - if (copy_from_user(secret.raw, uarg->raw, secret.size)) - goto out_wipe_secret; + if (arg.key_id) { + if (arg.raw_size != 0) + return -EINVAL; + err = get_keyring_key(arg.key_id, arg.key_spec.type, &secret); + if (err) + goto out_wipe_secret; + } else { + if (arg.raw_size < FSCRYPT_MIN_KEY_SIZE || + arg.raw_size > FSCRYPT_MAX_KEY_SIZE) + return -EINVAL; + secret.size = arg.raw_size; + err = -EFAULT; + if (copy_from_user(secret.raw, uarg->raw, secret.size)) + goto out_wipe_secret; + } switch (arg.key_spec.type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: @@ -666,9 +776,6 @@ static int check_for_busy_inodes(struct super_block *sb, struct list_head *pos; size_t busy_count = 0; unsigned long ino; - struct dentry *dentry; - char _path[256]; - char *path = NULL; spin_lock(&mk->mk_decrypted_inodes_lock); @@ -687,22 +794,14 @@ static int check_for_busy_inodes(struct super_block *sb, struct fscrypt_info, ci_master_key_link)->ci_inode; ino = inode->i_ino; - dentry = d_find_alias(inode); } spin_unlock(&mk->mk_decrypted_inodes_lock); - if (dentry) { - path = dentry_path(dentry, _path, sizeof(_path)); - dput(dentry); - } - if (IS_ERR_OR_NULL(path)) - path = "(unknown)"; - fscrypt_warn(NULL, - "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu (%s)", + "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu", sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec), master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u, - ino, path); + ino); return -EBUSY; } @@ -978,8 +1077,14 @@ int __init fscrypt_init_keyring(void) if (err) goto err_unregister_fscrypt; + err = register_key_type(&key_type_fscrypt_provisioning); + if (err) + goto err_unregister_fscrypt_user; + return 0; +err_unregister_fscrypt_user: + unregister_key_type(&key_type_fscrypt_user); err_unregister_fscrypt: unregister_key_type(&key_type_fscrypt); return err; diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index f577bb6613f9..65cb09fa6ead 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -13,7 +13,7 @@ #include "fscrypt_private.h" -static struct fscrypt_mode available_modes[] = { +struct fscrypt_mode fscrypt_modes[] = { [FSCRYPT_MODE_AES_256_XTS] = { .friendly_name = "AES-256-XTS", .cipher_str = "xts(aes)", @@ -51,10 +51,10 @@ select_encryption_mode(const union fscrypt_policy *policy, const struct inode *inode) { if (S_ISREG(inode->i_mode)) - return &available_modes[fscrypt_policy_contents_mode(policy)]; + return &fscrypt_modes[fscrypt_policy_contents_mode(policy)]; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - return &available_modes[fscrypt_policy_fnames_mode(policy)]; + return &fscrypt_modes[fscrypt_policy_fnames_mode(policy)]; WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", inode->i_ino, (inode->i_mode & S_IFMT)); @@ -89,8 +89,11 @@ struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode, * first time a mode is used. */ pr_info("fscrypt: %s using implementation \"%s\"\n", - mode->friendly_name, - crypto_skcipher_alg(tfm)->base.cra_driver_name); + mode->friendly_name, crypto_skcipher_driver_name(tfm)); + } + if (WARN_ON(crypto_skcipher_ivsize(tfm) != mode->ivsize)) { + err = -EINVAL; + goto err_free_tfm; } crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize); @@ -104,12 +107,12 @@ err_free_tfm: return ERR_PTR(err); } -/* Given the per-file key, set up the file's crypto transform object */ -int fscrypt_set_derived_key(struct fscrypt_info *ci, const u8 *derived_key) +/* Given a per-file encryption key, set up the file's crypto transform object */ +int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key) { struct crypto_skcipher *tfm; - tfm = fscrypt_allocate_skcipher(ci->ci_mode, derived_key, ci->ci_inode); + tfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, ci->ci_inode); if (IS_ERR(tfm)) return PTR_ERR(tfm); @@ -118,15 +121,15 @@ int fscrypt_set_derived_key(struct fscrypt_info *ci, const u8 *derived_key) return 0; } -static int setup_per_mode_key(struct fscrypt_info *ci, - struct fscrypt_master_key *mk, - struct crypto_skcipher **tfms, - u8 hkdf_context, bool include_fs_uuid) +static int setup_per_mode_enc_key(struct fscrypt_info *ci, + struct fscrypt_master_key *mk, + struct crypto_skcipher **tfms, + u8 hkdf_context, bool include_fs_uuid) { const struct inode *inode = ci->ci_inode; const struct super_block *sb = inode->i_sb; struct fscrypt_mode *mode = ci->ci_mode; - u8 mode_num = mode - available_modes; + const u8 mode_num = mode - fscrypt_modes; struct crypto_skcipher *tfm, *prev_tfm; u8 mode_key[FSCRYPT_MAX_KEY_SIZE]; u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)]; @@ -171,29 +174,37 @@ done: return 0; } +int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, + const struct fscrypt_master_key *mk) +{ + int err; + + err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, HKDF_CONTEXT_DIRHASH_KEY, + ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE, + (u8 *)&ci->ci_dirhash_key, + sizeof(ci->ci_dirhash_key)); + if (err) + return err; + ci->ci_dirhash_key_initialized = true; + return 0; +} + static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, struct fscrypt_master_key *mk) { - u8 derived_key[FSCRYPT_MAX_KEY_SIZE]; int err; if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { /* - * DIRECT_KEY: instead of deriving per-file keys, the per-file - * nonce will be included in all the IVs. But unlike v1 - * policies, for v2 policies in this case we don't encrypt with - * the master key directly but rather derive a per-mode key. - * This ensures that the master key is consistently used only - * for HKDF, avoiding key reuse issues. + * DIRECT_KEY: instead of deriving per-file encryption keys, the + * per-file nonce will be included in all the IVs. But unlike + * v1 policies, for v2 policies in this case we don't encrypt + * with the master key directly but rather derive a per-mode + * encryption key. This ensures that the master key is + * consistently used only for HKDF, avoiding key reuse issues. */ - if (!fscrypt_mode_supports_direct_key(ci->ci_mode)) { - fscrypt_warn(ci->ci_inode, - "Direct key flag not allowed with %s", - ci->ci_mode->friendly_name); - return -EINVAL; - } - return setup_per_mode_key(ci, mk, mk->mk_direct_tfms, - HKDF_CONTEXT_DIRECT_KEY, false); + err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_tfms, + HKDF_CONTEXT_DIRECT_KEY, false); } else if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { /* @@ -202,21 +213,34 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * the IVs. This format is optimized for use with inline * encryption hardware compliant with the UFS or eMMC standards. */ - return setup_per_mode_key(ci, mk, mk->mk_iv_ino_lblk_64_tfms, - HKDF_CONTEXT_IV_INO_LBLK_64_KEY, - true); + err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_tfms, + HKDF_CONTEXT_IV_INO_LBLK_64_KEY, + true); + } else { + u8 derived_key[FSCRYPT_MAX_KEY_SIZE]; + + err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, + HKDF_CONTEXT_PER_FILE_ENC_KEY, + ci->ci_nonce, + FS_KEY_DERIVATION_NONCE_SIZE, + derived_key, ci->ci_mode->keysize); + if (err) + return err; + + err = fscrypt_set_per_file_enc_key(ci, derived_key); + memzero_explicit(derived_key, ci->ci_mode->keysize); } - - err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, - HKDF_CONTEXT_PER_FILE_KEY, - ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE, - derived_key, ci->ci_mode->keysize); if (err) return err; - err = fscrypt_set_derived_key(ci, derived_key); - memzero_explicit(derived_key, ci->ci_mode->keysize); - return err; + /* Derive a secret dirhash key for directories that need it. */ + if (S_ISDIR(ci->ci_inode->i_mode) && IS_CASEFOLDED(ci->ci_inode)) { + err = fscrypt_derive_dirhash_key(ci, mk); + if (err) + return err; + } + + return 0; } /* diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c index 5298ef22aa85..801b48c0cd7f 100644 --- a/fs/crypto/keysetup_v1.c +++ b/fs/crypto/keysetup_v1.c @@ -9,7 +9,7 @@ * This file implements compatibility functions for the original encryption * policy version ("v1"), including: * - * - Deriving per-file keys using the AES-128-ECB based KDF + * - Deriving per-file encryption keys using the AES-128-ECB based KDF * (rather than the new method of using HKDF-SHA512) * * - Retrieving fscrypt master keys from process-subscribed keyrings @@ -253,23 +253,8 @@ err_free_dk: static int setup_v1_file_key_direct(struct fscrypt_info *ci, const u8 *raw_master_key) { - const struct fscrypt_mode *mode = ci->ci_mode; struct fscrypt_direct_key *dk; - if (!fscrypt_mode_supports_direct_key(mode)) { - fscrypt_warn(ci->ci_inode, - "Direct key mode not allowed with %s", - mode->friendly_name); - return -EINVAL; - } - - if (ci->ci_policy.v1.contents_encryption_mode != - ci->ci_policy.v1.filenames_encryption_mode) { - fscrypt_warn(ci->ci_inode, - "Direct key mode not allowed with different contents and filenames modes"); - return -EINVAL; - } - dk = fscrypt_get_direct_key(ci, raw_master_key); if (IS_ERR(dk)) return PTR_ERR(dk); @@ -298,7 +283,7 @@ static int setup_v1_file_key_derived(struct fscrypt_info *ci, if (err) goto out; - err = fscrypt_set_derived_key(ci, derived_key); + err = fscrypt_set_per_file_enc_key(ci, derived_key); out: kzfree(derived_key); return err; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 96f528071bed..cf2a9d26ef7d 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -29,6 +29,43 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, return !memcmp(policy1, policy2, fscrypt_policy_size(policy1)); } +static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) +{ + if (contents_mode == FSCRYPT_MODE_AES_256_XTS && + filenames_mode == FSCRYPT_MODE_AES_256_CTS) + return true; + + if (contents_mode == FSCRYPT_MODE_AES_128_CBC && + filenames_mode == FSCRYPT_MODE_AES_128_CTS) + return true; + + if (contents_mode == FSCRYPT_MODE_ADIANTUM && + filenames_mode == FSCRYPT_MODE_ADIANTUM) + return true; + + return false; +} + +static bool supported_direct_key_modes(const struct inode *inode, + u32 contents_mode, u32 filenames_mode) +{ + const struct fscrypt_mode *mode; + + if (contents_mode != filenames_mode) { + fscrypt_warn(inode, + "Direct key flag not allowed with different contents and filenames modes"); + return false; + } + mode = &fscrypt_modes[contents_mode]; + + if (mode->ivsize < offsetofend(union fscrypt_iv, nonce)) { + fscrypt_warn(inode, "Direct key flag not allowed with %s", + mode->friendly_name); + return false; + } + return true; +} + static bool supported_iv_ino_lblk_64_policy( const struct fscrypt_policy_v2 *policy, const struct inode *inode) @@ -63,13 +100,82 @@ static bool supported_iv_ino_lblk_64_policy( return true; } +static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy, + const struct inode *inode) +{ + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) { + fscrypt_warn(inode, + "Unsupported encryption modes (contents %d, filenames %d)", + policy->contents_encryption_mode, + policy->filenames_encryption_mode); + return false; + } + + if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK | + FSCRYPT_POLICY_FLAG_DIRECT_KEY)) { + fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)", + policy->flags); + return false; + } + + if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && + !supported_direct_key_modes(inode, policy->contents_encryption_mode, + policy->filenames_encryption_mode)) + return false; + + if (IS_CASEFOLDED(inode)) { + /* With v1, there's no way to derive dirhash keys. */ + fscrypt_warn(inode, + "v1 policies can't be used on casefolded directories"); + return false; + } + + return true; +} + +static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, + const struct inode *inode) +{ + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) { + fscrypt_warn(inode, + "Unsupported encryption modes (contents %d, filenames %d)", + policy->contents_encryption_mode, + policy->filenames_encryption_mode); + return false; + } + + if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) { + fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)", + policy->flags); + return false; + } + + if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && + !supported_direct_key_modes(inode, policy->contents_encryption_mode, + policy->filenames_encryption_mode)) + return false; + + if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) && + !supported_iv_ino_lblk_64_policy(policy, inode)) + return false; + + if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) { + fscrypt_warn(inode, "Reserved bits set in encryption policy"); + return false; + } + + return true; +} + /** * fscrypt_supported_policy - check whether an encryption policy is supported * * Given an encryption policy, check whether all its encryption modes and other - * settings are supported by this kernel. (But we don't currently don't check - * for crypto API support here, so attempting to use an algorithm not configured - * into the crypto API will still fail later.) + * settings are supported by this kernel on the given inode. (But we don't + * currently don't check for crypto API support here, so attempting to use an + * algorithm not configured into the crypto API will still fail later.) * * Return: %true if supported, else %false */ @@ -77,60 +183,10 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, const struct inode *inode) { switch (policy_u->version) { - case FSCRYPT_POLICY_V1: { - const struct fscrypt_policy_v1 *policy = &policy_u->v1; - - if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, - policy->filenames_encryption_mode)) { - fscrypt_warn(inode, - "Unsupported encryption modes (contents %d, filenames %d)", - policy->contents_encryption_mode, - policy->filenames_encryption_mode); - return false; - } - - if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK | - FSCRYPT_POLICY_FLAG_DIRECT_KEY)) { - fscrypt_warn(inode, - "Unsupported encryption flags (0x%02x)", - policy->flags); - return false; - } - - return true; - } - case FSCRYPT_POLICY_V2: { - const struct fscrypt_policy_v2 *policy = &policy_u->v2; - - if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, - policy->filenames_encryption_mode)) { - fscrypt_warn(inode, - "Unsupported encryption modes (contents %d, filenames %d)", - policy->contents_encryption_mode, - policy->filenames_encryption_mode); - return false; - } - - if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) { - fscrypt_warn(inode, - "Unsupported encryption flags (0x%02x)", - policy->flags); - return false; - } - - if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) && - !supported_iv_ino_lblk_64_policy(policy, inode)) - return false; - - if (memchr_inv(policy->__reserved, 0, - sizeof(policy->__reserved))) { - fscrypt_warn(inode, - "Reserved bits set in encryption policy"); - return false; - } - - return true; - } + case FSCRYPT_POLICY_V1: + return fscrypt_supported_v1_policy(&policy_u->v1, inode); + case FSCRYPT_POLICY_V2: + return fscrypt_supported_v2_policy(&policy_u->v2, inode); } return false; } diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index dede25247b81..634b09d18b77 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -142,18 +142,21 @@ EXPORT_SYMBOL_GPL(debugfs_file_put); * We also need to exclude any file that has ways to write or alter it as root * can bypass the permissions check. */ -static bool debugfs_is_locked_down(struct inode *inode, - struct file *filp, - const struct file_operations *real_fops) +static int debugfs_locked_down(struct inode *inode, + struct file *filp, + const struct file_operations *real_fops) { if ((inode->i_mode & 07777) == 0444 && !(filp->f_mode & FMODE_WRITE) && !real_fops->unlocked_ioctl && !real_fops->compat_ioctl && !real_fops->mmap) - return false; + return 0; - return security_locked_down(LOCKDOWN_DEBUGFS); + if (security_locked_down(LOCKDOWN_DEBUGFS)) + return -EPERM; + + return 0; } static int open_proxy_open(struct inode *inode, struct file *filp) @@ -168,7 +171,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp) real_fops = debugfs_real_fops(filp); - r = debugfs_is_locked_down(inode, filp, real_fops); + r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; @@ -298,7 +301,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp) real_fops = debugfs_real_fops(filp); - r = debugfs_is_locked_down(inode, filp, real_fops); + r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; @@ -496,10 +499,10 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_u32(const char *name, umode_t mode, @@ -581,10 +584,10 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_ulong(const char *name, umode_t mode, @@ -846,10 +849,10 @@ static const struct file_operations fops_bool_wo = { * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_bool(const char *name, umode_t mode, @@ -899,10 +902,10 @@ static const struct file_operations fops_blob = { * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_blob(const char *name, umode_t mode, @@ -1091,10 +1094,10 @@ static const struct file_operations fops_regset32 = { * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_regset32(const char *name, umode_t mode, @@ -1158,4 +1161,3 @@ struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name, &debugfs_devm_entry_ops); } EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile); - diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index f4d8df5e4714..dc6cffc4feba 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -423,7 +423,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be @@ -502,7 +502,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be @@ -534,7 +534,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be @@ -627,7 +627,7 @@ EXPORT_SYMBOL(debugfs_create_automount); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the symbolic * link is to be removed (no automatic cleanup happens if your module is - * unloaded, you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) + * unloaded, you are responsible here.) If an error occurs, ERR_PTR(-ERROR) * will be returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be @@ -906,4 +906,3 @@ static int __init debugfs_init(void) return retval; } core_initcall(debugfs_init); - diff --git a/fs/direct-io.c b/fs/direct-io.c index 0ec4f270139f..00b4d15bb811 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -39,6 +39,8 @@ #include <linux/atomic.h> #include <linux/prefetch.h> +#include "internal.h" + /* * How many user pages to map in one call to get_user_pages(). This determines * the size of a structure in the slab cache diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 3951d39b9b75..cdfaf4f0e11a 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1035,7 +1035,7 @@ static void sctp_connect_to_sock(struct connection *con) int result; int addr_len; struct socket *sock; - struct timeval tv = { .tv_sec = 5, .tv_usec = 0 }; + struct __kernel_sock_timeval tv = { .tv_sec = 5, .tv_usec = 0 }; if (con->nodeid == 0) { log_print("attempt to connect sock 0 foiled"); @@ -1087,12 +1087,12 @@ static void sctp_connect_to_sock(struct connection *con) * since O_NONBLOCK argument in connect() function does not work here, * then, we should restore the default value of this attribute. */ - kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv, + kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv, sizeof(tv)); result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len, 0); memset(&tv, 0, sizeof(tv)); - kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv, + kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv, sizeof(tv)); if (result == -EINPROGRESS) diff --git a/fs/drop_caches.c b/fs/drop_caches.c index d31b6c72b476..dc1a1d5d825b 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -35,11 +35,11 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); - cond_resched(); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; + cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index f91db24bbf3b..db1ef144c63a 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1586,7 +1586,7 @@ ecryptfs_process_key_cipher(struct crypto_skcipher **key_tfm, } crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); if (*key_size == 0) - *key_size = crypto_skcipher_default_keysize(*key_tfm); + *key_size = crypto_skcipher_max_keysize(*key_tfm); get_random_bytes(dummy_key, *key_size); rc = crypto_skcipher_setkey(*key_tfm, dummy_key, *key_size); if (rc) { diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 216fbe6a4837..7d326aa0308e 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -2204,9 +2204,9 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes, if (mount_crypt_stat->global_default_cipher_key_size == 0) { printk(KERN_WARNING "No key size specified at mount; " "defaulting to [%d]\n", - crypto_skcipher_default_keysize(tfm)); + crypto_skcipher_max_keysize(tfm)); mount_crypt_stat->global_default_cipher_key_size = - crypto_skcipher_default_keysize(tfm); + crypto_skcipher_max_keysize(tfm); } if (crypt_stat->key_size == 0) crypt_stat->key_size = diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 2890a67a1ded..5779a15c2cd6 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -306,24 +306,22 @@ static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq, } src = kmap_atomic(*rq->in); - if (!rq->out[0]) { - dst = NULL; - } else { + if (rq->out[0]) { dst = kmap_atomic(rq->out[0]); memcpy(dst + rq->pageofs_out, src, righthalf); + kunmap_atomic(dst); } - if (rq->out[1] == *rq->in) { - memmove(src, src + righthalf, rq->pageofs_out); - } else if (nrpages_out == 2) { - if (dst) - kunmap_atomic(dst); + if (nrpages_out == 2) { DBG_BUGON(!rq->out[1]); - dst = kmap_atomic(rq->out[1]); - memcpy(dst, src + righthalf, rq->pageofs_out); + if (rq->out[1] == *rq->in) { + memmove(src, src + righthalf, rq->pageofs_out); + } else { + dst = kmap_atomic(rq->out[1]); + memcpy(dst, src + righthalf, rq->pageofs_out); + kunmap_atomic(dst); + } } - if (dst) - kunmap_atomic(dst); kunmap_atomic(src); return 0; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 1ed5beff7d11..c4c6dcdc89ad 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -401,9 +401,9 @@ static inline void *erofs_get_pcpubuf(unsigned int pagenr) #ifdef CONFIG_EROFS_FS_ZIP int erofs_workgroup_put(struct erofs_workgroup *grp); struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, - pgoff_t index, bool *tag); + pgoff_t index); int erofs_register_workgroup(struct super_block *sb, - struct erofs_workgroup *grp, bool tag); + struct erofs_workgroup *grp); void erofs_workgroup_free_rcu(struct erofs_workgroup *grp); void erofs_shrinker_register(struct super_block *sb); void erofs_shrinker_unregister(struct super_block *sb); diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 1e8e1450d5b0..fddc5059c930 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -59,7 +59,7 @@ repeat: } struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, - pgoff_t index, bool *tag) + pgoff_t index) { struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_workgroup *grp; @@ -68,9 +68,6 @@ repeat: rcu_read_lock(); grp = radix_tree_lookup(&sbi->workstn_tree, index); if (grp) { - *tag = xa_pointer_tag(grp); - grp = xa_untag_pointer(grp); - if (erofs_workgroup_get(grp)) { /* prefer to relax rcu read side */ rcu_read_unlock(); @@ -84,8 +81,7 @@ repeat: } int erofs_register_workgroup(struct super_block *sb, - struct erofs_workgroup *grp, - bool tag) + struct erofs_workgroup *grp) { struct erofs_sb_info *sbi; int err; @@ -103,8 +99,6 @@ int erofs_register_workgroup(struct super_block *sb, sbi = EROFS_SB(sb); xa_lock(&sbi->workstn_tree); - grp = xa_tag_pointer(grp, tag); - /* * Bump up reference count before making this workgroup * visible to other users in order to avoid potential UAF @@ -175,8 +169,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, * however in order to avoid some race conditions, add a * DBG_BUGON to observe this in advance. */ - DBG_BUGON(xa_untag_pointer(radix_tree_delete(&sbi->workstn_tree, - grp->index)) != grp); + DBG_BUGON(radix_tree_delete(&sbi->workstn_tree, grp->index) != grp); /* * If managed cache is on, last refcount should indicate @@ -201,7 +194,7 @@ repeat: batch, first_index, PAGEVEC_SIZE); for (i = 0; i < found; ++i) { - struct erofs_workgroup *grp = xa_untag_pointer(batch[i]); + struct erofs_workgroup *grp = batch[i]; first_index = grp->index + 1; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index a13a78725c57..b766c3ee5fa8 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -649,6 +649,8 @@ ssize_t erofs_listxattr(struct dentry *dentry, struct listxattr_iter it; ret = init_inode_xattrs(d_inode(dentry)); + if (ret == -ENOATTR) + return 0; if (ret) return ret; diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 3585b84d2f20..50966f1c676e 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -46,18 +46,19 @@ extern const struct xattr_handler erofs_xattr_security_handler; static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx) { -static const struct xattr_handler *xattr_handler_map[] = { - [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, + static const struct xattr_handler *xattr_handler_map[] = { + [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, #ifdef CONFIG_EROFS_FS_POSIX_ACL - [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, - [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = - &posix_acl_default_xattr_handler, + [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = + &posix_acl_access_xattr_handler, + [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = + &posix_acl_default_xattr_handler, #endif - [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, + [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, #ifdef CONFIG_EROFS_FS_SECURITY - [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler, + [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler, #endif -}; + }; return idx && idx < ARRAY_SIZE(xattr_handler_map) ? xattr_handler_map[idx] : NULL; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index ca99425a4536..80e47f07d946 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -345,9 +345,8 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt, struct z_erofs_pcluster *pcl; struct z_erofs_collection *cl; unsigned int length; - bool tag; - grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT, &tag); + grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); if (!grp) return -ENOENT; @@ -438,7 +437,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, */ mutex_trylock(&cl->lock); - err = erofs_register_workgroup(inode->i_sb, &pcl->obj, 0); + err = erofs_register_workgroup(inode->i_sb, &pcl->obj); if (err) { mutex_unlock(&cl->lock); kmem_cache_free(pcluster_cachep, pcl); @@ -1149,21 +1148,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, qtail[JQ_BYPASS] = &pcl->next; } -static bool postsubmit_is_all_bypassed(struct z_erofs_decompressqueue *q[], - unsigned int nr_bios, bool force_fg) -{ - /* - * although background is preferred, no one is pending for submission. - * don't issue workqueue for decompression but drop it directly instead. - */ - if (force_fg || nr_bios) - return false; - - kvfree(q[JQ_SUBMIT]); - return true; -} - -static bool z_erofs_submit_queue(struct super_block *sb, +static void z_erofs_submit_queue(struct super_block *sb, z_erofs_next_pcluster_t owned_head, struct list_head *pagepool, struct z_erofs_decompressqueue *fgq, @@ -1172,19 +1157,12 @@ static bool z_erofs_submit_queue(struct super_block *sb, struct erofs_sb_info *const sbi = EROFS_SB(sb); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; - struct bio *bio; void *bi_private; /* since bio will be NULL, no need to initialize last_index */ pgoff_t uninitialized_var(last_index); - bool force_submit = false; - unsigned int nr_bios; - - if (owned_head == Z_EROFS_PCLUSTER_TAIL) - return false; + unsigned int nr_bios = 0; + struct bio *bio = NULL; - force_submit = false; - bio = NULL; - nr_bios = 0; bi_private = jobqueueset_init(sb, q, fgq, force_fg); qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; @@ -1194,11 +1172,9 @@ static bool z_erofs_submit_queue(struct super_block *sb, do { struct z_erofs_pcluster *pcl; - unsigned int clusterpages; - pgoff_t first_index; - struct page *page; - unsigned int i = 0, bypass = 0; - int err; + pgoff_t cur, end; + unsigned int i = 0; + bool bypass = true; /* no possible 'owned_head' equals the following */ DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); @@ -1206,55 +1182,50 @@ static bool z_erofs_submit_queue(struct super_block *sb, pcl = container_of(owned_head, struct z_erofs_pcluster, next); - clusterpages = BIT(pcl->clusterbits); + cur = pcl->obj.index; + end = cur + BIT(pcl->clusterbits); /* close the main owned chain at first */ owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, Z_EROFS_PCLUSTER_TAIL_CLOSED); - first_index = pcl->obj.index; - force_submit |= (first_index != last_index + 1); + do { + struct page *page; + int err; -repeat: - page = pickup_page_for_submission(pcl, i, pagepool, - MNGD_MAPPING(sbi), - GFP_NOFS); - if (!page) { - force_submit = true; - ++bypass; - goto skippage; - } + page = pickup_page_for_submission(pcl, i++, pagepool, + MNGD_MAPPING(sbi), + GFP_NOFS); + if (!page) + continue; - if (bio && force_submit) { + if (bio && cur != last_index + 1) { submit_bio_retry: - submit_bio(bio); - bio = NULL; - } - - if (!bio) { - bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + submit_bio(bio); + bio = NULL; + } - bio->bi_end_io = z_erofs_decompressqueue_endio; - bio_set_dev(bio, sb->s_bdev); - bio->bi_iter.bi_sector = (sector_t)(first_index + i) << - LOG_SECTORS_PER_BLOCK; - bio->bi_private = bi_private; - bio->bi_opf = REQ_OP_READ; + if (!bio) { + bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - ++nr_bios; - } + bio->bi_end_io = z_erofs_decompressqueue_endio; + bio_set_dev(bio, sb->s_bdev); + bio->bi_iter.bi_sector = (sector_t)cur << + LOG_SECTORS_PER_BLOCK; + bio->bi_private = bi_private; + bio->bi_opf = REQ_OP_READ; + ++nr_bios; + } - err = bio_add_page(bio, page, PAGE_SIZE, 0); - if (err < PAGE_SIZE) - goto submit_bio_retry; + err = bio_add_page(bio, page, PAGE_SIZE, 0); + if (err < PAGE_SIZE) + goto submit_bio_retry; - force_submit = false; - last_index = first_index + i; -skippage: - if (++i < clusterpages) - goto repeat; + last_index = cur; + bypass = false; + } while (++cur < end); - if (bypass < clusterpages) + if (!bypass) qtail[JQ_SUBMIT] = &pcl->next; else move_to_bypass_jobqueue(pcl, qtail, owned_head); @@ -1263,11 +1234,15 @@ skippage: if (bio) submit_bio(bio); - if (postsubmit_is_all_bypassed(q, nr_bios, *force_fg)) - return true; - + /* + * although background is preferred, no one is pending for submission. + * don't issue workqueue for decompression but drop it directly instead. + */ + if (!*force_fg && !nr_bios) { + kvfree(q[JQ_SUBMIT]); + return; + } z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); - return true; } static void z_erofs_runqueue(struct super_block *sb, @@ -1276,9 +1251,9 @@ static void z_erofs_runqueue(struct super_block *sb, { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; - if (!z_erofs_submit_queue(sb, clt->owned_head, - pagepool, io, &force_fg)) + if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) return; + z_erofs_submit_queue(sb, clt->owned_head, pagepool, io, &force_fg); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 67a395039268..b041b66002db 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -354,12 +354,6 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p) return container_of(p, struct ep_pqueue, pt)->epi; } -/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ -static inline int ep_op_has_event(int op) -{ - return op != EPOLL_CTL_DEL; -} - /* Initialize the poll safe wake up structure */ static void ep_nested_calls_init(struct nested_calls *ncalls) { @@ -2074,27 +2068,28 @@ SYSCALL_DEFINE1(epoll_create, int, size) return do_epoll_create(0); } -/* - * The following function implements the controller interface for - * the eventpoll file that enables the insertion/removal/change of - * file descriptors inside the interest set. - */ -SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, - struct epoll_event __user *, event) +static inline int epoll_mutex_lock(struct mutex *mutex, int depth, + bool nonblock) +{ + if (!nonblock) { + mutex_lock_nested(mutex, depth); + return 0; + } + if (mutex_trylock(mutex)) + return 0; + return -EAGAIN; +} + +int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, + bool nonblock) { int error; int full_check = 0; struct fd f, tf; struct eventpoll *ep; struct epitem *epi; - struct epoll_event epds; struct eventpoll *tep = NULL; - error = -EFAULT; - if (ep_op_has_event(op) && - copy_from_user(&epds, event, sizeof(struct epoll_event))) - goto error_return; - error = -EBADF; f = fdget(epfd); if (!f.file) @@ -2112,7 +2107,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) - ep_take_care_of_epollwakeup(&epds); + ep_take_care_of_epollwakeup(epds); /* * We have to check that the file structure underneath the file descriptor @@ -2128,11 +2123,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * Also, we do not currently supported nested exclusive wakeups. */ - if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) { + if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || - (epds.events & ~EPOLLEXCLUSIVE_OK_BITS))) + (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput; } @@ -2157,13 +2152,17 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * deep wakeup paths from forming in parallel through multiple * EPOLL_CTL_ADD operations. */ - mutex_lock_nested(&ep->mtx, 0); + error = epoll_mutex_lock(&ep->mtx, 0, nonblock); + if (error) + goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { if (!list_empty(&f.file->f_ep_links) || is_file_epoll(tf.file)) { - full_check = 1; mutex_unlock(&ep->mtx); - mutex_lock(&epmutex); + error = epoll_mutex_lock(&epmutex, 0, nonblock); + if (error) + goto error_tgt_fput; + full_check = 1; if (is_file_epoll(tf.file)) { error = -ELOOP; if (ep_loop_check(ep, tf.file) != 0) { @@ -2173,10 +2172,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } else list_add(&tf.file->f_tfile_llink, &tfile_check_list); - mutex_lock_nested(&ep->mtx, 0); + error = epoll_mutex_lock(&ep->mtx, 0, nonblock); + if (error) { +out_del: + list_del(&tf.file->f_tfile_llink); + goto error_tgt_fput; + } if (is_file_epoll(tf.file)) { tep = tf.file->private_data; - mutex_lock_nested(&tep->mtx, 1); + error = epoll_mutex_lock(&tep->mtx, 1, nonblock); + if (error) { + mutex_unlock(&ep->mtx); + goto out_del; + } } } } @@ -2192,8 +2200,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, switch (op) { case EPOLL_CTL_ADD: if (!epi) { - epds.events |= EPOLLERR | EPOLLHUP; - error = ep_insert(ep, &epds, tf.file, fd, full_check); + epds->events |= EPOLLERR | EPOLLHUP; + error = ep_insert(ep, epds, tf.file, fd, full_check); } else error = -EEXIST; if (full_check) @@ -2208,8 +2216,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, case EPOLL_CTL_MOD: if (epi) { if (!(epi->event.events & EPOLLEXCLUSIVE)) { - epds.events |= EPOLLERR | EPOLLHUP; - error = ep_modify(ep, epi, &epds); + epds->events |= EPOLLERR | EPOLLHUP; + error = ep_modify(ep, epi, epds); } } else error = -ENOENT; @@ -2232,6 +2240,23 @@ error_return: } /* + * The following function implements the controller interface for + * the eventpoll file that enables the insertion/removal/change of + * file descriptors inside the interest set. + */ +SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, + struct epoll_event __user *, event) +{ + struct epoll_event epds; + + if (ep_op_has_event(op) && + copy_from_user(&epds, event, sizeof(struct epoll_event))) + return -EFAULT; + + return do_epoll_ctl(epfd, op, fd, &epds, false); +} + +/* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). */ diff --git a/fs/exec.c b/fs/exec.c index 74d88dab98dd..db17be51b112 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -272,7 +272,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm) goto err; mm->stack_vm = mm->total_vm = 1; - arch_bprm_mm_init(mm, vma); up_write(&mm->mmap_sem); bprm->p = vma->vm_end - sizeof(void *); return 0; @@ -761,6 +760,11 @@ int setup_arg_pages(struct linux_binprm *bprm, goto out_unlock; BUG_ON(prev != vma); + if (unlikely(vm_flags & VM_EXEC)) { + pr_warn_once("process '%pD4' started with executable stack\n", + bprm->file); + } + /* Move stack pages down in memory. */ if (stack_shift) { ret = shift_arg_pages(vma, stack_shift); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index bcffe25da2f0..4a4ab683250d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1073,9 +1073,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (EXT2_BLOCKS_PER_GROUP(sb) == 0) goto cantfind_ext2; - sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - - le32_to_cpu(es->s_first_data_block) - 1) - / EXT2_BLOCKS_PER_GROUP(sb)) + 1; + sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) - 1) + / EXT2_BLOCKS_PER_GROUP(sb)) + 1; db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) / EXT2_DESC_PER_BLOCK(sb); sbi->s_group_desc = kmalloc_array (db_count, @@ -1138,6 +1138,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ext2_count_dirs(sb), GFP_KERNEL); } if (err) { + ret = err; ext2_msg(sb, KERN_ERR, "error: insufficient memory"); goto failed_mount3; } diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index ef42ab040905..2a592e38cdfe 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -4,12 +4,7 @@ # kernels after the removal of ext3 driver. config EXT3_FS tristate "The Extended 3 (ext3) filesystem" - # These must match EXT4_FS selects... select EXT4_FS - select JBD2 - select CRC16 - select CRYPTO - select CRYPTO_CRC32C help This config option is here only for backward compatibility. ext3 filesystem is now handled by the ext4 driver. @@ -33,12 +28,12 @@ config EXT3_FS_SECURITY config EXT4_FS tristate "The Extended 4 (ext4) filesystem" - # Please update EXT3_FS selects when changing these select JBD2 select CRC16 select CRYPTO select CRYPTO_CRC32C select FS_IOMAP + select FS_ENCRYPTION_ALGS if FS_ENCRYPTION help This is the next generation of the ext3 filesystem. @@ -108,7 +103,7 @@ config EXT4_DEBUG echo 1 > /sys/module/ext4/parameters/mballoc_debug config EXT4_KUNIT_TESTS - bool "KUnit tests for ext4" + tristate "KUnit tests for ext4" select EXT4_FS depends on KUNIT help diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 840b91d040f1..4ccb3c9189d8 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -13,5 +13,6 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o -ext4-$(CONFIG_EXT4_KUNIT_TESTS) += inode-test.o +ext4-inode-test-objs += inode-test.o +obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-inode-test.o ext4-$(CONFIG_FS_VERITY) += verity.o diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 0b202e00d93f..5f993a411251 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -371,7 +371,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, if (buffer_verified(bh)) goto verified; if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, - desc, bh))) { + desc, bh) || + ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, @@ -505,7 +506,9 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, if (!desc) return -EFSCORRUPTED; wait_on_buffer(bh); + ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO); if (!buffer_uptodate(bh)) { + ext4_set_errno(sb, EIO); ext4_error(sb, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, (unsigned long long) bh->b_blocknr); diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index d4d4fdfac1a6..1ee04e76bbe0 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -133,10 +133,13 @@ static void debug_print_tree(struct ext4_sb_info *sbi) { struct rb_node *node; struct ext4_system_zone *entry; + struct ext4_system_blocks *system_blks; int first = 1; printk(KERN_INFO "System zones: "); - node = rb_first(&sbi->system_blks->root); + rcu_read_lock(); + system_blks = rcu_dereference(sbi->system_blks); + node = rb_first(&system_blks->root); while (node) { entry = rb_entry(node, struct ext4_system_zone, node); printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ", @@ -144,6 +147,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi) first = 0; node = rb_next(node); } + rcu_read_unlock(); printk(KERN_CONT "\n"); } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 9fdd2b269d61..1f340743c9a8 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -72,6 +72,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); + const int next_offset = ((char *) de - buf) + rlen; if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) error_msg = "rec_len is smaller than minimal"; @@ -79,8 +80,11 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (unlikely(((char *) de - buf) + rlen > size)) + else if (unlikely(next_offset > size)) error_msg = "directory entry overrun"; + else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) && + next_offset != size)) + error_msg = "directory entry too close to block end"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; @@ -116,7 +120,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) if (IS_ENCRYPTED(inode)) { err = fscrypt_get_encryption_info(inode); - if (err && err != -ENOKEY) + if (err) return err; } @@ -458,7 +462,6 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, new_fn->name_len = ent_name->len; new_fn->file_type = dirent->file_type; memcpy(new_fn->name, ent_name->name, ent_name->len); - new_fn->name[ent_name->len] = 0; while (*p) { parent = *p; @@ -668,9 +671,11 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct qstr qstr = {.name = str, .len = len }; - struct inode *inode = dentry->d_parent->d_inode; + const struct dentry *parent = READ_ONCE(dentry->d_parent); + const struct inode *inode = READ_ONCE(parent->d_inode); - if (!IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) { + if (!inode || !IS_CASEFOLDED(inode) || + !EXT4_SB(inode->i_sb)->s_encoding) { if (len != name->len) return -1; return memcmp(str, name->name, len); @@ -683,10 +688,11 @@ static int ext4_d_hash(const struct dentry *dentry, struct qstr *str) { const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb); const struct unicode_map *um = sbi->s_encoding; + const struct inode *inode = READ_ONCE(dentry->d_inode); unsigned char *norm; int len, ret = 0; - if (!IS_CASEFOLDED(dentry->d_inode) || !um) + if (!inode || !IS_CASEFOLDED(inode) || !um) return 0; norm = kmalloc(PATH_MAX, GFP_ATOMIC); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f8578caba40d..9a2ee2428ecc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1052,8 +1052,6 @@ struct ext4_inode_info { /* allocation reservation info for delalloc */ /* In case of bigalloc, this refer to clusters rather than blocks */ unsigned int i_reserved_data_blocks; - ext4_lblk_t i_da_metadata_calc_last_lblock; - int i_da_metadata_calc_len; /* pending cluster reservations for bigalloc file systems */ struct ext4_pending_tree i_pending_tree; @@ -1343,7 +1341,8 @@ struct ext4_super_block { __u8 s_lastcheck_hi; __u8 s_first_error_time_hi; __u8 s_last_error_time_hi; - __u8 s_pad[2]; + __u8 s_first_error_errcode; + __u8 s_last_error_errcode; __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ __le32 s_reserved[95]; /* Padding to the end of the block */ @@ -1556,6 +1555,9 @@ struct ext4_sb_info { /* Barrier between changing inodes' journal flags and writepages ops. */ struct percpu_rw_semaphore s_journal_flag_rwsem; struct dax_device *s_daxdev; +#ifdef CONFIG_EXT4_DEBUG + unsigned long s_simulate_fail; +#endif }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1575,6 +1577,66 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) } /* + * Simulate_fail codes + */ +#define EXT4_SIM_BBITMAP_EIO 1 +#define EXT4_SIM_BBITMAP_CRC 2 +#define EXT4_SIM_IBITMAP_EIO 3 +#define EXT4_SIM_IBITMAP_CRC 4 +#define EXT4_SIM_INODE_EIO 5 +#define EXT4_SIM_INODE_CRC 6 +#define EXT4_SIM_DIRBLOCK_EIO 7 +#define EXT4_SIM_DIRBLOCK_CRC 8 + +static inline bool ext4_simulate_fail(struct super_block *sb, + unsigned long code) +{ +#ifdef CONFIG_EXT4_DEBUG + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (unlikely(sbi->s_simulate_fail == code)) { + sbi->s_simulate_fail = 0; + return true; + } +#endif + return false; +} + +static inline void ext4_simulate_fail_bh(struct super_block *sb, + struct buffer_head *bh, + unsigned long code) +{ + if (!IS_ERR(bh) && ext4_simulate_fail(sb, code)) + clear_buffer_uptodate(bh); +} + +/* + * Error number codes for s_{first,last}_error_errno + * + * Linux errno numbers are architecture specific, so we need to translate + * them into something which is architecture independent. We don't define + * codes for all errno's; just the ones which are most likely to be the cause + * of an ext4_error() call. + */ +#define EXT4_ERR_UNKNOWN 1 +#define EXT4_ERR_EIO 2 +#define EXT4_ERR_ENOMEM 3 +#define EXT4_ERR_EFSBADCRC 4 +#define EXT4_ERR_EFSCORRUPTED 5 +#define EXT4_ERR_ENOSPC 6 +#define EXT4_ERR_ENOKEY 7 +#define EXT4_ERR_EROFS 8 +#define EXT4_ERR_EFBIG 9 +#define EXT4_ERR_EEXIST 10 +#define EXT4_ERR_ERANGE 11 +#define EXT4_ERR_EOVERFLOW 12 +#define EXT4_ERR_EBUSY 13 +#define EXT4_ERR_ENOTDIR 14 +#define EXT4_ERR_ENOTEMPTY 15 +#define EXT4_ERR_ESHUTDOWN 16 +#define EXT4_ERR_EFAULT 17 + +/* * Inode dynamic state flags */ enum { @@ -2628,7 +2690,6 @@ extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, @@ -2679,8 +2740,6 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); extern void ext4_superblock_csum_set(struct super_block *sb); -extern void *ext4_kvmalloc(size_t size, gfp_t flags); -extern void *ext4_kvzalloc(size_t size, gfp_t flags); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, @@ -2688,6 +2747,7 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno, extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t block_group, unsigned int flags); +extern void ext4_set_errno(struct super_block *sb, int err); extern __printf(4, 5) void __ext4_error(struct super_block *, const char *, unsigned int, @@ -3254,7 +3314,6 @@ struct ext4_extent; #define EXT_MAX_BLOCKS 0xffffffff extern int ext4_ext_tree_init(handle_t *handle, struct inode *); -extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); @@ -3271,14 +3330,9 @@ extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern int ext4_ext_calc_metadata_amount(struct inode *inode, - ext4_lblk_t lblocks); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); -extern int ext4_can_extents_be_merged(struct inode *inode, - struct ext4_extent *ex1, - struct ext4_extent *ex2); extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path **, struct ext4_extent *, int); @@ -3294,8 +3348,6 @@ extern int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); -extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); -extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, @@ -3390,6 +3442,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } extern const struct iomap_ops ext4_iomap_ops; +extern const struct iomap_ops ext4_iomap_overwrite_ops; extern const struct iomap_ops ext4_iomap_report_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 98bd0e9ee7df..1c216fcc202a 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -267,10 +267,5 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, 0xffff); } -#define ext4_ext_dirty(handle, inode, path) \ - __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) -int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, - struct inode *inode, struct ext4_ext_path *path); - #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index d3b8cdea5df7..1f53d64e42a5 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -7,6 +7,28 @@ #include <trace/events/ext4.h> +int ext4_inode_journal_mode(struct inode *inode) +{ + if (EXT4_JOURNAL(inode) == NULL) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + /* We do not support data journalling with delayed allocation */ + if (!S_ISREG(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC))) { + /* We do not support data journalling for encrypted data */ + if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + } + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + BUG(); +} + /* Just increment the non-pointer handle value */ static handle_t *ext4_get_nojournal(void) { @@ -58,6 +80,7 @@ static int ext4_journal_check_start(struct super_block *sb) * take the FS itself readonly cleanly. */ if (journal && is_journal_aborted(journal)) { + ext4_set_errno(sb, -journal->j_errno); ext4_abort(sb, "Detected aborted journal"); return -EROFS; } @@ -249,6 +272,7 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); + ext4_set_errno(inode->i_sb, -err); __ext4_abort(inode->i_sb, where, line, "error %d when attempting revoke", err); } @@ -320,6 +344,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_block = cpu_to_le64(bh->b_blocknr); + ext4_set_errno(inode->i_sb, EIO); ext4_error_inode(inode, where, line, bh->b_blocknr, "IO error syncing itable block"); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index a6b9b66dbfad..7ea4f6fa173b 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -463,27 +463,7 @@ int ext4_force_commit(struct super_block *sb); #define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ #define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ -static inline int ext4_inode_journal_mode(struct inode *inode) -{ - if (EXT4_JOURNAL(inode) == NULL) - return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ - /* We do not support data journalling with delayed allocation */ - if (!S_ISREG(inode->i_mode) || - ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || - test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC))) { - /* We do not support data journalling for encrypted data */ - if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) - return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ - return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ - } - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) - return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ - BUG(); -} +int ext4_inode_journal_mode(struct inode *inode); static inline int ext4_should_journal_data(struct inode *inode) { diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0e8708b77da6..954013d6076b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -161,8 +161,9 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, * - ENOMEM * - EIO */ -int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, - struct inode *inode, struct ext4_ext_path *path) +static int __ext4_ext_dirty(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) { int err; @@ -179,6 +180,9 @@ int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, return err; } +#define ext4_ext_dirty(handle, inode, path) \ + __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) + static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) @@ -309,53 +313,6 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode, (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0)); } -/* - * Calculate the number of metadata blocks needed - * to allocate @blocks - * Worse case is one block per extent - */ -int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - int idxs; - - idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) - / sizeof(struct ext4_extent_idx)); - - /* - * If the new delayed allocation block is contiguous with the - * previous da block, it can share index blocks with the - * previous block, so we only need to allocate a new index - * block every idxs leaf blocks. At ldxs**2 blocks, we need - * an additional index block, and at ldxs**3 blocks, yet - * another index blocks. - */ - if (ei->i_da_metadata_calc_len && - ei->i_da_metadata_calc_last_lblock+1 == lblock) { - int num = 0; - - if ((ei->i_da_metadata_calc_len % idxs) == 0) - num++; - if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) - num++; - if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { - num++; - ei->i_da_metadata_calc_len = 0; - } else - ei->i_da_metadata_calc_len++; - ei->i_da_metadata_calc_last_lblock++; - return num; - } - - /* - * In the worst case we need a new set of index blocks at - * every level of the inode's extent tree. - */ - ei->i_da_metadata_calc_len = 1; - ei->i_da_metadata_calc_last_lblock = lblock; - return ext_depth(inode) + 1; -} - static int ext4_ext_max_entries(struct inode *inode, int depth) { @@ -492,6 +449,7 @@ static int __ext4_ext_check(const char *function, unsigned int line, return 0; corrupted: + ext4_set_errno(inode->i_sb, -err); ext4_error_inode(inode, function, line, 0, "pblk %llu bad header/extent: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", @@ -510,6 +468,30 @@ int ext4_ext_check_inode(struct inode *inode) return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); } +static void ext4_cache_extents(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); + ext4_lblk_t prev = 0; + int i; + + for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { + unsigned int status = EXTENT_STATUS_WRITTEN; + ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); + int len = ext4_ext_get_actual_len(ex); + + if (prev && (prev != lblk)) + ext4_es_cache_extent(inode, prev, lblk - prev, ~0, + EXTENT_STATUS_HOLE); + + if (ext4_ext_is_unwritten(ex)) + status = EXTENT_STATUS_UNWRITTEN; + ext4_es_cache_extent(inode, lblk, len, + ext4_ext_pblock(ex), status); + prev = lblk + len; + } +} + static struct buffer_head * __read_extent_tree_block(const char *function, unsigned int line, struct inode *inode, ext4_fsblk_t pblk, int depth, @@ -544,26 +526,7 @@ __read_extent_tree_block(const char *function, unsigned int line, */ if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { struct ext4_extent_header *eh = ext_block_hdr(bh); - struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); - ext4_lblk_t prev = 0; - int i; - - for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { - unsigned int status = EXTENT_STATUS_WRITTEN; - ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); - int len = ext4_ext_get_actual_len(ex); - - if (prev && (prev != lblk)) - ext4_es_cache_extent(inode, prev, - lblk - prev, ~0, - EXTENT_STATUS_HOLE); - - if (ext4_ext_is_unwritten(ex)) - status = EXTENT_STATUS_UNWRITTEN; - ext4_es_cache_extent(inode, lblk, len, - ext4_ext_pblock(ex), status); - prev = lblk + len; - } + ext4_cache_extents(inode, eh); } return bh; errout: @@ -649,8 +612,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) ext_debug("path:"); for (k = 0; k <= l; k++, path++) { if (path->p_idx) { - ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), - ext4_idx_pblock(path->p_idx)); + ext_debug(" %d->%llu", + le32_to_cpu(path->p_idx->ei_block), + ext4_idx_pblock(path->p_idx)); } else if (path->p_ext) { ext_debug(" %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), @@ -731,11 +695,12 @@ void ext4_ext_drop_refs(struct ext4_ext_path *path) if (!path) return; depth = path->p_depth; - for (i = 0; i <= depth; i++, path++) + for (i = 0; i <= depth; i++, path++) { if (path->p_bh) { brelse(path->p_bh); path->p_bh = NULL; } + } } /* @@ -777,8 +742,8 @@ ext4_ext_binsearch_idx(struct inode *inode, chix = ix = EXT_FIRST_INDEX(eh); for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { - if (k != 0 && - le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { + if (k != 0 && le32_to_cpu(ix->ei_block) <= + le32_to_cpu(ix[-1].ei_block)) { printk(KERN_DEBUG "k=%d, ix=0x%p, " "first=0x%p\n", k, ix, EXT_FIRST_INDEX(eh)); @@ -911,6 +876,8 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, path[0].p_bh = NULL; i = depth; + if (!(flags & EXT4_EX_NOCACHE) && depth == 0) + ext4_cache_extents(inode, eh); /* walk through the tree */ while (i) { ext_debug("depth %d: num %d, max %d\n", @@ -1632,17 +1599,16 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) return EXT_MAX_BLOCKS; while (depth >= 0) { + struct ext4_ext_path *p = &path[depth]; + if (depth == path->p_depth) { /* leaf */ - if (path[depth].p_ext && - path[depth].p_ext != - EXT_LAST_EXTENT(path[depth].p_hdr)) - return le32_to_cpu(path[depth].p_ext[1].ee_block); + if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr)) + return le32_to_cpu(p->p_ext[1].ee_block); } else { /* index */ - if (path[depth].p_idx != - EXT_LAST_INDEX(path[depth].p_hdr)) - return le32_to_cpu(path[depth].p_idx[1].ei_block); + if (p->p_idx != EXT_LAST_INDEX(p->p_hdr)) + return le32_to_cpu(p->p_idx[1].ei_block); } depth--; } @@ -1742,9 +1708,9 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, return err; } -int -ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, - struct ext4_extent *ex2) +static int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2) { unsigned short ext1_ee_len, ext2_ee_len; @@ -1758,11 +1724,6 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, le32_to_cpu(ex2->ee_block)) return 0; - /* - * To allow future support for preallocated extents to be added - * as an RO_COMPAT feature, refuse to merge to extents if - * this can result in the top bit of ee_len being set. - */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; @@ -1870,13 +1831,14 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, } /* - * This function tries to merge the @ex extent to neighbours in the tree. - * return 1 if merge left else 0. + * This function tries to merge the @ex extent to neighbours in the tree, then + * tries to collapse the extent tree into the inode. */ static void ext4_ext_try_to_merge(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *ex) { + struct ext4_extent *ex) +{ struct ext4_extent_header *eh; unsigned int depth; int merge_done = 0; @@ -3718,9 +3680,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); - if (IS_ENCRYPTED(inode)) - max_zeroout = 0; - /* * five cases: * 1. split the extent into three extents. @@ -4706,6 +4665,10 @@ retry: return ret > 0 ? ret2 : ret; } +static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); + +static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); + static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { @@ -4723,9 +4686,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, trace_ext4_zero_range(inode, offset, len, mode); - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - /* Call ext4_force_commit to flush all data in case of data=journal. */ if (ext4_should_journal_data(inode)) { ret = ext4_force_commit(inode->i_sb); @@ -4765,7 +4725,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, } if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > i_size_read(inode) || + (offset + len > inode->i_size || offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); @@ -4849,7 +4809,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, * Mark that we allocate beyond EOF so the subsequent truncate * can proceed even if the new size is the same as i_size. */ - if ((offset + len) > i_size_read(inode)) + if (offset + len > inode->i_size) ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } ext4_mark_inode_dirty(handle, inode); @@ -4890,14 +4850,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) * range since we would need to re-encrypt blocks with a * different IV or XTS tweak (which are based on the logical * block number). - * - * XXX It's not clear why zero range isn't working, but we'll - * leave it disabled for encrypted inodes for now. This is a - * bug we should fix.... */ if (IS_ENCRYPTED(inode) && - (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE | - FALLOC_FL_ZERO_RANGE))) + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; /* Return error if mode is not supported */ @@ -4941,7 +4896,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) } if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > i_size_read(inode) || + (offset + len > inode->i_size || offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); @@ -5268,7 +5223,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, { int depth, err = 0; struct ext4_extent *ex_start, *ex_last; - bool update = 0; + bool update = false; depth = path->p_depth; while (depth >= 0) { @@ -5284,7 +5239,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, goto out; if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) - update = 1; + update = true; while (ex_start <= ex_last) { if (SHIFT == SHIFT_LEFT) { @@ -5472,7 +5427,7 @@ out: * This implements the fallocate's collapse range functionality for ext4 * Returns: 0 and non-zero on error. */ -int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; ext4_lblk_t punch_start, punch_stop; @@ -5489,12 +5444,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Collapse range works only on fs block size aligned offsets. */ - if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || - len & (EXT4_CLUSTER_SIZE(sb) - 1)) - return -EINVAL; - - if (!S_ISREG(inode->i_mode)) + /* Collapse range works only on fs cluster size aligned regions. */ + if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; trace_ext4_collapse_range(inode, offset, len); @@ -5514,7 +5465,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation */ - if (offset + len >= i_size_read(inode)) { + if (offset + len >= inode->i_size) { ret = -EINVAL; goto out_mutex; } @@ -5592,7 +5543,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_stop; } - new_size = i_size_read(inode) - len; + new_size = inode->i_size - len; i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; @@ -5620,7 +5571,7 @@ out_mutex: * by len bytes. * Returns 0 on success, error otherwise. */ -int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; handle_t *handle; @@ -5639,14 +5590,10 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Insert range works only on fs block size aligned offsets. */ - if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || - len & (EXT4_CLUSTER_SIZE(sb) - 1)) + /* Insert range works only on fs cluster size aligned regions. */ + if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; - if (!S_ISREG(inode->i_mode)) - return -EOPNOTSUPP; - trace_ext4_insert_range(inode, offset, len); offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); @@ -5666,14 +5613,14 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - /* Check for wrap through zero */ - if (inode->i_size + len > inode->i_sb->s_maxbytes) { + /* Check whether the maximum file size would be exceeded */ + if (len > inode->i_sb->s_maxbytes - inode->i_size) { ret = -EFBIG; goto out_mutex; } - /* Offset should be less than i_size */ - if (offset >= i_size_read(inode)) { + /* Offset must be less than i_size */ + if (offset >= inode->i_size) { ret = -EINVAL; goto out_mutex; } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 825313c59752..4ec30a798260 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -209,6 +209,12 @@ static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) return es->es_pblk & ~ES_MASK; } +static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es) +{ + ext4_fsblk_t pblock = ext4_es_pblock(es); + return pblock == ~ES_MASK ? 0 : pblock; +} + static inline void ext4_es_store_pblock(struct extent_status *es, ext4_fsblk_t pb) { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6a7293a5cda2..5f225881176b 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -88,9 +88,10 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - if (!inode_trylock_shared(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) return -EAGAIN; + } else { inode_lock_shared(inode); } /* @@ -165,19 +166,25 @@ static int ext4_release_file(struct inode *inode, struct file *filp) * threads are at work on the same unwritten block, they must be synchronized * or one thread will zero the other's data, causing corruption. */ -static int -ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) +static bool +ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos) { struct super_block *sb = inode->i_sb; - int blockmask = sb->s_blocksize - 1; - - if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize)) - return 0; + unsigned long blockmask = sb->s_blocksize - 1; if ((pos | iov_iter_alignment(from)) & blockmask) - return 1; + return true; - return 0; + return false; +} + +static bool +ext4_extending_io(struct inode *inode, loff_t offset, size_t len) +{ + if (offset + len > i_size_read(inode) || + offset + len > EXT4_I(inode)->i_disksize) + return true; + return false; } /* Is IO overwriting allocated and initialized blocks? */ @@ -203,7 +210,8 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); } -static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +static ssize_t ext4_generic_write_checks(struct kiocb *iocb, + struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; @@ -227,11 +235,21 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } + return iov_iter_count(from); +} + +static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret, count; + + count = ext4_generic_write_checks(iocb, from); + if (count <= 0) + return count; + ret = file_modified(iocb->ki_filp); if (ret) return ret; - - return iov_iter_count(from); + return count; } static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, @@ -363,62 +381,137 @@ static const struct iomap_dio_ops ext4_dio_write_ops = { .end_io = ext4_dio_write_end_io, }; +/* + * The intention here is to start with shared lock acquired then see if any + * condition requires an exclusive inode lock. If yes, then we restart the + * whole operation by releasing the shared lock and acquiring exclusive lock. + * + * - For unaligned_io we never take shared lock as it may cause data corruption + * when two unaligned IO tries to modify the same block e.g. while zeroing. + * + * - For extending writes case we don't take the shared lock, since it requires + * updating inode i_disksize and/or orphan handling with exclusive lock. + * + * - shared locking will only be true mostly with overwrites. Otherwise we will + * switch to exclusive i_rwsem lock. + */ +static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, + bool *ilock_shared, bool *extend) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + loff_t offset; + size_t count; + ssize_t ret; + +restart: + ret = ext4_generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + offset = iocb->ki_pos; + count = ret; + if (ext4_extending_io(inode, offset, count)) + *extend = true; + /* + * Determine whether the IO operation will overwrite allocated + * and initialized blocks. + * We need exclusive i_rwsem for changing security info + * in file_modified(). + */ + if (*ilock_shared && (!IS_NOSEC(inode) || *extend || + !ext4_overwrite_io(inode, offset, count))) { + inode_unlock_shared(inode); + *ilock_shared = false; + inode_lock(inode); + goto restart; + } + + ret = file_modified(file); + if (ret < 0) + goto out; + + return count; +out: + if (*ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); + return ret; +} + static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; - size_t count; - loff_t offset; handle_t *handle; struct inode *inode = file_inode(iocb->ki_filp); - bool extend = false, overwrite = false, unaligned_aio = false; + loff_t offset = iocb->ki_pos; + size_t count = iov_iter_count(from); + const struct iomap_ops *iomap_ops = &ext4_iomap_ops; + bool extend = false, unaligned_io = false; + bool ilock_shared = true; + + /* + * We initially start with shared inode lock unless it is + * unaligned IO which needs exclusive lock anyways. + */ + if (ext4_unaligned_io(inode, from, offset)) { + unaligned_io = true; + ilock_shared = false; + } + /* + * Quick check here without any i_rwsem lock to see if it is extending + * IO. A more reliable check is done in ext4_dio_write_checks() with + * proper locking in place. + */ + if (offset + count > i_size_read(inode)) + ilock_shared = false; if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock(inode)) - return -EAGAIN; + if (ilock_shared) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + if (!inode_trylock(inode)) + return -EAGAIN; + } } else { - inode_lock(inode); + if (ilock_shared) + inode_lock_shared(inode); + else + inode_lock(inode); } + /* Fallback to buffered I/O if the inode does not support direct I/O. */ if (!ext4_dio_supported(inode)) { - inode_unlock(inode); - /* - * Fallback to buffered I/O if the inode does not support - * direct I/O. - */ + if (ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); return ext4_buffered_write_iter(iocb, from); } - ret = ext4_write_checks(iocb, from); - if (ret <= 0) { - inode_unlock(inode); + ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend); + if (ret <= 0) return ret; - } - /* - * Unaligned asynchronous direct I/O must be serialized among each - * other as the zeroing of partial blocks of two competing unaligned - * asynchronous direct I/O writes can result in data corruption. - */ offset = iocb->ki_pos; - count = iov_iter_count(from); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && - !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) { - unaligned_aio = true; - inode_dio_wait(inode); - } + count = ret; /* - * Determine whether the I/O will overwrite allocated and initialized - * blocks. If so, check to see whether it is possible to take the - * dioread_nolock path. + * Unaligned direct IO must be serialized among each other as zeroing + * of partial blocks of two competing unaligned IOs can result in data + * corruption. + * + * So we make sure we don't allow any unaligned IO in flight. + * For IOs where we need not wait (like unaligned non-AIO DIO), + * below inode_dio_wait() may anyway become a no-op, since we start + * with exclusive lock. */ - if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) && - ext4_should_dioread_nolock(inode)) { - overwrite = true; - downgrade_write(&inode->i_rwsem); - } + if (unaligned_io) + inode_dio_wait(inode); - if (offset + count > EXT4_I(inode)->i_disksize) { + if (extend) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -431,18 +524,19 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } - extend = true; ext4_journal_stop(handle); } - ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, - is_sync_kiocb(iocb) || unaligned_aio || extend); + if (ilock_shared) + iomap_ops = &ext4_iomap_overwrite_ops; + ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, + is_sync_kiocb(iocb) || unaligned_io || extend); if (extend) ret = ext4_handle_inode_extension(inode, offset, ret, count); out: - if (overwrite) + if (ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); @@ -487,9 +581,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) bool extend = false; struct inode *inode = file_inode(iocb->ki_filp); - if (!inode_trylock(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) return -EAGAIN; + } else { inode_lock(inode); } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index dc333e8e51e8..c66e8f9451a2 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -94,7 +94,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, goto verified; blk = ext4_inode_bitmap(sb, desc); if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, - EXT4_INODES_PER_GROUP(sb) / 8)) { + EXT4_INODES_PER_GROUP(sb) / 8) || + ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, blk); @@ -192,8 +193,10 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) get_bh(bh); submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); + ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO); if (!buffer_uptodate(bh)) { put_bh(bh); + ext4_set_errno(sb, EIO); ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); @@ -921,8 +924,8 @@ repeat_in_this_group: if (!handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, - handle_type, nblocks, - 0, 0); + handle_type, nblocks, 0, + ext4_trans_default_revoke_credits(sb)); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_std_error(sb, err); @@ -1223,6 +1226,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); + ext4_set_errno(sb, -err); ext4_error(sb, "couldn't read orphan inode %lu (err %d)", ino, err); return inode; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 3a4ab70fe9e0..569fc68e8975 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -660,32 +660,6 @@ out: } /* - * Calculate the number of metadata blocks need to reserve - * to allocate a new block at @lblocks for non extent file based file - */ -int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); - int blk_bits; - - if (lblock < EXT4_NDIR_BLOCKS) - return 0; - - lblock -= EXT4_NDIR_BLOCKS; - - if (ei->i_da_metadata_calc_len && - (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { - ei->i_da_metadata_calc_len++; - return 0; - } - ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; - ei->i_da_metadata_calc_len = 1; - blk_bits = order_base_2(lblock); - return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; -} - -/* * Calculate number of indirect blocks touched by mapping @nrblocks logically * contiguous blocks */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 2fec62d764fa..fad82d08fca5 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -98,6 +98,7 @@ int ext4_get_max_inline_size(struct inode *inode) error = ext4_get_inode_loc(inode, &iloc); if (error) { + ext4_set_errno(inode->i_sb, -error); ext4_error_inode(inode, __func__, __LINE__, 0, "can't get inode location %lu", inode->i_ino); @@ -849,7 +850,7 @@ out: /* * Prepare the write for the inline data. - * If the the data can be written into the inode, we just read + * If the data can be written into the inode, we just read * the page and make it uptodate, and start the journal. * Otherwise read the page, makes it dirty so that it can be * handle in writepages(the i_disksize update is left to the @@ -1761,6 +1762,7 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) err = ext4_get_inode_loc(dir, &iloc); if (err) { + ext4_set_errno(dir->i_sb, -err); EXT4_ERROR_INODE(dir, "error %d getting inode %lu block", err, dir->i_ino); return true; diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c index 92a9da1774aa..d62d802c9c12 100644 --- a/fs/ext4/inode-test.c +++ b/fs/ext4/inode-test.c @@ -25,7 +25,7 @@ * For constructing the negative timestamp lower bound value. * binary: 10000000 00000000 00000000 00000000 */ -#define LOWER_MSB_1 (-0x80000000L) +#define LOWER_MSB_1 (-(UPPER_MSB_0) - 1L) /* avoid overflow */ /* * For constructing the negative timestamp upper bound value. * binary: 11111111 11111111 11111111 11111111 @@ -269,4 +269,6 @@ static struct kunit_suite ext4_inode_test_suite = { .test_cases = ext4_inode_test_cases, }; -kunit_test_suite(ext4_inode_test_suite); +kunit_test_suites(&ext4_inode_test_suite); + +MODULE_LICENSE("GPL v2"); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 28f28de0c1b6..3313168b680f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -48,8 +48,6 @@ #include <trace/events/ext4.h> -#define MPAGE_DA_EXTENT_TAIL 0x01 - static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { @@ -271,6 +269,7 @@ void ext4_evict_inode(struct inode *inode) if (inode->i_blocks) { err = ext4_truncate(inode); if (err) { + ext4_set_errno(inode->i_sb, -err); ext4_error(inode->i_sb, "couldn't truncate inode %lu (err %d)", inode->i_ino, err); @@ -402,7 +401,7 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, { int ret; - if (IS_ENCRYPTED(inode)) + if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, lblk, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); @@ -2478,10 +2477,12 @@ update_disksize: EXT4_I(inode)->i_disksize = disksize; up_write(&EXT4_I(inode)->i_data_sem); err2 = ext4_mark_inode_dirty(handle, inode); - if (err2) + if (err2) { + ext4_set_errno(inode->i_sb, -err2); ext4_error(inode->i_sb, "Failed to mark inode %lu dirty", inode->i_ino); + } if (!err) err = err2; } @@ -3448,6 +3449,22 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, return 0; } +static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned flags, struct iomap *iomap, + struct iomap *srcmap) +{ + int ret; + + /* + * Even for writes we don't need to allocate blocks, so just pretend + * we are reading to save overhead of starting a transaction. + */ + flags &= ~IOMAP_WRITE; + ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); + WARN_ON_ONCE(iomap->type != IOMAP_MAPPED); + return ret; +} + static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { @@ -3469,6 +3486,11 @@ const struct iomap_ops ext4_iomap_ops = { .iomap_end = ext4_iomap_end, }; +const struct iomap_ops ext4_iomap_overwrite_ops = { + .iomap_begin = ext4_iomap_overwrite_begin, + .iomap_end = ext4_iomap_end, +}; + static bool ext4_iomap_is_delalloc(struct inode *inode, struct ext4_map_blocks *map) { @@ -3701,8 +3723,12 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); - WARN_ON_ONCE(fscrypt_decrypt_pagecache_blocks( - page, blocksize, bh_offset(bh))); + err = fscrypt_decrypt_pagecache_blocks(page, blocksize, + bh_offset(bh)); + if (err) { + clear_buffer_uptodate(bh); + goto unlock; + } } } if (ext4_should_journal_data(inode)) { @@ -3912,9 +3938,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) unsigned int credits; int ret = 0; - if (!S_ISREG(inode->i_mode)) - return -EOPNOTSUPP; - trace_ext4_punch_hole(inode, offset, length, 0); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); @@ -4240,6 +4263,8 @@ static int __ext4_get_inode_loc(struct inode *inode, bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; + if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)) + goto simulate_eio; if (!buffer_uptodate(bh)) { lock_buffer(bh); @@ -4338,6 +4363,8 @@ make_io: blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { + simulate_eio: + ext4_set_errno(inode->i_sb, EIO); EXT4_ERROR_INODE_BLOCK(inode, block, "unable to read itable block"); brelse(bh); @@ -4551,7 +4578,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, sizeof(gen)); } - if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { + if (!ext4_inode_csum_verify(inode, raw_inode, ei) || + ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) { + ext4_set_errno(inode->i_sb, EFSBADCRC); ext4_error_inode(inode, function, line, 0, "iget: checksum invalid"); ret = -EFSBADCRC; @@ -5090,6 +5119,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { + ext4_set_errno(inode->i_sb, EIO); EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, "IO error syncing inode"); err = -EIO; @@ -5368,7 +5398,8 @@ int ext4_getattr(const struct path *path, struct kstat *stat, struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { + if ((request_mask & STATX_BTIME) && + EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; stat->btime.tv_nsec = ei->i_crtime.tv_nsec; @@ -5692,7 +5723,7 @@ int ext4_expand_extra_isize(struct inode *inode, error = ext4_journal_get_write_access(handle, iloc->bh); if (error) { brelse(iloc->bh); - goto out_stop; + goto out_unlock; } error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, @@ -5702,8 +5733,8 @@ int ext4_expand_extra_isize(struct inode *inode, if (!error) error = rc; +out_unlock: ext4_write_unlock_xattr(inode, &no_expand); -out_stop: ext4_journal_stop(handle); return error; } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index e8870fff8224..a0ec750018dd 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1377,6 +1377,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_CLEAR_ES_CACHE: case EXT4_IOC_GETSTATE: case EXT4_IOC_GET_ES_CACHE: + case EXT4_IOC_FSGETXATTR: + case EXT4_IOC_FSSETXATTR: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a3e2767bdf2f..f64838187559 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3895,6 +3895,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); + ext4_set_errno(sb, -err); ext4_error(sb, "Error %d reading block bitmap for %u", err, group); return 0; @@ -4063,6 +4064,7 @@ repeat: err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { + ext4_set_errno(sb, -err); ext4_error(sb, "Error %d loading buddy information for %u", err, group); continue; @@ -4071,6 +4073,7 @@ repeat: bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); + ext4_set_errno(sb, -err); ext4_error(sb, "Error %d reading block bitmap for %u", err, group); ext4_mb_unload_buddy(&e4b); @@ -4325,6 +4328,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { + ext4_set_errno(sb, -err); ext4_error(sb, "Error %d loading buddy information for %u", err, group); continue; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 2305b4374fd3..1c44b1a32001 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -173,8 +173,10 @@ static int kmmpd(void *data) * (s_mmp_update_interval * 60) seconds. */ if (retval) { - if ((failed_writes % 60) == 0) + if ((failed_writes % 60) == 0) { + ext4_set_errno(sb, -retval); ext4_error(sb, "Error writing to MMP block"); + } failed_writes++; } @@ -205,6 +207,7 @@ static int kmmpd(void *data) retval = read_mmp_block(sb, &bh_check, mmp_block); if (retval) { + ext4_set_errno(sb, -retval); ext4_error(sb, "error reading MMP data: %d", retval); goto exit_thread; @@ -218,6 +221,7 @@ static int kmmpd(void *data) "Error while updating MMP info. " "The filesystem seems to have been" " multiply mounted."); + ext4_set_errno(sb, EBUSY); ext4_error(sb, "abort"); put_bh(bh_check); retval = -EBUSY; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a856997d87b5..129d2ebae00d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -109,7 +109,10 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, struct ext4_dir_entry *dirent; int is_dx_block = 0; - bh = ext4_bread(NULL, inode, block, 0); + if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO)) + bh = ERR_PTR(-EIO); + else + bh = ext4_bread(NULL, inode, block, 0); if (IS_ERR(bh)) { __ext4_warning(inode->i_sb, func, line, "inode #%lu: lblock %lu: comm %s: " @@ -153,9 +156,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, * caller is sure it should be an index block. */ if (is_dx_block && type == INDEX) { - if (ext4_dx_csum_verify(inode, dirent)) + if (ext4_dx_csum_verify(inode, dirent) && + !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { + ext4_set_errno(inode->i_sb, EFSBADCRC); ext4_error_inode(inode, func, line, block, "Directory index failed checksum"); brelse(bh); @@ -163,9 +168,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, } } if (!is_dx_block) { - if (ext4_dirblock_csum_verify(inode, bh)) + if (ext4_dirblock_csum_verify(inode, bh) && + !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { + ext4_set_errno(inode->i_sb, EFSBADCRC); ext4_error_inode(inode, func, line, block, "Directory block failed checksum"); brelse(bh); @@ -1002,7 +1009,6 @@ static int htree_dirblock_to_tree(struct file *dir_file, top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); -#ifdef CONFIG_FS_ENCRYPTION /* Check if the directory is encrypted */ if (IS_ENCRYPTED(dir)) { err = fscrypt_get_encryption_info(dir); @@ -1017,7 +1023,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, return err; } } -#endif + for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, @@ -1065,9 +1071,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, } errout: brelse(bh); -#ifdef CONFIG_FS_ENCRYPTION fscrypt_fname_free_buffer(&fname_crypto_str); -#endif return count; } @@ -1527,6 +1531,7 @@ restart: goto next; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { + ext4_set_errno(sb, EIO); EXT4_ERROR_INODE(dir, "reading directory lblock %lu", (unsigned long) block); brelse(bh); @@ -1537,6 +1542,7 @@ restart: !is_dx_internal_node(dir, block, (struct ext4_dir_entry *)bh->b_data) && !ext4_dirblock_csum_verify(dir, bh)) { + ext4_set_errno(sb, EFSBADCRC); EXT4_ERROR_INODE(dir, "checksumming directory " "block %lu", (unsigned long)block); brelse(bh); @@ -2164,7 +2170,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct buffer_head *bh = NULL; struct ext4_dir_entry_2 *de; struct super_block *sb; +#ifdef CONFIG_UNICODE struct ext4_sb_info *sbi; +#endif struct ext4_filename fname; int retval; int dx_fallback=0; @@ -2176,12 +2184,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; - sbi = EXT4_SB(sb); blocksize = sb->s_blocksize; if (!dentry->d_name.len) return -EINVAL; #ifdef CONFIG_UNICODE + sbi = EXT4_SB(sb); if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) && sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name)) return -EINVAL; @@ -2822,7 +2830,7 @@ bool ext4_empty_dir(struct inode *inode) { unsigned int offset; struct buffer_head *bh; - struct ext4_dir_entry_2 *de, *de1; + struct ext4_dir_entry_2 *de; struct super_block *sb; if (ext4_has_inline_data(inode)) { @@ -2847,19 +2855,25 @@ bool ext4_empty_dir(struct inode *inode) return true; de = (struct ext4_dir_entry_2 *) bh->b_data; - de1 = ext4_next_entry(de, sb->s_blocksize); - if (le32_to_cpu(de->inode) != inode->i_ino || - le32_to_cpu(de1->inode) == 0 || - strcmp(".", de->name) || strcmp("..", de1->name)) { - ext4_warning_inode(inode, "directory missing '.' and/or '..'"); + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, + 0) || + le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { + ext4_warning_inode(inode, "directory missing '.'"); + brelse(bh); + return true; + } + offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); + de = ext4_next_entry(de, sb->s_blocksize); + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, + offset) || + le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { + ext4_warning_inode(inode, "directory missing '..'"); brelse(bh); return true; } - offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) + - ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize); - de = ext4_next_entry(de1, sb->s_blocksize); + offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); while (offset < inode->i_size) { - if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + if (!(offset & (sb->s_blocksize - 1))) { unsigned int lblock; brelse(bh); lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); @@ -2870,12 +2884,11 @@ bool ext4_empty_dir(struct inode *inode) } if (IS_ERR(bh)) return true; - de = (struct ext4_dir_entry_2 *) bh->b_data; } + de = (struct ext4_dir_entry_2 *) (bh->b_data + + (offset & (sb->s_blocksize - 1))); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset)) { - de = (struct ext4_dir_entry_2 *)(bh->b_data + - sb->s_blocksize); offset = (offset | (sb->s_blocksize - 1)) + 1; continue; } @@ -2884,7 +2897,6 @@ bool ext4_empty_dir(struct inode *inode) return false; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); - de = ext4_next_entry(de, sb->s_blocksize); } brelse(bh); return true; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 24aeedb8fc75..68b39e75446a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -512,17 +512,26 @@ int ext4_bio_write_page(struct ext4_io_submit *io, gfp_t gfp_flags = GFP_NOFS; unsigned int enc_bytes = round_up(len, i_blocksize(inode)); + /* + * Since bounce page allocation uses a mempool, we can only use + * a waiting mask (i.e. request guaranteed allocation) on the + * first page of the bio. Otherwise it can deadlock. + */ + if (io->io_bio) + gfp_flags = GFP_NOWAIT | __GFP_NOWARN; retry_encrypt: bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes, 0, gfp_flags); if (IS_ERR(bounce_page)) { ret = PTR_ERR(bounce_page); - if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) { - if (io->io_bio) { + if (ret == -ENOMEM && + (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) { + gfp_flags = GFP_NOFS; + if (io->io_bio) ext4_io_submit(io); - congestion_wait(BLK_RW_ASYNC, HZ/50); - } - gfp_flags |= __GFP_NOFAIL; + else + gfp_flags |= __GFP_NOFAIL; + congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry_encrypt; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index fef7755300c3..c1769afbf799 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -57,6 +57,7 @@ enum bio_post_read_step { STEP_INITIAL = 0, STEP_DECRYPT, STEP_VERITY, + STEP_MAX, }; struct bio_post_read_ctx { @@ -106,10 +107,22 @@ static void verity_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; - fsverity_verify_bio(ctx->bio); + /* + * fsverity_verify_bio() may call readpages() again, and although verity + * will be disabled for that, decryption may still be needed, causing + * another bio_post_read_ctx to be allocated. So to guarantee that + * mempool_alloc() never deadlocks we must free the current ctx first. + * This is safe because verity is the last post-read step. + */ + BUILD_BUG_ON(STEP_VERITY + 1 != STEP_MAX); + mempool_free(ctx, bio_post_read_ctx_pool); + bio->bi_private = NULL; - bio_post_read_processing(ctx); + fsverity_verify_bio(bio); + + __read_end_io(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx) @@ -176,12 +189,11 @@ static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx) idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); } -static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode, - struct bio *bio, - pgoff_t first_idx) +static void ext4_set_bio_post_read_ctx(struct bio *bio, + const struct inode *inode, + pgoff_t first_idx) { unsigned int post_read_steps = 0; - struct bio_post_read_ctx *ctx = NULL; if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) post_read_steps |= 1 << STEP_DECRYPT; @@ -190,14 +202,14 @@ static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode, post_read_steps |= 1 << STEP_VERITY; if (post_read_steps) { - ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); - if (!ctx) - return ERR_PTR(-ENOMEM); + /* Due to the mempool, this never fails. */ + struct bio_post_read_ctx *ctx = + mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); + ctx->bio = bio; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; } - return ctx; } static inline loff_t ext4_readpage_limit(struct inode *inode) @@ -358,24 +370,16 @@ int ext4_mpage_readpages(struct address_space *mapping, bio = NULL; } if (bio == NULL) { - struct bio_post_read_ctx *ctx; - /* * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). */ bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - ctx = get_bio_post_read_ctx(inode, bio, page->index); - if (IS_ERR(ctx)) { - bio_put(bio); - bio = NULL; - goto set_error_page; - } + ext4_set_bio_post_read_ctx(bio, inode, page->index); bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; - bio->bi_private = ctx; bio_set_op_attrs(bio, REQ_OP_READ, is_readahead ? REQ_RAHEAD : 0); } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a8c0f2b5b6e1..86a2500ed292 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -824,9 +824,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (unlikely(err)) goto errout; - n_group_desc = ext4_kvmalloc((gdb_num + 1) * - sizeof(struct buffer_head *), - GFP_NOFS); + n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), + GFP_KERNEL); if (!n_group_desc) { err = -ENOMEM; ext4_warning(sb, "not enough memory for %lu groups", @@ -900,9 +899,8 @@ static int add_new_gdb_meta_bg(struct super_block *sb, gdb_bh = ext4_sb_bread(sb, gdblock, 0); if (IS_ERR(gdb_bh)) return PTR_ERR(gdb_bh); - n_group_desc = ext4_kvmalloc((gdb_num + 1) * - sizeof(struct buffer_head *), - GFP_NOFS); + n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), + GFP_KERNEL); if (!n_group_desc) { brelse(gdb_bh); err = -ENOMEM; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1d82b56d9b11..8434217549b3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -154,7 +154,7 @@ ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags) if (bh == NULL) return ERR_PTR(-ENOMEM); - if (buffer_uptodate(bh)) + if (ext4_buffer_uptodate(bh)) return bh; ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh); wait_on_buffer(bh); @@ -204,26 +204,6 @@ void ext4_superblock_csum_set(struct super_block *sb) es->s_checksum = ext4_superblock_csum(sb, es); } -void *ext4_kvmalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kmalloc(size, flags | __GFP_NOWARN); - if (!ret) - ret = __vmalloc(size, flags, PAGE_KERNEL); - return ret; -} - -void *ext4_kvzalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kzalloc(size, flags | __GFP_NOWARN); - if (!ret) - ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); - return ret; -} - ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { @@ -367,6 +347,8 @@ static void __save_error_info(struct super_block *sb, const char *func, ext4_update_tstamp(es, s_last_error_time); strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); es->s_last_error_line = cpu_to_le32(line); + if (es->s_last_error_errcode == 0) + es->s_last_error_errcode = EXT4_ERR_EFSCORRUPTED; if (!es->s_first_error_time) { es->s_first_error_time = es->s_last_error_time; es->s_first_error_time_hi = es->s_last_error_time_hi; @@ -375,6 +357,7 @@ static void __save_error_info(struct super_block *sb, const char *func, es->s_first_error_line = cpu_to_le32(line); es->s_first_error_ino = es->s_last_error_ino; es->s_first_error_block = es->s_last_error_block; + es->s_first_error_errcode = es->s_last_error_errcode; } /* * Start the daily error reporting function if it hasn't been @@ -631,6 +614,66 @@ const char *ext4_decode_error(struct super_block *sb, int errno, return errstr; } +void ext4_set_errno(struct super_block *sb, int err) +{ + if (err < 0) + err = -err; + + switch (err) { + case EIO: + err = EXT4_ERR_EIO; + break; + case ENOMEM: + err = EXT4_ERR_ENOMEM; + break; + case EFSBADCRC: + err = EXT4_ERR_EFSBADCRC; + break; + case EFSCORRUPTED: + err = EXT4_ERR_EFSCORRUPTED; + break; + case ENOSPC: + err = EXT4_ERR_ENOSPC; + break; + case ENOKEY: + err = EXT4_ERR_ENOKEY; + break; + case EROFS: + err = EXT4_ERR_EROFS; + break; + case EFBIG: + err = EXT4_ERR_EFBIG; + break; + case EEXIST: + err = EXT4_ERR_EEXIST; + break; + case ERANGE: + err = EXT4_ERR_ERANGE; + break; + case EOVERFLOW: + err = EXT4_ERR_EOVERFLOW; + break; + case EBUSY: + err = EXT4_ERR_EBUSY; + break; + case ENOTDIR: + err = EXT4_ERR_ENOTDIR; + break; + case ENOTEMPTY: + err = EXT4_ERR_ENOTEMPTY; + break; + case ESHUTDOWN: + err = EXT4_ERR_ESHUTDOWN; + break; + case EFAULT: + err = EXT4_ERR_EFAULT; + break; + default: + err = EXT4_ERR_UNKNOWN; + } + EXT4_SB(sb)->s_es->s_last_error_errcode = err; +} + /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ @@ -655,6 +698,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, sb->s_id, function, line, errstr); } + ext4_set_errno(sb, -errno); save_error_info(sb, function, line); ext4_handle_error(sb); } @@ -982,8 +1026,10 @@ static void ext4_put_super(struct super_block *sb) aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; - if ((err < 0) && !aborted) + if ((err < 0) && !aborted) { + ext4_set_errno(sb, -err); ext4_abort(sb, "Couldn't clean up the journal"); + } } ext4_unregister_sysfs(sb); @@ -1085,8 +1131,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; - ei->i_da_metadata_calc_len = 0; - ei->i_da_metadata_calc_last_lblock = 0; spin_lock_init(&(ei->i_block_reservation_lock)); ext4_init_pending_tree(&ei->i_pending_tree); #ifdef CONFIG_QUOTA @@ -1548,6 +1592,7 @@ static const match_table_t tokens = { {Opt_auto_da_alloc, "auto_da_alloc"}, {Opt_noauto_da_alloc, "noauto_da_alloc"}, {Opt_dioread_nolock, "dioread_nolock"}, + {Opt_dioread_lock, "nodioread_nolock"}, {Opt_dioread_lock, "dioread_lock"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, @@ -1900,6 +1945,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } sbi->s_commit_interval = HZ * arg; } else if (token == Opt_debug_want_extra_isize) { + if ((arg & 1) || + (arg < 4) || + (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) { + ext4_msg(sb, KERN_ERR, + "Invalid want_extra_isize %d", arg); + return -1; + } sbi->s_want_extra_isize = arg; } else if (token == Opt_max_batch_time) { sbi->s_max_batch_time = arg; @@ -3554,40 +3606,6 @@ int ext4_calculate_overhead(struct super_block *sb) return 0; } -static void ext4_clamp_want_extra_isize(struct super_block *sb) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - unsigned def_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; - - if (sbi->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE) { - sbi->s_want_extra_isize = 0; - return; - } - if (sbi->s_want_extra_isize < 4) { - sbi->s_want_extra_isize = def_extra_isize; - if (ext4_has_feature_extra_isize(sb)) { - if (sbi->s_want_extra_isize < - le16_to_cpu(es->s_want_extra_isize)) - sbi->s_want_extra_isize = - le16_to_cpu(es->s_want_extra_isize); - if (sbi->s_want_extra_isize < - le16_to_cpu(es->s_min_extra_isize)) - sbi->s_want_extra_isize = - le16_to_cpu(es->s_min_extra_isize); - } - } - /* Check if enough inode space is available */ - if ((sbi->s_want_extra_isize > sbi->s_inode_size) || - (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > - sbi->s_inode_size)) { - sbi->s_want_extra_isize = def_extra_isize; - ext4_msg(sb, KERN_INFO, - "required extra inode space not available"); - } -} - static void ext4_set_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; @@ -3747,6 +3765,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ set_opt(sb, XATTR_USER); + set_opt(sb, DIOREAD_NOLOCK); #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif @@ -3795,6 +3814,68 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { + sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; + sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; + } else { + sbi->s_inode_size = le16_to_cpu(es->s_inode_size); + sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { + ext4_msg(sb, KERN_ERR, "invalid first ino: %u", + sbi->s_first_ino); + goto failed_mount; + } + if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || + (!is_power_of_2(sbi->s_inode_size)) || + (sbi->s_inode_size > blocksize)) { + ext4_msg(sb, KERN_ERR, + "unsupported inode size: %d", + sbi->s_inode_size); + goto failed_mount; + } + /* + * i_atime_extra is the last extra field available for + * [acm]times in struct ext4_inode. Checking for that + * field should suffice to ensure we have extra space + * for all three. + */ + if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + + sizeof(((struct ext4_inode *)0)->i_atime_extra)) { + sb->s_time_gran = 1; + sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; + } else { + sb->s_time_gran = NSEC_PER_SEC; + sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; + } + sb->s_time_min = EXT4_TIMESTAMP_MIN; + } + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + if (ext4_has_feature_extra_isize(sb)) { + unsigned v, max = (sbi->s_inode_size - + EXT4_GOOD_OLD_INODE_SIZE); + + v = le16_to_cpu(es->s_want_extra_isize); + if (v > max) { + ext4_msg(sb, KERN_ERR, + "bad s_want_extra_isize: %d", v); + goto failed_mount; + } + if (sbi->s_want_extra_isize < v) + sbi->s_want_extra_isize = v; + + v = le16_to_cpu(es->s_min_extra_isize); + if (v > max) { + ext4_msg(sb, KERN_ERR, + "bad s_min_extra_isize: %d", v); + goto failed_mount; + } + if (sbi->s_want_extra_isize < v) + sbi->s_want_extra_isize = v; + } + } + if (sbi->s_es->s_mount_opts[0]) { char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, sizeof(sbi->s_es->s_mount_opts), @@ -3852,9 +3933,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #endif if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk_once(KERN_WARNING "EXT4-fs: Warning: mounting " - "with data=journal disables delayed " - "allocation and O_DIRECT support!\n"); + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n"); + clear_opt(sb, DIOREAD_NOLOCK); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); @@ -4033,42 +4113,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) has_huge_files); sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); - if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { - sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; - sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; - } else { - sbi->s_inode_size = le16_to_cpu(es->s_inode_size); - sbi->s_first_ino = le32_to_cpu(es->s_first_ino); - if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { - ext4_msg(sb, KERN_ERR, "invalid first ino: %u", - sbi->s_first_ino); - goto failed_mount; - } - if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || - (!is_power_of_2(sbi->s_inode_size)) || - (sbi->s_inode_size > blocksize)) { - ext4_msg(sb, KERN_ERR, - "unsupported inode size: %d", - sbi->s_inode_size); - goto failed_mount; - } - /* - * i_atime_extra is the last extra field available for [acm]times in - * struct ext4_inode. Checking for that field should suffice to ensure - * we have extra space for all three. - */ - if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + - sizeof(((struct ext4_inode *)0)->i_atime_extra)) { - sb->s_time_gran = 1; - sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; - } else { - sb->s_time_gran = NSEC_PER_SEC; - sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; - } - - sb->s_time_min = EXT4_TIMESTAMP_MIN; - } - sbi->s_desc_size = le16_to_cpu(es->s_desc_size); if (ext4_has_feature_64bit(sb)) { if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || @@ -4517,8 +4561,6 @@ no_journal: } else if (ret) goto failed_mount4a; - ext4_clamp_want_extra_isize(sb); - ext4_set_resv_clusters(sb); err = ext4_setup_system_zone(sb); @@ -5306,8 +5348,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - ext4_clamp_want_extra_isize(sb); - if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "changing journal_checksum " @@ -5545,9 +5585,15 @@ static int ext4_statfs_project(struct super_block *sb, return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); - limit = (dquot->dq_dqb.dqb_bsoftlimit ? - dquot->dq_dqb.dqb_bsoftlimit : - dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + limit = 0; + if (dquot->dq_dqb.dqb_bsoftlimit && + (!limit || dquot->dq_dqb.dqb_bsoftlimit < limit)) + limit = dquot->dq_dqb.dqb_bsoftlimit; + if (dquot->dq_dqb.dqb_bhardlimit && + (!limit || dquot->dq_dqb.dqb_bhardlimit < limit)) + limit = dquot->dq_dqb.dqb_bhardlimit; + limit >>= sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; @@ -5557,9 +5603,14 @@ static int ext4_statfs_project(struct super_block *sb, (buf->f_blocks - curblock) : 0; } - limit = dquot->dq_dqb.dqb_isoftlimit ? - dquot->dq_dqb.dqb_isoftlimit : - dquot->dq_dqb.dqb_ihardlimit; + limit = 0; + if (dquot->dq_dqb.dqb_isoftlimit && + (!limit || dquot->dq_dqb.dqb_isoftlimit < limit)) + limit = dquot->dq_dqb.dqb_isoftlimit; + if (dquot->dq_dqb.dqb_ihardlimit && + (!limit || dquot->dq_dqb.dqb_ihardlimit < limit)) + limit = dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { buf->f_files = limit; buf->f_ffree = @@ -5992,7 +6043,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, bh = ext4_bread(handle, inode, blk, EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL); - } while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) && + } while (PTR_ERR(bh) == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); if (IS_ERR(bh)) return PTR_ERR(bh); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index eb1efad0e20a..d218ebdafa4a 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -29,6 +29,10 @@ typedef enum { attr_last_error_time, attr_feature, attr_pointer_ui, + attr_pointer_ul, + attr_pointer_u64, + attr_pointer_u8, + attr_pointer_string, attr_pointer_atomic, attr_journal_task, } attr_id_t; @@ -46,6 +50,7 @@ struct ext4_attr { struct attribute attr; short attr_id; short attr_ptr; + unsigned short attr_size; union { int offset; void *explicit_ptr; @@ -154,12 +159,35 @@ static struct ext4_attr ext4_attr_##_name = { \ }, \ } +#define EXT4_ATTR_STRING(_name,_mode,_size,_struct,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_pointer_string, \ + .attr_size = _size, \ + .attr_ptr = ptr_##_struct##_offset, \ + .u = { \ + .offset = offsetof(struct _struct, _elname),\ + }, \ +} + #define EXT4_RO_ATTR_ES_UI(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0444, pointer_ui, ext4_super_block, _elname) +#define EXT4_RO_ATTR_ES_U8(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0444, pointer_u8, ext4_super_block, _elname) + +#define EXT4_RO_ATTR_ES_U64(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0444, pointer_u64, ext4_super_block, _elname) + +#define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \ + EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname) + #define EXT4_RW_ATTR_SBI_UI(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname) +#define EXT4_RW_ATTR_SBI_UL(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname) + #define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ @@ -194,7 +222,20 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); +#ifdef CONFIG_EXT4_DEBUG +EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); +#endif EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); +EXT4_RO_ATTR_ES_U8(first_error_errcode, s_first_error_errcode); +EXT4_RO_ATTR_ES_U8(last_error_errcode, s_last_error_errcode); +EXT4_RO_ATTR_ES_UI(first_error_ino, s_first_error_ino); +EXT4_RO_ATTR_ES_UI(last_error_ino, s_last_error_ino); +EXT4_RO_ATTR_ES_U64(first_error_block, s_first_error_block); +EXT4_RO_ATTR_ES_U64(last_error_block, s_last_error_block); +EXT4_RO_ATTR_ES_UI(first_error_line, s_first_error_line); +EXT4_RO_ATTR_ES_UI(last_error_line, s_last_error_line); +EXT4_RO_ATTR_ES_STRING(first_error_func, s_first_error_func, 32); +EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32); EXT4_ATTR(first_error_time, 0444, first_error_time); EXT4_ATTR(last_error_time, 0444, last_error_time); EXT4_ATTR(journal_task, 0444, journal_task); @@ -225,9 +266,22 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(msg_ratelimit_interval_ms), ATTR_LIST(msg_ratelimit_burst), ATTR_LIST(errors_count), + ATTR_LIST(first_error_ino), + ATTR_LIST(last_error_ino), + ATTR_LIST(first_error_block), + ATTR_LIST(last_error_block), + ATTR_LIST(first_error_line), + ATTR_LIST(last_error_line), + ATTR_LIST(first_error_func), + ATTR_LIST(last_error_func), + ATTR_LIST(first_error_errcode), + ATTR_LIST(last_error_errcode), ATTR_LIST(first_error_time), ATTR_LIST(last_error_time), ATTR_LIST(journal_task), +#ifdef CONFIG_EXT4_DEBUG + ATTR_LIST(simulate_fail), +#endif NULL, }; ATTRIBUTE_GROUPS(ext4); @@ -280,7 +334,7 @@ static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi) static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi) { - return snprintf(buf, PAGE_SIZE, "%lld", + return snprintf(buf, PAGE_SIZE, "%lld\n", ((time64_t)hi << 32) + le32_to_cpu(lo)); } @@ -318,6 +372,30 @@ static ssize_t ext4_attr_show(struct kobject *kobj, else return snprintf(buf, PAGE_SIZE, "%u\n", *((unsigned int *) ptr)); + case attr_pointer_ul: + if (!ptr) + return 0; + return snprintf(buf, PAGE_SIZE, "%lu\n", + *((unsigned long *) ptr)); + case attr_pointer_u8: + if (!ptr) + return 0; + return snprintf(buf, PAGE_SIZE, "%u\n", + *((unsigned char *) ptr)); + case attr_pointer_u64: + if (!ptr) + return 0; + if (a->attr_ptr == ptr_ext4_super_block_offset) + return snprintf(buf, PAGE_SIZE, "%llu\n", + le64_to_cpup(ptr)); + else + return snprintf(buf, PAGE_SIZE, "%llu\n", + *((unsigned long long *) ptr)); + case attr_pointer_string: + if (!ptr) + return 0; + return snprintf(buf, PAGE_SIZE, "%.*s\n", a->attr_size, + (char *) ptr); case attr_pointer_atomic: if (!ptr) return 0; @@ -361,6 +439,14 @@ static ssize_t ext4_attr_store(struct kobject *kobj, else *((unsigned int *) ptr) = t; return len; + case attr_pointer_ul: + if (!ptr) + return 0; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + *((unsigned long *) ptr) = t; + return len; case attr_inode_readahead: return inode_readahead_blks_store(sbi, buf, len); case attr_trigger_test_error: diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index d0d8a9795dd6..dc5ec724d889 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -342,12 +342,55 @@ static int ext4_get_verity_descriptor(struct inode *inode, void *buf, return desc_size; } +/* + * Prefetch some pages from the file's Merkle tree. + * + * This is basically a stripped-down version of __do_page_cache_readahead() + * which works on pages past i_size. + */ +static void ext4_merkle_tree_readahead(struct address_space *mapping, + pgoff_t start_index, unsigned long count) +{ + LIST_HEAD(pages); + unsigned int nr_pages = 0; + struct page *page; + pgoff_t index; + struct blk_plug plug; + + for (index = start_index; index < start_index + count; index++) { + page = xa_load(&mapping->i_pages, index); + if (!page || xa_is_value(page)) { + page = __page_cache_alloc(readahead_gfp_mask(mapping)); + if (!page) + break; + page->index = index; + list_add(&page->lru, &pages); + nr_pages++; + } + } + blk_start_plug(&plug); + ext4_mpage_readpages(mapping, &pages, NULL, nr_pages, true); + blk_finish_plug(&plug); +} + static struct page *ext4_read_merkle_tree_page(struct inode *inode, - pgoff_t index) + pgoff_t index, + unsigned long num_ra_pages) { + struct page *page; + index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; - return read_mapping_page(inode->i_mapping, index, NULL); + page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); + if (!page || !PageUptodate(page)) { + if (page) + put_page(page); + else if (num_ra_pages > 1) + ext4_merkle_tree_readahead(inode->i_mapping, index, + num_ra_pages); + page = read_mapping_page(inode->i_mapping, index, NULL); + } + return page; } static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 8966a5439a22..8cac7d95c3ad 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1456,7 +1456,7 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, if (!ce) return NULL; - ea_data = ext4_kvmalloc(value_len, GFP_NOFS); + ea_data = kvmalloc(value_len, GFP_KERNEL); if (!ea_data) { mb_cache_entry_put(ea_inode_cache, ce); return NULL; @@ -2879,9 +2879,11 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { error = PTR_ERR(bh); - if (error == -EIO) + if (error == -EIO) { + ext4_set_errno(inode->i_sb, EIO); EXT4_ERROR_INODE(inode, "block %llu read error", EXT4_I(inode)->i_file_acl); + } bh = NULL; goto cleanup; } diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 652fd2e2b23d..f0faada30f30 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -6,6 +6,7 @@ config F2FS_FS select CRYPTO select CRYPTO_CRC32 select F2FS_FS_XATTR if FS_ENCRYPTION + select FS_ENCRYPTION_ALGS if FS_ENCRYPTION help F2FS is based on Log-structured File System (LFS), which supports versatile "flash-friendly" features. The design has been focused on @@ -21,7 +22,7 @@ config F2FS_FS config F2FS_STAT_FS bool "F2FS Status Information" - depends on F2FS_FS && DEBUG_FS + depends on F2FS_FS default y help /sys/kernel/debug/f2fs/ contains information about all the partitions @@ -92,3 +93,28 @@ config F2FS_FAULT_INJECTION Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on. If unsure, say N. + +config F2FS_FS_COMPRESSION + bool "F2FS compression feature" + depends on F2FS_FS + help + Enable filesystem-level compression on f2fs regular files, + multiple back-end compression algorithms are supported. + +config F2FS_FS_LZO + bool "LZO compression support" + depends on F2FS_FS_COMPRESSION + select LZO_COMPRESS + select LZO_DECOMPRESS + default y + help + Support LZO compress algorithm, if unsure, say Y. + +config F2FS_FS_LZ4 + bool "LZ4 compression support" + depends on F2FS_FS_COMPRESSION + select LZ4_COMPRESS + select LZ4_DECOMPRESS + default y + help + Support LZ4 compress algorithm, if unsure, say Y. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 2aaecc63834f..ee7316b42f69 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -9,3 +9,4 @@ f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o f2fs-$(CONFIG_FS_VERITY) += verity.o +f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ffdaba0c55d2..44e84ac5c941 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1509,10 +1509,10 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_wait_on_all_pages_writeback(sbi); /* - * invalidate intermediate page cache borrowed from meta inode - * which are used for migration of encrypted inode's blocks. + * invalidate intermediate page cache borrowed from meta inode which are + * used for migration of encrypted or verity inode's blocks. */ - if (f2fs_sb_has_encrypt(sbi)) + if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi)) invalidate_mapping_pages(META_MAPPING(sbi), MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c new file mode 100644 index 000000000000..d8a64be90a50 --- /dev/null +++ b/fs/f2fs/compress.c @@ -0,0 +1,1176 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs compress support + * + * Copyright (c) 2019 Chao Yu <chao@kernel.org> + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> +#include <linux/lzo.h> +#include <linux/lz4.h> + +#include "f2fs.h" +#include "node.h" +#include <trace/events/f2fs.h> + +struct f2fs_compress_ops { + int (*init_compress_ctx)(struct compress_ctx *cc); + void (*destroy_compress_ctx)(struct compress_ctx *cc); + int (*compress_pages)(struct compress_ctx *cc); + int (*decompress_pages)(struct decompress_io_ctx *dic); +}; + +static unsigned int offset_in_cluster(struct compress_ctx *cc, pgoff_t index) +{ + return index & (cc->cluster_size - 1); +} + +static pgoff_t cluster_idx(struct compress_ctx *cc, pgoff_t index) +{ + return index >> cc->log_cluster_size; +} + +static pgoff_t start_idx_of_cluster(struct compress_ctx *cc) +{ + return cc->cluster_idx << cc->log_cluster_size; +} + +bool f2fs_is_compressed_page(struct page *page) +{ + if (!PagePrivate(page)) + return false; + if (!page_private(page)) + return false; + if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page)) + return false; + f2fs_bug_on(F2FS_M_SB(page->mapping), + *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC); + return true; +} + +static void f2fs_set_compressed_page(struct page *page, + struct inode *inode, pgoff_t index, void *data, refcount_t *r) +{ + SetPagePrivate(page); + set_page_private(page, (unsigned long)data); + + /* i_crypto_info and iv index */ + page->index = index; + page->mapping = inode->i_mapping; + if (r) + refcount_inc(r); +} + +static void f2fs_put_compressed_page(struct page *page) +{ + set_page_private(page, (unsigned long)NULL); + ClearPagePrivate(page); + page->mapping = NULL; + unlock_page(page); + put_page(page); +} + +static void f2fs_drop_rpages(struct compress_ctx *cc, int len, bool unlock) +{ + int i; + + for (i = 0; i < len; i++) { + if (!cc->rpages[i]) + continue; + if (unlock) + unlock_page(cc->rpages[i]); + else + put_page(cc->rpages[i]); + } +} + +static void f2fs_put_rpages(struct compress_ctx *cc) +{ + f2fs_drop_rpages(cc, cc->cluster_size, false); +} + +static void f2fs_unlock_rpages(struct compress_ctx *cc, int len) +{ + f2fs_drop_rpages(cc, len, true); +} + +static void f2fs_put_rpages_mapping(struct compress_ctx *cc, + struct address_space *mapping, + pgoff_t start, int len) +{ + int i; + + for (i = 0; i < len; i++) { + struct page *page = find_get_page(mapping, start + i); + + put_page(page); + put_page(page); + } +} + +static void f2fs_put_rpages_wbc(struct compress_ctx *cc, + struct writeback_control *wbc, bool redirty, int unlock) +{ + unsigned int i; + + for (i = 0; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; + if (redirty) + redirty_page_for_writepage(wbc, cc->rpages[i]); + f2fs_put_page(cc->rpages[i], unlock); + } +} + +struct page *f2fs_compress_control_page(struct page *page) +{ + return ((struct compress_io_ctx *)page_private(page))->rpages[0]; +} + +int f2fs_init_compress_ctx(struct compress_ctx *cc) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + + if (cc->nr_rpages) + return 0; + + cc->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << + cc->log_cluster_size, GFP_NOFS); + return cc->rpages ? 0 : -ENOMEM; +} + +void f2fs_destroy_compress_ctx(struct compress_ctx *cc) +{ + kfree(cc->rpages); + cc->rpages = NULL; + cc->nr_rpages = 0; + cc->nr_cpages = 0; + cc->cluster_idx = NULL_CLUSTER; +} + +void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page) +{ + unsigned int cluster_ofs; + + if (!f2fs_cluster_can_merge_page(cc, page->index)) + f2fs_bug_on(F2FS_I_SB(cc->inode), 1); + + cluster_ofs = offset_in_cluster(cc, page->index); + cc->rpages[cluster_ofs] = page; + cc->nr_rpages++; + cc->cluster_idx = cluster_idx(cc, page->index); +} + +#ifdef CONFIG_F2FS_FS_LZO +static int lzo_init_compress_ctx(struct compress_ctx *cc) +{ + cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), + LZO1X_MEM_COMPRESS, GFP_NOFS); + if (!cc->private) + return -ENOMEM; + + cc->clen = lzo1x_worst_compress(PAGE_SIZE << cc->log_cluster_size); + return 0; +} + +static void lzo_destroy_compress_ctx(struct compress_ctx *cc) +{ + kvfree(cc->private); + cc->private = NULL; +} + +static int lzo_compress_pages(struct compress_ctx *cc) +{ + int ret; + + ret = lzo1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata, + &cc->clen, cc->private); + if (ret != LZO_E_OK) { + printk_ratelimited("%sF2FS-fs (%s): lzo compress failed, ret:%d\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret); + return -EIO; + } + return 0; +} + +static int lzo_decompress_pages(struct decompress_io_ctx *dic) +{ + int ret; + + ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen, + dic->rbuf, &dic->rlen); + if (ret != LZO_E_OK) { + printk_ratelimited("%sF2FS-fs (%s): lzo decompress failed, ret:%d\n", + KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret); + return -EIO; + } + + if (dic->rlen != PAGE_SIZE << dic->log_cluster_size) { + printk_ratelimited("%sF2FS-fs (%s): lzo invalid rlen:%zu, " + "expected:%lu\n", KERN_ERR, + F2FS_I_SB(dic->inode)->sb->s_id, + dic->rlen, + PAGE_SIZE << dic->log_cluster_size); + return -EIO; + } + return 0; +} + +static const struct f2fs_compress_ops f2fs_lzo_ops = { + .init_compress_ctx = lzo_init_compress_ctx, + .destroy_compress_ctx = lzo_destroy_compress_ctx, + .compress_pages = lzo_compress_pages, + .decompress_pages = lzo_decompress_pages, +}; +#endif + +#ifdef CONFIG_F2FS_FS_LZ4 +static int lz4_init_compress_ctx(struct compress_ctx *cc) +{ + cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), + LZ4_MEM_COMPRESS, GFP_NOFS); + if (!cc->private) + return -ENOMEM; + + cc->clen = LZ4_compressBound(PAGE_SIZE << cc->log_cluster_size); + return 0; +} + +static void lz4_destroy_compress_ctx(struct compress_ctx *cc) +{ + kvfree(cc->private); + cc->private = NULL; +} + +static int lz4_compress_pages(struct compress_ctx *cc) +{ + int len; + + len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, cc->private); + if (!len) { + printk_ratelimited("%sF2FS-fs (%s): lz4 compress failed\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id); + return -EIO; + } + cc->clen = len; + return 0; +} + +static int lz4_decompress_pages(struct decompress_io_ctx *dic) +{ + int ret; + + ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf, + dic->clen, dic->rlen); + if (ret < 0) { + printk_ratelimited("%sF2FS-fs (%s): lz4 decompress failed, ret:%d\n", + KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret); + return -EIO; + } + + if (ret != PAGE_SIZE << dic->log_cluster_size) { + printk_ratelimited("%sF2FS-fs (%s): lz4 invalid rlen:%zu, " + "expected:%lu\n", KERN_ERR, + F2FS_I_SB(dic->inode)->sb->s_id, + dic->rlen, + PAGE_SIZE << dic->log_cluster_size); + return -EIO; + } + return 0; +} + +static const struct f2fs_compress_ops f2fs_lz4_ops = { + .init_compress_ctx = lz4_init_compress_ctx, + .destroy_compress_ctx = lz4_destroy_compress_ctx, + .compress_pages = lz4_compress_pages, + .decompress_pages = lz4_decompress_pages, +}; +#endif + +static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = { +#ifdef CONFIG_F2FS_FS_LZO + &f2fs_lzo_ops, +#else + NULL, +#endif +#ifdef CONFIG_F2FS_FS_LZ4 + &f2fs_lz4_ops, +#else + NULL, +#endif +}; + +bool f2fs_is_compress_backend_ready(struct inode *inode) +{ + if (!f2fs_compressed_file(inode)) + return true; + return f2fs_cops[F2FS_I(inode)->i_compress_algorithm]; +} + +static struct page *f2fs_grab_page(void) +{ + struct page *page; + + page = alloc_page(GFP_NOFS); + if (!page) + return NULL; + lock_page(page); + return page; +} + +static int f2fs_compress_pages(struct compress_ctx *cc) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + struct f2fs_inode_info *fi = F2FS_I(cc->inode); + const struct f2fs_compress_ops *cops = + f2fs_cops[fi->i_compress_algorithm]; + unsigned int max_len, nr_cpages; + int i, ret; + + trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx, + cc->cluster_size, fi->i_compress_algorithm); + + ret = cops->init_compress_ctx(cc); + if (ret) + goto out; + + max_len = COMPRESS_HEADER_SIZE + cc->clen; + cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); + + cc->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) * + cc->nr_cpages, GFP_NOFS); + if (!cc->cpages) { + ret = -ENOMEM; + goto destroy_compress_ctx; + } + + for (i = 0; i < cc->nr_cpages; i++) { + cc->cpages[i] = f2fs_grab_page(); + if (!cc->cpages[i]) { + ret = -ENOMEM; + goto out_free_cpages; + } + } + + cc->rbuf = vmap(cc->rpages, cc->cluster_size, VM_MAP, PAGE_KERNEL_RO); + if (!cc->rbuf) { + ret = -ENOMEM; + goto out_free_cpages; + } + + cc->cbuf = vmap(cc->cpages, cc->nr_cpages, VM_MAP, PAGE_KERNEL); + if (!cc->cbuf) { + ret = -ENOMEM; + goto out_vunmap_rbuf; + } + + ret = cops->compress_pages(cc); + if (ret) + goto out_vunmap_cbuf; + + max_len = PAGE_SIZE * (cc->cluster_size - 1) - COMPRESS_HEADER_SIZE; + + if (cc->clen > max_len) { + ret = -EAGAIN; + goto out_vunmap_cbuf; + } + + cc->cbuf->clen = cpu_to_le32(cc->clen); + cc->cbuf->chksum = cpu_to_le32(0); + + for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++) + cc->cbuf->reserved[i] = cpu_to_le32(0); + + vunmap(cc->cbuf); + vunmap(cc->rbuf); + + nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); + + for (i = nr_cpages; i < cc->nr_cpages; i++) { + f2fs_put_compressed_page(cc->cpages[i]); + cc->cpages[i] = NULL; + } + + cc->nr_cpages = nr_cpages; + + trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, + cc->clen, ret); + return 0; + +out_vunmap_cbuf: + vunmap(cc->cbuf); +out_vunmap_rbuf: + vunmap(cc->rbuf); +out_free_cpages: + for (i = 0; i < cc->nr_cpages; i++) { + if (cc->cpages[i]) + f2fs_put_compressed_page(cc->cpages[i]); + } + kfree(cc->cpages); + cc->cpages = NULL; +destroy_compress_ctx: + cops->destroy_compress_ctx(cc); +out: + trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, + cc->clen, ret); + return ret; +} + +void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + struct f2fs_inode_info *fi= F2FS_I(dic->inode); + const struct f2fs_compress_ops *cops = + f2fs_cops[fi->i_compress_algorithm]; + int ret; + + dec_page_count(sbi, F2FS_RD_DATA); + + if (bio->bi_status || PageError(page)) + dic->failed = true; + + if (refcount_dec_not_one(&dic->ref)) + return; + + trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, + dic->cluster_size, fi->i_compress_algorithm); + + /* submit partial compressed pages */ + if (dic->failed) { + ret = -EIO; + goto out_free_dic; + } + + dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL); + if (!dic->rbuf) { + ret = -ENOMEM; + goto out_free_dic; + } + + dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL_RO); + if (!dic->cbuf) { + ret = -ENOMEM; + goto out_vunmap_rbuf; + } + + dic->clen = le32_to_cpu(dic->cbuf->clen); + dic->rlen = PAGE_SIZE << dic->log_cluster_size; + + if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { + ret = -EFSCORRUPTED; + goto out_vunmap_cbuf; + } + + ret = cops->decompress_pages(dic); + +out_vunmap_cbuf: + vunmap(dic->cbuf); +out_vunmap_rbuf: + vunmap(dic->rbuf); +out_free_dic: + if (!verity) + f2fs_decompress_end_io(dic->rpages, dic->cluster_size, + ret, false); + + trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, + dic->clen, ret); + if (!verity) + f2fs_free_dic(dic); +} + +static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) +{ + if (cc->cluster_idx == NULL_CLUSTER) + return true; + return cc->cluster_idx == cluster_idx(cc, index); +} + +bool f2fs_cluster_is_empty(struct compress_ctx *cc) +{ + return cc->nr_rpages == 0; +} + +static bool f2fs_cluster_is_full(struct compress_ctx *cc) +{ + return cc->cluster_size == cc->nr_rpages; +} + +bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) +{ + if (f2fs_cluster_is_empty(cc)) + return true; + return is_page_in_cluster(cc, index); +} + +static bool __cluster_may_compress(struct compress_ctx *cc) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + loff_t i_size = i_size_read(cc->inode); + unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE); + int i; + + for (i = 0; i < cc->cluster_size; i++) { + struct page *page = cc->rpages[i]; + + f2fs_bug_on(sbi, !page); + + if (unlikely(f2fs_cp_error(sbi))) + return false; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return false; + + /* beyond EOF */ + if (page->index >= nr_pages) + return false; + } + return true; +} + +/* return # of compressed block addresses */ +static int f2fs_compressed_blocks(struct compress_ctx *cc) +{ + struct dnode_of_data dn; + int ret; + + set_new_dnode(&dn, cc->inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx_of_cluster(cc), + LOOKUP_NODE); + if (ret) { + if (ret == -ENOENT) + ret = 0; + goto fail; + } + + if (dn.data_blkaddr == COMPRESS_ADDR) { + int i; + + ret = 1; + for (i = 1; i < cc->cluster_size; i++) { + block_t blkaddr; + + blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node + i); + if (blkaddr != NULL_ADDR) + ret++; + } + } +fail: + f2fs_put_dnode(&dn); + return ret; +} + +int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) +{ + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size, + }; + + return f2fs_compressed_blocks(&cc); +} + +static bool cluster_may_compress(struct compress_ctx *cc) +{ + if (!f2fs_compressed_file(cc->inode)) + return false; + if (f2fs_is_atomic_file(cc->inode)) + return false; + if (f2fs_is_mmap_file(cc->inode)) + return false; + if (!f2fs_cluster_is_full(cc)) + return false; + return __cluster_may_compress(cc); +} + +static void set_cluster_writeback(struct compress_ctx *cc) +{ + int i; + + for (i = 0; i < cc->cluster_size; i++) { + if (cc->rpages[i]) + set_page_writeback(cc->rpages[i]); + } +} + +static void set_cluster_dirty(struct compress_ctx *cc) +{ + int i; + + for (i = 0; i < cc->cluster_size; i++) + if (cc->rpages[i]) + set_page_dirty(cc->rpages[i]); +} + +static int prepare_compress_overwrite(struct compress_ctx *cc, + struct page **pagep, pgoff_t index, void **fsdata) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + struct address_space *mapping = cc->inode->i_mapping; + struct page *page; + struct dnode_of_data dn; + sector_t last_block_in_bio; + unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; + pgoff_t start_idx = start_idx_of_cluster(cc); + int i, ret; + bool prealloc; + +retry: + ret = f2fs_compressed_blocks(cc); + if (ret <= 0) + return ret; + + /* compressed case */ + prealloc = (ret < cc->cluster_size); + + ret = f2fs_init_compress_ctx(cc); + if (ret) + return ret; + + /* keep page reference to avoid page reclaim */ + for (i = 0; i < cc->cluster_size; i++) { + page = f2fs_pagecache_get_page(mapping, start_idx + i, + fgp_flag, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + goto unlock_pages; + } + + if (PageUptodate(page)) + unlock_page(page); + else + f2fs_compress_ctx_add_page(cc, page); + } + + if (!f2fs_cluster_is_empty(cc)) { + struct bio *bio = NULL; + + ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, + &last_block_in_bio, false); + f2fs_destroy_compress_ctx(cc); + if (ret) + goto release_pages; + if (bio) + f2fs_submit_bio(sbi, bio, DATA); + + ret = f2fs_init_compress_ctx(cc); + if (ret) + goto release_pages; + } + + for (i = 0; i < cc->cluster_size; i++) { + f2fs_bug_on(sbi, cc->rpages[i]); + + page = find_lock_page(mapping, start_idx + i); + f2fs_bug_on(sbi, !page); + + f2fs_wait_on_page_writeback(page, DATA, true, true); + + f2fs_compress_ctx_add_page(cc, page); + f2fs_put_page(page, 0); + + if (!PageUptodate(page)) { + f2fs_unlock_rpages(cc, i + 1); + f2fs_put_rpages_mapping(cc, mapping, start_idx, + cc->cluster_size); + f2fs_destroy_compress_ctx(cc); + goto retry; + } + } + + if (prealloc) { + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + + set_new_dnode(&dn, cc->inode, NULL, NULL, 0); + + for (i = cc->cluster_size - 1; i > 0; i--) { + ret = f2fs_get_block(&dn, start_idx + i); + if (ret) { + i = cc->cluster_size; + break; + } + + if (dn.data_blkaddr != NEW_ADDR) + break; + } + + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + } + + if (likely(!ret)) { + *fsdata = cc->rpages; + *pagep = cc->rpages[offset_in_cluster(cc, index)]; + return cc->cluster_size; + } + +unlock_pages: + f2fs_unlock_rpages(cc, i); +release_pages: + f2fs_put_rpages_mapping(cc, mapping, start_idx, i); + f2fs_destroy_compress_ctx(cc); + return ret; +} + +int f2fs_prepare_compress_overwrite(struct inode *inode, + struct page **pagep, pgoff_t index, void **fsdata) +{ + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size, + .rpages = NULL, + .nr_rpages = 0, + }; + + return prepare_compress_overwrite(&cc, pagep, index, fsdata); +} + +bool f2fs_compress_write_end(struct inode *inode, void *fsdata, + pgoff_t index, unsigned copied) + +{ + struct compress_ctx cc = { + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .rpages = fsdata, + }; + bool first_index = (index == cc.rpages[0]->index); + + if (copied) + set_cluster_dirty(&cc); + + f2fs_put_rpages_wbc(&cc, NULL, false, 1); + f2fs_destroy_compress_ctx(&cc); + + return first_index; +} + +static int f2fs_write_compressed_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct inode *inode = cc->inode; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = cc->inode->i_ino, + .type = DATA, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .old_blkaddr = NEW_ADDR, + .page = NULL, + .encrypted_page = NULL, + .compressed_page = NULL, + .submitted = false, + .need_lock = LOCK_RETRY, + .io_type = io_type, + .io_wbc = wbc, + .encrypted = f2fs_encrypted_file(cc->inode), + }; + struct dnode_of_data dn; + struct node_info ni; + struct compress_io_ctx *cic; + pgoff_t start_idx = start_idx_of_cluster(cc); + unsigned int last_index = cc->cluster_size - 1; + loff_t psize; + int i, err; + + set_new_dnode(&dn, cc->inode, NULL, NULL, 0); + + f2fs_lock_op(sbi); + + err = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); + if (err) + goto out_unlock_op; + + for (i = 0; i < cc->cluster_size; i++) { + if (datablock_addr(dn.inode, dn.node_page, + dn.ofs_in_node + i) == NULL_ADDR) + goto out_put_dnode; + } + + psize = (loff_t)(cc->rpages[last_index]->index + 1) << PAGE_SHIFT; + + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + if (err) + goto out_put_dnode; + + fio.version = ni.version; + + cic = f2fs_kzalloc(sbi, sizeof(struct compress_io_ctx), GFP_NOFS); + if (!cic) + goto out_put_dnode; + + cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; + cic->inode = inode; + refcount_set(&cic->ref, 1); + cic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << + cc->log_cluster_size, GFP_NOFS); + if (!cic->rpages) + goto out_put_cic; + + cic->nr_rpages = cc->cluster_size; + + for (i = 0; i < cc->nr_cpages; i++) { + f2fs_set_compressed_page(cc->cpages[i], inode, + cc->rpages[i + 1]->index, + cic, i ? &cic->ref : NULL); + fio.compressed_page = cc->cpages[i]; + if (fio.encrypted) { + fio.page = cc->rpages[i + 1]; + err = f2fs_encrypt_one_page(&fio); + if (err) + goto out_destroy_crypt; + cc->cpages[i] = fio.encrypted_page; + } + } + + set_cluster_writeback(cc); + + for (i = 0; i < cc->cluster_size; i++) + cic->rpages[i] = cc->rpages[i]; + + for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) { + block_t blkaddr; + + blkaddr = datablock_addr(dn.inode, dn.node_page, + dn.ofs_in_node); + fio.page = cic->rpages[i]; + fio.old_blkaddr = blkaddr; + + /* cluster header */ + if (i == 0) { + if (blkaddr == COMPRESS_ADDR) + fio.compr_blocks++; + if (__is_valid_data_blkaddr(blkaddr)) + f2fs_invalidate_blocks(sbi, blkaddr); + f2fs_update_data_blkaddr(&dn, COMPRESS_ADDR); + goto unlock_continue; + } + + if (fio.compr_blocks && __is_valid_data_blkaddr(blkaddr)) + fio.compr_blocks++; + + if (i > cc->nr_cpages) { + if (__is_valid_data_blkaddr(blkaddr)) { + f2fs_invalidate_blocks(sbi, blkaddr); + f2fs_update_data_blkaddr(&dn, NEW_ADDR); + } + goto unlock_continue; + } + + f2fs_bug_on(fio.sbi, blkaddr == NULL_ADDR); + + if (fio.encrypted) + fio.encrypted_page = cc->cpages[i - 1]; + else + fio.compressed_page = cc->cpages[i - 1]; + + cc->cpages[i - 1] = NULL; + f2fs_outplace_write_data(&dn, &fio); + (*submitted)++; +unlock_continue: + inode_dec_dirty_pages(cc->inode); + unlock_page(fio.page); + } + + if (fio.compr_blocks) + f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false); + f2fs_i_compr_blocks_update(inode, cc->nr_cpages, true); + + set_inode_flag(cc->inode, FI_APPEND_WRITE); + if (cc->cluster_idx == 0) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + + f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); + + down_write(&fi->i_sem); + if (fi->last_disk_size < psize) + fi->last_disk_size = psize; + up_write(&fi->i_sem); + + f2fs_put_rpages(cc); + f2fs_destroy_compress_ctx(cc); + return 0; + +out_destroy_crypt: + kfree(cic->rpages); + + for (--i; i >= 0; i--) + fscrypt_finalize_bounce_page(&cc->cpages[i]); + for (i = 0; i < cc->nr_cpages; i++) { + if (!cc->cpages[i]) + continue; + f2fs_put_page(cc->cpages[i], 1); + } +out_put_cic: + kfree(cic); +out_put_dnode: + f2fs_put_dnode(&dn); +out_unlock_op: + f2fs_unlock_op(sbi); + return -EAGAIN; +} + +void f2fs_compress_write_end_io(struct bio *bio, struct page *page) +{ + struct f2fs_sb_info *sbi = bio->bi_private; + struct compress_io_ctx *cic = + (struct compress_io_ctx *)page_private(page); + int i; + + if (unlikely(bio->bi_status)) + mapping_set_error(cic->inode->i_mapping, -EIO); + + f2fs_put_compressed_page(page); + + dec_page_count(sbi, F2FS_WB_DATA); + + if (refcount_dec_not_one(&cic->ref)) + return; + + for (i = 0; i < cic->nr_rpages; i++) { + WARN_ON(!cic->rpages[i]); + clear_cold_data(cic->rpages[i]); + end_page_writeback(cic->rpages[i]); + } + + kfree(cic->rpages); + kfree(cic); +} + +static int f2fs_write_raw_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct address_space *mapping = cc->inode->i_mapping; + int _submitted, compr_blocks, ret; + int i = -1, err = 0; + + compr_blocks = f2fs_compressed_blocks(cc); + if (compr_blocks < 0) { + err = compr_blocks; + goto out_err; + } + + for (i = 0; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; +retry_write: + if (cc->rpages[i]->mapping != mapping) { + unlock_page(cc->rpages[i]); + continue; + } + + BUG_ON(!PageLocked(cc->rpages[i])); + + ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, + NULL, NULL, wbc, io_type, + compr_blocks); + if (ret) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(cc->rpages[i]); + ret = 0; + } else if (ret == -EAGAIN) { + ret = 0; + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + lock_page(cc->rpages[i]); + clear_page_dirty_for_io(cc->rpages[i]); + goto retry_write; + } + err = ret; + goto out_fail; + } + + *submitted += _submitted; + } + return 0; + +out_fail: + /* TODO: revoke partially updated block addresses */ + BUG_ON(compr_blocks); +out_err: + for (++i; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; + redirty_page_for_writepage(wbc, cc->rpages[i]); + unlock_page(cc->rpages[i]); + } + return err; +} + +int f2fs_write_multi_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct f2fs_inode_info *fi = F2FS_I(cc->inode); + const struct f2fs_compress_ops *cops = + f2fs_cops[fi->i_compress_algorithm]; + int err; + + *submitted = 0; + if (cluster_may_compress(cc)) { + err = f2fs_compress_pages(cc); + if (err == -EAGAIN) { + goto write; + } else if (err) { + f2fs_put_rpages_wbc(cc, wbc, true, 1); + goto destroy_out; + } + + err = f2fs_write_compressed_pages(cc, submitted, + wbc, io_type); + cops->destroy_compress_ctx(cc); + if (!err) + return 0; + f2fs_bug_on(F2FS_I_SB(cc->inode), err != -EAGAIN); + } +write: + f2fs_bug_on(F2FS_I_SB(cc->inode), *submitted); + + err = f2fs_write_raw_pages(cc, submitted, wbc, io_type); + f2fs_put_rpages_wbc(cc, wbc, false, 0); +destroy_out: + f2fs_destroy_compress_ctx(cc); + return err; +} + +struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + struct decompress_io_ctx *dic; + pgoff_t start_idx = start_idx_of_cluster(cc); + int i; + + dic = f2fs_kzalloc(sbi, sizeof(struct decompress_io_ctx), GFP_NOFS); + if (!dic) + return ERR_PTR(-ENOMEM); + + dic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << + cc->log_cluster_size, GFP_NOFS); + if (!dic->rpages) { + kfree(dic); + return ERR_PTR(-ENOMEM); + } + + dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; + dic->inode = cc->inode; + refcount_set(&dic->ref, 1); + dic->cluster_idx = cc->cluster_idx; + dic->cluster_size = cc->cluster_size; + dic->log_cluster_size = cc->log_cluster_size; + dic->nr_cpages = cc->nr_cpages; + dic->failed = false; + + for (i = 0; i < dic->cluster_size; i++) + dic->rpages[i] = cc->rpages[i]; + dic->nr_rpages = cc->cluster_size; + + dic->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) * + dic->nr_cpages, GFP_NOFS); + if (!dic->cpages) + goto out_free; + + for (i = 0; i < dic->nr_cpages; i++) { + struct page *page; + + page = f2fs_grab_page(); + if (!page) + goto out_free; + + f2fs_set_compressed_page(page, cc->inode, + start_idx + i + 1, + dic, i ? &dic->ref : NULL); + dic->cpages[i] = page; + } + + dic->tpages = f2fs_kzalloc(sbi, sizeof(struct page *) * + dic->cluster_size, GFP_NOFS); + if (!dic->tpages) + goto out_free; + + for (i = 0; i < dic->cluster_size; i++) { + if (cc->rpages[i]) + continue; + + dic->tpages[i] = f2fs_grab_page(); + if (!dic->tpages[i]) + goto out_free; + } + + for (i = 0; i < dic->cluster_size; i++) { + if (dic->tpages[i]) + continue; + dic->tpages[i] = cc->rpages[i]; + } + + return dic; + +out_free: + f2fs_free_dic(dic); + return ERR_PTR(-ENOMEM); +} + +void f2fs_free_dic(struct decompress_io_ctx *dic) +{ + int i; + + if (dic->tpages) { + for (i = 0; i < dic->cluster_size; i++) { + if (dic->rpages[i]) + continue; + f2fs_put_page(dic->tpages[i], 1); + } + kfree(dic->tpages); + } + + if (dic->cpages) { + for (i = 0; i < dic->nr_cpages; i++) { + if (!dic->cpages[i]) + continue; + f2fs_put_compressed_page(dic->cpages[i]); + } + kfree(dic->cpages); + } + + kfree(dic->rpages); + kfree(dic); +} + +void f2fs_decompress_end_io(struct page **rpages, + unsigned int cluster_size, bool err, bool verity) +{ + int i; + + for (i = 0; i < cluster_size; i++) { + struct page *rpage = rpages[i]; + + if (!rpage) + continue; + + if (err || PageError(rpage)) { + ClearPageUptodate(rpage); + ClearPageError(rpage); + } else { + if (!verity || fsverity_verify_page(rpage)) + SetPageUptodate(rpage); + else + SetPageError(rpage); + } + unlock_page(rpage); + } +} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a034cd0ce021..8bd9afa81c54 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -31,6 +31,47 @@ static struct kmem_cache *bio_post_read_ctx_cache; static struct kmem_cache *bio_entry_slab; static mempool_t *bio_post_read_ctx_pool; +static struct bio_set f2fs_bioset; + +#define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE + +int __init f2fs_init_bioset(void) +{ + if (bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, + 0, BIOSET_NEED_BVECS)) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_bioset(void) +{ + bioset_exit(&f2fs_bioset); +} + +static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask, + unsigned int nr_iovecs) +{ + return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset); +} + +struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail) +{ + struct bio *bio; + + if (no_fail) { + /* No failure on bio allocation */ + bio = __f2fs_bio_alloc(GFP_NOIO, npages); + if (!bio) + bio = __f2fs_bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); + return bio; + } + if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { + f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); + return NULL; + } + + return __f2fs_bio_alloc(GFP_KERNEL, npages); +} static bool __is_cp_guaranteed(struct page *page) { @@ -41,6 +82,9 @@ static bool __is_cp_guaranteed(struct page *page) if (!mapping) return false; + if (f2fs_is_compressed_page(page)) + return false; + inode = mapping->host; sbi = F2FS_I_SB(inode); @@ -73,19 +117,19 @@ static enum count_type __read_io_type(struct page *page) /* postprocessing steps for read bios */ enum bio_post_read_step { - STEP_INITIAL = 0, STEP_DECRYPT, + STEP_DECOMPRESS, STEP_VERITY, }; struct bio_post_read_ctx { struct bio *bio; + struct f2fs_sb_info *sbi; struct work_struct work; - unsigned int cur_step; unsigned int enabled_steps; }; -static void __read_end_io(struct bio *bio) +static void __read_end_io(struct bio *bio, bool compr, bool verity) { struct page *page; struct bio_vec *bv; @@ -94,6 +138,13 @@ static void __read_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, iter_all) { page = bv->bv_page; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (compr && f2fs_is_compressed_page(page)) { + f2fs_decompress_pages(bio, page, verity); + continue; + } +#endif + /* PG_error was set if any post_read step failed */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); @@ -105,31 +156,107 @@ static void __read_end_io(struct bio *bio) dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); - bio_put(bio); +} + +static void f2fs_release_read_bio(struct bio *bio); +static void __f2fs_read_end_io(struct bio *bio, bool compr, bool verity) +{ + if (!compr) + __read_end_io(bio, false, verity); + f2fs_release_read_bio(bio); +} + +static void f2fs_decompress_bio(struct bio *bio, bool verity) +{ + __read_end_io(bio, true, verity); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx); -static void decrypt_work(struct work_struct *work) +static void f2fs_decrypt_work(struct bio_post_read_ctx *ctx) +{ + fscrypt_decrypt_bio(ctx->bio); +} + +static void f2fs_decompress_work(struct bio_post_read_ctx *ctx) +{ + f2fs_decompress_bio(ctx->bio, ctx->enabled_steps & (1 << STEP_VERITY)); +} + +#ifdef CONFIG_F2FS_FS_COMPRESSION +static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size) +{ + f2fs_decompress_end_io(rpages, cluster_size, false, true); +} + +static void f2fs_verify_bio(struct bio *bio) +{ + struct page *page = bio_first_page_all(bio); + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + + f2fs_verify_pages(dic->rpages, dic->cluster_size); + f2fs_free_dic(dic); +} +#endif + +static void f2fs_verity_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; +#ifdef CONFIG_F2FS_FS_COMPRESSION + unsigned int enabled_steps = ctx->enabled_steps; +#endif - fscrypt_decrypt_bio(ctx->bio); + /* + * fsverity_verify_bio() may call readpages() again, and while verity + * will be disabled for this, decryption may still be needed, resulting + * in another bio_post_read_ctx being allocated. So to prevent + * deadlocks we need to release the current ctx to the mempool first. + * This assumes that verity is the last post-read step. + */ + mempool_free(ctx, bio_post_read_ctx_pool); + bio->bi_private = NULL; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* previous step is decompression */ + if (enabled_steps & (1 << STEP_DECOMPRESS)) { + f2fs_verify_bio(bio); + f2fs_release_read_bio(bio); + return; + } +#endif - bio_post_read_processing(ctx); + fsverity_verify_bio(bio); + __f2fs_read_end_io(bio, false, false); } -static void verity_work(struct work_struct *work) +static void f2fs_post_read_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); - fsverity_verify_bio(ctx->bio); + if (ctx->enabled_steps & (1 << STEP_DECRYPT)) + f2fs_decrypt_work(ctx); - bio_post_read_processing(ctx); + if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) + f2fs_decompress_work(ctx); + + if (ctx->enabled_steps & (1 << STEP_VERITY)) { + INIT_WORK(&ctx->work, f2fs_verity_work); + fsverity_enqueue_verify_work(&ctx->work); + return; + } + + __f2fs_read_end_io(ctx->bio, + ctx->enabled_steps & (1 << STEP_DECOMPRESS), false); +} + +static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi, + struct work_struct *work) +{ + queue_work(sbi->post_read_wq, work); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx) @@ -139,31 +266,26 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx) * verity may require reading metadata pages that need decryption, and * we shouldn't recurse to the same workqueue. */ - switch (++ctx->cur_step) { - case STEP_DECRYPT: - if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { - INIT_WORK(&ctx->work, decrypt_work); - fscrypt_enqueue_decrypt_work(&ctx->work); - return; - } - ctx->cur_step++; - /* fall-through */ - case STEP_VERITY: - if (ctx->enabled_steps & (1 << STEP_VERITY)) { - INIT_WORK(&ctx->work, verity_work); - fsverity_enqueue_verify_work(&ctx->work); - return; - } - ctx->cur_step++; - /* fall-through */ - default: - __read_end_io(ctx->bio); + + if (ctx->enabled_steps & (1 << STEP_DECRYPT) || + ctx->enabled_steps & (1 << STEP_DECOMPRESS)) { + INIT_WORK(&ctx->work, f2fs_post_read_work); + f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work); + return; } + + if (ctx->enabled_steps & (1 << STEP_VERITY)) { + INIT_WORK(&ctx->work, f2fs_verity_work); + fsverity_enqueue_verify_work(&ctx->work); + return; + } + + __f2fs_read_end_io(ctx->bio, false, false); } static bool f2fs_bio_post_read_required(struct bio *bio) { - return bio->bi_private && !bio->bi_status; + return bio->bi_private; } static void f2fs_read_end_io(struct bio *bio) @@ -178,12 +300,11 @@ static void f2fs_read_end_io(struct bio *bio) if (f2fs_bio_post_read_required(bio)) { struct bio_post_read_ctx *ctx = bio->bi_private; - ctx->cur_step = STEP_INITIAL; bio_post_read_processing(ctx); return; } - __read_end_io(bio); + __f2fs_read_end_io(bio, false, false); } static void f2fs_write_end_io(struct bio *bio) @@ -214,6 +335,13 @@ static void f2fs_write_end_io(struct bio *bio) fscrypt_finalize_bounce_page(&page); +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_is_compressed_page(page)) { + f2fs_compress_write_end_io(bio, page); + continue; + } +#endif + if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); if (type == F2FS_WB_CP_DATA) @@ -358,6 +486,12 @@ submit_io: submit_bio(bio); } +void f2fs_submit_bio(struct f2fs_sb_info *sbi, + struct bio *bio, enum page_type type) +{ + __submit_bio(sbi, bio, type); +} + static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; @@ -380,7 +514,6 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, struct page *page, nid_t ino) { struct bio_vec *bvec; - struct page *target; struct bvec_iter_all iter_all; if (!bio) @@ -390,10 +523,18 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, return true; bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *target = bvec->bv_page; - target = bvec->bv_page; - if (fscrypt_is_bounce_page(target)) + if (fscrypt_is_bounce_page(target)) { target = fscrypt_pagecache_page(target); + if (IS_ERR(target)) + continue; + } + if (f2fs_is_compressed_page(target)) { + target = f2fs_compress_control_page(target); + if (IS_ERR(target)) + continue; + } if (inode && inode == target->mapping->host) return true; @@ -588,7 +729,8 @@ static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio, found = true; - if (bio_add_page(*bio, page, PAGE_SIZE, 0) == PAGE_SIZE) { + if (bio_add_page(*bio, page, PAGE_SIZE, 0) == + PAGE_SIZE) { ret = 0; break; } @@ -728,7 +870,12 @@ next: verify_fio_blkaddr(fio); - bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + if (fio->encrypted_page) + bio_page = fio->encrypted_page; + else if (fio->compressed_page) + bio_page = fio->compressed_page; + else + bio_page = fio->page; /* set submitted = true as a return value */ fio->submitted = true; @@ -797,17 +944,16 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, if (f2fs_encrypted_file(inode)) post_read_steps |= 1 << STEP_DECRYPT; - + if (f2fs_compressed_file(inode)) + post_read_steps |= 1 << STEP_DECOMPRESS; if (f2fs_need_verity(inode, first_idx)) post_read_steps |= 1 << STEP_VERITY; if (post_read_steps) { + /* Due to the mempool, this never fails. */ ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); - if (!ctx) { - bio_put(bio); - return ERR_PTR(-ENOMEM); - } ctx->bio = bio; + ctx->sbi = sbi; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; } @@ -815,6 +961,13 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, return bio; } +static void f2fs_release_read_bio(struct bio *bio) +{ + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); +} + /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct page *page, block_t blkaddr) @@ -1180,19 +1333,6 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) int err = 0; bool direct_io = iocb->ki_flags & IOCB_DIRECT; - /* convert inline data for Direct I/O*/ - if (direct_io) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } - - if (direct_io && allow_outplace_dio(inode, iocb, from)) - return 0; - - if (is_inode_flag_set(inode, FI_NO_PREALLOC)) - return 0; - map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); if (map.m_len > map.m_lblk) @@ -1872,6 +2012,144 @@ out: return ret; } +#ifdef CONFIG_F2FS_FS_COMPRESSION +int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, + unsigned nr_pages, sector_t *last_block_in_bio, + bool is_readahead) +{ + struct dnode_of_data dn; + struct inode *inode = cc->inode; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct bio *bio = *bio_ret; + unsigned int start_idx = cc->cluster_idx << cc->log_cluster_size; + sector_t last_block_in_file; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocksize = 1 << blkbits; + struct decompress_io_ctx *dic = NULL; + int i; + int ret = 0; + + f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); + + last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; + + /* get rid of pages beyond EOF */ + for (i = 0; i < cc->cluster_size; i++) { + struct page *page = cc->rpages[i]; + + if (!page) + continue; + if ((sector_t)page->index >= last_block_in_file) { + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); + } else if (!PageUptodate(page)) { + continue; + } + unlock_page(page); + cc->rpages[i] = NULL; + cc->nr_rpages--; + } + + /* we are done since all pages are beyond EOF */ + if (f2fs_cluster_is_empty(cc)) + goto out; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); + if (ret) + goto out; + + /* cluster was overwritten as normal cluster */ + if (dn.data_blkaddr != COMPRESS_ADDR) + goto out; + + for (i = 1; i < cc->cluster_size; i++) { + block_t blkaddr; + + blkaddr = datablock_addr(dn.inode, dn.node_page, + dn.ofs_in_node + i); + + if (!__is_valid_data_blkaddr(blkaddr)) + break; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) { + ret = -EFAULT; + goto out_put_dnode; + } + cc->nr_cpages++; + } + + /* nothing to decompress */ + if (cc->nr_cpages == 0) { + ret = 0; + goto out_put_dnode; + } + + dic = f2fs_alloc_dic(cc); + if (IS_ERR(dic)) { + ret = PTR_ERR(dic); + goto out_put_dnode; + } + + for (i = 0; i < dic->nr_cpages; i++) { + struct page *page = dic->cpages[i]; + block_t blkaddr; + + blkaddr = datablock_addr(dn.inode, dn.node_page, + dn.ofs_in_node + i + 1); + + if (bio && !page_is_mergeable(sbi, bio, + *last_block_in_bio, blkaddr)) { +submit_and_realloc: + __submit_bio(sbi, bio, DATA); + bio = NULL; + } + + if (!bio) { + bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, + is_readahead ? REQ_RAHEAD : 0, + page->index); + if (IS_ERR(bio)) { + ret = PTR_ERR(bio); + bio = NULL; + dic->failed = true; + if (refcount_sub_and_test(dic->nr_cpages - i, + &dic->ref)) + f2fs_decompress_end_io(dic->rpages, + cc->cluster_size, true, + false); + f2fs_free_dic(dic); + f2fs_put_dnode(&dn); + *bio_ret = bio; + return ret; + } + } + + f2fs_wait_on_block_writeback(inode, blkaddr); + + if (bio_add_page(bio, page, blocksize, 0) < blocksize) + goto submit_and_realloc; + + inc_page_count(sbi, F2FS_RD_DATA); + ClearPageError(page); + *last_block_in_bio = blkaddr; + } + + f2fs_put_dnode(&dn); + + *bio_ret = bio; + return 0; + +out_put_dnode: + f2fs_put_dnode(&dn); +out: + f2fs_decompress_end_io(cc->rpages, cc->cluster_size, true, false); + *bio_ret = bio; + return ret; +} +#endif + /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -1881,7 +2159,7 @@ out: * use ->readpage() or do the necessary surgery to decouple ->readpages() * from read-ahead. */ -static int f2fs_mpage_readpages(struct address_space *mapping, +int f2fs_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, unsigned nr_pages, bool is_readahead) { @@ -1889,6 +2167,19 @@ static int f2fs_mpage_readpages(struct address_space *mapping, sector_t last_block_in_bio = 0; struct inode *inode = mapping->host; struct f2fs_map_blocks map; +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = NULL_CLUSTER, + .rpages = NULL, + .cpages = NULL, + .nr_rpages = 0, + .nr_cpages = 0, + }; +#endif + unsigned max_nr_pages = nr_pages; int ret = 0; map.m_pblk = 0; @@ -1912,9 +2203,41 @@ static int f2fs_mpage_readpages(struct address_space *mapping, goto next_page; } - ret = f2fs_read_single_page(inode, page, nr_pages, &map, &bio, - &last_block_in_bio, is_readahead); +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + /* there are remained comressed pages, submit them */ + if (!f2fs_cluster_can_merge_page(&cc, page->index)) { + ret = f2fs_read_multi_pages(&cc, &bio, + max_nr_pages, + &last_block_in_bio, + is_readahead); + f2fs_destroy_compress_ctx(&cc); + if (ret) + goto set_error_page; + } + ret = f2fs_is_compressed_cluster(inode, page->index); + if (ret < 0) + goto set_error_page; + else if (!ret) + goto read_single_page; + + ret = f2fs_init_compress_ctx(&cc); + if (ret) + goto set_error_page; + + f2fs_compress_ctx_add_page(&cc, page); + + goto next_page; + } +read_single_page: +#endif + + ret = f2fs_read_single_page(inode, page, max_nr_pages, &map, + &bio, &last_block_in_bio, is_readahead); if (ret) { +#ifdef CONFIG_F2FS_FS_COMPRESSION +set_error_page: +#endif SetPageError(page); zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); @@ -1922,6 +2245,19 @@ static int f2fs_mpage_readpages(struct address_space *mapping, next_page: if (pages) put_page(page); + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + /* last page */ + if (nr_pages == 1 && !f2fs_cluster_is_empty(&cc)) { + ret = f2fs_read_multi_pages(&cc, &bio, + max_nr_pages, + &last_block_in_bio, + is_readahead); + f2fs_destroy_compress_ctx(&cc); + } + } +#endif } BUG_ON(pages && !list_empty(pages)); if (bio) @@ -1936,6 +2272,11 @@ static int f2fs_read_data_page(struct file *file, struct page *page) trace_f2fs_readpage(page, DATA); + if (!f2fs_is_compress_backend_ready(inode)) { + unlock_page(page); + return -EOPNOTSUPP; + } + /* If the file has inline data, try to read it directly */ if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); @@ -1954,6 +2295,9 @@ static int f2fs_read_data_pages(struct file *file, trace_f2fs_readpages(inode, page, nr_pages); + if (!f2fs_is_compress_backend_ready(inode)) + return 0; + /* If the file has inline data, skip readpages */ if (f2fs_has_inline_data(inode)) return 0; @@ -1961,22 +2305,23 @@ static int f2fs_read_data_pages(struct file *file, return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true); } -static int encrypt_one_page(struct f2fs_io_info *fio) +int f2fs_encrypt_one_page(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; - struct page *mpage; + struct page *mpage, *page; gfp_t gfp_flags = GFP_NOFS; if (!f2fs_encrypted_file(inode)) return 0; + page = fio->compressed_page ? fio->compressed_page : fio->page; + /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); retry_encrypt: - fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(fio->page, - PAGE_SIZE, 0, - gfp_flags); + fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page, + PAGE_SIZE, 0, gfp_flags); if (IS_ERR(fio->encrypted_page)) { /* flush pending IOs and wait for a while in the ENOMEM case */ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { @@ -2136,7 +2481,7 @@ got_it: if (ipu_force || (__is_valid_data_blkaddr(fio->old_blkaddr) && need_inplace_update(fio))) { - err = encrypt_one_page(fio); + err = f2fs_encrypt_one_page(fio); if (err) goto out_writepage; @@ -2172,13 +2517,16 @@ got_it: fio->version = ni.version; - err = encrypt_one_page(fio); + err = f2fs_encrypt_one_page(fio); if (err) goto out_writepage; set_page_writeback(page); ClearPageError(page); + if (fio->compr_blocks && fio->old_blkaddr == COMPRESS_ADDR) + f2fs_i_compr_blocks_update(inode, fio->compr_blocks - 1, false); + /* LFS mode write path */ f2fs_outplace_write_data(&dn, fio); trace_f2fs_do_write_data_page(page, OPU); @@ -2193,16 +2541,17 @@ out: return err; } -static int __write_data_page(struct page *page, bool *submitted, +int f2fs_write_single_data_page(struct page *page, int *submitted, struct bio **bio, sector_t *last_block, struct writeback_control *wbc, - enum iostat_type io_type) + enum iostat_type io_type, + int compr_blocks) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); - const pgoff_t end_index = ((unsigned long long) i_size) + const pgoff_t end_index = ((unsigned long long)i_size) >> PAGE_SHIFT; loff_t psize = (loff_t)(page->index + 1) << PAGE_SHIFT; unsigned offset = 0; @@ -2218,6 +2567,7 @@ static int __write_data_page(struct page *page, bool *submitted, .page = page, .encrypted_page = NULL, .submitted = false, + .compr_blocks = compr_blocks, .need_lock = LOCK_RETRY, .io_type = io_type, .io_wbc = wbc, @@ -2242,7 +2592,9 @@ static int __write_data_page(struct page *page, bool *submitted, if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; - if (page->index < end_index || f2fs_verity_in_progress(inode)) + if (page->index < end_index || + f2fs_verity_in_progress(inode) || + compr_blocks) goto write; /* @@ -2318,7 +2670,6 @@ out: f2fs_remove_dirty_inode(inode); submitted = NULL; } - unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && !F2FS_I(inode)->cp_task) @@ -2331,7 +2682,7 @@ out: } if (submitted) - *submitted = fio.submitted; + *submitted = fio.submitted ? 1 : 0; return 0; @@ -2352,7 +2703,23 @@ redirty_out: static int f2fs_write_data_page(struct page *page, struct writeback_control *wbc) { - return __write_data_page(page, NULL, NULL, NULL, wbc, FS_DATA_IO); +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct inode *inode = page->mapping->host; + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + goto out; + + if (f2fs_compressed_file(inode)) { + if (f2fs_is_compressed_cluster(inode, page->index)) { + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; + } + } +out: +#endif + + return f2fs_write_single_data_page(page, NULL, NULL, NULL, + wbc, FS_DATA_IO, 0); } /* @@ -2365,11 +2732,27 @@ static int f2fs_write_cache_pages(struct address_space *mapping, enum iostat_type io_type) { int ret = 0; - int done = 0; + int done = 0, retry = 0; struct pagevec pvec; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); struct bio *bio = NULL; sector_t last_block; +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct inode *inode = mapping->host; + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = NULL_CLUSTER, + .rpages = NULL, + .nr_rpages = 0, + .cpages = NULL, + .rbuf = NULL, + .cbuf = NULL, + .rlen = PAGE_SIZE * F2FS_I(inode)->i_cluster_size, + .private = NULL, + }; +#endif int nr_pages; pgoff_t uninitialized_var(writeback_index); pgoff_t index; @@ -2379,6 +2762,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int range_whole = 0; xa_mark_t tag; int nwritten = 0; + int submitted = 0; + int i; pagevec_init(&pvec); @@ -2408,12 +2793,11 @@ static int f2fs_write_cache_pages(struct address_space *mapping, else tag = PAGECACHE_TAG_DIRTY; retry: + retry = 0; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; - while (!done && (index <= end)) { - int i; - + while (!done && !retry && (index <= end)) { nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, tag); if (nr_pages == 0) @@ -2421,15 +2805,62 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - bool submitted = false; + bool need_readd; +readd: + need_readd = false; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + ret = f2fs_init_compress_ctx(&cc); + if (ret) { + done = 1; + break; + } + + if (!f2fs_cluster_can_merge_page(&cc, + page->index)) { + ret = f2fs_write_multi_pages(&cc, + &submitted, wbc, io_type); + if (!ret) + need_readd = true; + goto result; + } + if (unlikely(f2fs_cp_error(sbi))) + goto lock_page; + + if (f2fs_cluster_is_empty(&cc)) { + void *fsdata = NULL; + struct page *pagep; + int ret2; + + ret2 = f2fs_prepare_compress_overwrite( + inode, &pagep, + page->index, &fsdata); + if (ret2 < 0) { + ret = ret2; + done = 1; + break; + } else if (ret2 && + !f2fs_compress_write_end(inode, + fsdata, page->index, + 1)) { + retry = 1; + break; + } + } else { + goto lock_page; + } + } +#endif /* give a priority to WB_SYNC threads */ if (atomic_read(&sbi->wb_sync_req[DATA]) && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } - +#ifdef CONFIG_F2FS_FS_COMPRESSION +lock_page: +#endif done_index = page->index; retry_write: lock_page(page); @@ -2456,45 +2887,71 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = __write_data_page(page, &submitted, &bio, - &last_block, wbc, io_type); +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + get_page(page); + f2fs_compress_ctx_add_page(&cc, page); + continue; + } +#endif + ret = f2fs_write_single_data_page(page, &submitted, + &bio, &last_block, wbc, io_type, 0); + if (ret == AOP_WRITEPAGE_ACTIVATE) + unlock_page(page); +#ifdef CONFIG_F2FS_FS_COMPRESSION +result: +#endif + nwritten += submitted; + wbc->nr_to_write -= submitted; + if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to * get # of written pages. */ if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); ret = 0; - continue; + goto next; } else if (ret == -EAGAIN) { ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) { cond_resched(); congestion_wait(BLK_RW_ASYNC, - HZ/50); + HZ/50); goto retry_write; } - continue; + goto next; } done_index = page->index + 1; done = 1; break; - } else if (submitted) { - nwritten++; } - if (--wbc->nr_to_write <= 0 && + if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } +next: + if (need_readd) + goto readd; } pagevec_release(&pvec); cond_resched(); } - - if (!cycled && !done) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* flush remained pages in compress cluster */ + if (f2fs_compressed_file(inode) && !f2fs_cluster_is_empty(&cc)) { + ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type); + nwritten += submitted; + wbc->nr_to_write -= submitted; + if (ret) { + done = 1; + retry = 0; + } + } +#endif + if ((!cycled && !done) || retry) { cycled = 1; index = 0; end = writeback_index - 1; @@ -2518,6 +2975,8 @@ static inline bool __should_serialize_io(struct inode *inode, { if (!S_ISREG(inode->i_mode)) return false; + if (f2fs_compressed_file(inode)) + return true; if (IS_NOQUOTA(inode)) return false; /* to avoid deadlock in path of data flush */ @@ -2613,14 +3072,16 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; loff_t i_size = i_size_read(inode); + if (IS_NOQUOTA(inode)) + return; + /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ if (to > i_size && !f2fs_verity_in_progress(inode)) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); - if (!IS_NOQUOTA(inode)) - f2fs_truncate_blocks(inode, i_size, true); + f2fs_truncate_blocks(inode, i_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -2660,6 +3121,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, __do_map_lock(sbi, flag, true); locked = true; } + restart: /* check inline_data */ ipage = f2fs_get_node_page(sbi, inode->i_ino); @@ -2750,6 +3212,24 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, if (err) goto fail; } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + int ret; + + *fsdata = NULL; + + ret = f2fs_prepare_compress_overwrite(inode, pagep, + index, fsdata); + if (ret < 0) { + err = ret; + goto fail; + } else if (ret) { + return 0; + } + } +#endif + repeat: /* * Do not use grab_cache_page_write_begin() to avoid deadlock due to @@ -2762,6 +3242,8 @@ repeat: goto fail; } + /* TODO: cluster can be compressed due to race with .writepage */ + *pagep = page; err = prepare_write_begin(sbi, page, pos, len, @@ -2845,6 +3327,16 @@ static int f2fs_write_end(struct file *file, else SetPageUptodate(page); } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* overwrite compressed file */ + if (f2fs_compressed_file(inode) && fsdata) { + f2fs_compress_write_end(inode, fsdata, page->index, copied); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return copied; + } +#endif + if (!copied) goto unlock_out; @@ -3145,7 +3637,8 @@ int f2fs_migrate_page(struct address_space *mapping, #ifdef CONFIG_SWAP /* Copied from generic_swapfile_activate() to check any holes */ -static int check_swap_activate(struct file *swap_file, unsigned int max) +static int check_swap_activate(struct swap_info_struct *sis, + struct file *swap_file, sector_t *span) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; @@ -3156,6 +3649,8 @@ static int check_swap_activate(struct file *swap_file, unsigned int max) sector_t last_block; sector_t lowest_block = -1; sector_t highest_block = 0; + int nr_extents = 0; + int ret; blkbits = inode->i_blkbits; blocks_per_page = PAGE_SIZE >> blkbits; @@ -3167,7 +3662,8 @@ static int check_swap_activate(struct file *swap_file, unsigned int max) probe_block = 0; page_no = 0; last_block = i_size_read(inode) >> blkbits; - while ((probe_block + blocks_per_page) <= last_block && page_no < max) { + while ((probe_block + blocks_per_page) <= last_block && + page_no < sis->max) { unsigned block_in_page; sector_t first_block; @@ -3207,13 +3703,27 @@ static int check_swap_activate(struct file *swap_file, unsigned int max) highest_block = first_block; } + /* + * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks + */ + ret = add_swap_extent(sis, page_no, 1, first_block); + if (ret < 0) + goto out; + nr_extents += ret; page_no++; probe_block += blocks_per_page; reprobe: continue; } - return 0; - + ret = nr_extents; + *span = 1 + highest_block - lowest_block; + if (page_no == 0) + page_no = 1; /* force Empty message */ + sis->max = page_no; + sis->pages = page_no - 1; + sis->highest_bit = page_no - 1; +out: + return ret; bad_bmap: pr_err("swapon: swapfile has holes\n"); return -EINVAL; @@ -3235,14 +3745,17 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) return ret; - ret = check_swap_activate(file, sis->max); - if (ret) + if (f2fs_disable_compressed_file(inode)) + return -EINVAL; + + ret = check_swap_activate(sis, file, span); + if (ret < 0) return ret; set_inode_flag(inode, FI_PIN_FILE); f2fs_precache_extents(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return 0; + return ret; } static void f2fs_swap_deactivate(struct file *file) @@ -3319,6 +3832,27 @@ void f2fs_destroy_post_read_processing(void) kmem_cache_destroy(bio_post_read_ctx_cache); } +int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi) +{ + if (!f2fs_sb_has_encrypt(sbi) && + !f2fs_sb_has_verity(sbi) && + !f2fs_sb_has_compression(sbi)) + return 0; + + sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq", + WQ_UNBOUND | WQ_HIGHPRI, + num_online_cpus()); + if (!sbi->post_read_wq) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi) +{ + if (sbi->post_read_wq) + destroy_workqueue(sbi->post_read_wq); +} + int __init f2fs_init_bio_entry_cache(void) { bio_entry_slab = f2fs_kmem_cache_create("bio_entry_slab", @@ -3328,7 +3862,7 @@ int __init f2fs_init_bio_entry_cache(void) return 0; } -void __exit f2fs_destroy_bio_entry_cache(void) +void f2fs_destroy_bio_entry_cache(void) { kmem_cache_destroy(bio_entry_slab); } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 9b0bedd82581..6b89eae5e4ca 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -21,9 +21,45 @@ #include "gc.h" static LIST_HEAD(f2fs_stat_list); -static struct dentry *f2fs_debugfs_root; static DEFINE_MUTEX(f2fs_stat_mutex); +#ifdef CONFIG_DEBUG_FS +static struct dentry *f2fs_debugfs_root; +#endif + +/* + * This function calculates BDF of every segments + */ +void f2fs_update_sit_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + unsigned long long blks_per_sec, hblks_per_sec, total_vblocks; + unsigned long long bimodal, dist; + unsigned int segno, vblocks; + int ndirty = 0; + + bimodal = 0; + total_vblocks = 0; + blks_per_sec = BLKS_PER_SEC(sbi); + hblks_per_sec = blks_per_sec / 2; + for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { + vblocks = get_valid_blocks(sbi, segno, true); + dist = abs(vblocks - hblks_per_sec); + bimodal += dist * dist; + + if (vblocks > 0 && vblocks < blks_per_sec) { + total_vblocks += vblocks; + ndirty++; + } + } + dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100); + si->bimodal = div64_u64(bimodal, dist); + if (si->dirty_count) + si->avg_vblocks = div_u64(total_vblocks, ndirty); + else + si->avg_vblocks = 0; +} +#ifdef CONFIG_DEBUG_FS static void update_general_status(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); @@ -56,7 +92,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->aw_cnt = sbi->atomic_files; si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); @@ -94,6 +130,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); + si->compr_inode = atomic_read(&sbi->compr_inode); + si->compr_blocks = atomic_read(&sbi->compr_blocks); si->append = sbi->im[APPEND_INO].ino_num; si->update = sbi->im[UPDATE_INO].ino_num; si->orphans = sbi->im[ORPHAN_INO].ino_num; @@ -114,7 +152,6 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID]; si->avail_nids = NM_I(sbi)->available_nids; si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; - si->bg_gc = sbi->bg_gc; si->io_skip_bggc = sbi->io_skip_bggc; si->other_skip_bggc = sbi->other_skip_bggc; si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC]; @@ -146,39 +183,6 @@ static void update_general_status(struct f2fs_sb_info *sbi) } /* - * This function calculates BDF of every segments - */ -static void update_sit_info(struct f2fs_sb_info *sbi) -{ - struct f2fs_stat_info *si = F2FS_STAT(sbi); - unsigned long long blks_per_sec, hblks_per_sec, total_vblocks; - unsigned long long bimodal, dist; - unsigned int segno, vblocks; - int ndirty = 0; - - bimodal = 0; - total_vblocks = 0; - blks_per_sec = BLKS_PER_SEC(sbi); - hblks_per_sec = blks_per_sec / 2; - for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { - vblocks = get_valid_blocks(sbi, segno, true); - dist = abs(vblocks - hblks_per_sec); - bimodal += dist * dist; - - if (vblocks > 0 && vblocks < blks_per_sec) { - total_vblocks += vblocks; - ndirty++; - } - } - dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100); - si->bimodal = div64_u64(bimodal, dist); - if (si->dirty_count) - si->avg_vblocks = div_u64(total_vblocks, ndirty); - else - si->avg_vblocks = 0; -} - -/* * This function calculates memory footprint. */ static void update_mem_info(struct f2fs_sb_info *sbi) @@ -315,6 +319,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); + seq_printf(s, " - Compressed Inode: %u, Blocks: %u\n", + si->compr_inode, si->compr_blocks); seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", @@ -441,7 +447,7 @@ static int stat_show(struct seq_file *s, void *v) si->block_count[LFS], si->segment_count[LFS]); /* segment usage info */ - update_sit_info(si->sbi); + f2fs_update_sit_info(si->sbi); seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n", si->bimodal, si->avg_vblocks); @@ -461,6 +467,7 @@ static int stat_show(struct seq_file *s, void *v) } DEFINE_SHOW_ATTRIBUTE(stat); +#endif int f2fs_build_stats(struct f2fs_sb_info *sbi) { @@ -491,11 +498,12 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); atomic_set(&sbi->inline_dir, 0); + atomic_set(&sbi->compr_inode, 0); + atomic_set(&sbi->compr_blocks, 0); atomic_set(&sbi->inplace_count, 0); for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); - atomic_set(&sbi->aw_cnt, 0); atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); atomic_set(&sbi->max_vw_cnt, 0); @@ -520,14 +528,18 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) void __init f2fs_create_root_stats(void) { +#ifdef CONFIG_DEBUG_FS f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, &stat_fops); +#endif } void f2fs_destroy_root_stats(void) { +#ifdef CONFIG_DEBUG_FS debugfs_remove_recursive(f2fs_debugfs_root); f2fs_debugfs_root = NULL; +#endif } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c967cacf979e..27d0dd7a16d6 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -578,6 +578,20 @@ next: goto next; } +bool f2fs_has_enough_room(struct inode *dir, struct page *ipage, + struct fscrypt_name *fname) +{ + struct f2fs_dentry_ptr d; + unsigned int bit_pos; + int slots = GET_DENTRY_SLOTS(fname_len(fname)); + + make_dentry_ptr_inline(dir, &d, inline_data_addr(dir, ipage)); + + bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); + + return bit_pos < d.max; +} + void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, const struct qstr *name, f2fs_hash_t name_hash, unsigned int bit_pos) @@ -987,7 +1001,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) if (IS_ENCRYPTED(inode)) { err = fscrypt_get_encryption_info(inode); - if (err && err != -ENOKEY) + if (err) goto out; err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr); @@ -1069,24 +1083,27 @@ static int f2fs_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct qstr qstr = {.name = str, .len = len }; + const struct dentry *parent = READ_ONCE(dentry->d_parent); + const struct inode *inode = READ_ONCE(parent->d_inode); - if (!IS_CASEFOLDED(dentry->d_parent->d_inode)) { + if (!inode || !IS_CASEFOLDED(inode)) { if (len != name->len) return -1; - return memcmp(str, name, len); + return memcmp(str, name->name, len); } - return f2fs_ci_compare(dentry->d_parent->d_inode, name, &qstr, false); + return f2fs_ci_compare(inode, name, &qstr, false); } static int f2fs_d_hash(const struct dentry *dentry, struct qstr *str) { struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); const struct unicode_map *um = sbi->s_encoding; + const struct inode *inode = READ_ONCE(dentry->d_inode); unsigned char *norm; int len, ret = 0; - if (!IS_CASEFOLDED(dentry->d_inode)) + if (!inode || !IS_CASEFOLDED(inode)) return 0; norm = f2fs_kmalloc(sbi, PATH_MAX, GFP_ATOMIC); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5a888a063c7f..5355be6b6755 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -116,6 +116,8 @@ typedef u32 block_t; /* */ typedef u32 nid_t; +#define COMPRESS_EXT_NUM 16 + struct f2fs_mount_info { unsigned int opt; int write_io_size_bits; /* Write IO size bits */ @@ -140,6 +142,12 @@ struct f2fs_mount_info { block_t unusable_cap; /* Amount of space allowed to be * unusable when disabling checkpoint */ + + /* For compression */ + unsigned char compress_algorithm; /* algorithm type */ + unsigned compress_log_size; /* cluster log size */ + unsigned char compress_ext_cnt; /* extension count */ + unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ }; #define F2FS_FEATURE_ENCRYPT 0x0001 @@ -155,6 +163,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_VERITY 0x0400 #define F2FS_FEATURE_SB_CHKSUM 0x0800 #define F2FS_FEATURE_CASEFOLD 0x1000 +#define F2FS_FEATURE_COMPRESSION 0x2000 #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) @@ -712,6 +721,12 @@ struct f2fs_inode_info { int i_inline_xattr_size; /* inline xattr size */ struct timespec64 i_crtime; /* inode creation time */ struct timespec64 i_disk_time[4];/* inode disk times */ + + /* for file compress */ + u64 i_compr_blocks; /* # of compressed blocks */ + unsigned char i_compress_algorithm; /* algorithm type */ + unsigned char i_log_cluster_size; /* log of cluster size */ + unsigned int i_cluster_size; /* cluster size */ }; static inline void get_extent_info(struct extent_info *ext, @@ -1018,6 +1033,7 @@ enum need_lock_type { enum cp_reason_type { CP_NO_NEEDED, CP_NON_REGULAR, + CP_COMPRESSED, CP_HARDLINK, CP_SB_NEED_CP, CP_WRONG_PINO, @@ -1056,12 +1072,15 @@ struct f2fs_io_info { block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ + struct page *compressed_page; /* compressed page */ struct list_head list; /* serialize IOs */ bool submitted; /* indicate IO submission */ int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ bool is_por; /* indicate IO is from recovery or not */ bool retry; /* need to reallocate block address */ + int compr_blocks; /* # of compressed block addresses */ + bool encrypted; /* indicate file is encrypted */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ struct bio **bio; /* bio for ipu */ @@ -1169,6 +1188,18 @@ enum fsync_mode { FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */ }; +/* + * this value is set in page as a private data which indicate that + * the page is atomically written, and it is in inmem_pages list. + */ +#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) +#define DUMMY_WRITTEN_PAGE ((unsigned long)-2) + +#define IS_ATOMIC_WRITTEN_PAGE(page) \ + (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) +#define IS_DUMMY_WRITTEN_PAGE(page) \ + (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) + #ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) \ (unlikely(F2FS_OPTION(sbi).test_dummy_encryption)) @@ -1176,6 +1207,75 @@ enum fsync_mode { #define DUMMY_ENCRYPTION_ENABLED(sbi) (0) #endif +/* For compression */ +enum compress_algorithm_type { + COMPRESS_LZO, + COMPRESS_LZ4, + COMPRESS_MAX, +}; + +#define COMPRESS_DATA_RESERVED_SIZE 4 +struct compress_data { + __le32 clen; /* compressed data size */ + __le32 chksum; /* checksum of compressed data */ + __le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */ + u8 cdata[]; /* compressed data */ +}; + +#define COMPRESS_HEADER_SIZE (sizeof(struct compress_data)) + +#define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 + +/* compress context */ +struct compress_ctx { + struct inode *inode; /* inode the context belong to */ + pgoff_t cluster_idx; /* cluster index number */ + unsigned int cluster_size; /* page count in cluster */ + unsigned int log_cluster_size; /* log of cluster size */ + struct page **rpages; /* pages store raw data in cluster */ + unsigned int nr_rpages; /* total page number in rpages */ + struct page **cpages; /* pages store compressed data in cluster */ + unsigned int nr_cpages; /* total page number in cpages */ + void *rbuf; /* virtual mapped address on rpages */ + struct compress_data *cbuf; /* virtual mapped address on cpages */ + size_t rlen; /* valid data length in rbuf */ + size_t clen; /* valid data length in cbuf */ + void *private; /* payload buffer for specified compression algorithm */ +}; + +/* compress context for write IO path */ +struct compress_io_ctx { + u32 magic; /* magic number to indicate page is compressed */ + struct inode *inode; /* inode the context belong to */ + struct page **rpages; /* pages store raw data in cluster */ + unsigned int nr_rpages; /* total page number in rpages */ + refcount_t ref; /* referrence count of raw page */ +}; + +/* decompress io context for read IO path */ +struct decompress_io_ctx { + u32 magic; /* magic number to indicate page is compressed */ + struct inode *inode; /* inode the context belong to */ + pgoff_t cluster_idx; /* cluster index number */ + unsigned int cluster_size; /* page count in cluster */ + unsigned int log_cluster_size; /* log of cluster size */ + struct page **rpages; /* pages store raw data in cluster */ + unsigned int nr_rpages; /* total page number in rpages */ + struct page **cpages; /* pages store compressed data in cluster */ + unsigned int nr_cpages; /* total page number in cpages */ + struct page **tpages; /* temp pages to pad holes in cluster */ + void *rbuf; /* virtual mapped address on rpages */ + struct compress_data *cbuf; /* virtual mapped address on cpages */ + size_t rlen; /* valid data length in rbuf */ + size_t clen; /* valid data length in cbuf */ + refcount_t ref; /* referrence count of compressed page */ + bool failed; /* indicate IO error during decompression */ +}; + +#define NULL_CLUSTER ((unsigned int)(~0)) +#define MIN_COMPRESS_LOG_SIZE 2 +#define MAX_COMPRESS_LOG_SIZE 8 + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -1291,7 +1391,10 @@ struct f2fs_sb_info { struct f2fs_mount_info mount_opt; /* mount options */ /* for cleaning operations */ - struct mutex gc_mutex; /* mutex for GC */ + struct rw_semaphore gc_lock; /* + * semaphore for GC, avoid + * race between GC and GC or CP + */ struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ @@ -1327,11 +1430,11 @@ struct f2fs_sb_info { atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ - atomic_t aw_cnt; /* # of atomic writes */ + atomic_t compr_inode; /* # of compressed inodes */ + atomic_t compr_blocks; /* # of compressed blocks */ atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ atomic_t max_vw_cnt; /* max # of volatile writes */ - int bg_gc; /* background gc calls */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ @@ -1365,6 +1468,8 @@ struct f2fs_sb_info { /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_chksum_seed; + + struct workqueue_struct *post_read_wq; /* post read workqueue */ }; struct f2fs_private_dio { @@ -2222,26 +2327,6 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, return entry; } -static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, - int npages, bool no_fail) -{ - struct bio *bio; - - if (no_fail) { - /* No failure on bio allocation */ - bio = bio_alloc(GFP_NOIO, npages); - if (!bio) - bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); - return bio; - } - if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { - f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); - return NULL; - } - - return bio_alloc(GFP_KERNEL, npages); -} - static inline bool is_idle(struct f2fs_sb_info *sbi, int type) { if (sbi->gc_mode == GC_URGENT) @@ -2378,11 +2463,13 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) /* * On-disk inode flags (f2fs_inode::i_flags) */ +#define F2FS_COMPR_FL 0x00000004 /* Compress file */ #define F2FS_SYNC_FL 0x00000008 /* Synchronous updates */ #define F2FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ #define F2FS_APPEND_FL 0x00000020 /* writes to file may only append */ #define F2FS_NODUMP_FL 0x00000040 /* do not dump file */ #define F2FS_NOATIME_FL 0x00000080 /* do not update atime */ +#define F2FS_NOCOMP_FL 0x00000400 /* Don't compress */ #define F2FS_INDEX_FL 0x00001000 /* hash-indexed directory */ #define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ @@ -2391,7 +2478,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) /* Flags that should be inherited by new inodes from their parent. */ #define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \ F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ - F2FS_CASEFOLD_FL) + F2FS_CASEFOLD_FL | F2FS_COMPR_FL | F2FS_NOCOMP_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ @@ -2443,6 +2530,8 @@ enum { FI_PIN_FILE, /* indicate file should not be gced */ FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ + FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ + FI_MMAP_FILE, /* indicate file was mmapped */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -2459,6 +2548,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, case FI_DATA_EXIST: case FI_INLINE_DOTS: case FI_PIN_FILE: + case FI_COMPRESSED_FILE: f2fs_mark_inode_dirty_sync(inode, true); } } @@ -2614,16 +2704,27 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_XATTR); } +static inline int f2fs_compressed_file(struct inode *inode) +{ + return S_ISREG(inode->i_mode) && + is_inode_flag_set(inode, FI_COMPRESSED_FILE); +} + static inline unsigned int addrs_per_inode(struct inode *inode) { unsigned int addrs = CUR_ADDRS_PER_INODE(inode) - get_inline_xattr_addrs(inode); - return ALIGN_DOWN(addrs, 1); + + if (!f2fs_compressed_file(inode)) + return addrs; + return ALIGN_DOWN(addrs, F2FS_I(inode)->i_cluster_size); } static inline unsigned int addrs_per_block(struct inode *inode) { - return ALIGN_DOWN(DEF_ADDRS_PER_BLOCK, 1); + if (!f2fs_compressed_file(inode)) + return DEF_ADDRS_PER_BLOCK; + return ALIGN_DOWN(DEF_ADDRS_PER_BLOCK, F2FS_I(inode)->i_cluster_size); } static inline void *inline_xattr_addr(struct inode *inode, struct page *page) @@ -2656,6 +2757,11 @@ static inline int f2fs_has_inline_dots(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DOTS); } +static inline int f2fs_is_mmap_file(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_MMAP_FILE); +} + static inline bool f2fs_is_pinned_file(struct inode *inode) { return is_inode_flag_set(inode, FI_PIN_FILE); @@ -2783,7 +2889,8 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (!test_opt(sbi, EXTENT_CACHE) || - is_inode_flag_set(inode, FI_NO_EXTENT)) + is_inode_flag_set(inode, FI_NO_EXTENT) || + is_inode_flag_set(inode, FI_COMPRESSED_FILE)) return false; /* @@ -2903,7 +3010,8 @@ static inline void verify_blkaddr(struct f2fs_sb_info *sbi, static inline bool __is_valid_data_blkaddr(block_t blkaddr) { - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR || + blkaddr == COMPRESS_ADDR) return false; return true; } @@ -3001,6 +3109,8 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, struct page **page); void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode); +bool f2fs_has_enough_room(struct inode *dir, struct page *ipage, + struct fscrypt_name *fname); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, const struct qstr *name, f2fs_hash_t name_hash, unsigned int bit_pos); @@ -3155,6 +3265,8 @@ void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc); void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi); +int f2fs_check_write_pointer(struct f2fs_sb_info *sbi); int f2fs_build_segment_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_segment_manager_caches(void); @@ -3205,10 +3317,13 @@ void f2fs_destroy_checkpoint_caches(void); /* * data.c */ -int f2fs_init_post_read_processing(void); -void f2fs_destroy_post_read_processing(void); +int __init f2fs_init_bioset(void); +void f2fs_destroy_bioset(void); +struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); +void f2fs_submit_bio(struct f2fs_sb_info *sbi, + struct bio *bio, enum page_type type); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, @@ -3229,6 +3344,9 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); +int f2fs_mpage_readpages(struct address_space *mapping, + struct list_head *pages, struct page *page, + unsigned nr_pages, bool is_readahead); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, int op_flags, bool for_write); struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); @@ -3242,8 +3360,14 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); +int f2fs_encrypt_one_page(struct f2fs_io_info *fio); bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); +int f2fs_write_single_data_page(struct page *page, int *submitted, + struct bio **bio, sector_t *last_block, + struct writeback_control *wbc, + enum iostat_type io_type, + int compr_blocks); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); @@ -3253,6 +3377,10 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, #endif bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); void f2fs_clear_page_cache_dirty_tag(struct page *page); +int f2fs_init_post_read_processing(void); +void f2fs_destroy_post_read_processing(void); +int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); +void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); /* * gc.c @@ -3299,6 +3427,7 @@ struct f2fs_stat_info { int nr_discard_cmd; unsigned int undiscard_blks; int inline_xattr, inline_inode, inline_dir, append, update, orphans; + int compr_inode, compr_blocks; int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; @@ -3330,7 +3459,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_cp_count(si) ((si)->cp_count++) #define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) -#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) +#define stat_inc_bggc_count(si) ((si)->bg_gc++) #define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++) #define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) #define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) @@ -3369,6 +3498,20 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (f2fs_has_inline_dentry(inode)) \ (atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \ } while (0) +#define stat_inc_compr_inode(inode) \ + do { \ + if (f2fs_compressed_file(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->compr_inode)); \ + } while (0) +#define stat_dec_compr_inode(inode) \ + do { \ + if (f2fs_compressed_file(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->compr_inode)); \ + } while (0) +#define stat_add_compr_blocks(inode, blocks) \ + (atomic_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) +#define stat_sub_compr_blocks(inode, blocks) \ + (atomic_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_inc_meta_count(sbi, blkaddr) \ do { \ if (blkaddr < SIT_I(sbi)->sit_base_addr) \ @@ -3386,13 +3529,9 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) ((sbi)->block_count[(curseg)->alloc_type]++) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) -#define stat_inc_atomic_write(inode) \ - (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)) -#define stat_dec_atomic_write(inode) \ - (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)) #define stat_update_max_atomic_write(inode) \ do { \ - int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \ + int cur = F2FS_I_SB(inode)->atomic_files; \ int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ @@ -3444,6 +3583,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi); void f2fs_destroy_stats(struct f2fs_sb_info *sbi); void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); +void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #else #define stat_inc_cp_count(si) do { } while (0) #define stat_inc_bg_cp_count(si) do { } while (0) @@ -3453,8 +3593,8 @@ void f2fs_destroy_root_stats(void); #define stat_other_skip_bggc_count(sbi) do { } while (0) #define stat_inc_dirty_inode(sbi, type) do { } while (0) #define stat_dec_dirty_inode(sbi, type) do { } while (0) -#define stat_inc_total_hit(sb) do { } while (0) -#define stat_inc_rbtree_node_hit(sb) do { } while (0) +#define stat_inc_total_hit(sbi) do { } while (0) +#define stat_inc_rbtree_node_hit(sbi) do { } while (0) #define stat_inc_largest_node_hit(sbi) do { } while (0) #define stat_inc_cached_node_hit(sbi) do { } while (0) #define stat_inc_inline_xattr(inode) do { } while (0) @@ -3463,6 +3603,10 @@ void f2fs_destroy_root_stats(void); #define stat_dec_inline_inode(inode) do { } while (0) #define stat_inc_inline_dir(inode) do { } while (0) #define stat_dec_inline_dir(inode) do { } while (0) +#define stat_inc_compr_inode(inode) do { } while (0) +#define stat_dec_compr_inode(inode) do { } while (0) +#define stat_add_compr_blocks(inode, blocks) do { } while (0) +#define stat_sub_compr_blocks(inode, blocks) do { } while (0) #define stat_inc_atomic_write(inode) do { } while (0) #define stat_dec_atomic_write(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) @@ -3482,6 +3626,7 @@ static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } static inline void __init f2fs_create_root_stats(void) { } static inline void f2fs_destroy_root_stats(void) { } +static inline void update_sit_info(struct f2fs_sb_info *sbi) {} #endif extern const struct file_operations f2fs_dir_operations; @@ -3510,6 +3655,7 @@ void f2fs_truncate_inline_inode(struct inode *inode, int f2fs_read_inline_data(struct inode *inode, struct page *page); int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); int f2fs_convert_inline_inode(struct inode *inode); +int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry); int f2fs_write_inline_data(struct inode *inode, struct page *page); bool f2fs_recover_inline_data(struct inode *inode, struct page *npage); struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, @@ -3602,7 +3748,85 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) */ static inline bool f2fs_post_read_required(struct inode *inode) { - return f2fs_encrypted_file(inode) || fsverity_active(inode); + return f2fs_encrypted_file(inode) || fsverity_active(inode) || + f2fs_compressed_file(inode); +} + +/* + * compress.c + */ +#ifdef CONFIG_F2FS_FS_COMPRESSION +bool f2fs_is_compressed_page(struct page *page); +struct page *f2fs_compress_control_page(struct page *page); +int f2fs_prepare_compress_overwrite(struct inode *inode, + struct page **pagep, pgoff_t index, void **fsdata); +bool f2fs_compress_write_end(struct inode *inode, void *fsdata, + pgoff_t index, unsigned copied); +void f2fs_compress_write_end_io(struct bio *bio, struct page *page); +bool f2fs_is_compress_backend_ready(struct inode *inode); +void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity); +bool f2fs_cluster_is_empty(struct compress_ctx *cc); +bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); +void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); +int f2fs_write_multi_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type); +int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); +int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, + unsigned nr_pages, sector_t *last_block_in_bio, + bool is_readahead); +struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); +void f2fs_free_dic(struct decompress_io_ctx *dic); +void f2fs_decompress_end_io(struct page **rpages, + unsigned int cluster_size, bool err, bool verity); +int f2fs_init_compress_ctx(struct compress_ctx *cc); +void f2fs_destroy_compress_ctx(struct compress_ctx *cc); +void f2fs_init_compress_info(struct f2fs_sb_info *sbi); +#else +static inline bool f2fs_is_compressed_page(struct page *page) { return false; } +static inline bool f2fs_is_compress_backend_ready(struct inode *inode) +{ + if (!f2fs_compressed_file(inode)) + return true; + /* not support compression */ + return false; +} +static inline struct page *f2fs_compress_control_page(struct page *page) +{ + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +} +#endif + +static inline void set_compress_context(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + F2FS_I(inode)->i_compress_algorithm = + F2FS_OPTION(sbi).compress_algorithm; + F2FS_I(inode)->i_log_cluster_size = + F2FS_OPTION(sbi).compress_log_size; + F2FS_I(inode)->i_cluster_size = + 1 << F2FS_I(inode)->i_log_cluster_size; + F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; + set_inode_flag(inode, FI_COMPRESSED_FILE); + stat_inc_compr_inode(inode); +} + +static inline u64 f2fs_disable_compressed_file(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + if (!f2fs_compressed_file(inode)) + return 0; + if (fi->i_compr_blocks) + return fi->i_compr_blocks; + + fi->i_flags &= ~F2FS_COMPR_FL; + clear_inode_flag(inode, FI_COMPRESSED_FILE); + stat_dec_compr_inode(inode); + return 0; } #define F2FS_FEATURE_FUNCS(name, flagname) \ @@ -3623,6 +3847,7 @@ F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); F2FS_FEATURE_FUNCS(verity, VERITY); F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); F2FS_FEATURE_FUNCS(casefold, CASEFOLD); +F2FS_FEATURE_FUNCS(compression, COMPRESSION); #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, @@ -3704,6 +3929,30 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } +static inline bool f2fs_may_compress(struct inode *inode) +{ + if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || + f2fs_is_atomic_file(inode) || + f2fs_is_volatile_file(inode)) + return false; + return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); +} + +static inline void f2fs_i_compr_blocks_update(struct inode *inode, + u64 blocks, bool add) +{ + int diff = F2FS_I(inode)->i_cluster_size - blocks; + + if (add) { + F2FS_I(inode)->i_compr_blocks += diff; + stat_add_compr_blocks(inode, diff); + } else { + F2FS_I(inode)->i_compr_blocks -= diff; + stat_sub_compr_blocks(inode, diff); + } + f2fs_mark_inode_dirty_sync(inode, true); +} + static inline int block_unaligned_IO(struct inode *inode, struct kiocb *iocb, struct iov_iter *iter) { @@ -3735,6 +3984,8 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, return true; if (f2fs_is_multi_device(sbi)) return true; + if (f2fs_compressed_file(inode)) + return true; /* * for blkzoned device, fallback direct IO to buffered IO, so * all IOs can be serialized by log-structured write. diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 13aef5f28fa8..0d4da644df3b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -50,8 +50,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct dnode_of_data dn = { .node_changed = false }; - int err; + struct dnode_of_data dn; + bool need_alloc = true; + int err = 0; if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -63,6 +64,26 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) goto err; } +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + int ret = f2fs_is_compressed_cluster(inode, page->index); + + if (ret < 0) { + err = ret; + goto err; + } else if (ret) { + if (ret < F2FS_I(inode)->i_cluster_size) { + err = -EAGAIN; + goto err; + } + need_alloc = false; + } + } +#endif + /* should do out of any locked page */ + if (need_alloc) + f2fs_balance_fs(sbi, true); + sb_start_pagefault(inode->i_sb); f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); @@ -78,15 +99,17 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) goto out_sem; } - /* block allocation */ - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_get_block(&dn, page->index); - f2fs_put_dnode(&dn); - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); - if (err) { - unlock_page(page); - goto out_sem; + if (need_alloc) { + /* block allocation */ + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_block(&dn, page->index); + f2fs_put_dnode(&dn); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + if (err) { + unlock_page(page); + goto out_sem; + } } /* fill the page */ @@ -120,8 +143,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) out_sem: up_read(&F2FS_I(inode)->i_mmap_sem); - f2fs_balance_fs(sbi, dn.node_changed); - sb_end_pagefault(inode->i_sb); err: return block_page_mkwrite_return(err); @@ -155,6 +176,8 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) if (!S_ISREG(inode->i_mode)) cp_reason = CP_NON_REGULAR; + else if (f2fs_compressed_file(inode)) + cp_reason = CP_COMPRESSED; else if (inode->i_nlink != 1) cp_reason = CP_HARDLINK; else if (is_sbi_flag_set(sbi, SBI_NEED_CP)) @@ -485,6 +508,9 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + /* we don't need to use inline_data strictly */ err = f2fs_convert_inline_inode(inode); if (err) @@ -492,6 +518,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; + set_inode_flag(inode, FI_MMAP_FILE); return 0; } @@ -502,6 +529,9 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (err) return err; + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + err = fsverity_file_open(inode, filp); if (err) return err; @@ -518,6 +548,9 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; int base = 0; + bool compressed_cluster = false; + int cluster_index = 0, valid_blocks = 0; + int cluster_size = F2FS_I(dn->inode)->i_cluster_size; if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) base = get_extra_isize(dn->inode); @@ -525,26 +558,43 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) raw_node = F2FS_NODE(dn->node_page); addr = blkaddr_in_node(raw_node) + base + ofs; - for (; count > 0; count--, addr++, dn->ofs_in_node++) { + /* Assumption: truncateion starts with cluster */ + for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) { block_t blkaddr = le32_to_cpu(*addr); + if (f2fs_compressed_file(dn->inode) && + !(cluster_index & (cluster_size - 1))) { + if (compressed_cluster) + f2fs_i_compr_blocks_update(dn->inode, + valid_blocks, false); + compressed_cluster = (blkaddr == COMPRESS_ADDR); + valid_blocks = 0; + } + if (blkaddr == NULL_ADDR) continue; dn->data_blkaddr = NULL_ADDR; f2fs_set_data_blkaddr(dn); - if (__is_valid_data_blkaddr(blkaddr) && - !f2fs_is_valid_blkaddr(sbi, blkaddr, + if (__is_valid_data_blkaddr(blkaddr)) { + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) - continue; + continue; + if (compressed_cluster) + valid_blocks++; + } - f2fs_invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); + + f2fs_invalidate_blocks(sbi, blkaddr); nr_free++; } + if (compressed_cluster) + f2fs_i_compr_blocks_update(dn->inode, valid_blocks, false); + if (nr_free) { pgoff_t fofs; /* @@ -587,6 +637,9 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; } + if (f2fs_compressed_file(inode)) + return 0; + page = f2fs_get_lock_data_page(inode, index, true); if (IS_ERR(page)) return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); @@ -602,7 +655,7 @@ truncate_out: return 0; } -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) +static int do_truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; @@ -667,6 +720,28 @@ free_partial: return err; } +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) +{ + u64 free_from = from; + + /* + * for compressed file, only support cluster size + * aligned truncation. + */ + if (f2fs_compressed_file(inode)) { + size_t cluster_shift = PAGE_SHIFT + + F2FS_I(inode)->i_log_cluster_size; + size_t cluster_mask = (1 << cluster_shift) - 1; + + free_from = from >> cluster_shift; + if (from & cluster_mask) + free_from++; + free_from <<= cluster_shift; + } + + return do_truncate_blocks(inode, free_from, lock); +} + int f2fs_truncate(struct inode *inode) { int err; @@ -780,6 +855,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; + if ((attr->ia_valid & ATTR_SIZE) && + !f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + err = setattr_prepare(dentry, attr); if (err) return err; @@ -1020,8 +1099,8 @@ next_dnode: } else if (ret == -ENOENT) { if (dn.max_level == 0) return -ENOENT; - done = min((pgoff_t)ADDRS_PER_BLOCK(inode) - dn.ofs_in_node, - len); + done = min((pgoff_t)ADDRS_PER_BLOCK(inode) - + dn.ofs_in_node, len); blkaddr += done; do_replace += done; goto next; @@ -1184,13 +1263,13 @@ static int __exchange_data_block(struct inode *src_inode, src_blkaddr = f2fs_kvzalloc(F2FS_I_SB(src_inode), array_size(olen, sizeof(block_t)), - GFP_KERNEL); + GFP_NOFS); if (!src_blkaddr) return -ENOMEM; do_replace = f2fs_kvzalloc(F2FS_I_SB(src_inode), array_size(olen, sizeof(int)), - GFP_KERNEL); + GFP_NOFS); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; @@ -1557,7 +1636,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, NULL_SEGNO); if (err && err != -ENODATA && err != -EAGAIN) goto out_err; @@ -1615,6 +1694,8 @@ static long f2fs_fallocate(struct file *file, int mode, return -EIO; if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode))) return -ENOSPC; + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; /* f2fs only support ->fallocate for regular file */ if (!S_ISREG(inode->i_mode)) @@ -1624,6 +1705,11 @@ static long f2fs_fallocate(struct file *file, int mode, (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; + if (f2fs_compressed_file(inode) && + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | + FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE))) + return -EOPNOTSUPP; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE)) @@ -1713,7 +1799,40 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) return -ENOTEMPTY; } + if (iflags & (F2FS_COMPR_FL | F2FS_NOCOMP_FL)) { + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + if ((iflags & F2FS_COMPR_FL) && (iflags & F2FS_NOCOMP_FL)) + return -EINVAL; + } + + if ((iflags ^ fi->i_flags) & F2FS_COMPR_FL) { + if (S_ISREG(inode->i_mode) && + (fi->i_flags & F2FS_COMPR_FL || i_size_read(inode) || + F2FS_HAS_BLOCKS(inode))) + return -EINVAL; + if (iflags & F2FS_NOCOMP_FL) + return -EINVAL; + if (iflags & F2FS_COMPR_FL) { + int err = f2fs_convert_inline_inode(inode); + + if (err) + return err; + + if (!f2fs_may_compress(inode)) + return -EINVAL; + + set_compress_context(inode); + } + } + if ((iflags ^ fi->i_flags) & F2FS_NOCOMP_FL) { + if (fi->i_flags & F2FS_COMPR_FL) + return -EINVAL; + } + fi->i_flags = iflags | (fi->i_flags & ~mask); + f2fs_bug_on(F2FS_I_SB(inode), (fi->i_flags & F2FS_COMPR_FL) && + (fi->i_flags & F2FS_NOCOMP_FL)); if (fi->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); @@ -1739,11 +1858,13 @@ static const struct { u32 iflag; u32 fsflag; } f2fs_fsflags_map[] = { + { F2FS_COMPR_FL, FS_COMPR_FL }, { F2FS_SYNC_FL, FS_SYNC_FL }, { F2FS_IMMUTABLE_FL, FS_IMMUTABLE_FL }, { F2FS_APPEND_FL, FS_APPEND_FL }, { F2FS_NODUMP_FL, FS_NODUMP_FL }, { F2FS_NOATIME_FL, FS_NOATIME_FL }, + { F2FS_NOCOMP_FL, FS_NOCOMP_FL }, { F2FS_INDEX_FL, FS_INDEX_FL }, { F2FS_DIRSYNC_FL, FS_DIRSYNC_FL }, { F2FS_PROJINHERIT_FL, FS_PROJINHERIT_FL }, @@ -1751,11 +1872,13 @@ static const struct { }; #define F2FS_GETTABLE_FS_FL ( \ + FS_COMPR_FL | \ FS_SYNC_FL | \ FS_IMMUTABLE_FL | \ FS_APPEND_FL | \ FS_NODUMP_FL | \ FS_NOATIME_FL | \ + FS_NOCOMP_FL | \ FS_INDEX_FL | \ FS_DIRSYNC_FL | \ FS_PROJINHERIT_FL | \ @@ -1766,11 +1889,13 @@ static const struct { FS_CASEFOLD_FL) #define F2FS_SETTABLE_FS_FL ( \ + FS_COMPR_FL | \ FS_SYNC_FL | \ FS_IMMUTABLE_FL | \ FS_APPEND_FL | \ FS_NODUMP_FL | \ FS_NOATIME_FL | \ + FS_NOCOMP_FL | \ FS_DIRSYNC_FL | \ FS_PROJINHERIT_FL | \ FS_CASEFOLD_FL) @@ -1891,6 +2016,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); + f2fs_disable_compressed_file(inode); + if (f2fs_is_atomic_file(inode)) { if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) ret = -EINVAL; @@ -1929,7 +2056,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); F2FS_I(inode)->inmem_task = current; - stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: inode_unlock(inode); @@ -2318,12 +2444,12 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) return ret; if (!sync) { - if (!mutex_trylock(&sbi->gc_mutex)) { + if (!down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } } else { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); } ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); @@ -2361,12 +2487,12 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) do_more: if (!range.sync) { - if (!mutex_trylock(&sbi->gc_mutex)) { + if (!down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } } else { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); } ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start)); @@ -2797,7 +2923,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) end_segno = min(start_segno + range.segments, dev_end_segno); while (start_segno < end_segno) { - if (!mutex_trylock(&sbi->gc_mutex)) { + if (!down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } @@ -3092,10 +3218,16 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) ret = -EAGAIN; goto out; } + ret = f2fs_convert_inline_inode(inode); if (ret) goto out; + if (f2fs_disable_compressed_file(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + set_inode_flag(inode, FI_PIN_FILE); ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]; done: @@ -3344,6 +3476,17 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } } +static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + + return generic_file_read_iter(iocb, iter); +} + static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -3355,6 +3498,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) { ret = -EAGAIN; @@ -3383,18 +3529,41 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = -EAGAIN; goto out; } - } else { - preallocated = true; - target_size = iocb->ki_pos + iov_iter_count(from); + goto write; + } - err = f2fs_preallocate_blocks(iocb, from); - if (err) { - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); - ret = err; - goto out; - } + if (is_inode_flag_set(inode, FI_NO_PREALLOC)) + goto write; + + if (iocb->ki_flags & IOCB_DIRECT) { + /* + * Convert inline data for Direct I/O before entering + * f2fs_direct_IO(). + */ + err = f2fs_convert_inline_inode(inode); + if (err) + goto out_err; + /* + * If force_buffere_io() is true, we have to allocate + * blocks all the time, since f2fs_direct_IO will fall + * back to buffered IO. + */ + if (!f2fs_force_buffered_io(inode, iocb, from) && + allow_outplace_dio(inode, iocb, from)) + goto write; + } + preallocated = true; + target_size = iocb->ki_pos + iov_iter_count(from); + + err = f2fs_preallocate_blocks(iocb, from); + if (err) { +out_err: + clear_inode_flag(inode, FI_NO_PREALLOC); + inode_unlock(inode); + ret = err; + goto out; } +write: ret = __generic_file_write_iter(iocb, from); clear_inode_flag(inode, FI_NO_PREALLOC); @@ -3469,7 +3638,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) const struct file_operations f2fs_file_operations = { .llseek = f2fs_llseek, - .read_iter = generic_file_read_iter, + .read_iter = f2fs_file_read_iter, .write_iter = f2fs_file_write_iter, .open = f2fs_file_open, .release = f2fs_release_file, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b3d399623290..db8725d473b5 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -78,18 +78,18 @@ static int gc_thread_func(void *data) */ if (sbi->gc_mode == GC_URGENT) { wait_ms = gc_th->urgent_sleep_time; - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); goto do_gc; } - if (!mutex_trylock(&sbi->gc_mutex)) { + if (!down_write_trylock(&sbi->gc_lock)) { stat_other_skip_bggc_count(sbi); goto next; } if (!is_idle(sbi, GC_TIME)) { increase_sleep_time(gc_th, &wait_ms); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); stat_io_skip_bggc_count(sbi); goto next; } @@ -99,7 +99,7 @@ static int gc_thread_func(void *data) else increase_sleep_time(gc_th, &wait_ms); do_gc: - stat_inc_bggc_count(sbi); + stat_inc_bggc_count(sbi->stat_info); /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) @@ -1049,8 +1049,10 @@ next_step: if (phase == 3) { inode = f2fs_iget(sb, dni.ino); - if (IS_ERR(inode) || is_bad_inode(inode)) + if (IS_ERR(inode) || is_bad_inode(inode)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); continue; + } if (!down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { @@ -1368,7 +1370,7 @@ stop: reserved_segments(sbi), prefree_segments(sbi)); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); put_gc_inode(&gc_list); @@ -1407,9 +1409,9 @@ static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); do_garbage_collect(sbi, segno, &gc_list, FG_GC); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); put_gc_inode(&gc_list); if (get_valid_blocks(sbi, segno, true)) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 896db0416f0e..4167e5408151 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -368,7 +368,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, struct f2fs_dentry_ptr src, dst; int err; - page = f2fs_grab_cache_page(dir->i_mapping, 0, false); + page = f2fs_grab_cache_page(dir->i_mapping, 0, true); if (!page) { f2fs_put_page(ipage, 1); return -ENOMEM; @@ -530,7 +530,7 @@ recover: return err; } -static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, +static int do_convert_inline_dir(struct inode *dir, struct page *ipage, void *inline_dentry) { if (!F2FS_I(dir)->i_dir_level) @@ -539,6 +539,44 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry); } +int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct page *ipage; + struct fscrypt_name fname; + void *inline_dentry = NULL; + int err = 0; + + if (!f2fs_has_inline_dentry(dir)) + return 0; + + f2fs_lock_op(sbi); + + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); + if (err) + goto out; + + ipage = f2fs_get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out; + } + + if (f2fs_has_enough_room(dir, ipage, &fname)) { + f2fs_put_page(ipage, 1); + goto out; + } + + inline_dentry = inline_data_addr(dir, ipage); + + err = do_convert_inline_dir(dir, ipage, inline_dentry); + if (!err) + f2fs_put_page(ipage, 1); +out: + f2fs_unlock_op(sbi); + return err; +} + int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, const struct qstr *orig_name, struct inode *inode, nid_t ino, umode_t mode) @@ -562,7 +600,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); if (bit_pos >= d.max) { - err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); + err = do_convert_inline_dir(dir, ipage, inline_dentry); if (err) return err; err = -EAGAIN; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 502bd491336a..78c3f1d70f1d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -200,6 +200,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_inode *ri = F2FS_INODE(node_page); unsigned long long iblocks; iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks); @@ -286,6 +287,19 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) && + fi->i_flags & F2FS_COMPR_FL && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, + i_log_cluster_size)) { + if (ri->i_compress_algorithm >= COMPRESS_MAX) + return false; + if (le64_to_cpu(ri->i_compr_blocks) > inode->i_blocks) + return false; + if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || + ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) + return false; + } + return true; } @@ -407,6 +421,18 @@ static int do_read_inode(struct inode *inode) fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); } + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) && + (fi->i_flags & F2FS_COMPR_FL)) { + if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, + i_log_cluster_size)) { + fi->i_compr_blocks = le64_to_cpu(ri->i_compr_blocks); + fi->i_compress_algorithm = ri->i_compress_algorithm; + fi->i_log_cluster_size = ri->i_log_cluster_size; + fi->i_cluster_size = 1 << fi->i_log_cluster_size; + set_inode_flag(inode, FI_COMPRESSED_FILE); + } + } + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; @@ -416,6 +442,8 @@ static int do_read_inode(struct inode *inode) stat_inc_inline_xattr(inode); stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); + stat_inc_compr_inode(inode); + stat_add_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks); return 0; } @@ -569,6 +597,17 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) ri->i_crtime_nsec = cpu_to_le32(F2FS_I(inode)->i_crtime.tv_nsec); } + + if (f2fs_sb_has_compression(F2FS_I_SB(inode)) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_log_cluster_size)) { + ri->i_compr_blocks = + cpu_to_le64(F2FS_I(inode)->i_compr_blocks); + ri->i_compress_algorithm = + F2FS_I(inode)->i_compress_algorithm; + ri->i_log_cluster_size = + F2FS_I(inode)->i_log_cluster_size; + } } __set_inode_rdev(inode, ri); @@ -711,6 +750,8 @@ no_delete: stat_dec_inline_xattr(inode); stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); + stat_dec_compr_inode(inode); + stat_sub_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks); if (likely(!f2fs_cp_error(sbi) && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a1c507b0b4ac..2aa035422c0f 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -119,6 +119,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); + if (f2fs_sb_has_compression(sbi)) { + /* Inherit the compression flag in directory */ + if ((F2FS_I(dir)->i_flags & F2FS_COMPR_FL) && + f2fs_may_compress(inode)) + set_compress_context(inode); + } + f2fs_set_inode_flags(inode); trace_f2fs_new_inode(inode, 0); @@ -149,6 +156,9 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub) size_t sublen = strlen(sub); int i; + if (sublen == 1 && *sub == '*') + return 1; + /* * filename format of multimedia file should be defined as: * "filename + '.' + extension + (optional: '.' + temp extension)". @@ -262,6 +272,45 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, return 0; } +static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, + const unsigned char *name) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + unsigned char (*ext)[F2FS_EXTENSION_LEN]; + unsigned int ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + int i, cold_count, hot_count; + + if (!f2fs_sb_has_compression(sbi) || + is_inode_flag_set(inode, FI_COMPRESSED_FILE) || + F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL || + !f2fs_may_compress(inode)) + return; + + down_read(&sbi->sb_lock); + + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; + + for (i = cold_count; i < cold_count + hot_count; i++) { + if (is_extension_exist(name, extlist[i])) { + up_read(&sbi->sb_lock); + return; + } + } + + up_read(&sbi->sb_lock); + + ext = F2FS_OPTION(sbi).extensions; + + for (i = 0; i < ext_cnt; i++) { + if (!is_extension_exist(name, ext[i])) + continue; + + set_compress_context(inode); + return; + } +} + static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -286,6 +335,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) set_file_temperature(sbi, inode, dentry->d_name.name); + set_compress_inode(sbi, inode, dentry->d_name.name); + inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; @@ -797,6 +848,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, if (whiteout) { f2fs_i_links_write(inode, false); + inode->i_state |= I_LINKABLE; *whiteout = inode; } else { d_tmpfile(dentry, inode); @@ -849,12 +901,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); struct inode *whiteout = NULL; - struct page *old_dir_page; + struct page *old_dir_page = NULL; struct page *old_page, *new_page = NULL; struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; - bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err; if (unlikely(f2fs_cp_error(sbi))) @@ -867,6 +918,26 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, F2FS_I(old_dentry->d_inode)->i_projid))) return -EXDEV; + /* + * If new_inode is null, the below renaming flow will + * add a link in old_dir which can conver inline_dir. + * After then, if we failed to get the entry due to other + * reasons like ENOMEM, we had to remove the new entry. + * Instead of adding such the error handling routine, let's + * simply convert first here. + */ + if (old_dir == new_dir && !new_inode) { + err = f2fs_try_convert_inline_dir(old_dir, new_dentry); + if (err) + return err; + } + + if (flags & RENAME_WHITEOUT) { + err = f2fs_create_whiteout(old_dir, &whiteout); + if (err) + return err; + } + err = dquot_initialize(old_dir); if (err) goto out; @@ -898,17 +969,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } } - if (flags & RENAME_WHITEOUT) { - err = f2fs_create_whiteout(old_dir, &whiteout); - if (err) - goto out_dir; - } - if (new_inode) { err = -ENOTEMPTY; if (old_dir_entry && !f2fs_empty_dir(new_inode)) - goto out_whiteout; + goto out_dir; err = -ENOENT; new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, @@ -916,7 +981,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_entry) { if (IS_ERR(new_page)) err = PTR_ERR(new_page); - goto out_whiteout; + goto out_dir; } f2fs_balance_fs(sbi, true); @@ -928,6 +993,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto put_out_dir; f2fs_set_link(new_dir, new_entry, new_page, old_inode); + new_page = NULL; new_inode->i_ctime = current_time(new_inode); down_write(&F2FS_I(new_inode)->i_sem); @@ -948,33 +1014,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = f2fs_add_link(new_dentry, old_inode); if (err) { f2fs_unlock_op(sbi); - goto out_whiteout; + goto out_dir; } if (old_dir_entry) f2fs_i_links_write(new_dir, true); - - /* - * old entry and new entry can locate in the same inline - * dentry in inode, when attaching new entry in inline dentry, - * it could force inline dentry conversion, after that, - * old_entry and old_page will point to wrong address, in - * order to avoid this, let's do the check and update here. - */ - if (is_old_inline && !f2fs_has_inline_dentry(old_dir)) { - f2fs_put_page(old_page, 0); - old_page = NULL; - - old_entry = f2fs_find_entry(old_dir, - &old_dentry->d_name, &old_page); - if (!old_entry) { - err = -ENOENT; - if (IS_ERR(old_page)) - err = PTR_ERR(old_page); - f2fs_unlock_op(sbi); - goto out_whiteout; - } - } } down_write(&F2FS_I(old_inode)->i_sem); @@ -989,9 +1033,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_mark_inode_dirty_sync(old_inode, false); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); + old_page = NULL; if (whiteout) { - whiteout->i_state |= I_LINKABLE; set_inode_flag(whiteout, FI_INC_LINK); err = f2fs_add_link(old_dentry, whiteout); if (err) @@ -1025,17 +1069,15 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, put_out_dir: f2fs_unlock_op(sbi); - if (new_page) - f2fs_put_page(new_page, 0); -out_whiteout: - if (whiteout) - iput(whiteout); + f2fs_put_page(new_page, 0); out_dir: if (old_dir_entry) f2fs_put_page(old_dir_page, 0); out_old: f2fs_put_page(old_page, 0); out: + if (whiteout) + iput(whiteout); return err; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3314a0f3405e..9d02cdcdbb07 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -875,7 +875,7 @@ static int truncate_dnode(struct dnode_of_data *dn) /* get direct node */ page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid); - if (IS_ERR(page) && PTR_ERR(page) == -ENOENT) + if (PTR_ERR(page) == -ENOENT) return 1; else if (IS_ERR(page)) return PTR_ERR(page); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 76477f71d4ee..763d5c0951d1 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -723,6 +723,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) int ret = 0; unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; + bool fix_curseg_write_pointer = false; #ifdef CONFIG_QUOTA int quota_enabled; #endif @@ -774,6 +775,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) sbi->sb->s_flags = s_flags; } skip: + fix_curseg_write_pointer = !check_only || list_empty(&inode_list); + destroy_fsync_dnodes(&inode_list, err); destroy_fsync_dnodes(&tmp_inode_list, err); @@ -784,9 +787,22 @@ skip: if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); truncate_inode_pages_final(META_MAPPING(sbi)); - } else { - clear_sbi_flag(sbi, SBI_POR_DOING); } + + /* + * If fsync data succeeds or there is no fsync data to recover, + * and the f2fs is not read only, check and fix zoned block devices' + * write pointer consistency. + */ + if (!err && fix_curseg_write_pointer && !f2fs_readonly(sbi->sb) && + f2fs_sb_has_blkzoned(sbi)) { + err = f2fs_fix_curseg_write_pointer(sbi); + ret = err; + } + + if (!err) + clear_sbi_flag(sbi, SBI_POR_DOING); + mutex_unlock(&sbi->cp_mutex); /* let's drop all the directory inodes for clean checkpoint */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 56e81447e2f3..cf0eb002cfd4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -334,7 +334,6 @@ void f2fs_drop_inmem_pages(struct inode *inode) } fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; - stat_dec_atomic_write(inode); spin_lock(&sbi->inode_lock[ATOMIC_FILE]); if (!list_empty(&fi->inmem_ilist)) @@ -505,7 +504,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) * dir/node pages without enough free segments. */ if (has_not_enough_free_secs(sbi, 0, 0)) { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); f2fs_gc(sbi, false, false, NULL_SEGNO); } } @@ -2225,7 +2224,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) struct sit_info *sit_i = SIT_I(sbi); f2fs_bug_on(sbi, addr == NULL_ADDR); - if (addr == NEW_ADDR) + if (addr == NEW_ADDR || addr == COMPRESS_ADDR) return; invalidate_mapping_pages(META_MAPPING(sbi), addr, addr); @@ -2861,9 +2860,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (sbi->discard_blks == 0) goto out; - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); err = f2fs_write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); if (err) goto out; @@ -3036,7 +3035,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; - if (is_cold_data(fio->page) || file_is_cold(inode)) + if (is_cold_data(fio->page) || file_is_cold(inode) || + f2fs_compressed_file(inode)) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || @@ -3289,7 +3289,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) stat_inc_inplace_blocks(fio->sbi); - if (fio->bio) + if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE))) err = f2fs_merge_page_bio(fio); else err = f2fs_submit_page_bio(fio); @@ -4368,6 +4368,263 @@ out: return 0; } +#ifdef CONFIG_BLK_DEV_ZONED + +static int check_zone_write_pointer(struct f2fs_sb_info *sbi, + struct f2fs_dev_info *fdev, + struct blk_zone *zone) +{ + unsigned int wp_segno, wp_blkoff, zone_secno, zone_segno, segno; + block_t zone_block, wp_block, last_valid_block; + unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; + int i, s, b, ret; + struct seg_entry *se; + + if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) + return 0; + + wp_block = fdev->start_blk + (zone->wp >> log_sectors_per_block); + wp_segno = GET_SEGNO(sbi, wp_block); + wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); + zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block); + zone_segno = GET_SEGNO(sbi, zone_block); + zone_secno = GET_SEC_FROM_SEG(sbi, zone_segno); + + if (zone_segno >= MAIN_SEGS(sbi)) + return 0; + + /* + * Skip check of zones cursegs point to, since + * fix_curseg_write_pointer() checks them. + */ + for (i = 0; i < NO_CHECK_TYPE; i++) + if (zone_secno == GET_SEC_FROM_SEG(sbi, + CURSEG_I(sbi, i)->segno)) + return 0; + + /* + * Get last valid block of the zone. + */ + last_valid_block = zone_block - 1; + for (s = sbi->segs_per_sec - 1; s >= 0; s--) { + segno = zone_segno + s; + se = get_seg_entry(sbi, segno); + for (b = sbi->blocks_per_seg - 1; b >= 0; b--) + if (f2fs_test_bit(b, se->cur_valid_map)) { + last_valid_block = START_BLOCK(sbi, segno) + b; + break; + } + if (last_valid_block >= zone_block) + break; + } + + /* + * If last valid block is beyond the write pointer, report the + * inconsistency. This inconsistency does not cause write error + * because the zone will not be selected for write operation until + * it get discarded. Just report it. + */ + if (last_valid_block >= wp_block) { + f2fs_notice(sbi, "Valid block beyond write pointer: " + "valid block[0x%x,0x%x] wp[0x%x,0x%x]", + GET_SEGNO(sbi, last_valid_block), + GET_BLKOFF_FROM_SEG0(sbi, last_valid_block), + wp_segno, wp_blkoff); + return 0; + } + + /* + * If there is no valid block in the zone and if write pointer is + * not at zone start, reset the write pointer. + */ + if (last_valid_block + 1 == zone_block && zone->wp != zone->start) { + f2fs_notice(sbi, + "Zone without valid block has non-zero write " + "pointer. Reset the write pointer: wp[0x%x,0x%x]", + wp_segno, wp_blkoff); + ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block, + zone->len >> log_sectors_per_block); + if (ret) { + f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", + fdev->path, ret); + return ret; + } + } + + return 0; +} + +static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi, + block_t zone_blkaddr) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (!bdev_is_zoned(FDEV(i).bdev)) + continue; + if (sbi->s_ndevs == 1 || (FDEV(i).start_blk <= zone_blkaddr && + zone_blkaddr <= FDEV(i).end_blk)) + return &FDEV(i); + } + + return NULL; +} + +static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx, + void *data) { + memcpy(data, zone, sizeof(struct blk_zone)); + return 0; +} + +static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *cs = CURSEG_I(sbi, type); + struct f2fs_dev_info *zbd; + struct blk_zone zone; + unsigned int cs_section, wp_segno, wp_blkoff, wp_sector_off; + block_t cs_zone_block, wp_block; + unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; + sector_t zone_sector; + int err; + + cs_section = GET_SEC_FROM_SEG(sbi, cs->segno); + cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section)); + + zbd = get_target_zoned_dev(sbi, cs_zone_block); + if (!zbd) + return 0; + + /* report zone for the sector the curseg points to */ + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) + << log_sectors_per_block; + err = blkdev_report_zones(zbd->bdev, zone_sector, 1, + report_one_zone_cb, &zone); + if (err != 1) { + f2fs_err(sbi, "Report zone failed: %s errno=(%d)", + zbd->path, err); + return err; + } + + if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) + return 0; + + wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block); + wp_segno = GET_SEGNO(sbi, wp_block); + wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); + wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0); + + if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff && + wp_sector_off == 0) + return 0; + + f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: " + "curseg[0x%x,0x%x] wp[0x%x,0x%x]", + type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff); + + f2fs_notice(sbi, "Assign new section to curseg[%d]: " + "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); + allocate_segment_by_default(sbi, type, true); + + /* check consistency of the zone curseg pointed to */ + if (check_zone_write_pointer(sbi, zbd, &zone)) + return -EIO; + + /* check newly assigned zone */ + cs_section = GET_SEC_FROM_SEG(sbi, cs->segno); + cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section)); + + zbd = get_target_zoned_dev(sbi, cs_zone_block); + if (!zbd) + return 0; + + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) + << log_sectors_per_block; + err = blkdev_report_zones(zbd->bdev, zone_sector, 1, + report_one_zone_cb, &zone); + if (err != 1) { + f2fs_err(sbi, "Report zone failed: %s errno=(%d)", + zbd->path, err); + return err; + } + + if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) + return 0; + + if (zone.wp != zone.start) { + f2fs_notice(sbi, + "New zone for curseg[%d] is not yet discarded. " + "Reset the zone: curseg[0x%x,0x%x]", + type, cs->segno, cs->next_blkoff); + err = __f2fs_issue_discard_zone(sbi, zbd->bdev, + zone_sector >> log_sectors_per_block, + zone.len >> log_sectors_per_block); + if (err) { + f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", + zbd->path, err); + return err; + } + } + + return 0; +} + +int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) +{ + int i, ret; + + for (i = 0; i < NO_CHECK_TYPE; i++) { + ret = fix_curseg_write_pointer(sbi, i); + if (ret) + return ret; + } + + return 0; +} + +struct check_zone_write_pointer_args { + struct f2fs_sb_info *sbi; + struct f2fs_dev_info *fdev; +}; + +static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx, + void *data) { + struct check_zone_write_pointer_args *args; + args = (struct check_zone_write_pointer_args *)data; + + return check_zone_write_pointer(args->sbi, args->fdev, zone); +} + +int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) +{ + int i, ret; + struct check_zone_write_pointer_args args; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (!bdev_is_zoned(FDEV(i).bdev)) + continue; + + args.sbi = sbi; + args.fdev = &FDEV(i); + ret = blkdev_report_zones(FDEV(i).bdev, 0, BLK_ALL_ZONES, + check_zone_write_pointer_cb, &args); + if (ret < 0) + return ret; + } + + return 0; +} +#else +int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) +{ + return 0; +} + +int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) +{ + return 0; +} +#endif + /* * Update min, max modified time for cost-benefit GC algorithm */ diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index a95467b202ea..459dc3901a57 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -200,18 +200,6 @@ struct segment_allocation { void (*allocate_segment)(struct f2fs_sb_info *, int, bool); }; -/* - * this value is set in page as a private data which indicate that - * the page is atomically written, and it is in inmem_pages list. - */ -#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) -#define DUMMY_WRITTEN_PAGE ((unsigned long)-2) - -#define IS_ATOMIC_WRITTEN_PAGE(page) \ - (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) -#define IS_DUMMY_WRITTEN_PAGE(page) \ - (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) - #define MAX_SKIP_GC_COUNT 16 struct inmem_pages { @@ -619,8 +607,10 @@ static inline int utilization(struct f2fs_sb_info *sbi) * threashold, * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash * storages. IPU will be triggered only if the # of dirty - * pages over min_fsync_blocks. - * F2FS_IPUT_DISABLE - disable IPU. (=default option) + * pages over min_fsync_blocks. (=default option) + * F2FS_IPU_ASYNC - do IPU given by asynchronous write requests. + * F2FS_IPU_NOCACHE - disable IPU bio cache. + * F2FS_IPUT_DISABLE - disable IPU. (=default option in LFS mode) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 @@ -635,6 +625,7 @@ enum { F2FS_IPU_SSR_UTIL, F2FS_IPU_FSYNC, F2FS_IPU_ASYNC, + F2FS_IPU_NOCACHE, }; static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 5111e1ffe58a..65a7a432dfee 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -141,6 +141,9 @@ enum { Opt_checkpoint_disable_cap, Opt_checkpoint_disable_cap_perc, Opt_checkpoint_enable, + Opt_compress_algorithm, + Opt_compress_log_size, + Opt_compress_extension, Opt_err, }; @@ -203,6 +206,9 @@ static match_table_t f2fs_tokens = { {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"}, {Opt_checkpoint_enable, "checkpoint=enable"}, + {Opt_compress_algorithm, "compress_algorithm=%s"}, + {Opt_compress_log_size, "compress_log_size=%u"}, + {Opt_compress_extension, "compress_extension=%s"}, {Opt_err, NULL}, }; @@ -391,8 +397,9 @@ static int parse_options(struct super_block *sb, char *options) { struct f2fs_sb_info *sbi = F2FS_SB(sb); substring_t args[MAX_OPT_ARGS]; + unsigned char (*ext)[F2FS_EXTENSION_LEN]; char *p, *name; - int arg = 0; + int arg = 0, ext_cnt; kuid_t uid; kgid_t gid; #ifdef CONFIG_QUOTA @@ -810,6 +817,66 @@ static int parse_options(struct super_block *sb, char *options) case Opt_checkpoint_enable: clear_opt(sbi, DISABLE_CHECKPOINT); break; + case Opt_compress_algorithm: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_err(sbi, "Compression feature if off"); + return -EINVAL; + } + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (strlen(name) == 3 && !strcmp(name, "lzo")) { + F2FS_OPTION(sbi).compress_algorithm = + COMPRESS_LZO; + } else if (strlen(name) == 3 && + !strcmp(name, "lz4")) { + F2FS_OPTION(sbi).compress_algorithm = + COMPRESS_LZ4; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_compress_log_size: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_err(sbi, "Compression feature is off"); + return -EINVAL; + } + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg < MIN_COMPRESS_LOG_SIZE || + arg > MAX_COMPRESS_LOG_SIZE) { + f2fs_err(sbi, + "Compress cluster log size is out of range"); + return -EINVAL; + } + F2FS_OPTION(sbi).compress_log_size = arg; + break; + case Opt_compress_extension: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_err(sbi, "Compression feature is off"); + return -EINVAL; + } + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + + ext = F2FS_OPTION(sbi).extensions; + ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + + if (strlen(name) >= F2FS_EXTENSION_LEN || + ext_cnt >= COMPRESS_EXT_NUM) { + f2fs_err(sbi, + "invalid extension length/number"); + kfree(name); + return -EINVAL; + } + + strcpy(ext[ext_cnt], name); + F2FS_OPTION(sbi).compress_ext_cnt++; + kfree(name); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1125,6 +1192,8 @@ static void f2fs_put_super(struct super_block *sb) f2fs_destroy_node_manager(sbi); f2fs_destroy_segment_manager(sbi); + f2fs_destroy_post_read_wq(sbi); + kvfree(sbi->ckpt); f2fs_unregister_sysfs(sbi); @@ -1169,9 +1238,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) cpc.reason = __get_cp_reason(sbi); - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); err = f2fs_write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); } f2fs_trace_ios(NULL, 1); @@ -1213,12 +1282,10 @@ static int f2fs_statfs_project(struct super_block *sb, return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); - limit = 0; - if (dquot->dq_dqb.dqb_bsoftlimit) - limit = dquot->dq_dqb.dqb_bsoftlimit; - if (dquot->dq_dqb.dqb_bhardlimit && - (!limit || dquot->dq_dqb.dqb_bhardlimit < limit)) - limit = dquot->dq_dqb.dqb_bhardlimit; + limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, + dquot->dq_dqb.dqb_bhardlimit); + if (limit) + limit >>= sb->s_blocksize_bits; if (limit && buf->f_blocks > limit) { curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; @@ -1228,12 +1295,8 @@ static int f2fs_statfs_project(struct super_block *sb, (buf->f_blocks - curblock) : 0; } - limit = 0; - if (dquot->dq_dqb.dqb_isoftlimit) - limit = dquot->dq_dqb.dqb_isoftlimit; - if (dquot->dq_dqb.dqb_ihardlimit && - (!limit || dquot->dq_dqb.dqb_ihardlimit < limit)) - limit = dquot->dq_dqb.dqb_ihardlimit; + limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, + dquot->dq_dqb.dqb_ihardlimit); if (limit && buf->f_files > limit) { buf->f_files = limit; @@ -1340,6 +1403,35 @@ static inline void f2fs_show_quota_options(struct seq_file *seq, #endif } +static inline void f2fs_show_compress_options(struct seq_file *seq, + struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + char *algtype = ""; + int i; + + if (!f2fs_sb_has_compression(sbi)) + return; + + switch (F2FS_OPTION(sbi).compress_algorithm) { + case COMPRESS_LZO: + algtype = "lzo"; + break; + case COMPRESS_LZ4: + algtype = "lz4"; + break; + } + seq_printf(seq, ",compress_algorithm=%s", algtype); + + seq_printf(seq, ",compress_log_size=%u", + F2FS_OPTION(sbi).compress_log_size); + + for (i = 0; i < F2FS_OPTION(sbi).compress_ext_cnt; i++) { + seq_printf(seq, ",compress_extension=%s", + F2FS_OPTION(sbi).extensions[i]); + } +} + static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); @@ -1462,6 +1554,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",fsync_mode=%s", "strict"); else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_NOBARRIER) seq_printf(seq, ",fsync_mode=%s", "nobarrier"); + + f2fs_show_compress_options(seq, sbi->sb); return 0; } @@ -1476,6 +1570,9 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).test_dummy_encryption = false; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); + F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; + F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE; + F2FS_OPTION(sbi).compress_ext_cnt = 0; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); @@ -1524,7 +1621,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) f2fs_update_time(sbi, DISABLE_TIME); while (!f2fs_time_over(sbi, DISABLE_TIME)) { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, NULL_SEGNO); if (err == -ENODATA) { err = 0; @@ -1546,7 +1643,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) goto restore_flag; } - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); err = f2fs_write_checkpoint(sbi, &cpc); @@ -1558,7 +1655,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) spin_unlock(&sbi->stat_lock); out_unlock: - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); restore_flag: sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ return err; @@ -1566,12 +1663,12 @@ restore_flag: static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); clear_sbi_flag(sbi, SBI_CP_DISABLED); set_sbi_flag(sbi, SBI_IS_DIRTY); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); f2fs_sync_fs(sbi->sb, 1); } @@ -2158,7 +2255,7 @@ static int f2fs_dquot_commit(struct dquot *dquot) struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); int ret; - down_read(&sbi->quota_sem); + down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING); ret = dquot_commit(dquot); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); @@ -2182,13 +2279,10 @@ static int f2fs_dquot_acquire(struct dquot *dquot) static int f2fs_dquot_release(struct dquot *dquot) { struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); - int ret; + int ret = dquot_release(dquot); - down_read(&sbi->quota_sem); - ret = dquot_release(dquot); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); return ret; } @@ -2196,29 +2290,22 @@ static int f2fs_dquot_mark_dquot_dirty(struct dquot *dquot) { struct super_block *sb = dquot->dq_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); - int ret; - - down_read(&sbi->quota_sem); - ret = dquot_mark_dquot_dirty(dquot); + int ret = dquot_mark_dquot_dirty(dquot); /* if we are using journalled quota */ if (is_journalled_quota(sbi)) set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); - up_read(&sbi->quota_sem); return ret; } static int f2fs_dquot_commit_info(struct super_block *sb, int type) { struct f2fs_sb_info *sbi = F2FS_SB(sb); - int ret; + int ret = dquot_commit_info(sb, type); - down_read(&sbi->quota_sem); - ret = dquot_commit_info(sb, type); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); return ret; } @@ -3311,7 +3398,7 @@ try_onemore: /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; - mutex_init(&sbi->gc_mutex); + init_rwsem(&sbi->gc_lock); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); mutex_init(&sbi->resize_mutex); @@ -3400,6 +3487,12 @@ try_onemore: goto free_devices; } + err = f2fs_init_post_read_wq(sbi); + if (err) { + f2fs_err(sbi, "Failed to initialize post read workqueue"); + goto free_devices; + } + sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, @@ -3544,6 +3637,17 @@ try_onemore: goto free_meta; } } + + /* + * If the f2fs is not readonly and fsync data recovery succeeds, + * check zoned block devices' write pointer consistency. + */ + if (!err && !f2fs_readonly(sb) && f2fs_sb_has_blkzoned(sbi)) { + err = f2fs_check_write_pointer(sbi); + if (err) + goto free_meta; + } + reset_checkpoint: /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -3621,6 +3725,7 @@ free_nm: f2fs_destroy_node_manager(sbi); free_sm: f2fs_destroy_segment_manager(sbi); + f2fs_destroy_post_read_wq(sbi); free_devices: destroy_device_list(sbi); kvfree(sbi->ckpt); @@ -3762,8 +3867,12 @@ static int __init init_f2fs_fs(void) err = f2fs_init_bio_entry_cache(); if (err) goto free_post_read; + err = f2fs_init_bioset(); + if (err) + goto free_bio_enrty_cache; return 0; - +free_bio_enrty_cache: + f2fs_destroy_bio_entry_cache(); free_post_read: f2fs_destroy_post_read_processing(); free_root_stats: @@ -3789,6 +3898,7 @@ fail: static void __exit exit_f2fs_fs(void) { + f2fs_destroy_bioset(); f2fs_destroy_bio_entry_cache(); f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 70945ceb9c0c..91d649790b1b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -25,6 +25,9 @@ enum { DCC_INFO, /* struct discard_cmd_control */ NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_STAT_FS + STAT_INFO, /* struct f2fs_stat_info */ +#endif #ifdef CONFIG_F2FS_FAULT_INJECTION FAULT_INFO_RATE, /* struct f2fs_fault_info */ FAULT_INFO_TYPE, /* struct f2fs_fault_info */ @@ -42,6 +45,9 @@ struct f2fs_attr { int id; }; +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf); + static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) { if (struct_type == GC_THREAD) @@ -59,41 +65,25 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) struct_type == FAULT_INFO_TYPE) return (unsigned char *)&F2FS_OPTION(sbi).fault_info; #endif +#ifdef CONFIG_F2FS_STAT_FS + else if (struct_type == STAT_INFO) + return (unsigned char *)F2FS_STAT(sbi); +#endif return NULL; } static ssize_t dirty_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)(dirty_segments(sbi))); + return sprintf(buf, "%llu\n", + (unsigned long long)(dirty_segments(sbi))); } -static ssize_t unusable_show(struct f2fs_attr *a, +static ssize_t free_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - block_t unusable; - - if (test_opt(sbi, DISABLE_CHECKPOINT)) - unusable = sbi->unusable_block_count; - else - unusable = f2fs_get_unusable_blocks(sbi); - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)unusable); -} - -static ssize_t encoding_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) -{ -#ifdef CONFIG_UNICODE - if (f2fs_sb_has_casefold(sbi)) - return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n", - sbi->s_encoding->charset, - (sbi->s_encoding->version >> 16) & 0xff, - (sbi->s_encoding->version >> 8) & 0xff, - sbi->s_encoding->version & 0xff); -#endif - return snprintf(buf, PAGE_SIZE, "(none)"); + return sprintf(buf, "%llu\n", + (unsigned long long)(free_segments(sbi))); } static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, @@ -102,10 +92,10 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct super_block *sb = sbi->sb; if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); + return sprintf(buf, "0\n"); - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)(sbi->kbytes_written + + return sprintf(buf, "%llu\n", + (unsigned long long)(sbi->kbytes_written + BD_PART_WRITTEN(sbi))); } @@ -116,7 +106,7 @@ static ssize_t features_show(struct f2fs_attr *a, int len = 0; if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); + return sprintf(buf, "0\n"); if (f2fs_sb_has_encrypt(sbi)) len += snprintf(buf, PAGE_SIZE - len, "%s", @@ -154,6 +144,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_casefold(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "casefold"); + if (f2fs_sb_has_compression(sbi)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "compression"); len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "pin_file"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); @@ -163,9 +156,66 @@ static ssize_t features_show(struct f2fs_attr *a, static ssize_t current_reserved_blocks_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return snprintf(buf, PAGE_SIZE, "%u\n", sbi->current_reserved_blocks); + return sprintf(buf, "%u\n", sbi->current_reserved_blocks); +} + +static ssize_t unusable_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + block_t unusable; + + if (test_opt(sbi, DISABLE_CHECKPOINT)) + unusable = sbi->unusable_block_count; + else + unusable = f2fs_get_unusable_blocks(sbi); + return sprintf(buf, "%llu\n", (unsigned long long)unusable); +} + +static ssize_t encoding_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ +#ifdef CONFIG_UNICODE + if (f2fs_sb_has_casefold(sbi)) + return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n", + sbi->s_encoding->charset, + (sbi->s_encoding->version >> 16) & 0xff, + (sbi->s_encoding->version >> 8) & 0xff, + sbi->s_encoding->version & 0xff); +#endif + return sprintf(buf, "(none)"); } +#ifdef CONFIG_F2FS_STAT_FS +static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + + return sprintf(buf, "%llu\n", + (unsigned long long)(si->tot_blks - + (si->bg_data_blks + si->bg_node_blks))); +} + +static ssize_t moved_blocks_background_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + + return sprintf(buf, "%llu\n", + (unsigned long long)(si->bg_data_blks + si->bg_node_blks)); +} + +static ssize_t avg_vblocks_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + + si->dirty_count = dirty_segments(sbi); + f2fs_update_sit_info(sbi); + return sprintf(buf, "%llu\n", (unsigned long long)(si->avg_vblocks)); +} +#endif + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -199,7 +249,7 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, ui = (unsigned int *)(ptr + a->offset); - return snprintf(buf, PAGE_SIZE, "%u\n", *ui); + return sprintf(buf, "%u\n", *ui); } static ssize_t __sbi_store(struct f2fs_attr *a, @@ -389,6 +439,7 @@ enum feat_id { FEAT_VERITY, FEAT_SB_CHECKSUM, FEAT_CASEFOLD, + FEAT_COMPRESSION, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -408,7 +459,8 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_VERITY: case FEAT_SB_CHECKSUM: case FEAT_CASEFOLD: - return snprintf(buf, PAGE_SIZE, "supported\n"); + case FEAT_COMPRESSION: + return sprintf(buf, "supported\n"); } return 0; } @@ -437,6 +489,14 @@ static struct f2fs_attr f2fs_attr_##_name = { \ .id = _id, \ } +#define F2FS_STAT_ATTR(_struct_type, _struct_name, _name, _elname) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_sbi_show, \ + .struct_type = _struct_type, \ + .offset = offsetof(struct _struct_name, _elname), \ +} + F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, urgent_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); @@ -478,11 +538,21 @@ F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_GENERAL_RO_ATTR(dirty_segments); +F2FS_GENERAL_RO_ATTR(free_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); F2FS_GENERAL_RO_ATTR(current_reserved_blocks); F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); +#ifdef CONFIG_F2FS_STAT_FS +F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); +F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); +F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, gc_foreground_calls, call_count); +F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, gc_background_calls, bg_gc); +F2FS_GENERAL_RO_ATTR(moved_blocks_background); +F2FS_GENERAL_RO_ATTR(moved_blocks_foreground); +F2FS_GENERAL_RO_ATTR(avg_vblocks); +#endif #ifdef CONFIG_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); @@ -503,6 +573,7 @@ F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY); #endif F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD); +F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -543,12 +614,22 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(inject_type), #endif ATTR_LIST(dirty_segments), + ATTR_LIST(free_segments), ATTR_LIST(unusable), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(features), ATTR_LIST(reserved_blocks), ATTR_LIST(current_reserved_blocks), ATTR_LIST(encoding), +#ifdef CONFIG_F2FS_STAT_FS + ATTR_LIST(cp_foreground_calls), + ATTR_LIST(cp_background_calls), + ATTR_LIST(gc_foreground_calls), + ATTR_LIST(gc_background_calls), + ATTR_LIST(moved_blocks_foreground), + ATTR_LIST(moved_blocks_background), + ATTR_LIST(avg_vblocks), +#endif NULL, }; ATTRIBUTE_GROUPS(f2fs); @@ -573,6 +654,7 @@ static struct attribute *f2fs_feat_attrs[] = { #endif ATTR_LIST(sb_checksum), ATTR_LIST(casefold), + ATTR_LIST(compression), NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); @@ -733,10 +815,12 @@ int __init f2fs_init_sysfs(void) ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, NULL, "features"); - if (ret) + if (ret) { + kobject_put(&f2fs_feat); kset_unregister(&f2fs_kset); - else + } else { f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + } return ret; } @@ -757,8 +841,11 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, "%s", sb->s_id); - if (err) + if (err) { + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); return err; + } if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -786,4 +873,5 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); } diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index a401ef72bc82..d7d430a6f130 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -222,12 +222,55 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, return size; } +/* + * Prefetch some pages from the file's Merkle tree. + * + * This is basically a stripped-down version of __do_page_cache_readahead() + * which works on pages past i_size. + */ +static void f2fs_merkle_tree_readahead(struct address_space *mapping, + pgoff_t start_index, unsigned long count) +{ + LIST_HEAD(pages); + unsigned int nr_pages = 0; + struct page *page; + pgoff_t index; + struct blk_plug plug; + + for (index = start_index; index < start_index + count; index++) { + page = xa_load(&mapping->i_pages, index); + if (!page || xa_is_value(page)) { + page = __page_cache_alloc(readahead_gfp_mask(mapping)); + if (!page) + break; + page->index = index; + list_add(&page->lru, &pages); + nr_pages++; + } + } + blk_start_plug(&plug); + f2fs_mpage_readpages(mapping, &pages, NULL, nr_pages, true); + blk_finish_plug(&plug); +} + static struct page *f2fs_read_merkle_tree_page(struct inode *inode, - pgoff_t index) + pgoff_t index, + unsigned long num_ra_pages) { + struct page *page; + index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; - return read_mapping_page(inode->i_mapping, index, NULL); + page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); + if (!page || !PageUptodate(page)) { + if (page) + put_page(page); + else if (num_ra_pages > 1) + f2fs_merkle_tree_readahead(inode->i_mapping, index, + num_ra_pages); + page = read_mapping_page(inode->i_mapping, index, NULL); + } + return page; } static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 5f04c5c810fb..594b05ae16c9 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -21,6 +21,7 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <asm/unaligned.h> +#include <linux/random.h> #include <linux/iversion.h> #include "fat.h" @@ -521,7 +522,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode_inc_iversion(inode); - inode->i_generation = get_seconds(); + inode->i_generation = prandom_u32(); if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) { inode->i_generation &= ~1; diff --git a/fs/file.c b/fs/file.c index 3da91a112bab..a364e1a9b7e8 100644 --- a/fs/file.c +++ b/fs/file.c @@ -642,7 +642,9 @@ out_unlock: EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ /* - * variant of __close_fd that gets a ref on the file for later fput + * variant of __close_fd that gets a ref on the file for later fput. + * The caller must ensure that filp_close() called on the file, and then + * an fput(). */ int __close_fd_get_file(unsigned int fd, struct file **res) { @@ -662,7 +664,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res) spin_unlock(&files->file_lock); get_file(file); *res = file; - return filp_close(file, files); + return 0; out_unlock: spin_unlock(&files->file_lock); @@ -706,9 +708,9 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } -static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs) +static struct file *__fget_files(struct files_struct *files, unsigned int fd, + fmode_t mask, unsigned int refs) { - struct files_struct *files = current->files; struct file *file; rcu_read_lock(); @@ -729,6 +731,12 @@ loop: return file; } +static inline struct file *__fget(unsigned int fd, fmode_t mask, + unsigned int refs) +{ + return __fget_files(current->files, fd, mask, refs); +} + struct file *fget_many(unsigned int fd, unsigned int refs) { return __fget(fd, FMODE_PATH, refs); @@ -746,6 +754,18 @@ struct file *fget_raw(unsigned int fd) } EXPORT_SYMBOL(fget_raw); +struct file *fget_task(struct task_struct *task, unsigned int fd) +{ + struct file *file = NULL; + + task_lock(task); + if (task->files) + file = __fget_files(task->files, fd, 0, 1); + task_unlock(task); + + return file; +} + /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 335607b8c5c0..76ac9c7d32ec 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2063,7 +2063,7 @@ void wb_workfn(struct work_struct *work) struct bdi_writeback, dwork); long pages_written; - set_worker_desc("flush-%s", dev_name(wb->bdi->dev)); + set_worker_desc("flush-%s", bdi_dev_name(wb->bdi)); current->flags |= PF_SWAPWRITE; if (likely(!current_is_workqueue_rescuer() || diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 9616af3768e1..08e91efbce53 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -111,7 +111,7 @@ extern void fscache_enqueue_object(struct fscache_object *); * object-list.c */ #ifdef CONFIG_FSCACHE_OBJECT_LIST -extern const struct file_operations fscache_objlist_fops; +extern const struct proc_ops fscache_objlist_proc_ops; extern void fscache_objlist_add(struct fscache_object *); extern void fscache_objlist_remove(struct fscache_object *); diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 72ebfe578f40..e106a1a1600d 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -7,6 +7,7 @@ #define FSCACHE_DEBUG_LEVEL COOKIE #include <linux/module.h> +#include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/key.h> @@ -405,9 +406,9 @@ static int fscache_objlist_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -const struct file_operations fscache_objlist_fops = { - .open = fscache_objlist_open, - .read = seq_read, - .llseek = seq_lseek, - .release = fscache_objlist_release, +const struct proc_ops fscache_objlist_proc_ops = { + .proc_open = fscache_objlist_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = fscache_objlist_release, }; diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c index 5523446e2952..90a7bc22f7e1 100644 --- a/fs/fscache/proc.c +++ b/fs/fscache/proc.c @@ -35,7 +35,7 @@ int __init fscache_proc_init(void) #ifdef CONFIG_FSCACHE_OBJECT_LIST if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL, - &fscache_objlist_fops)) + &fscache_objlist_proc_ops)) goto error_objects; #endif diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a63d779eac10..ce715380143c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -882,6 +882,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) struct fuse_args_pages *ap = &ia->ap; loff_t pos = page_offset(ap->pages[0]); size_t count = ap->num_pages << PAGE_SHIFT; + ssize_t res; int err; ap->args.out_pages = true; @@ -896,7 +897,8 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) if (!err) return; } else { - err = fuse_simple_request(fc, &ap->args); + res = fuse_simple_request(fc, &ap->args); + err = res < 0 ? res : 0; } fuse_readpages_end(fc, &ap->args, err); } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 9c6df721321a..ba83b49ce18c 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -183,14 +183,12 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc struct inode *inode = page->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - int ret; if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) goto out; if (PageChecked(page) || current->journal_info) goto out_ignore; - ret = __gfs2_jdata_writepage(page, wbc); - return ret; + return __gfs2_jdata_writepage(page, wbc); out_ignore: redirty_page_for_writepage(wbc, page); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index eb9c0578978f..c8b62577e2f2 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -73,9 +73,6 @@ #include "bmap.h" #include "util.h" -#define IS_LEAF 1 /* Hashed (leaf) directory */ -#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */ - #define MAX_RA_BLOCKS 32 /* max read-ahead blocks */ #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b7123de7c180..d0eceaff3cea 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -826,7 +826,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); if (glops->go_flags & GLOF_LVB) { - gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_NOFS); + gl->gl_lksb.sb_lvbptr = kzalloc(GDLM_LVB_SIZE, GFP_NOFS); if (!gl->gl_lksb.sb_lvbptr) { kmem_cache_free(cachep, gl); return -ENOMEM; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 4ede1f18de85..061d22e1ceb6 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -95,7 +95,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) /* A shortened, inline version of gfs2_trans_begin() * tr->alloced is not set since the transaction structure is * on the stack */ - tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); + tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes); tr.tr_ip = _RET_IP_; if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) return; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 5f89c515f5bb..9fd88ed18807 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -387,8 +387,6 @@ struct gfs2_glock { struct rhash_head gl_node; }; -#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ - enum { GIF_INVALID = 0, GIF_QD_LOCKED = 1, @@ -505,6 +503,7 @@ struct gfs2_trans { unsigned int tr_num_buf_rm; unsigned int tr_num_databuf_rm; unsigned int tr_num_revoke; + unsigned int tr_num_revoke_rm; struct list_head tr_list; struct list_head tr_databuf; @@ -703,6 +702,7 @@ struct gfs2_sbd { u32 sd_fsb2bb_shift; u32 sd_diptrs; /* Number of pointers in a dinode */ u32 sd_inptrs; /* Number of pointers in a indirect block */ + u32 sd_ldptrs; /* Number of pointers in a log descriptor block */ u32 sd_jbsize; /* Size of a journaled data block */ u32 sd_hash_bsize; /* sizeof(exhash block) */ u32 sd_hash_bsize_shift; @@ -803,7 +803,7 @@ struct gfs2_sbd { struct gfs2_trans *sd_log_tr; unsigned int sd_log_blks_reserved; - int sd_log_commited_revoke; + int sd_log_committed_revoke; atomic_t sd_log_pinned; unsigned int sd_log_num_revoke; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index dafef10b91f1..2716d56ed0a0 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -136,7 +136,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); - ip->i_no_formal_ino = no_formal_ino; error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (unlikely(error)) @@ -175,21 +174,22 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, gfs2_glock_put(io_gl); io_gl = NULL; + /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ + inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); + inode->i_atime.tv_nsec = 0; + if (type == DT_UNKNOWN) { /* Inode glock must be locked already */ error = gfs2_inode_refresh(GFS2_I(inode)); if (error) goto fail_refresh; } else { + ip->i_no_formal_ino = no_formal_ino; inode->i_mode = DT2IF(type); } gfs2_set_iop(inode); - /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ - inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); - inode->i_atime.tv_nsec = 0; - unlock_new_inode(inode); } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index eb3f2e7b8085..00a2e721a374 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -37,7 +37,6 @@ static void gfs2_log_shutdown(struct gfs2_sbd *sdp); * gfs2_struct2blk - compute stuff * @sdp: the filesystem * @nstruct: the number of structures - * @ssize: the size of the structures * * Compute the number of log descriptor blocks needed to hold a certain number * of structures of a certain size. @@ -45,18 +44,16 @@ static void gfs2_log_shutdown(struct gfs2_sbd *sdp); * Returns: the number of blocks needed (minimum is always 1) */ -unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, - unsigned int ssize) +unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct) { unsigned int blks; unsigned int first, second; blks = 1; - first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / ssize; + first = sdp->sd_ldptrs; if (nstruct > first) { - second = (sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_meta_header)) / ssize; + second = sdp->sd_inptrs; blks += DIV_ROUND_UP(nstruct - first, second); } @@ -472,9 +469,8 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp) reserved += DIV_ROUND_UP(dbuf, databuf_limit(sdp)); } - if (sdp->sd_log_commited_revoke > 0) - reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, - sizeof(u64)); + if (sdp->sd_log_committed_revoke > 0) + reserved += gfs2_struct2blk(sdp, sdp->sd_log_committed_revoke); /* One for the overall header */ if (reserved) reserved++; @@ -829,7 +825,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) if (unlikely(state == SFS_FROZEN)) gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); gfs2_assert_withdraw(sdp, - sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); + sdp->sd_log_num_revoke == sdp->sd_log_committed_revoke); gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); @@ -848,7 +844,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) gfs2_log_lock(sdp); sdp->sd_log_head = sdp->sd_log_flush_head; sdp->sd_log_blks_reserved = 0; - sdp->sd_log_commited_revoke = 0; + sdp->sd_log_committed_revoke = 0; spin_lock(&sdp->sd_ail_lock); if (tr && !list_empty(&tr->tr_ail1_list)) { @@ -899,6 +895,7 @@ static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new) old->tr_num_buf_rm += new->tr_num_buf_rm; old->tr_num_databuf_rm += new->tr_num_databuf_rm; old->tr_num_revoke += new->tr_num_revoke; + old->tr_num_revoke_rm += new->tr_num_revoke_rm; list_splice_tail_init(&new->tr_databuf, &old->tr_databuf); list_splice_tail_init(&new->tr_buf, &old->tr_buf); @@ -920,7 +917,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) set_bit(TR_ATTACHED, &tr->tr_flags); } - sdp->sd_log_commited_revoke += tr->tr_num_revoke; + sdp->sd_log_committed_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; reserved = calc_reserved(sdp); maxres = sdp->sd_log_blks_reserved + tr->tr_reserved; gfs2_assert_withdraw(sdp, maxres >= reserved); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 2ff163a8dce1..c0a65e5a126b 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -60,9 +60,9 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) spin_unlock(&sdp->sd_ordered_lock); } } + extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); -extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, - unsigned int ssize); +extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct); extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 55fed7daf2b1..d9431724b788 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -259,7 +259,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno, struct super_block *sb = sdp->sd_vfs; struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); + bio->bi_iter.bi_sector = blkno << (sb->s_blocksize_bits - 9); bio_set_dev(bio, sb->s_bdev); bio->bi_end_io = end_io; bio->bi_private = sdp; @@ -472,6 +472,20 @@ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, put_page(page); /* Once more for find_or_create_page */ } +static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs) +{ + struct bio *new; + + new = bio_alloc(GFP_NOIO, nr_iovecs); + bio_copy_dev(new, prev); + new->bi_iter.bi_sector = bio_end_sector(prev); + new->bi_opf = prev->bi_opf; + new->bi_write_hint = prev->bi_write_hint; + bio_chain(new, prev); + submit_bio(prev); + return new; +} + /** * gfs2_find_jhead - find the head of a log * @jd: The journal descriptor @@ -488,15 +502,15 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct address_space *mapping = jd->jd_inode->i_mapping; unsigned int block = 0, blocks_submitted = 0, blocks_read = 0; - unsigned int bsize = sdp->sd_sb.sb_bsize; + unsigned int bsize = sdp->sd_sb.sb_bsize, off; unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift; unsigned int shift = PAGE_SHIFT - bsize_shift; - unsigned int readhead_blocks = BIO_MAX_PAGES << shift; + unsigned int readahead_blocks = BIO_MAX_PAGES << shift; struct gfs2_journal_extent *je; int sz, ret = 0; struct bio *bio = NULL; struct page *page = NULL; - bool done = false; + bool bio_chained = false, done = false; errseq_t since; memset(head, 0, sizeof(*head)); @@ -505,9 +519,9 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, since = filemap_sample_wb_err(mapping); list_for_each_entry(je, &jd->extent_list, list) { - for (; block < je->lblock + je->blocks; block++) { - u64 dblock; + u64 dblock = je->dblock; + for (; block < je->lblock + je->blocks; block++, dblock++) { if (!page) { page = find_or_create_page(mapping, block >> shift, GFP_NOFS); @@ -516,35 +530,41 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, done = true; goto out; } + off = 0; } - if (bio) { - unsigned int off; - - off = (block << bsize_shift) & ~PAGE_MASK; + if (!bio || (bio_chained && !off)) { + /* start new bio */ + } else { sz = bio_add_page(bio, page, bsize, off); - if (sz == bsize) { /* block added */ - if (off + bsize == PAGE_SIZE) { - page = NULL; - goto page_added; - } - continue; + if (sz == bsize) + goto block_added; + if (off) { + unsigned int blocks = + (PAGE_SIZE - off) >> bsize_shift; + + bio = gfs2_chain_bio(bio, blocks); + bio_chained = true; + goto add_block_to_new_bio; } + } + + if (bio) { blocks_submitted = block + 1; submit_bio(bio); - bio = NULL; } - dblock = je->dblock + (block - je->lblock); bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read); bio->bi_opf = REQ_OP_READ; - sz = bio_add_page(bio, page, bsize, 0); - gfs2_assert_warn(sdp, sz == bsize); - if (bsize == PAGE_SIZE) + bio_chained = false; +add_block_to_new_bio: + sz = bio_add_page(bio, page, bsize, off); + BUG_ON(sz != bsize); +block_added: + off += bsize; + if (off == PAGE_SIZE) page = NULL; - -page_added: - if (blocks_submitted < blocks_read + readhead_blocks) { + if (blocks_submitted < blocks_read + readahead_blocks) { /* Keep at least one bio in flight */ continue; } @@ -846,7 +866,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) if (!sdp->sd_log_num_revoke) return; - length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64)); + length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke); page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke); offset = sizeof(struct gfs2_log_descriptor); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e8b7b0ce8404..b3e904bcc02c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -298,6 +298,8 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) sizeof(struct gfs2_dinode)) / sizeof(u64); sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); + sdp->sd_ldptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_log_descriptor)) / sizeof(u64); sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2; sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 2466bb44a23c..e7bf91ec231c 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -36,16 +36,6 @@ #define BFITNOENT ((u32)~0) #define NO_BLOCK ((u64)~0) -#if BITS_PER_LONG == 32 -#define LBITMASK (0x55555555UL) -#define LBITSKIP55 (0x55555555UL) -#define LBITSKIP00 (0x00000000UL) -#else -#define LBITMASK (0x5555555555555555UL) -#define LBITSKIP55 (0x5555555555555555UL) -#define LBITSKIP00 (0x0000000000000000UL) -#endif - /* * These routines are used by the resource group routines (rgrp.c) * to keep track of block allocation. Each block is represented by two diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 9d4227330de4..a685637a5b55 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -49,8 +49,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, if (blocks) tr->tr_reserved += 6 + blocks; if (revokes) - tr->tr_reserved += gfs2_struct2blk(sdp, revokes, - sizeof(u64)); + tr->tr_reserved += gfs2_struct2blk(sdp, revokes); INIT_LIST_HEAD(&tr->tr_databuf); INIT_LIST_HEAD(&tr->tr_buf); @@ -77,10 +76,10 @@ static void gfs2_print_trans(struct gfs2_sbd *sdp, const struct gfs2_trans *tr) fs_warn(sdp, "blocks=%u revokes=%u reserved=%u touched=%u\n", tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, test_bit(TR_TOUCHED, &tr->tr_flags)); - fs_warn(sdp, "Buf %u/%u Databuf %u/%u Revoke %u\n", + fs_warn(sdp, "Buf %u/%u Databuf %u/%u Revoke %u/%u\n", tr->tr_num_buf_new, tr->tr_num_buf_rm, tr->tr_num_databuf_new, tr->tr_num_databuf_rm, - tr->tr_num_revoke); + tr->tr_num_revoke, tr->tr_num_revoke_rm); } void gfs2_trans_end(struct gfs2_sbd *sdp) @@ -265,7 +264,7 @@ void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) if (bd->bd_gl) gfs2_glock_remove_revoke(bd->bd_gl); kmem_cache_free(gfs2_bufdata_cachep, bd); - tr->tr_num_revoke--; + tr->tr_num_revoke_rm++; if (--n == 0) break; } diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 6d0783e2e276..f71c384064c8 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -242,19 +242,35 @@ extern void hfs_mark_mdb_dirty(struct super_block *sb); /* * There are two time systems. Both are based on seconds since * a particular time/date. - * Unix: unsigned lil-endian since 00:00 GMT, Jan. 1, 1970 + * Unix: signed little-endian since 00:00 GMT, Jan. 1, 1970 * mac: unsigned big-endian since 00:00 GMT, Jan. 1, 1904 * + * HFS implementations are highly inconsistent, this one matches the + * traditional behavior of 64-bit Linux, giving the most useful + * time range between 1970 and 2106, by treating any on-disk timestamp + * under HFS_UTC_OFFSET (Jan 1 1970) as a time between 2040 and 2106. */ -#define __hfs_u_to_mtime(sec) cpu_to_be32(sec + 2082844800U - sys_tz.tz_minuteswest * 60) -#define __hfs_m_to_utime(sec) (be32_to_cpu(sec) - 2082844800U + sys_tz.tz_minuteswest * 60) +#define HFS_UTC_OFFSET 2082844800U +static inline time64_t __hfs_m_to_utime(__be32 mt) +{ + time64_t ut = (u32)(be32_to_cpu(mt) - HFS_UTC_OFFSET); + + return ut + sys_tz.tz_minuteswest * 60; +} + +static inline __be32 __hfs_u_to_mtime(time64_t ut) +{ + ut -= sys_tz.tz_minuteswest * 60; + + return cpu_to_be32(lower_32_bits(ut) + HFS_UTC_OFFSET); +} #define HFS_I(inode) (container_of(inode, struct hfs_inode_info, vfs_inode)) #define HFS_SB(sb) ((struct hfs_sb_info *)(sb)->s_fs_info) -#define hfs_m_to_utime(time) (struct timespec){ .tv_sec = __hfs_m_to_utime(time) } -#define hfs_u_to_mtime(time) __hfs_u_to_mtime((time).tv_sec) -#define hfs_mtime() __hfs_u_to_mtime(get_seconds()) +#define hfs_m_to_utime(time) (struct timespec64){ .tv_sec = __hfs_m_to_utime(time) } +#define hfs_u_to_mtime(time) __hfs_u_to_mtime((time).tv_sec) +#define hfs_mtime() __hfs_u_to_mtime(ktime_get_real_seconds()) static inline const char *hfs_mdb_name(struct super_block *sb) { diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index da243c84e93b..2f224b98ee94 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -351,7 +351,7 @@ static int hfs_read_inode(struct inode *inode, void *data) inode->i_mode &= ~hsb->s_file_umask; inode->i_mode |= S_IFREG; inode->i_ctime = inode->i_atime = inode->i_mtime = - timespec_to_timespec64(hfs_m_to_utime(rec->file.MdDat)); + hfs_m_to_utime(rec->file.MdDat); inode->i_op = &hfs_file_inode_operations; inode->i_fop = &hfs_file_operations; inode->i_mapping->a_ops = &hfs_aops; @@ -362,7 +362,7 @@ static int hfs_read_inode(struct inode *inode, void *data) HFS_I(inode)->fs_blocks = 0; inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask); inode->i_ctime = inode->i_atime = inode->i_mtime = - timespec_to_timespec64(hfs_m_to_utime(rec->dir.MdDat)); + hfs_m_to_utime(rec->dir.MdDat); inode->i_op = &hfs_dir_inode_operations; inode->i_fop = &hfs_dir_operations; break; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index b8471bf05def..3b03fff68543 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -533,13 +533,31 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf, void **data, int op, int op_flags); int hfsplus_read_wrapper(struct super_block *sb); -/* time macros */ -#define __hfsp_mt2ut(t) (be32_to_cpu(t) - 2082844800U) -#define __hfsp_ut2mt(t) (cpu_to_be32(t + 2082844800U)) +/* + * time helpers: convert between 1904-base and 1970-base timestamps + * + * HFS+ implementations are highly inconsistent, this one matches the + * traditional behavior of 64-bit Linux, giving the most useful + * time range between 1970 and 2106, by treating any on-disk timestamp + * under HFSPLUS_UTC_OFFSET (Jan 1 1970) as a time between 2040 and 2106. + */ +#define HFSPLUS_UTC_OFFSET 2082844800U + +static inline time64_t __hfsp_mt2ut(__be32 mt) +{ + time64_t ut = (u32)(be32_to_cpu(mt) - HFSPLUS_UTC_OFFSET); + + return ut; +} + +static inline __be32 __hfsp_ut2mt(time64_t ut) +{ + return cpu_to_be32(lower_32_bits(ut) + HFSPLUS_UTC_OFFSET); +} /* compatibility */ -#define hfsp_mt2ut(t) (struct timespec){ .tv_sec = __hfsp_mt2ut(t) } +#define hfsp_mt2ut(t) (struct timespec64){ .tv_sec = __hfsp_mt2ut(t) } #define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) -#define hfsp_now2mt() __hfsp_ut2mt(get_seconds()) +#define hfsp_now2mt() __hfsp_ut2mt(ktime_get_real_seconds()) #endif diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index d131c8ea7eb6..94bd83b36644 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -504,9 +504,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) hfsplus_get_perms(inode, &folder->permissions, 1); set_nlink(inode, 1); inode->i_size = 2 + be32_to_cpu(folder->valence); - inode->i_atime = timespec_to_timespec64(hfsp_mt2ut(folder->access_date)); - inode->i_mtime = timespec_to_timespec64(hfsp_mt2ut(folder->content_mod_date)); - inode->i_ctime = timespec_to_timespec64(hfsp_mt2ut(folder->attribute_mod_date)); + inode->i_atime = hfsp_mt2ut(folder->access_date); + inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); + inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); HFSPLUS_I(inode)->create_date = folder->create_date; HFSPLUS_I(inode)->fs_blocks = 0; if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) { @@ -542,9 +542,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) init_special_inode(inode, inode->i_mode, be32_to_cpu(file->permissions.dev)); } - inode->i_atime = timespec_to_timespec64(hfsp_mt2ut(file->access_date)); - inode->i_mtime = timespec_to_timespec64(hfsp_mt2ut(file->content_mod_date)); - inode->i_ctime = timespec_to_timespec64(hfsp_mt2ut(file->attribute_mod_date)); + inode->i_atime = hfsp_mt2ut(file->access_date); + inode->i_mtime = hfsp_mt2ut(file->content_mod_date); + inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); HFSPLUS_I(inode)->create_date = file->create_date; } else { pr_err("bad catalog entry used to create inode\n"); diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index f4295aa19350..69cb796f6270 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -37,16 +37,20 @@ * is on, and remove the appropriate bits from attr->ia_mode (attr is a * "struct iattr *"). -BlaisorBlade */ +struct hostfs_timespec { + long long tv_sec; + long long tv_nsec; +}; struct hostfs_iattr { - unsigned int ia_valid; - unsigned short ia_mode; - uid_t ia_uid; - gid_t ia_gid; - loff_t ia_size; - struct timespec ia_atime; - struct timespec ia_mtime; - struct timespec ia_ctime; + unsigned int ia_valid; + unsigned short ia_mode; + uid_t ia_uid; + gid_t ia_gid; + loff_t ia_size; + struct hostfs_timespec ia_atime; + struct hostfs_timespec ia_mtime; + struct hostfs_timespec ia_ctime; }; struct hostfs_stat { @@ -56,7 +60,7 @@ struct hostfs_stat { unsigned int uid; unsigned int gid; unsigned long long size; - struct timespec atime, mtime, ctime; + struct hostfs_timespec atime, mtime, ctime; unsigned int blksize; unsigned long long blocks; unsigned int maj; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 5a7eb0c79839..e6b8c49076bb 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -549,9 +549,9 @@ static int read_name(struct inode *ino, char *name) set_nlink(ino, st.nlink); i_uid_write(ino, st.uid); i_gid_write(ino, st.gid); - ino->i_atime = timespec_to_timespec64(st.atime); - ino->i_mtime = timespec_to_timespec64(st.mtime); - ino->i_ctime = timespec_to_timespec64(st.ctime); + ino->i_atime = (struct timespec64){ st.atime.tv_sec, st.atime.tv_nsec }; + ino->i_mtime = (struct timespec64){ st.mtime.tv_sec, st.mtime.tv_nsec }; + ino->i_ctime = (struct timespec64){ st.ctime.tv_sec, st.ctime.tv_nsec }; ino->i_size = st.size; ino->i_blocks = st.blocks; return 0; @@ -820,15 +820,18 @@ static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_ATIME) { attrs.ia_valid |= HOSTFS_ATTR_ATIME; - attrs.ia_atime = timespec64_to_timespec(attr->ia_atime); + attrs.ia_atime = (struct hostfs_timespec) + { attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec }; } if (attr->ia_valid & ATTR_MTIME) { attrs.ia_valid |= HOSTFS_ATTR_MTIME; - attrs.ia_mtime = timespec64_to_timespec(attr->ia_mtime); + attrs.ia_mtime = (struct hostfs_timespec) + { attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec }; } if (attr->ia_valid & ATTR_CTIME) { attrs.ia_valid |= HOSTFS_ATTR_CTIME; - attrs.ia_ctime = timespec64_to_timespec(attr->ia_ctime); + attrs.ia_ctime = (struct hostfs_timespec) + { attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec }; } if (attr->ia_valid & ATTR_ATIME_SET) { attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d5c2a3158610..a66e425884d1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1498,8 +1498,10 @@ static int __init init_hugetlbfs_fs(void) /* other hstates are optional */ i = 0; for_each_hstate(h) { - if (i == default_hstate_idx) + if (i == default_hstate_idx) { + i++; continue; + } mnt = mount_one_hugetlbfs(h); if (IS_ERR(mnt)) diff --git a/fs/inode.c b/fs/inode.c index aff2b5831168..c7418b0b4168 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -12,6 +12,7 @@ #include <linux/security.h> #include <linux/cdev.h> #include <linux/memblock.h> +#include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/mount.h> #include <linux/posix_acl.h> @@ -676,6 +677,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) struct inode *inode, *next; LIST_HEAD(dispose); +again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); @@ -698,6 +700,12 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); + if (need_resched()) { + spin_unlock(&sb->s_inode_list_lock); + cond_resched(); + dispose_list(&dispose); + goto again; + } } spin_unlock(&sb->s_inode_list_lock); @@ -2218,7 +2226,7 @@ int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags, !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; - return 0; + return fscrypt_prepare_setflags(inode, oldflags, flags); } EXPORT_SYMBOL(vfs_ioc_setflags_prepare); diff --git a/fs/internal.h b/fs/internal.h index 4a7da1df573d..f3f280b952a3 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -38,7 +38,7 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) /* * buffer.c */ -extern void guard_bio_eod(int rw, struct bio *bio); +extern void guard_bio_eod(struct bio *bio); extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, get_block_t *get_block, struct iomap *iomap); @@ -124,6 +124,8 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op); extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, const char *, const struct open_flags *); +extern struct open_how build_open_how(int flags, umode_t mode); +extern int build_open_flags(const struct open_how *how, struct open_flags *op); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); long do_faccessat(int dfd, const char __user *filename, int mode); @@ -180,11 +182,11 @@ extern void mnt_pin_kill(struct mount *m); */ extern const struct dentry_operations ns_dentry_operations; -/* - * fs/ioctl.c - */ -extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, - unsigned long arg); - /* direct-io.c: */ int sb_init_dio_done_wq(struct super_block *sb); + +/* + * fs/stat.c: + */ +unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags); +int cp_statx(const struct kstat *stat, struct statx __user *buffer); diff --git a/fs/io-wq.c b/fs/io-wq.c index 74b40506c5d9..cb60a42b9fdf 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -49,7 +49,6 @@ struct io_worker { struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; - wait_queue_head_t wait; struct io_wqe *wqe; struct io_wq_work *cur_work; @@ -57,7 +56,8 @@ struct io_worker { struct rcu_head rcu; struct mm_struct *mm; - const struct cred *creds; + const struct cred *cur_creds; + const struct cred *saved_creds; struct files_struct *restore_files; }; @@ -93,7 +93,6 @@ struct io_wqe { struct io_wqe_acct acct[2]; struct hlist_nulls_head free_list; - struct hlist_nulls_head busy_list; struct list_head all_list; struct io_wq *wq; @@ -111,10 +110,10 @@ struct io_wq { struct task_struct *manager; struct user_struct *user; - const struct cred *creds; - struct mm_struct *mm; refcount_t refs; struct completion done; + + refcount_t use_refs; }; static bool io_worker_get(struct io_worker *worker) @@ -137,9 +136,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) { bool dropped_lock = false; - if (worker->creds) { - revert_creds(worker->creds); - worker->creds = NULL; + if (worker->saved_creds) { + revert_creds(worker->saved_creds); + worker->cur_creds = worker->saved_creds = NULL; } if (current->files != worker->restore_files) { @@ -258,7 +257,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe) worker = hlist_nulls_entry(n, struct io_worker, nulls_node); if (io_worker_get(worker)) { - wake_up(&worker->wait); + wake_up_process(worker->task); io_worker_release(worker); return true; } @@ -328,7 +327,6 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, if (worker->flags & IO_WORKER_F_FREE) { worker->flags &= ~IO_WORKER_F_FREE; hlist_nulls_del_init_rcu(&worker->nulls_node); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->busy_list); } /* @@ -366,7 +364,6 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) { if (!(worker->flags & IO_WORKER_F_FREE)) { worker->flags |= IO_WORKER_F_FREE; - hlist_nulls_del_init_rcu(&worker->nulls_node); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); } @@ -400,6 +397,43 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) return NULL; } +static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) +{ + if (worker->mm) { + unuse_mm(worker->mm); + mmput(worker->mm); + worker->mm = NULL; + } + if (!work->mm) { + set_fs(KERNEL_DS); + return; + } + if (mmget_not_zero(work->mm)) { + use_mm(work->mm); + if (!worker->mm) + set_fs(USER_DS); + worker->mm = work->mm; + /* hang on to this mm */ + work->mm = NULL; + return; + } + + /* failed grabbing mm, ensure work gets cancelled */ + work->flags |= IO_WQ_WORK_CANCEL; +} + +static void io_wq_switch_creds(struct io_worker *worker, + struct io_wq_work *work) +{ + const struct cred *old_creds = override_creds(work->creds); + + worker->cur_creds = work->creds; + if (worker->saved_creds) + put_cred(old_creds); /* creds set by previous switch */ + else + worker->saved_creds = old_creds; +} + static void io_worker_handle_work(struct io_worker *worker) __releases(wqe->lock) { @@ -433,6 +467,8 @@ next: if (signal_pending(current)) flush_signals(current); + cond_resched(); + spin_lock_irq(&worker->lock); worker->cur_work = work; spin_unlock_irq(&worker->lock); @@ -440,20 +476,19 @@ next: if (work->flags & IO_WQ_WORK_CB) work->func(&work); - if ((work->flags & IO_WQ_WORK_NEEDS_FILES) && - current->files != work->files) { + if (work->files && current->files != work->files) { task_lock(current); current->files = work->files; task_unlock(current); } - if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm && - wq->mm && mmget_not_zero(wq->mm)) { - use_mm(wq->mm); - set_fs(USER_DS); - worker->mm = wq->mm; - } - if (!worker->creds) - worker->creds = override_creds(wq->creds); + if (work->mm != worker->mm) + io_wq_switch_mm(worker, work); + if (worker->cur_creds != work->creds) + io_wq_switch_creds(worker, work); + /* + * OK to set IO_WQ_WORK_CANCEL even for uncancellable work, + * the worker function will do the right thing. + */ if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) work->flags |= IO_WQ_WORK_CANCEL; if (worker->mm) @@ -492,28 +527,46 @@ next: } while (1); } +static inline void io_worker_spin_for_work(struct io_wqe *wqe) +{ + int i = 0; + + while (++i < 1000) { + if (io_wqe_run_queue(wqe)) + break; + if (need_resched()) + break; + cpu_relax(); + } +} + static int io_wqe_worker(void *data) { struct io_worker *worker = data; struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; - DEFINE_WAIT(wait); + bool did_work; io_worker_start(wqe, worker); + did_work = false; while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - prepare_to_wait(&worker->wait, &wait, TASK_INTERRUPTIBLE); - + set_current_state(TASK_INTERRUPTIBLE); +loop: + if (did_work) + io_worker_spin_for_work(wqe); spin_lock_irq(&wqe->lock); if (io_wqe_run_queue(wqe)) { __set_current_state(TASK_RUNNING); io_worker_handle_work(worker); - continue; + did_work = true; + goto loop; } + did_work = false; /* drops the lock on success, retry */ if (__io_worker_idle(wqe, worker)) { __release(&wqe->lock); - continue; + goto loop; } spin_unlock_irq(&wqe->lock); if (signal_pending(current)) @@ -526,8 +579,6 @@ static int io_wqe_worker(void *data) break; } - finish_wait(&worker->wait, &wait); - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { spin_lock_irq(&wqe->lock); if (!wq_list_empty(&wqe->work_list)) @@ -589,7 +640,6 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) refcount_set(&worker->ref, 1); worker->nulls_node.pprev = NULL; - init_waitqueue_head(&worker->wait); worker->wqe = wqe; spin_lock_init(&worker->lock); @@ -703,6 +753,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); + int work_flags; unsigned long flags; /* @@ -717,12 +768,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) return; } + work_flags = work->flags; spin_lock_irqsave(&wqe->lock, flags); wq_list_add_tail(&work->list, &wqe->work_list); wqe->flags &= ~IO_WQE_FLAG_STALLED; spin_unlock_irqrestore(&wqe->lock, flags); - if (!atomic_read(&acct->nr_running)) + if ((work_flags & IO_WQ_WORK_CONCURRENT) || + !atomic_read(&acct->nr_running)) io_wqe_wake_worker(wqe, acct); } @@ -784,10 +837,6 @@ void io_wq_cancel_all(struct io_wq *wq) set_bit(IO_WQ_BIT_CANCEL, &wq->state); - /* - * Browse both lists, as there's a gap between handing work off - * to a worker and the worker putting itself on the busy_list - */ rcu_read_lock(); for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; @@ -815,6 +864,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data) */ spin_lock_irqsave(&worker->lock, flags); if (worker->cur_work && + !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) && data->cancel(worker->cur_work, data->caller_data)) { send_sig(SIGINT, worker->task, 1); ret = true; @@ -889,7 +939,8 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) return false; spin_lock_irqsave(&worker->lock, flags); - if (worker->cur_work == work) { + if (worker->cur_work == work && + !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) { send_sig(SIGINT, worker->task, 1); ret = true; } @@ -934,7 +985,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, /* * Now check if a free (going busy) or busy worker has the work * currently running. If we find it there, we'll return CANCEL_RUNNING - * as an indication that we attempte to signal cancellation. The + * as an indication that we attempt to signal cancellation. The * completion will run normally in this case. */ rcu_read_lock(); @@ -1013,7 +1064,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) /* caller must already hold a reference to this */ wq->user = data->user; - wq->creds = data->creds; for_each_node(node) { struct io_wqe *wqe; @@ -1035,15 +1085,11 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) spin_lock_init(&wqe->lock); INIT_WQ_LIST(&wqe->work_list); INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); - INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1); INIT_LIST_HEAD(&wqe->all_list); } init_completion(&wq->done); - /* caller must have already done mmgrab() on this mm */ - wq->mm = data->mm; - wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager"); if (!IS_ERR(wq->manager)) { wake_up_process(wq->manager); @@ -1052,6 +1098,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ret = -ENOMEM; goto err; } + refcount_set(&wq->use_refs, 1); reinit_completion(&wq->done); return wq; } @@ -1066,13 +1113,21 @@ err: return ERR_PTR(ret); } +bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) +{ + if (data->get_work != wq->get_work || data->put_work != wq->put_work) + return false; + + return refcount_inc_not_zero(&wq->use_refs); +} + static bool io_wq_worker_wake(struct io_worker *worker, void *data) { wake_up_process(worker->task); return false; } -void io_wq_destroy(struct io_wq *wq) +static void __io_wq_destroy(struct io_wq *wq) { int node; @@ -1092,3 +1147,9 @@ void io_wq_destroy(struct io_wq *wq) kfree(wq->wqes); kfree(wq); } + +void io_wq_destroy(struct io_wq *wq) +{ + if (refcount_dec_and_test(&wq->use_refs)) + __io_wq_destroy(wq); +} diff --git a/fs/io-wq.h b/fs/io-wq.h index 7c333a28e2a7..50b3378febf2 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -7,11 +7,11 @@ enum { IO_WQ_WORK_CANCEL = 1, IO_WQ_WORK_HAS_MM = 2, IO_WQ_WORK_HASHED = 4, - IO_WQ_WORK_NEEDS_USER = 8, - IO_WQ_WORK_NEEDS_FILES = 16, IO_WQ_WORK_UNBOUND = 32, IO_WQ_WORK_INTERNAL = 64, IO_WQ_WORK_CB = 128, + IO_WQ_WORK_NO_CANCEL = 256, + IO_WQ_WORK_CONCURRENT = 512, IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; @@ -35,7 +35,8 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, struct io_wq_work_list *list) { if (!list->first) { - list->first = list->last = node; + list->last = node; + WRITE_ONCE(list->first, node); } else { list->last->next = node; list->last = node; @@ -47,7 +48,7 @@ static inline void wq_node_del(struct io_wq_work_list *list, struct io_wq_work_node *prev) { if (node == list->first) - list->first = node->next; + WRITE_ONCE(list->first, node->next); if (node == list->last) list->last = prev; if (prev) @@ -58,7 +59,7 @@ static inline void wq_node_del(struct io_wq_work_list *list, #define wq_list_for_each(pos, prv, head) \ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) -#define wq_list_empty(list) ((list)->first == NULL) +#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) #define INIT_WQ_LIST(list) do { \ (list)->first = NULL; \ (list)->last = NULL; \ @@ -71,6 +72,8 @@ struct io_wq_work { }; void (*func)(struct io_wq_work **); struct files_struct *files; + struct mm_struct *mm; + const struct cred *creds; unsigned flags; }; @@ -80,21 +83,22 @@ struct io_wq_work { (work)->func = _func; \ (work)->flags = 0; \ (work)->files = NULL; \ + (work)->mm = NULL; \ + (work)->creds = NULL; \ } while (0) \ typedef void (get_work_fn)(struct io_wq_work *); typedef void (put_work_fn)(struct io_wq_work *); struct io_wq_data { - struct mm_struct *mm; struct user_struct *user; - const struct cred *creds; get_work_fn *get_work; put_work_fn *put_work; }; struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); +bool io_wq_get(struct io_wq *wq, struct io_wq_data *data); void io_wq_destroy(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); @@ -119,6 +123,10 @@ static inline void io_wq_worker_sleeping(struct task_struct *tsk) static inline void io_wq_worker_running(struct task_struct *tsk) { } -#endif /* CONFIG_IO_WQ */ +#endif -#endif /* INTERNAL_IO_WQ_H */ +static inline bool io_wq_current_is_worker(void) +{ + return in_task() && (current->flags & PF_IO_WORKER); +} +#endif diff --git a/fs/io_uring.c b/fs/io_uring.c index 405be10da73d..1806afddfea5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -46,6 +46,7 @@ #include <linux/compat.h> #include <linux/refcount.h> #include <linux/uio.h> +#include <linux/bits.h> #include <linux/sched/signal.h> #include <linux/fs.h> @@ -70,6 +71,10 @@ #include <linux/sizes.h> #include <linux/hugetlb.h> #include <linux/highmem.h> +#include <linux/namei.h> +#include <linux/fsnotify.h> +#include <linux/fadvise.h> +#include <linux/eventpoll.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -177,6 +182,21 @@ struct fixed_file_table { struct file **files; }; +enum { + FFD_F_ATOMIC, +}; + +struct fixed_file_data { + struct fixed_file_table *table; + struct io_ring_ctx *ctx; + + struct percpu_ref refs; + struct llist_head put_llist; + unsigned long state; + struct work_struct ref_work; + struct completion done; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -184,10 +204,11 @@ struct io_ring_ctx { struct { unsigned int flags; - bool compat; - bool account_mem; - bool cq_overflow_flushed; - bool drain_next; + int compat: 1; + int account_mem: 1; + int cq_overflow_flushed: 1; + int drain_next: 1; + int eventfd_async: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -207,13 +228,14 @@ struct io_ring_ctx { unsigned sq_thread_idle; unsigned cached_sq_dropped; atomic_t cached_cq_overflow; - struct io_uring_sqe *sq_sqes; + unsigned long sq_check_overflow; struct list_head defer_list; struct list_head timeout_list; struct list_head cq_overflow_list; wait_queue_head_t inflight_wait; + struct io_uring_sqe *sq_sqes; } ____cacheline_aligned_in_smp; struct io_rings *rings; @@ -229,8 +251,10 @@ struct io_ring_ctx { * readers must ensure that ->refs is alive as long as the file* is * used. Only updated through io_uring_register(2). */ - struct fixed_file_table *file_table; + struct fixed_file_data *file_data; unsigned nr_user_files; + int ring_fd; + struct file *ring_file; /* if used, fixed mapped user buffers */ unsigned nr_user_bufs; @@ -250,11 +274,14 @@ struct io_ring_ctx { struct socket *ring_sock; #endif + struct idr personality_idr; + struct { unsigned cached_cq_tail; unsigned cq_entries; unsigned cq_mask; atomic_t cq_timeouts; + unsigned long cq_check_overflow; struct wait_queue_head cq_wait; struct fasync_struct *cq_fasync; struct eventfd_ctx *cq_ev_fd; @@ -267,7 +294,8 @@ struct io_ring_ctx { struct { spinlock_t completion_lock; - bool poll_multi_file; + struct llist_head poll_llist; + /* * ->poll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. @@ -277,6 +305,7 @@ struct io_ring_ctx { struct list_head poll_list; struct hlist_head *cancel_hash; unsigned cancel_hash_bits; + bool poll_multi_file; spinlock_t inflight_lock; struct list_head inflight_list; @@ -289,11 +318,20 @@ struct io_ring_ctx { */ struct io_poll_iocb { struct file *file; - struct wait_queue_head *head; + union { + struct wait_queue_head *head; + u64 addr; + }; __poll_t events; bool done; bool canceled; - struct wait_queue_entry *wait; + struct wait_queue_entry wait; +}; + +struct io_close { + struct file *file; + struct file *put_file; + int fd; }; struct io_timeout_data { @@ -304,6 +342,96 @@ struct io_timeout_data { u32 seq_offset; }; +struct io_accept { + struct file *file; + struct sockaddr __user *addr; + int __user *addr_len; + int flags; +}; + +struct io_sync { + struct file *file; + loff_t len; + loff_t off; + int flags; + int mode; +}; + +struct io_cancel { + struct file *file; + u64 addr; +}; + +struct io_timeout { + struct file *file; + u64 addr; + int flags; + unsigned count; +}; + +struct io_rw { + /* NOTE: kiocb has the file as the first member, so don't do it here */ + struct kiocb kiocb; + u64 addr; + u64 len; +}; + +struct io_connect { + struct file *file; + struct sockaddr __user *addr; + int addr_len; +}; + +struct io_sr_msg { + struct file *file; + union { + struct user_msghdr __user *msg; + void __user *buf; + }; + int msg_flags; + size_t len; +}; + +struct io_open { + struct file *file; + int dfd; + union { + unsigned mask; + }; + struct filename *filename; + struct statx __user *buffer; + struct open_how how; +}; + +struct io_files_update { + struct file *file; + u64 arg; + u32 nr_args; + u32 offset; +}; + +struct io_fadvise { + struct file *file; + u64 offset; + u32 len; + u32 advice; +}; + +struct io_madvise { + struct file *file; + u64 addr; + u32 len; + u32 advice; +}; + +struct io_epoll { + struct file *file; + int epfd; + int op; + int fd; + struct epoll_event event; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -322,16 +450,79 @@ struct io_async_rw { ssize_t size; }; +struct io_async_open { + struct filename *filename; +}; + struct io_async_ctx { - struct io_uring_sqe sqe; union { struct io_async_rw rw; struct io_async_msghdr msg; struct io_async_connect connect; struct io_timeout_data timeout; + struct io_async_open open; }; }; +enum { + REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, + REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, + REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, + REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, + REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, + + REQ_F_LINK_NEXT_BIT, + REQ_F_FAIL_LINK_BIT, + REQ_F_INFLIGHT_BIT, + REQ_F_CUR_POS_BIT, + REQ_F_NOWAIT_BIT, + REQ_F_IOPOLL_COMPLETED_BIT, + REQ_F_LINK_TIMEOUT_BIT, + REQ_F_TIMEOUT_BIT, + REQ_F_ISREG_BIT, + REQ_F_MUST_PUNT_BIT, + REQ_F_TIMEOUT_NOSEQ_BIT, + REQ_F_COMP_LOCKED_BIT, +}; + +enum { + /* ctx owns file */ + REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), + /* drain existing IO first */ + REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), + /* linked sqes */ + REQ_F_LINK = BIT(REQ_F_LINK_BIT), + /* doesn't sever on completion < 0 */ + REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), + /* IOSQE_ASYNC */ + REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), + + /* already grabbed next link */ + REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), + /* fail rest of links */ + REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), + /* on inflight list */ + REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), + /* read/write uses file position */ + REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), + /* must not punt to workers */ + REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), + /* polled IO has completed */ + REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT), + /* has linked timeout */ + REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), + /* timeout request */ + REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT), + /* regular file */ + REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), + /* must be punted even for NONBLOCK */ + REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT), + /* no timeout sequence */ + REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), + /* completion under lock */ + REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -341,17 +532,31 @@ struct io_async_ctx { struct io_kiocb { union { struct file *file; - struct kiocb rw; + struct io_rw rw; struct io_poll_iocb poll; + struct io_accept accept; + struct io_sync sync; + struct io_cancel cancel; + struct io_timeout timeout; + struct io_connect connect; + struct io_sr_msg sr_msg; + struct io_open open; + struct io_close close; + struct io_files_update files_update; + struct io_fadvise fadvise; + struct io_madvise madvise; + struct io_epoll epoll; }; - const struct io_uring_sqe *sqe; struct io_async_ctx *io; - struct file *ring_file; - int ring_fd; + /* + * llist_node is only used for poll deferred completions + */ + struct llist_node llist_node; bool has_user; bool in_async; bool needs_fixed_file; + u8 opcode; struct io_ring_ctx *ctx; union { @@ -361,22 +566,6 @@ struct io_kiocb { struct list_head link_list; unsigned int flags; refcount_t refs; -#define REQ_F_NOWAIT 1 /* must not punt to workers */ -#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ -#define REQ_F_FIXED_FILE 4 /* ctx owns file */ -#define REQ_F_LINK_NEXT 8 /* already grabbed next link */ -#define REQ_F_IO_DRAIN 16 /* drain existing IO first */ -#define REQ_F_IO_DRAINED 32 /* drain done */ -#define REQ_F_LINK 64 /* linked sqes */ -#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */ -#define REQ_F_FAIL_LINK 256 /* fail rest of links */ -#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */ -#define REQ_F_TIMEOUT 1024 /* timeout request */ -#define REQ_F_ISREG 2048 /* regular file */ -#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ -#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ -#define REQ_F_INFLIGHT 16384 /* on inflight list */ -#define REQ_F_COMP_LOCKED 32768 /* completion under lock */ u64 user_data; u32 result; u32 sequence; @@ -409,14 +598,162 @@ struct io_submit_state { unsigned int ios_left; }; +struct io_op_def { + /* needs req->io allocated for deferral/async */ + unsigned async_ctx : 1; + /* needs current->mm setup, does mm access */ + unsigned needs_mm : 1; + /* needs req->file assigned */ + unsigned needs_file : 1; + /* needs req->file assigned IFF fd is >= 0 */ + unsigned fd_non_neg : 1; + /* hash wq insertion if file is a regular file */ + unsigned hash_reg_file : 1; + /* unbound wq insertion if file is a non-regular file */ + unsigned unbound_nonreg_file : 1; + /* opcode is not supported by this kernel */ + unsigned not_supported : 1; + /* needs file table */ + unsigned file_table : 1; +}; + +static const struct io_op_def io_op_defs[] = { + [IORING_OP_NOP] = {}, + [IORING_OP_READV] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_WRITEV] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_FSYNC] = { + .needs_file = 1, + }, + [IORING_OP_READ_FIXED] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_WRITE_FIXED] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_POLL_ADD] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_POLL_REMOVE] = {}, + [IORING_OP_SYNC_FILE_RANGE] = { + .needs_file = 1, + }, + [IORING_OP_SENDMSG] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_RECVMSG] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_TIMEOUT] = { + .async_ctx = 1, + .needs_mm = 1, + }, + [IORING_OP_TIMEOUT_REMOVE] = {}, + [IORING_OP_ACCEPT] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + .file_table = 1, + }, + [IORING_OP_ASYNC_CANCEL] = {}, + [IORING_OP_LINK_TIMEOUT] = { + .async_ctx = 1, + .needs_mm = 1, + }, + [IORING_OP_CONNECT] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_FALLOCATE] = { + .needs_file = 1, + }, + [IORING_OP_OPENAT] = { + .needs_file = 1, + .fd_non_neg = 1, + .file_table = 1, + }, + [IORING_OP_CLOSE] = { + .needs_file = 1, + .file_table = 1, + }, + [IORING_OP_FILES_UPDATE] = { + .needs_mm = 1, + .file_table = 1, + }, + [IORING_OP_STATX] = { + .needs_mm = 1, + .needs_file = 1, + .fd_non_neg = 1, + }, + [IORING_OP_READ] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_WRITE] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_FADVISE] = { + .needs_file = 1, + }, + [IORING_OP_MADVISE] = { + .needs_mm = 1, + }, + [IORING_OP_SEND] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_RECV] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_OPENAT2] = { + .needs_file = 1, + .fd_non_neg = 1, + .file_table = 1, + }, + [IORING_OP_EPOLL_CTL] = { + .unbound_nonreg_file = 1, + .file_table = 1, + }, +}; + static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_kiocb *req, long res); -static void __io_free_req(struct io_kiocb *req); static void io_put_req(struct io_kiocb *req); -static void io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); +static int __io_sqe_files_update(struct io_ring_ctx *ctx, + struct io_uring_files_update *ip, + unsigned nr_args); +static int io_grab_files(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -483,9 +820,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->completions[0]); init_completion(&ctx->completions[1]); + idr_init(&ctx->personality_idr); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); + init_llist_head(&ctx->poll_llist); INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); @@ -512,7 +851,7 @@ static inline bool __req_need_defer(struct io_kiocb *req) static inline bool req_need_defer(struct io_kiocb *req) { - if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN) + if (unlikely(req->flags & REQ_F_IO_DRAIN)) return __req_need_defer(req); return false; @@ -552,56 +891,54 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) { - /* order cqe stores with ring update */ - smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); + /* order cqe stores with ring update */ + smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); - if (wq_has_sleeper(&ctx->cq_wait)) { - wake_up_interruptible(&ctx->cq_wait); - kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); - } + if (wq_has_sleeper(&ctx->cq_wait)) { + wake_up_interruptible(&ctx->cq_wait); + kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); } } -static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe) +static inline void io_req_work_grab_env(struct io_kiocb *req, + const struct io_op_def *def) { - u8 opcode = READ_ONCE(sqe->opcode); + if (!req->work.mm && def->needs_mm) { + mmgrab(current->mm); + req->work.mm = current->mm; + } + if (!req->work.creds) + req->work.creds = get_current_cred(); +} - return !(opcode == IORING_OP_READ_FIXED || - opcode == IORING_OP_WRITE_FIXED); +static inline void io_req_work_drop_env(struct io_kiocb *req) +{ + if (req->work.mm) { + mmdrop(req->work.mm); + req->work.mm = NULL; + } + if (req->work.creds) { + put_cred(req->work.creds); + req->work.creds = NULL; + } } static inline bool io_prep_async_work(struct io_kiocb *req, struct io_kiocb **link) { + const struct io_op_def *def = &io_op_defs[req->opcode]; bool do_hashed = false; - if (req->sqe) { - switch (req->sqe->opcode) { - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: + if (req->flags & REQ_F_ISREG) { + if (def->hash_reg_file) do_hashed = true; - /* fall-through */ - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_SENDMSG: - case IORING_OP_RECVMSG: - case IORING_OP_ACCEPT: - case IORING_OP_POLL_ADD: - case IORING_OP_CONNECT: - /* - * We know REQ_F_ISREG is not set on some of these - * opcodes, but this enables us to keep the check in - * just one place. - */ - if (!(req->flags & REQ_F_ISREG)) - req->work.flags |= IO_WQ_WORK_UNBOUND; - break; - } - if (io_sqe_needs_user(req->sqe)) - req->work.flags |= IO_WQ_WORK_NEEDS_USER; + } else { + if (def->unbound_nonreg_file) + req->work.flags |= IO_WQ_WORK_UNBOUND; } + io_req_work_grab_env(req, def); + *link = io_prep_linked_timeout(req); return do_hashed; } @@ -659,10 +996,8 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) __io_commit_cqring(ctx); - while ((req = io_get_deferred_req(ctx)) != NULL) { - req->flags |= REQ_F_IO_DRAINED; + while ((req = io_get_deferred_req(ctx)) != NULL) io_queue_async_work(req); - } } static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) @@ -683,13 +1018,20 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) return &rings->cqes[tail & ctx->cq_mask]; } +static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) +{ + if (!ctx->eventfd_async) + return true; + return io_wq_current_is_worker() || in_interrupt(); +} + static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); if (waitqueue_active(&ctx->sqo_wait)) wake_up(&ctx->sqo_wait); - if (ctx->cq_ev_fd) + if (ctx->cq_ev_fd && io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); } @@ -714,7 +1056,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) /* if force is set, the ring is going away. always drop after that */ if (force) - ctx->cq_overflow_flushed = true; + ctx->cq_overflow_flushed = 1; cqe = NULL; while (!list_empty(&ctx->cq_overflow_list)) { @@ -736,6 +1078,10 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) } io_commit_cqring(ctx); + if (cqe) { + clear_bit(0, &ctx->sq_check_overflow); + clear_bit(0, &ctx->cq_check_overflow); + } spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); @@ -769,6 +1115,10 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); } else { + if (list_empty(&ctx->cq_overflow_list)) { + set_bit(0, &ctx->sq_check_overflow); + set_bit(0, &ctx->cq_check_overflow); + } refcount_inc(&req->refs); req->result = res; list_add_tail(&req->list, &ctx->cq_overflow_list); @@ -811,9 +1161,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct io_kiocb *req; - if (!percpu_ref_tryget(&ctx->refs)) - return NULL; - if (!state) { req = kmem_cache_alloc(req_cachep, gfp); if (unlikely(!req)) @@ -846,7 +1193,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, got_it: req->io = NULL; - req->ring_file = NULL; req->file = NULL; req->ctx = ctx; req->flags = 0; @@ -863,24 +1209,35 @@ fallback: return NULL; } -static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) +static void __io_req_do_free(struct io_kiocb *req) +{ + if (likely(!io_is_fallback_req(req))) + kmem_cache_free(req_cachep, req); + else + clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req); +} + +static void __io_req_aux_free(struct io_kiocb *req) { - if (*nr) { - kmem_cache_free_bulk(req_cachep, *nr, reqs); - percpu_ref_put_many(&ctx->refs, *nr); - *nr = 0; + struct io_ring_ctx *ctx = req->ctx; + + kfree(req->io); + if (req->file) { + if (req->flags & REQ_F_FIXED_FILE) + percpu_ref_put(&ctx->file_data->refs); + else + fput(req->file); } + + io_req_work_drop_env(req); } static void __io_free_req(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; + __io_req_aux_free(req); - if (req->io) - kfree(req->io); - if (req->file && !(req->flags & REQ_F_FIXED_FILE)) - fput(req->file); if (req->flags & REQ_F_INFLIGHT) { + struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->inflight_lock, flags); @@ -889,11 +1246,63 @@ static void __io_free_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } - percpu_ref_put(&ctx->refs); - if (likely(!io_is_fallback_req(req))) - kmem_cache_free(req_cachep, req); - else - clear_bit_unlock(0, (unsigned long *) ctx->fallback_req); + + percpu_ref_put(&req->ctx->refs); + __io_req_do_free(req); +} + +struct req_batch { + void *reqs[IO_IOPOLL_BATCH]; + int to_free; + int need_iter; +}; + +static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) +{ + int fixed_refs = rb->to_free; + + if (!rb->to_free) + return; + if (rb->need_iter) { + int i, inflight = 0; + unsigned long flags; + + fixed_refs = 0; + for (i = 0; i < rb->to_free; i++) { + struct io_kiocb *req = rb->reqs[i]; + + if (req->flags & REQ_F_FIXED_FILE) { + req->file = NULL; + fixed_refs++; + } + if (req->flags & REQ_F_INFLIGHT) + inflight++; + __io_req_aux_free(req); + } + if (!inflight) + goto do_free; + + spin_lock_irqsave(&ctx->inflight_lock, flags); + for (i = 0; i < rb->to_free; i++) { + struct io_kiocb *req = rb->reqs[i]; + + if (req->flags & REQ_F_INFLIGHT) { + list_del(&req->inflight_entry); + if (!--inflight) + break; + } + } + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + + if (waitqueue_active(&ctx->inflight_wait)) + wake_up(&ctx->inflight_wait); + } +do_free: + kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); + if (fixed_refs) + percpu_ref_put_many(&ctx->file_data->refs, fixed_refs); + percpu_ref_put_many(&ctx->refs, rb->to_free); + rb->to_free = rb->need_iter = 0; } static bool io_link_cancel_timeout(struct io_kiocb *req) @@ -969,7 +1378,7 @@ static void io_fail_links(struct io_kiocb *req) trace_io_uring_fail_link(req, link); if ((req->flags & REQ_F_LINK_TIMEOUT) && - link->sqe->opcode == IORING_OP_LINK_TIMEOUT) { + link->opcode == IORING_OP_LINK_TIMEOUT) { io_link_cancel_timeout(link); } else { io_cqring_fill_event(link, -ECANCELED); @@ -1066,19 +1475,21 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) { struct io_rings *rings = ctx->rings; - /* - * noflush == true is from the waitqueue handler, just ensure we wake - * up the task, and the next invocation will flush the entries. We - * cannot safely to it from here. - */ - if (noflush && !list_empty(&ctx->cq_overflow_list)) - return -1U; + if (test_bit(0, &ctx->cq_check_overflow)) { + /* + * noflush == true is from the waitqueue handler, just ensure + * we wake up the task, and the next invocation will flush the + * entries. We cannot safely to it from here. + */ + if (noflush && !list_empty(&ctx->cq_overflow_list)) + return -1U; - io_cqring_overflow_flush(ctx, false); + io_cqring_overflow_flush(ctx, false); + } /* See comment at the top of this file */ smp_rmb(); - return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); + return ctx->cached_cq_tail - READ_ONCE(rings->cq.head); } static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) @@ -1089,17 +1500,30 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } +static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) +{ + if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req)) + return false; + + if (!(req->flags & REQ_F_FIXED_FILE) || req->io) + rb->need_iter++; + + rb->reqs[rb->to_free++] = req; + if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) + io_free_req_many(req->ctx, rb); + return true; +} + /* * Find and free completed poll iocbs */ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, struct list_head *done) { - void *reqs[IO_IOPOLL_BATCH]; + struct req_batch rb; struct io_kiocb *req; - int to_free; - to_free = 0; + rb.to_free = rb.need_iter = 0; while (!list_empty(done)) { req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); @@ -1107,26 +1531,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, io_cqring_fill_event(req, req->result); (*nr_events)++; - if (refcount_dec_and_test(&req->refs)) { - /* If we're not using fixed files, we have to pair the - * completion part with the file put. Use regular - * completions for those, only batch free for fixed - * file and non-linked commands. - */ - if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == - REQ_F_FIXED_FILE) && !io_is_fallback_req(req) && - !req->io) { - reqs[to_free++] = req; - if (to_free == ARRAY_SIZE(reqs)) - io_free_req_many(ctx, reqs, &to_free); - } else { - io_free_req(req); - } - } + if (refcount_dec_and_test(&req->refs) && + !io_req_multi_free(&rb, req)) + io_free_req(req); } io_commit_cqring(ctx); - io_free_req_many(ctx, reqs, &to_free); + io_free_req_many(ctx, &rb); } static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, @@ -1145,7 +1556,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, ret = 0; list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) { - struct kiocb *kiocb = &req->rw; + struct kiocb *kiocb = &req->rw.kiocb; /* * Move completed entries to our local list. If we find a @@ -1175,7 +1586,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, } /* - * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a + * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a * non-spinning poll check - we'll still enter the driver poll loop, but only * as a non-spinning completion check. */ @@ -1292,21 +1703,27 @@ static void kiocb_end_write(struct io_kiocb *req) file_end_write(req->file); } +static inline void req_set_fail_links(struct io_kiocb *req) +{ + if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; +} + static void io_complete_rw_common(struct kiocb *kiocb, long res) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if ((req->flags & REQ_F_LINK) && res != req->result) - req->flags |= REQ_F_FAIL_LINK; + if (res != req->result) + req_set_fail_links(req); io_cqring_add_event(req, res); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); io_complete_rw_common(kiocb, res); io_put_req(req); @@ -1314,7 +1731,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); struct io_kiocb *nxt = NULL; io_complete_rw_common(kiocb, res); @@ -1325,13 +1742,13 @@ static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res) static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if ((req->flags & REQ_F_LINK) && res != req->result) - req->flags |= REQ_F_FAIL_LINK; + if (res != req->result) + req_set_fail_links(req); req->result = res; if (res != -EAGAIN) req->flags |= REQ_F_IOPOLL_COMPLETED; @@ -1359,7 +1776,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req) list_req = list_first_entry(&ctx->poll_list, struct io_kiocb, list); - if (list_req->rw.ki_filp != req->rw.ki_filp) + if (list_req->file != req->file) ctx->poll_multi_file = true; } @@ -1422,7 +1839,7 @@ static bool io_file_supports_async(struct file *file) { umode_t mode = file_inode(file)->i_mode; - if (S_ISBLK(mode) || S_ISCHR(mode)) + if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode)) return true; if (S_ISREG(mode) && file->f_op != &io_uring_fops) return true; @@ -1430,11 +1847,11 @@ static bool io_file_supports_async(struct file *file) return false; } -static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) { - const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; - struct kiocb *kiocb = &req->rw; + struct kiocb *kiocb = &req->rw.kiocb; unsigned ioprio; int ret; @@ -1445,6 +1862,10 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) req->flags |= REQ_F_ISREG; kiocb->ki_pos = READ_ONCE(sqe->off); + if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) { + req->flags |= REQ_F_CUR_POS; + kiocb->ki_pos = req->file->f_pos; + } kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -1483,6 +1904,12 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) return -EINVAL; kiocb->ki_complete = io_complete_rw; } + + req->rw.addr = READ_ONCE(sqe->addr); + req->rw.len = READ_ONCE(sqe->len); + /* we own ->private, reuse it for the buffer index */ + req->rw.kiocb.private = (void *) (unsigned long) + READ_ONCE(sqe->buf_index); return 0; } @@ -1510,17 +1937,21 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, bool in_async) { + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + + if (req->flags & REQ_F_CUR_POS) + req->file->f_pos = kiocb->ki_pos; if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw) *nxt = __io_complete_rw(kiocb, ret); else io_rw_done(kiocb, ret); } -static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw, - const struct io_uring_sqe *sqe, +static ssize_t io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) { - size_t len = READ_ONCE(sqe->len); + struct io_ring_ctx *ctx = req->ctx; + size_t len = req->rw.len; struct io_mapped_ubuf *imu; unsigned index, buf_index; size_t offset; @@ -1530,13 +1961,13 @@ static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw, if (unlikely(!ctx->user_bufs)) return -EFAULT; - buf_index = READ_ONCE(sqe->buf_index); + buf_index = (unsigned long) req->rw.kiocb.private; if (unlikely(buf_index >= ctx->nr_user_bufs)) return -EFAULT; index = array_index_nospec(buf_index, ctx->nr_user_bufs); imu = &ctx->user_bufs[index]; - buf_addr = READ_ONCE(sqe->addr); + buf_addr = req->rw.addr; /* overflow */ if (buf_addr + len < buf_addr) @@ -1593,23 +2024,25 @@ static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw, static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter) { - const struct io_uring_sqe *sqe = req->sqe; - void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); - size_t sqe_len = READ_ONCE(sqe->len); + void __user *buf = u64_to_user_ptr(req->rw.addr); + size_t sqe_len = req->rw.len; u8 opcode; - /* - * We're reading ->opcode for the second time, but the first read - * doesn't care whether it's _FIXED or not, so it doesn't matter - * whether ->opcode changes concurrently. The first read does care - * about whether it is a READ or a WRITE, so we don't trust this read - * for that purpose and instead let the caller pass in the read/write - * flag. - */ - opcode = READ_ONCE(sqe->opcode); + opcode = req->opcode; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; - return io_import_fixed(req->ctx, rw, sqe, iter); + return io_import_fixed(req, rw, iter); + } + + /* buffer index only valid with fixed read/write */ + if (req->rw.kiocb.private) + return -EINVAL; + + if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { + ssize_t ret; + ret = import_single_range(rw, buf, sqe_len, *iovec, iter); + *iovec = NULL; + return ret; } if (req->io) { @@ -1692,7 +2125,7 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return ret; } -static void io_req_map_io(struct io_kiocb *req, ssize_t io_size, +static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, struct iovec *iovec, struct iovec *fast_iov, struct iov_iter *iter) { @@ -1706,57 +2139,86 @@ static void io_req_map_io(struct io_kiocb *req, ssize_t io_size, } } -static int io_setup_async_io(struct io_kiocb *req, ssize_t io_size, +static int io_alloc_async_ctx(struct io_kiocb *req) +{ + if (!io_op_defs[req->opcode].async_ctx) + return 0; + req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); + return req->io == NULL; +} + +static void io_rw_async(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct iovec *iov = NULL; + + if (req->io->rw.iov != req->io->rw.fast_iov) + iov = req->io->rw.iov; + io_wq_submit_work(workptr); + kfree(iov); +} + +static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, struct iovec *iovec, struct iovec *fast_iov, struct iov_iter *iter) { - req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); - if (req->io) { - io_req_map_io(req, io_size, iovec, fast_iov, iter); - memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe)); - req->sqe = &req->io->sqe; + if (!io_op_defs[req->opcode].async_ctx) return 0; - } + if (!req->io && io_alloc_async_ctx(req)) + return -ENOMEM; - return -ENOMEM; + io_req_map_rw(req, io_size, iovec, fast_iov, iter); + req->work.func = io_rw_async; + return 0; } -static int io_read_prep(struct io_kiocb *req, struct iovec **iovec, - struct iov_iter *iter, bool force_nonblock) +static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) { + struct io_async_ctx *io; + struct iov_iter iter; ssize_t ret; - ret = io_prep_rw(req, force_nonblock); + ret = io_prep_rw(req, sqe, force_nonblock); if (ret) return ret; if (unlikely(!(req->file->f_mode & FMODE_READ))) return -EBADF; - return io_import_iovec(READ, req, iovec, iter); + if (!req->io) + return 0; + + io = req->io; + io->rw.iov = io->rw.fast_iov; + req->io = NULL; + ret = io_import_iovec(READ, req, &io->rw.iov, &iter); + req->io = io; + if (ret < 0) + return ret; + + io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); + return 0; } static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; - struct kiocb *kiocb = &req->rw; + struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; - struct file *file; size_t iov_count; ssize_t io_size, ret; - if (!req->io) { - ret = io_read_prep(req, &iovec, &iter, force_nonblock); - if (ret < 0) - return ret; - } else { - ret = io_import_iovec(READ, req, &iovec, &iter); - if (ret < 0) - return ret; - } + ret = io_import_iovec(READ, req, &iovec, &iter); + if (ret < 0) + return ret; - file = req->file; + /* Ensure we clear previously set non-block flag */ + if (!force_nonblock) + req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; + + req->result = 0; io_size = ret; if (req->flags & REQ_F_LINK) req->result = io_size; @@ -1765,39 +2227,27 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so * we know to async punt it even if it was opened O_NONBLOCK */ - if (force_nonblock && !io_file_supports_async(file)) { + if (force_nonblock && !io_file_supports_async(req->file)) { req->flags |= REQ_F_MUST_PUNT; goto copy_iov; } iov_count = iov_iter_count(&iter); - ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); + ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); if (!ret) { ssize_t ret2; - if (file->f_op->read_iter) - ret2 = call_read_iter(file, kiocb, &iter); + if (req->file->f_op->read_iter) + ret2 = call_read_iter(req->file, kiocb, &iter); else - ret2 = loop_rw_iter(READ, file, kiocb, &iter); + ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); - /* - * In case of a short read, punt to async. This can happen - * if we have data partially cached. Alternatively we can - * return the short read, in which case the application will - * need to issue another SQE and wait for it. That SQE will - * need async punt anyway, so it's more efficient to do it - * here. - */ - if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && - (req->flags & REQ_F_ISREG) && - ret2 > 0 && ret2 < io_size) - ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, nxt, req->in_async); } else { copy_iov: - ret = io_setup_async_io(req, io_size, iovec, + ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); if (ret) goto out_free; @@ -1805,46 +2255,58 @@ copy_iov: } } out_free: - kfree(iovec); + if (!io_wq_current_is_worker()) + kfree(iovec); return ret; } -static int io_write_prep(struct io_kiocb *req, struct iovec **iovec, - struct iov_iter *iter, bool force_nonblock) +static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) { + struct io_async_ctx *io; + struct iov_iter iter; ssize_t ret; - ret = io_prep_rw(req, force_nonblock); + ret = io_prep_rw(req, sqe, force_nonblock); if (ret) return ret; if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; - return io_import_iovec(WRITE, req, iovec, iter); + if (!req->io) + return 0; + + io = req->io; + io->rw.iov = io->rw.fast_iov; + req->io = NULL; + ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter); + req->io = io; + if (ret < 0) + return ret; + + io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); + return 0; } static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; - struct kiocb *kiocb = &req->rw; + struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; - struct file *file; size_t iov_count; ssize_t ret, io_size; - if (!req->io) { - ret = io_write_prep(req, &iovec, &iter, force_nonblock); - if (ret < 0) - return ret; - } else { - ret = io_import_iovec(WRITE, req, &iovec, &iter); - if (ret < 0) - return ret; - } + ret = io_import_iovec(WRITE, req, &iovec, &iter); + if (ret < 0) + return ret; + + /* Ensure we clear previously set non-block flag */ + if (!force_nonblock) + req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; - file = kiocb->ki_filp; + req->result = 0; io_size = ret; if (req->flags & REQ_F_LINK) req->result = io_size; @@ -1858,11 +2320,13 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, goto copy_iov; } - if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) + /* file path doesn't support NOWAIT for non-direct_IO */ + if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && + (req->flags & REQ_F_ISREG)) goto copy_iov; iov_count = iov_iter_count(&iter); - ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count); + ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); if (!ret) { ssize_t ret2; @@ -1874,22 +2338,22 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, * we return to userspace. */ if (req->flags & REQ_F_ISREG) { - __sb_start_write(file_inode(file)->i_sb, + __sb_start_write(file_inode(req->file)->i_sb, SB_FREEZE_WRITE, true); - __sb_writers_release(file_inode(file)->i_sb, + __sb_writers_release(file_inode(req->file)->i_sb, SB_FREEZE_WRITE); } kiocb->ki_flags |= IOCB_WRITE; - if (file->f_op->write_iter) - ret2 = call_write_iter(file, kiocb, &iter); + if (req->file->f_op->write_iter) + ret2 = call_write_iter(req->file, kiocb, &iter); else - ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); + ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, nxt, req->in_async); } else { copy_iov: - ret = io_setup_async_io(req, io_size, iovec, + ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); if (ret) goto out_free; @@ -1897,7 +2361,8 @@ copy_iov: } } out_free: - kfree(iovec); + if (!io_wq_current_is_worker()) + kfree(iovec); return ret; } @@ -1928,45 +2393,507 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; + req->sync.flags = READ_ONCE(sqe->fsync_flags); + if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC)) + return -EINVAL; + + req->sync.off = READ_ONCE(sqe->off); + req->sync.len = READ_ONCE(sqe->len); return 0; } -static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static bool io_req_cancelled(struct io_kiocb *req) { - loff_t sqe_off = READ_ONCE(sqe->off); - loff_t sqe_len = READ_ONCE(sqe->len); - loff_t end = sqe_off + sqe_len; - unsigned fsync_flags; + if (req->work.flags & IO_WQ_WORK_CANCEL) { + req_set_fail_links(req); + io_cqring_add_event(req, -ECANCELED); + io_put_req(req); + return true; + } + + return false; +} + +static void io_link_work_cb(struct io_wq_work **workptr) +{ + struct io_wq_work *work = *workptr; + struct io_kiocb *link = work->data; + + io_queue_linked_timeout(link); + work->func = io_wq_submit_work; +} + +static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) +{ + struct io_kiocb *link; + + io_prep_async_work(nxt, &link); + *workptr = &nxt->work; + if (link) { + nxt->work.flags |= IO_WQ_WORK_CB; + nxt->work.func = io_link_work_cb; + nxt->work.data = link; + } +} + +static void io_fsync_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + loff_t end = req->sync.off + req->sync.len; + struct io_kiocb *nxt = NULL; int ret; - fsync_flags = READ_ONCE(sqe->fsync_flags); - if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC)) + if (io_req_cancelled(req)) + return; + + ret = vfs_fsync_range(req->file, req->sync.off, + end > 0 ? end : LLONG_MAX, + req->sync.flags & IORING_FSYNC_DATASYNC); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); +} + +static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_wq_work *work, *old_work; + + /* fsync always requires a blocking context */ + if (force_nonblock) { + io_put_req(req); + req->work.func = io_fsync_finish; + return -EAGAIN; + } + + work = old_work = &req->work; + io_fsync_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + return 0; +} + +static void io_fallocate_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + int ret; + + ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, + req->sync.len); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); +} + +static int io_fallocate_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (sqe->ioprio || sqe->buf_index || sqe->rw_flags) + return -EINVAL; + + req->sync.off = READ_ONCE(sqe->off); + req->sync.len = READ_ONCE(sqe->addr); + req->sync.mode = READ_ONCE(sqe->len); + return 0; +} + +static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_wq_work *work, *old_work; + + /* fallocate always requiring blocking context */ + if (force_nonblock) { + io_put_req(req); + req->work.func = io_fallocate_finish; + return -EAGAIN; + } + + work = old_work = &req->work; + io_fallocate_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + + return 0; +} + +static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + const char __user *fname; + int ret; + + if (sqe->ioprio || sqe->buf_index) return -EINVAL; - ret = io_prep_fsync(req, sqe); + req->open.dfd = READ_ONCE(sqe->fd); + req->open.how.mode = READ_ONCE(sqe->len); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + req->open.how.flags = READ_ONCE(sqe->open_flags); + + req->open.filename = getname(fname); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct open_how __user *how; + const char __user *fname; + size_t len; + int ret; + + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->open.dfd = READ_ONCE(sqe->fd); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + len = READ_ONCE(sqe->len); + + if (len < OPEN_HOW_SIZE_VER0) + return -EINVAL; + + ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how, + len); if (ret) return ret; - /* fsync always requires a blocking context */ + if (!(req->open.how.flags & O_PATH) && force_o_largefile()) + req->open.how.flags |= O_LARGEFILE; + + req->open.filename = getname(fname); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct open_flags op; + struct file *file; + int ret; + if (force_nonblock) return -EAGAIN; - ret = vfs_fsync_range(req->rw.ki_filp, sqe_off, - end > 0 ? end : LLONG_MAX, - fsync_flags & IORING_FSYNC_DATASYNC); + ret = build_open_flags(&req->open.how, &op); + if (ret) + goto err; - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + ret = get_unused_fd_flags(req->open.how.flags); + if (ret < 0) + goto err; + + file = do_filp_open(req->open.dfd, req->open.filename, &op); + if (IS_ERR(file)) { + put_unused_fd(ret); + ret = PTR_ERR(file); + } else { + fsnotify_open(file); + fd_install(ret, file); + } +err: + putname(req->open.filename); + if (ret < 0) + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req_find_next(req, nxt); return 0; } +static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + req->open.how = build_open_how(req->open.how.flags, req->open.how.mode); + return io_openat2(req, nxt, force_nonblock); +} + +static int io_epoll_ctl_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_EPOLL) + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->epoll.epfd = READ_ONCE(sqe->fd); + req->epoll.op = READ_ONCE(sqe->len); + req->epoll.fd = READ_ONCE(sqe->off); + + if (ep_op_has_event(req->epoll.op)) { + struct epoll_event __user *ev; + + ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); + if (copy_from_user(&req->epoll.event, ev, sizeof(*ev))) + return -EFAULT; + } + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_EPOLL) + struct io_epoll *ie = &req->epoll; + int ret; + + ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + if (sqe->ioprio || sqe->buf_index || sqe->off) + return -EINVAL; + + req->madvise.addr = READ_ONCE(sqe->addr); + req->madvise.len = READ_ONCE(sqe->len); + req->madvise.advice = READ_ONCE(sqe->fadvise_advice); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + struct io_madvise *ma = &req->madvise; + int ret; + + if (force_nonblock) + return -EAGAIN; + + ret = do_madvise(ma->addr, ma->len, ma->advice); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + if (sqe->ioprio || sqe->buf_index || sqe->addr) + return -EINVAL; + + req->fadvise.offset = READ_ONCE(sqe->off); + req->fadvise.len = READ_ONCE(sqe->len); + req->fadvise.advice = READ_ONCE(sqe->fadvise_advice); + return 0; +} + +static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_fadvise *fa = &req->fadvise; + int ret; + + /* DONTNEED may block, others _should_ not */ + if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock) + return -EAGAIN; + + ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + +static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + const char __user *fname; + unsigned lookup_flags; + int ret; + + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->open.dfd = READ_ONCE(sqe->fd); + req->open.mask = READ_ONCE(sqe->len); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + req->open.how.flags = READ_ONCE(sqe->statx_flags); + + if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags)) + return -EINVAL; + + req->open.filename = getname_flags(fname, lookup_flags, NULL); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_open *ctx = &req->open; + unsigned lookup_flags; + struct path path; + struct kstat stat; + int ret; + + if (force_nonblock) + return -EAGAIN; + + if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags)) + return -EINVAL; + +retry: + /* filename_lookup() drops it, keep a reference */ + ctx->filename->refcnt++; + + ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path, + NULL); + if (ret) + goto err; + + ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags); + path_put(&path); + if (retry_estale(ret, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + if (!ret) + ret = cp_statx(&stat, ctx->buffer); +err: + putname(ctx->filename); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + +static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + /* + * If we queue this for async, it must not be cancellable. That would + * leave the 'file' in an undeterminate state. + */ + req->work.flags |= IO_WQ_WORK_NO_CANCEL; + + if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || + sqe->rw_flags || sqe->buf_index) + return -EINVAL; + if (sqe->flags & IOSQE_FIXED_FILE) + return -EINVAL; + + req->close.fd = READ_ONCE(sqe->fd); + if (req->file->f_op == &io_uring_fops || + req->close.fd == req->ctx->ring_fd) + return -EBADF; + + return 0; +} + +static void io_close_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + + /* Invoked with files, we need to do the close */ + if (req->work.files) { + int ret; + + ret = filp_close(req->close.put_file, req->work.files); + if (ret < 0) { + req_set_fail_links(req); + } + io_cqring_add_event(req, ret); + } + + fput(req->close.put_file); + + /* we bypassed the re-issue, drop the submission reference */ + io_put_req(req); + io_put_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); +} + +static int io_close(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + int ret; + + req->close.put_file = NULL; + ret = __close_fd_get_file(req->close.fd, &req->close.put_file); + if (ret < 0) + return ret; + + /* if the file has a flush method, be safe and punt to async */ + if (req->close.put_file->f_op->flush && !io_wq_current_is_worker()) + goto eagain; + + /* + * No ->flush(), safely close from here and just punt the + * fput() to async context. + */ + ret = filp_close(req->close.put_file, current->files); + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + + if (io_wq_current_is_worker()) { + struct io_wq_work *old_work, *work; + + old_work = work = &req->work; + io_close_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + return 0; + } + +eagain: + req->work.func = io_close_finish; + return -EAGAIN; +} + static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; - int ret = 0; if (!req->file) return -EBADF; @@ -1976,59 +2903,89 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; - return ret; + req->sync.off = READ_ONCE(sqe->off); + req->sync.len = READ_ONCE(sqe->len); + req->sync.flags = READ_ONCE(sqe->sync_range_flags); + return 0; } -static int io_sync_file_range(struct io_kiocb *req, - const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, - bool force_nonblock) +static void io_sync_file_range_finish(struct io_wq_work **workptr) { - loff_t sqe_off; - loff_t sqe_len; - unsigned flags; + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; int ret; - ret = io_prep_sfr(req, sqe); - if (ret) - return ret; + if (io_req_cancelled(req)) + return; + + ret = sync_file_range(req->file, req->sync.off, req->sync.len, + req->sync.flags); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); +} + +static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_wq_work *work, *old_work; /* sync_file_range always requires a blocking context */ - if (force_nonblock) + if (force_nonblock) { + io_put_req(req); + req->work.func = io_sync_file_range_finish; return -EAGAIN; + } - sqe_off = READ_ONCE(sqe->off); - sqe_len = READ_ONCE(sqe->len); - flags = READ_ONCE(sqe->sync_range_flags); + work = old_work = &req->work; + io_sync_file_range_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + return 0; +} - ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); +#if defined(CONFIG_NET) +static void io_sendrecv_async(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct iovec *iov = NULL; - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; - io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); - return 0; + if (req->io->rw.iov != req->io->rw.fast_iov) + iov = req->io->msg.iov; + io_wq_submit_work(workptr); + kfree(iov); } +#endif -static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) +static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; - struct user_msghdr __user *msg; - unsigned flags; + struct io_sr_msg *sr = &req->sr_msg; + struct io_async_ctx *io = req->io; + + sr->msg_flags = READ_ONCE(sqe->msg_flags); + sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->len = READ_ONCE(sqe->len); + + if (!io || req->opcode == IORING_OP_SEND) + return 0; - flags = READ_ONCE(sqe->msg_flags); - msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); - return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov); + io->msg.iov = io->msg.fast_iov; + return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + &io->msg.iov); #else - return 0; + return -EOPNOTSUPP; #endif } -static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { #if defined(CONFIG_NET) + struct io_async_msghdr *kmsg = NULL; struct socket *sock; int ret; @@ -2037,50 +2994,55 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, sock = sock_from_file(req->file, &ret); if (sock) { - struct io_async_ctx io, *copy; + struct io_async_ctx io; struct sockaddr_storage addr; - struct msghdr *kmsg; unsigned flags; - flags = READ_ONCE(sqe->msg_flags); - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - if (req->io) { - kmsg = &req->io->msg.msg; - kmsg->msg_name = &addr; + kmsg = &req->io->msg; + kmsg->msg.msg_name = &addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; } else { - kmsg = &io.msg.msg; - kmsg->msg_name = &addr; + struct io_sr_msg *sr = &req->sr_msg; + + kmsg = &io.msg; + kmsg->msg.msg_name = &addr; + io.msg.iov = io.msg.fast_iov; - ret = io_sendmsg_prep(req, &io); + ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg, + sr->msg_flags, &io.msg.iov); if (ret) - goto out; + return ret; } - ret = __sys_sendmsg_sock(sock, kmsg, flags); + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (force_nonblock && ret == -EAGAIN) { - copy = kmalloc(sizeof(*copy), GFP_KERNEL); - if (!copy) { - ret = -ENOMEM; - goto out; - } - memcpy(©->msg, &io.msg, sizeof(copy->msg)); - req->io = copy; - memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); - req->sqe = &req->io->sqe; - return ret; + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) + return -ENOMEM; + memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); + req->work.func = io_sendrecv_async; + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; } -out: + if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); io_cqring_add_event(req, ret); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, nxt); return 0; #else @@ -2088,26 +3050,82 @@ out: #endif } -static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) +static int io_send(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; - struct user_msghdr __user *msg; - unsigned flags; + struct socket *sock; + int ret; - flags = READ_ONCE(sqe->msg_flags); - msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); - return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr, - &io->msg.iov); -#else + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; + unsigned flags; + + ret = import_single_range(WRITE, sr->buf, sr->len, &iov, + &msg.msg_iter); + if (ret) + return ret; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_sendmsg_sock(sock, &msg, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + + io_cqring_add_event(req, ret); + if (ret < 0) + req_set_fail_links(req); + io_put_req_find_next(req, nxt); return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_recvmsg_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_NET) + struct io_sr_msg *sr = &req->sr_msg; + struct io_async_ctx *io = req->io; + + sr->msg_flags = READ_ONCE(sqe->msg_flags); + sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + + if (!io || req->opcode == IORING_OP_RECV) + return 0; + + io->msg.iov = io->msg.fast_iov; + return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + &io->msg.uaddr, &io->msg.iov); +#else + return -EOPNOTSUPP; #endif } -static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { #if defined(CONFIG_NET) + struct io_async_msghdr *kmsg = NULL; struct socket *sock; int ret; @@ -2116,53 +3134,57 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, sock = sock_from_file(req->file, &ret); if (sock) { - struct user_msghdr __user *msg; - struct io_async_ctx io, *copy; + struct io_async_ctx io; struct sockaddr_storage addr; - struct msghdr *kmsg; unsigned flags; - flags = READ_ONCE(sqe->msg_flags); - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - - msg = (struct user_msghdr __user *) (unsigned long) - READ_ONCE(sqe->addr); if (req->io) { - kmsg = &req->io->msg.msg; - kmsg->msg_name = &addr; + kmsg = &req->io->msg; + kmsg->msg.msg_name = &addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; } else { - kmsg = &io.msg.msg; - kmsg->msg_name = &addr; + struct io_sr_msg *sr = &req->sr_msg; + + kmsg = &io.msg; + kmsg->msg.msg_name = &addr; + io.msg.iov = io.msg.fast_iov; - ret = io_recvmsg_prep(req, &io); + ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg, + sr->msg_flags, &io.msg.uaddr, + &io.msg.iov); if (ret) - goto out; + return ret; } - ret = __sys_recvmsg_sock(sock, kmsg, msg, io.msg.uaddr, flags); + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg, + kmsg->uaddr, flags); if (force_nonblock && ret == -EAGAIN) { - copy = kmalloc(sizeof(*copy), GFP_KERNEL); - if (!copy) { - ret = -ENOMEM; - goto out; - } - memcpy(copy, &io, sizeof(*copy)); - req->io = copy; - memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); - req->sqe = &req->io->sqe; - return ret; + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) + return -ENOMEM; + memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); + req->work.func = io_sendrecv_async; + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; } -out: + if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); io_cqring_add_event(req, ret); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, nxt); return 0; #else @@ -2170,101 +3192,193 @@ out: #endif } -static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { #if defined(CONFIG_NET) - struct sockaddr __user *addr; - int __user *addr_len; - unsigned file_flags; - int flags, ret; + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; + unsigned flags; + + ret = import_single_range(READ, sr->buf, sr->len, &iov, + &msg.msg_iter); + if (ret) + return ret; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + msg.msg_iocb = NULL; + msg.msg_flags = 0; + + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_recvmsg_sock(sock, &msg, NULL, NULL, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + + io_cqring_add_event(req, ret); + if (ret < 0) + req_set_fail_links(req); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + + +static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_NET) + struct io_accept *accept = &req->accept; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index) return -EINVAL; - addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); - addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2); - flags = READ_ONCE(sqe->accept_flags); - file_flags = force_nonblock ? O_NONBLOCK : 0; + accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + accept->flags = READ_ONCE(sqe->accept_flags); + return 0; +#else + return -EOPNOTSUPP; +#endif +} - ret = __sys_accept4_file(req->file, file_flags, addr, addr_len, flags); - if (ret == -EAGAIN && force_nonblock) { - req->work.flags |= IO_WQ_WORK_NEEDS_FILES; +#if defined(CONFIG_NET) +static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_accept *accept = &req->accept; + unsigned file_flags; + int ret; + + file_flags = force_nonblock ? O_NONBLOCK : 0; + ret = __sys_accept4_file(req->file, file_flags, accept->addr, + accept->addr_len, accept->flags); + if (ret == -EAGAIN && force_nonblock) return -EAGAIN; - } if (ret == -ERESTARTSYS) ret = -EINTR; - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req_find_next(req, nxt); return 0; -#else - return -EOPNOTSUPP; -#endif } -static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) +static void io_accept_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + + if (io_req_cancelled(req)) + return; + __io_accept(req, &nxt, false); + if (nxt) + io_wq_assign_next(workptr, nxt); +} +#endif + +static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; - struct sockaddr __user *addr; - int addr_len; + int ret; - addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); - addr_len = READ_ONCE(sqe->addr2); - return move_addr_to_kernel(addr, addr_len, &io->connect.address); -#else + ret = __io_accept(req, nxt, force_nonblock); + if (ret == -EAGAIN && force_nonblock) { + req->work.func = io_accept_finish; + io_put_req(req); + return -EAGAIN; + } return 0; +#else + return -EOPNOTSUPP; #endif } -static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) - struct io_async_ctx __io, *io; - unsigned file_flags; - int addr_len, ret; + struct io_connect *conn = &req->connect; + struct io_async_ctx *io = req->io; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) return -EINVAL; - addr_len = READ_ONCE(sqe->addr2); - file_flags = force_nonblock ? O_NONBLOCK : 0; + conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + conn->addr_len = READ_ONCE(sqe->addr2); + + if (!io) + return 0; + + return move_addr_to_kernel(conn->addr, conn->addr_len, + &io->connect.address); +#else + return -EOPNOTSUPP; +#endif +} + +static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct io_async_ctx __io, *io; + unsigned file_flags; + int ret; if (req->io) { io = req->io; } else { - ret = io_connect_prep(req, &__io); + ret = move_addr_to_kernel(req->connect.addr, + req->connect.addr_len, + &__io.connect.address); if (ret) goto out; io = &__io; } - ret = __sys_connect_file(req->file, &io->connect.address, addr_len, - file_flags); + file_flags = force_nonblock ? O_NONBLOCK : 0; + + ret = __sys_connect_file(req->file, &io->connect.address, + req->connect.addr_len, file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { - io = kmalloc(sizeof(*io), GFP_KERNEL); - if (!io) { + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) { ret = -ENOMEM; goto out; } - memcpy(&io->connect, &__io.connect, sizeof(io->connect)); - req->io = io; - memcpy(&io->sqe, req->sqe, sizeof(*req->sqe)); - req->sqe = &io->sqe; + memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect)); return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; out: - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req_find_next(req, nxt); return 0; @@ -2279,8 +3393,8 @@ static void io_poll_remove_one(struct io_kiocb *req) spin_lock(&poll->head->lock); WRITE_ONCE(poll->canceled, true); - if (!list_empty(&poll->wait->entry)) { - list_del_init(&poll->wait->entry); + if (!list_empty(&poll->wait.entry)) { + list_del_init(&poll->wait.entry); io_queue_async_work(req); } spin_unlock(&poll->head->lock); @@ -2320,28 +3434,37 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) return -ENOENT; } +static int io_poll_remove_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || + sqe->poll_events) + return -EINVAL; + + req->poll.addr = READ_ONCE(sqe->addr); + return 0; +} + /* * Find a running poll command that matches one specified in sqe->addr, * and remove it if found. */ -static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_poll_remove(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + u64 addr; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || - sqe->poll_events) - return -EINVAL; - + addr = req->poll.addr; spin_lock_irq(&ctx->completion_lock); - ret = io_poll_cancel(ctx, READ_ONCE(sqe->addr)); + ret = io_poll_cancel(ctx, addr); spin_unlock_irq(&ctx->completion_lock); io_cqring_add_event(req, ret); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req(req); return 0; } @@ -2351,7 +3474,6 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) struct io_ring_ctx *ctx = req->ctx; req->poll.done = true; - kfree(req->poll.wait); if (error) io_cqring_fill_event(req, error); else @@ -2389,7 +3511,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr) */ spin_lock_irq(&ctx->completion_lock); if (!mask && ret != -ECANCELED) { - add_wait_queue(poll->head, poll->wait); + add_wait_queue(poll->head, &poll->wait); spin_unlock_irq(&ctx->completion_lock); return; } @@ -2399,11 +3521,44 @@ static void io_poll_complete_work(struct io_wq_work **workptr) io_cqring_ev_posted(ctx); - if (ret < 0 && req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, &nxt); if (nxt) - *workptr = &nxt->work; + io_wq_assign_next(workptr, nxt); +} + +static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes) +{ + struct io_kiocb *req, *tmp; + struct req_batch rb; + + rb.to_free = rb.need_iter = 0; + spin_lock_irq(&ctx->completion_lock); + llist_for_each_entry_safe(req, tmp, nodes, llist_node) { + hash_del(&req->hash_node); + io_poll_complete(req, req->result, 0); + + if (refcount_dec_and_test(&req->refs) && + !io_req_multi_free(&rb, req)) { + req->flags |= REQ_F_COMP_LOCKED; + io_free_req(req); + } + } + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + io_free_req_many(ctx, &rb); +} + +static void io_poll_flush(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct llist_node *nodes; + + nodes = llist_del_all(&req->ctx->poll_llist); + if (nodes) + __io_poll_flush(req->ctx, nodes); } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -2413,13 +3568,12 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); struct io_ring_ctx *ctx = req->ctx; __poll_t mask = key_to_poll(key); - unsigned long flags; /* for instances that support it check for an event match first: */ if (mask && !(mask & poll->events)) return 0; - list_del_init(&poll->wait->entry); + list_del_init(&poll->wait.entry); /* * Run completion inline if we can. We're using trylock here because @@ -2427,17 +3581,31 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, * If we have a link timeout we're going to need the completion_lock * for finalizing the request, mark us as having grabbed that already. */ - if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { - hash_del(&req->hash_node); - io_poll_complete(req, mask, 0); - req->flags |= REQ_F_COMP_LOCKED; - io_put_req(req); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (mask) { + unsigned long flags; - io_cqring_ev_posted(ctx); - } else { - io_queue_async_work(req); + if (llist_empty(&ctx->poll_llist) && + spin_trylock_irqsave(&ctx->completion_lock, flags)) { + hash_del(&req->hash_node); + io_poll_complete(req, mask, 0); + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + io_cqring_ev_posted(ctx); + req = NULL; + } else { + req->result = mask; + req->llist_node.next = NULL; + /* if the list wasn't empty, we're done */ + if (!llist_add(&req->llist_node, &ctx->poll_llist)) + req = NULL; + else + req->work.func = io_poll_flush; + } } + if (req) + io_queue_async_work(req); return 1; } @@ -2460,7 +3628,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, pt->error = 0; pt->req->poll.head = head; - add_wait_queue(head, pt->req->poll.wait); + add_wait_queue(head, &pt->req->poll.wait); } static void io_poll_req_insert(struct io_kiocb *req) @@ -2472,14 +3640,9 @@ static void io_poll_req_insert(struct io_kiocb *req) hlist_add_head(&req->hash_node, list); } -static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt) +static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_iocb *poll = &req->poll; - struct io_ring_ctx *ctx = req->ctx; - struct io_poll_table ipt; - bool cancel = false; - __poll_t mask; u16 events; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -2489,14 +3652,20 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (!poll->file) return -EBADF; - poll->wait = kmalloc(sizeof(*poll->wait), GFP_KERNEL); - if (!poll->wait) - return -ENOMEM; - - req->io = NULL; - INIT_IO_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + return 0; +} + +static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) +{ + struct io_poll_iocb *poll = &req->poll; + struct io_ring_ctx *ctx = req->ctx; + struct io_poll_table ipt; + bool cancel = false; + __poll_t mask; + + INIT_IO_WORK(&req->work, io_poll_complete_work); INIT_HLIST_NODE(&req->hash_node); poll->head = NULL; @@ -2509,9 +3678,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ /* initialized the list so that we can do list_empty checks */ - INIT_LIST_HEAD(&poll->wait->entry); - init_waitqueue_func_entry(poll->wait, io_poll_wake); - poll->wait->private = poll; + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, io_poll_wake); + poll->wait.private = poll; INIT_LIST_HEAD(&req->list); @@ -2520,14 +3689,14 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, spin_lock_irq(&ctx->completion_lock); if (likely(poll->head)) { spin_lock(&poll->head->lock); - if (unlikely(list_empty(&poll->wait->entry))) { + if (unlikely(list_empty(&poll->wait.entry))) { if (ipt.error) cancel = true; ipt.error = 0; mask = 0; } if (mask || ipt.error) - list_del_init(&poll->wait->entry); + list_del_init(&poll->wait.entry); else if (cancel) WRITE_ONCE(poll->canceled, true); else if (!poll->done) /* actually waiting for an event */ @@ -2567,7 +3736,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) /* * Adjust the reqs sequence before the current one because it - * will consume a slot in the cq_ring and the the cq_tail + * will consume a slot in the cq_ring and the cq_tail * pointer will be increased, otherwise other timeout reqs may * return in advance without waiting for enough wait_nr. */ @@ -2582,8 +3751,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_put_req(req); return HRTIMER_NORESTART; } @@ -2608,48 +3776,52 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (ret == -1) return -EALREADY; - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_cqring_fill_event(req, -ECANCELED); io_put_req(req); return 0; } +static int io_timeout_remove_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) + return -EINVAL; + + req->timeout.addr = READ_ONCE(sqe->addr); + req->timeout.flags = READ_ONCE(sqe->timeout_flags); + if (req->timeout.flags) + return -EINVAL; + + return 0; +} + /* * Remove or update an existing timeout command */ -static int io_timeout_remove(struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_timeout_remove(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - unsigned flags; int ret; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) - return -EINVAL; - flags = READ_ONCE(sqe->timeout_flags); - if (flags) - return -EINVAL; - spin_lock_irq(&ctx->completion_lock); - ret = io_timeout_cancel(ctx, READ_ONCE(sqe->addr)); + ret = io_timeout_cancel(ctx, req->timeout.addr); io_cqring_fill_event(req, ret); io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - if (ret < 0 && req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req(req); return 0; } -static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, +static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool is_timeout_link) { - const struct io_uring_sqe *sqe = req->sqe; struct io_timeout_data *data; unsigned flags; @@ -2663,7 +3835,12 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, if (flags & ~IORING_TIMEOUT_ABS) return -EINVAL; - data = &io->timeout; + req->timeout.count = READ_ONCE(sqe->off); + + if (!req->io && io_alloc_async_ctx(req)) + return -ENOMEM; + + data = &req->io->timeout; data->req = req; req->flags |= REQ_F_TIMEOUT; @@ -2676,32 +3853,17 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, data->mode = HRTIMER_MODE_REL; hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); - req->io = io; return 0; } -static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_timeout(struct io_kiocb *req) { unsigned count; struct io_ring_ctx *ctx = req->ctx; struct io_timeout_data *data; - struct io_async_ctx *io; struct list_head *entry; unsigned span = 0; - io = req->io; - if (!io) { - int ret; - - io = kmalloc(sizeof(*io), GFP_KERNEL); - if (!io) - return -ENOMEM; - ret = io_timeout_prep(req, io, false); - if (ret) { - kfree(io); - return ret; - } - } data = &req->io->timeout; /* @@ -2709,7 +3871,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) * timeout event to be satisfied. If it isn't set, then this is * a pure timeout request, sequence isn't used. */ - count = READ_ONCE(sqe->off); + count = req->timeout.count; if (!count) { req->flags |= REQ_F_TIMEOUT_NOSEQ; spin_lock_irq(&ctx->completion_lock); @@ -2822,89 +3984,185 @@ done: spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, nxt); } -static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt) +static int io_async_cancel_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { - struct io_ring_ctx *ctx = req->ctx; - - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->flags || sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags) return -EINVAL; - io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt, 0); + req->cancel.addr = READ_ONCE(sqe->addr); return 0; } -static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io) +static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) { - struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; - struct iov_iter iter; - ssize_t ret; + struct io_ring_ctx *ctx = req->ctx; - memcpy(&io->sqe, req->sqe, sizeof(io->sqe)); - req->sqe = &io->sqe; + io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0); + return 0; +} - switch (io->sqe.opcode) { +static int io_files_update_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (sqe->flags || sqe->ioprio || sqe->rw_flags) + return -EINVAL; + + req->files_update.offset = READ_ONCE(sqe->off); + req->files_update.nr_args = READ_ONCE(sqe->len); + if (!req->files_update.nr_args) + return -EINVAL; + req->files_update.arg = READ_ONCE(sqe->addr); + return 0; +} + +static int io_files_update(struct io_kiocb *req, bool force_nonblock) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_uring_files_update up; + int ret; + + if (force_nonblock) + return -EAGAIN; + + up.offset = req->files_update.offset; + up.fds = req->files_update.arg; + + mutex_lock(&ctx->uring_lock); + ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args); + mutex_unlock(&ctx->uring_lock); + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req(req); + return 0; +} + +static int io_req_defer_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + ssize_t ret = 0; + + if (io_op_defs[req->opcode].file_table) { + ret = io_grab_files(req); + if (unlikely(ret)) + return ret; + } + + io_req_work_grab_env(req, &io_op_defs[req->opcode]); + + switch (req->opcode) { + case IORING_OP_NOP: + break; case IORING_OP_READV: case IORING_OP_READ_FIXED: - ret = io_read_prep(req, &iovec, &iter, true); + case IORING_OP_READ: + ret = io_read_prep(req, sqe, true); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: - ret = io_write_prep(req, &iovec, &iter, true); + case IORING_OP_WRITE: + ret = io_write_prep(req, sqe, true); + break; + case IORING_OP_POLL_ADD: + ret = io_poll_add_prep(req, sqe); + break; + case IORING_OP_POLL_REMOVE: + ret = io_poll_remove_prep(req, sqe); + break; + case IORING_OP_FSYNC: + ret = io_prep_fsync(req, sqe); + break; + case IORING_OP_SYNC_FILE_RANGE: + ret = io_prep_sfr(req, sqe); break; case IORING_OP_SENDMSG: - ret = io_sendmsg_prep(req, io); + case IORING_OP_SEND: + ret = io_sendmsg_prep(req, sqe); break; case IORING_OP_RECVMSG: - ret = io_recvmsg_prep(req, io); + case IORING_OP_RECV: + ret = io_recvmsg_prep(req, sqe); break; case IORING_OP_CONNECT: - ret = io_connect_prep(req, io); + ret = io_connect_prep(req, sqe); break; case IORING_OP_TIMEOUT: - return io_timeout_prep(req, io, false); + ret = io_timeout_prep(req, sqe, false); + break; + case IORING_OP_TIMEOUT_REMOVE: + ret = io_timeout_remove_prep(req, sqe); + break; + case IORING_OP_ASYNC_CANCEL: + ret = io_async_cancel_prep(req, sqe); + break; case IORING_OP_LINK_TIMEOUT: - return io_timeout_prep(req, io, true); + ret = io_timeout_prep(req, sqe, true); + break; + case IORING_OP_ACCEPT: + ret = io_accept_prep(req, sqe); + break; + case IORING_OP_FALLOCATE: + ret = io_fallocate_prep(req, sqe); + break; + case IORING_OP_OPENAT: + ret = io_openat_prep(req, sqe); + break; + case IORING_OP_CLOSE: + ret = io_close_prep(req, sqe); + break; + case IORING_OP_FILES_UPDATE: + ret = io_files_update_prep(req, sqe); + break; + case IORING_OP_STATX: + ret = io_statx_prep(req, sqe); + break; + case IORING_OP_FADVISE: + ret = io_fadvise_prep(req, sqe); + break; + case IORING_OP_MADVISE: + ret = io_madvise_prep(req, sqe); + break; + case IORING_OP_OPENAT2: + ret = io_openat2_prep(req, sqe); + break; + case IORING_OP_EPOLL_CTL: + ret = io_epoll_ctl_prep(req, sqe); + break; default: - req->io = io; - return 0; + printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", + req->opcode); + ret = -EINVAL; + break; } - if (ret < 0) - return ret; - - req->io = io; - io_req_map_io(req, ret, iovec, inline_vecs, &iter); - return 0; + return ret; } -static int io_req_defer(struct io_kiocb *req) +static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; - struct io_async_ctx *io; int ret; /* Still need defer if there is pending req in defer list. */ if (!req_need_defer(req) && list_empty(&ctx->defer_list)) return 0; - io = kmalloc(sizeof(*io), GFP_KERNEL); - if (!io) + if (!req->io && io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, io); - if (ret < 0) { - kfree(io); + ret = io_req_defer_prep(req, sqe); + if (ret < 0) return ret; - } spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { @@ -2918,66 +4176,203 @@ static int io_req_defer(struct io_kiocb *req) return -EIOCBQUEUED; } -__attribute__((nonnull)) -static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt, bool force_nonblock) { - int ret, opcode; struct io_ring_ctx *ctx = req->ctx; + int ret; - opcode = READ_ONCE(req->sqe->opcode); - switch (opcode) { + switch (req->opcode) { case IORING_OP_NOP: ret = io_nop(req); break; case IORING_OP_READV: - if (unlikely(req->sqe->buf_index)) - return -EINVAL; - ret = io_read(req, nxt, force_nonblock); - break; - case IORING_OP_WRITEV: - if (unlikely(req->sqe->buf_index)) - return -EINVAL; - ret = io_write(req, nxt, force_nonblock); - break; case IORING_OP_READ_FIXED: + case IORING_OP_READ: + if (sqe) { + ret = io_read_prep(req, sqe, force_nonblock); + if (ret < 0) + break; + } ret = io_read(req, nxt, force_nonblock); break; + case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + if (sqe) { + ret = io_write_prep(req, sqe, force_nonblock); + if (ret < 0) + break; + } ret = io_write(req, nxt, force_nonblock); break; case IORING_OP_FSYNC: - ret = io_fsync(req, req->sqe, nxt, force_nonblock); + if (sqe) { + ret = io_prep_fsync(req, sqe); + if (ret < 0) + break; + } + ret = io_fsync(req, nxt, force_nonblock); break; case IORING_OP_POLL_ADD: - ret = io_poll_add(req, req->sqe, nxt); + if (sqe) { + ret = io_poll_add_prep(req, sqe); + if (ret) + break; + } + ret = io_poll_add(req, nxt); break; case IORING_OP_POLL_REMOVE: - ret = io_poll_remove(req, req->sqe); + if (sqe) { + ret = io_poll_remove_prep(req, sqe); + if (ret < 0) + break; + } + ret = io_poll_remove(req); break; case IORING_OP_SYNC_FILE_RANGE: - ret = io_sync_file_range(req, req->sqe, nxt, force_nonblock); + if (sqe) { + ret = io_prep_sfr(req, sqe); + if (ret < 0) + break; + } + ret = io_sync_file_range(req, nxt, force_nonblock); break; case IORING_OP_SENDMSG: - ret = io_sendmsg(req, req->sqe, nxt, force_nonblock); + case IORING_OP_SEND: + if (sqe) { + ret = io_sendmsg_prep(req, sqe); + if (ret < 0) + break; + } + if (req->opcode == IORING_OP_SENDMSG) + ret = io_sendmsg(req, nxt, force_nonblock); + else + ret = io_send(req, nxt, force_nonblock); break; case IORING_OP_RECVMSG: - ret = io_recvmsg(req, req->sqe, nxt, force_nonblock); + case IORING_OP_RECV: + if (sqe) { + ret = io_recvmsg_prep(req, sqe); + if (ret) + break; + } + if (req->opcode == IORING_OP_RECVMSG) + ret = io_recvmsg(req, nxt, force_nonblock); + else + ret = io_recv(req, nxt, force_nonblock); break; case IORING_OP_TIMEOUT: - ret = io_timeout(req, req->sqe); + if (sqe) { + ret = io_timeout_prep(req, sqe, false); + if (ret) + break; + } + ret = io_timeout(req); break; case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove(req, req->sqe); + if (sqe) { + ret = io_timeout_remove_prep(req, sqe); + if (ret) + break; + } + ret = io_timeout_remove(req); break; case IORING_OP_ACCEPT: - ret = io_accept(req, req->sqe, nxt, force_nonblock); + if (sqe) { + ret = io_accept_prep(req, sqe); + if (ret) + break; + } + ret = io_accept(req, nxt, force_nonblock); break; case IORING_OP_CONNECT: - ret = io_connect(req, req->sqe, nxt, force_nonblock); + if (sqe) { + ret = io_connect_prep(req, sqe); + if (ret) + break; + } + ret = io_connect(req, nxt, force_nonblock); break; case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel(req, req->sqe, nxt); + if (sqe) { + ret = io_async_cancel_prep(req, sqe); + if (ret) + break; + } + ret = io_async_cancel(req, nxt); + break; + case IORING_OP_FALLOCATE: + if (sqe) { + ret = io_fallocate_prep(req, sqe); + if (ret) + break; + } + ret = io_fallocate(req, nxt, force_nonblock); + break; + case IORING_OP_OPENAT: + if (sqe) { + ret = io_openat_prep(req, sqe); + if (ret) + break; + } + ret = io_openat(req, nxt, force_nonblock); + break; + case IORING_OP_CLOSE: + if (sqe) { + ret = io_close_prep(req, sqe); + if (ret) + break; + } + ret = io_close(req, nxt, force_nonblock); + break; + case IORING_OP_FILES_UPDATE: + if (sqe) { + ret = io_files_update_prep(req, sqe); + if (ret) + break; + } + ret = io_files_update(req, force_nonblock); + break; + case IORING_OP_STATX: + if (sqe) { + ret = io_statx_prep(req, sqe); + if (ret) + break; + } + ret = io_statx(req, nxt, force_nonblock); + break; + case IORING_OP_FADVISE: + if (sqe) { + ret = io_fadvise_prep(req, sqe); + if (ret) + break; + } + ret = io_fadvise(req, nxt, force_nonblock); + break; + case IORING_OP_MADVISE: + if (sqe) { + ret = io_madvise_prep(req, sqe); + if (ret) + break; + } + ret = io_madvise(req, nxt, force_nonblock); + break; + case IORING_OP_OPENAT2: + if (sqe) { + ret = io_openat2_prep(req, sqe); + if (ret) + break; + } + ret = io_openat2(req, nxt, force_nonblock); + break; + case IORING_OP_EPOLL_CTL: + if (sqe) { + ret = io_epoll_ctl_prep(req, sqe); + if (ret) + break; + } + ret = io_epoll_ctl(req, nxt, force_nonblock); break; default: ret = -EINVAL; @@ -2988,29 +4383,24 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, return ret; if (ctx->flags & IORING_SETUP_IOPOLL) { + const bool in_async = io_wq_current_is_worker(); + if (req->result == -EAGAIN) return -EAGAIN; /* workqueue context doesn't hold uring_lock, grab it now */ - if (req->in_async) + if (in_async) mutex_lock(&ctx->uring_lock); + io_iopoll_req_issued(req); - if (req->in_async) + + if (in_async) mutex_unlock(&ctx->uring_lock); } return 0; } -static void io_link_work_cb(struct io_wq_work **workptr) -{ - struct io_wq_work *work = *workptr; - struct io_kiocb *link = work->data; - - io_queue_linked_timeout(link); - work->func = io_wq_submit_work; -} - static void io_wq_submit_work(struct io_wq_work **workptr) { struct io_wq_work *work = *workptr; @@ -3018,17 +4408,17 @@ static void io_wq_submit_work(struct io_wq_work **workptr) struct io_kiocb *nxt = NULL; int ret = 0; - /* Ensure we clear previously set non-block flag */ - req->rw.ki_flags &= ~IOCB_NOWAIT; - - if (work->flags & IO_WQ_WORK_CANCEL) + /* if NO_CANCEL is set, we must still run the work */ + if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) == + IO_WQ_WORK_CANCEL) { ret = -ECANCELED; + } if (!ret) { req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; req->in_async = true; do { - ret = io_issue_sqe(req, &nxt, false); + ret = io_issue_sqe(req, NULL, &nxt, false); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -3044,41 +4434,23 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_put_req(req); if (ret) { - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req(req); } /* if a dependent link is ready, pass it back */ - if (!ret && nxt) { - struct io_kiocb *link; - - io_prep_async_work(nxt, &link); - *workptr = &nxt->work; - if (link) { - nxt->work.flags |= IO_WQ_WORK_CB; - nxt->work.func = io_link_work_cb; - nxt->work.data = link; - } - } + if (!ret && nxt) + io_wq_assign_next(workptr, nxt); } -static bool io_op_needs_file(const struct io_uring_sqe *sqe) +static int io_req_needs_file(struct io_kiocb *req, int fd) { - int op = READ_ONCE(sqe->opcode); - - switch (op) { - case IORING_OP_NOP: - case IORING_OP_POLL_REMOVE: - case IORING_OP_TIMEOUT: - case IORING_OP_TIMEOUT_REMOVE: - case IORING_OP_ASYNC_CANCEL: - case IORING_OP_LINK_TIMEOUT: - return false; - default: - return true; - } + if (!io_op_defs[req->opcode].needs_file) + return 0; + if (fd == -1 && io_op_defs[req->opcode].fd_non_neg) + return 0; + return 1; } static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, @@ -3086,27 +4458,25 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, { struct fixed_file_table *table; - table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT]; - return table->files[index & IORING_FILE_TABLE_MASK]; + table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT]; + return table->files[index & IORING_FILE_TABLE_MASK];; } -static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) +static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, + const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; unsigned flags; int fd; - flags = READ_ONCE(req->sqe->flags); - fd = READ_ONCE(req->sqe->fd); + flags = READ_ONCE(sqe->flags); + fd = READ_ONCE(sqe->fd); - if (flags & IOSQE_IO_DRAIN) - req->flags |= REQ_F_IO_DRAIN; - - if (!io_op_needs_file(req->sqe)) + if (!io_req_needs_file(req, fd)) return 0; if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->file_table || + if (unlikely(!ctx->file_data || (unsigned) fd >= ctx->nr_user_files)) return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); @@ -3114,6 +4484,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) if (!req->file) return -EBADF; req->flags |= REQ_F_FIXED_FILE; + percpu_ref_get(&ctx->file_data->refs); } else { if (req->needs_fixed_file) return -EBADF; @@ -3131,6 +4502,11 @@ static int io_grab_files(struct io_kiocb *req) int ret = -EBADF; struct io_ring_ctx *ctx = req->ctx; + if (req->work.files) + return 0; + if (!ctx->ring_file) + return -EBADF; + rcu_read_lock(); spin_lock_irq(&ctx->inflight_lock); /* @@ -3139,7 +4515,7 @@ static int io_grab_files(struct io_kiocb *req) * the fd has changed since we started down this path, and disallow * this operation if it has. */ - if (fcheck(req->ring_fd) == req->ring_file) { + if (fcheck(ctx->ring_fd) == ctx->ring_file) { list_add(&req->inflight_entry, &ctx->inflight_list); req->flags |= REQ_F_INFLIGHT; req->work.files = current->files; @@ -3179,8 +4555,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { - if (prev->flags & REQ_F_LINK) - prev->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(prev); io_async_find_and_cancel(ctx, req, prev->user_data, NULL, -ETIME); io_put_req(prev); @@ -3222,22 +4597,23 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, link_list); - if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT) + if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT) return NULL; req->flags |= REQ_F_LINK_TIMEOUT; return nxt; } -static void __io_queue_sqe(struct io_kiocb *req) +static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + struct io_kiocb *linked_timeout; struct io_kiocb *nxt = NULL; int ret; - ret = io_issue_sqe(req, &nxt, true); - if (nxt) - io_queue_async_work(nxt); +again: + linked_timeout = io_prep_linked_timeout(req); + + ret = io_issue_sqe(req, sqe, &nxt, true); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -3245,7 +4621,8 @@ static void __io_queue_sqe(struct io_kiocb *req) */ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || (req->flags & REQ_F_MUST_PUNT))) { - if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { +punt: + if (io_op_defs[req->opcode].file_table) { ret = io_grab_files(req); if (ret) goto err; @@ -3256,7 +4633,7 @@ static void __io_queue_sqe(struct io_kiocb *req) * submit reference when the iocb is actually submitted. */ io_queue_async_work(req); - return; + goto done_req; } err: @@ -3273,32 +4650,45 @@ err: /* and drop final reference, if we failed */ if (ret) { io_cqring_add_event(req, ret); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_put_req(req); } +done_req: + if (nxt) { + req = nxt; + nxt = NULL; + + if (req->flags & REQ_F_FORCE_ASYNC) + goto punt; + goto again; + } } -static void io_queue_sqe(struct io_kiocb *req) +static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) { int ret; - if (unlikely(req->ctx->drain_next)) { - req->flags |= REQ_F_IO_DRAIN; - req->ctx->drain_next = false; - } - req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK); - - ret = io_req_defer(req); + ret = io_req_defer(req, sqe); if (ret) { if (ret != -EIOCBQUEUED) { +fail_req: io_cqring_add_event(req, ret); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_double_put_req(req); } - } else - __io_queue_sqe(req); + } else if (req->flags & REQ_F_FORCE_ASYNC) { + ret = io_req_defer_prep(req, sqe); + if (unlikely(ret < 0)) + goto fail_req; + /* + * Never try inline submit of IOSQE_ASYNC is set, go straight + * to async execution. + */ + req->work.flags |= IO_WQ_WORK_CONCURRENT; + io_queue_async_work(req); + } else { + __io_queue_sqe(req, sqe); + } } static inline void io_queue_link_head(struct io_kiocb *req) @@ -3307,31 +4697,51 @@ static inline void io_queue_link_head(struct io_kiocb *req) io_cqring_add_event(req, -ECANCELED); io_double_put_req(req); } else - io_queue_sqe(req); + io_queue_sqe(req, NULL); } +#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ + IOSQE_IO_HARDLINK | IOSQE_ASYNC) -#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) - -static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, - struct io_kiocb **link) +static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_submit_state *state, struct io_kiocb **link) { + const struct cred *old_creds = NULL; struct io_ring_ctx *ctx = req->ctx; - int ret; + unsigned int sqe_flags; + int ret, id; - req->user_data = req->sqe->user_data; + sqe_flags = READ_ONCE(sqe->flags); /* enforce forwards compatibility on users */ - if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) { + if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { ret = -EINVAL; goto err_req; } - ret = io_req_set_file(state, req); + id = READ_ONCE(sqe->personality); + if (id) { + const struct cred *personality_creds; + + personality_creds = idr_find(&ctx->personality_idr, id); + if (unlikely(!personality_creds)) { + ret = -EINVAL; + goto err_req; + } + old_creds = override_creds(personality_creds); + } + + /* same numerical values with corresponding REQ_F_*, safe to copy */ + req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK| + IOSQE_ASYNC); + + ret = io_req_set_file(state, req, sqe); if (unlikely(ret)) { err_req: io_cqring_add_event(req, ret); io_double_put_req(req); + if (old_creds) + revert_creds(old_creds); return false; } @@ -3343,35 +4753,57 @@ err_req: * conditions are true (normal request), then just queue it. */ if (*link) { - struct io_kiocb *prev = *link; - struct io_async_ctx *io; - - if (req->sqe->flags & IOSQE_IO_DRAIN) - (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; + struct io_kiocb *head = *link; - io = kmalloc(sizeof(*io), GFP_KERNEL); - if (!io) { + /* + * Taking sequential execution of a link, draining both sides + * of the link also fullfils IOSQE_IO_DRAIN semantics for all + * requests in the link. So, it drains the head and the + * next after the link request. The last one is done via + * drain_next flag to persist the effect across calls. + */ + if (sqe_flags & IOSQE_IO_DRAIN) { + head->flags |= REQ_F_IO_DRAIN; + ctx->drain_next = 1; + } + if (io_alloc_async_ctx(req)) { ret = -EAGAIN; goto err_req; } - ret = io_req_defer_prep(req, io); + ret = io_req_defer_prep(req, sqe); if (ret) { - kfree(io); - prev->flags |= REQ_F_FAIL_LINK; + /* fail even hard links since we don't submit */ + head->flags |= REQ_F_FAIL_LINK; goto err_req; } - trace_io_uring_link(ctx, req, prev); - list_add_tail(&req->link_list, &prev->link_list); - } else if (req->sqe->flags & IOSQE_IO_LINK) { - req->flags |= REQ_F_LINK; + trace_io_uring_link(ctx, req, head); + list_add_tail(&req->link_list, &head->link_list); - INIT_LIST_HEAD(&req->link_list); - *link = req; + /* last request of a link, enqueue the link */ + if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) { + io_queue_link_head(head); + *link = NULL; + } } else { - io_queue_sqe(req); + if (unlikely(ctx->drain_next)) { + req->flags |= REQ_F_IO_DRAIN; + req->ctx->drain_next = 0; + } + if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { + req->flags |= REQ_F_LINK; + INIT_LIST_HEAD(&req->link_list); + ret = io_req_defer_prep(req, sqe); + if (ret) + req->flags |= REQ_F_FAIL_LINK; + *link = req; + } else { + io_queue_sqe(req, sqe); + } } + if (old_creds) + revert_creds(old_creds); return true; } @@ -3403,27 +4835,25 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) { - /* - * Ensure any loads from the SQEs are done at this point, - * since once we write the new head, the application could - * write new data to them. - */ - smp_store_release(&rings->sq.head, ctx->cached_sq_head); - } + /* + * Ensure any loads from the SQEs are done at this point, + * since once we write the new head, the application could + * write new data to them. + */ + smp_store_release(&rings->sq.head, ctx->cached_sq_head); } /* - * Fetch an sqe, if one is available. Note that s->sqe will point to memory + * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory * that is mapped by userspace. This means that care needs to be taken to * ensure that reads are stable, as we cannot rely on userspace always * being a good citizen. If members of the sqe are validated and then later * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) +static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe **sqe_ptr) { - struct io_rings *rings = ctx->rings; u32 *sq_array = ctx->sq_array; unsigned head; @@ -3435,12 +4865,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) * 2) allows the kernel side to track the head on its own, even * though the application is the one updating it. */ - head = ctx->cached_sq_head; - /* make sure SQ entry isn't read before tail */ - if (unlikely(head == smp_load_acquire(&rings->sq.tail))) - return false; - - head = READ_ONCE(sq_array[head & ctx->sq_mask]); + head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]); if (likely(head < ctx->sq_entries)) { /* * All io need record the previous position, if LINK vs DARIN, @@ -3448,7 +4873,9 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) * link list. */ req->sequence = ctx->cached_sq_head; - req->sqe = &ctx->sq_sqes[head]; + *sqe_ptr = &ctx->sq_sqes[head]; + req->opcode = READ_ONCE((*sqe_ptr)->opcode); + req->user_data = READ_ONCE((*sqe_ptr)->user_data); ctx->cached_sq_head++; return true; } @@ -3456,7 +4883,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) /* drop invalid entries */ ctx->cached_sq_head++; ctx->cached_sq_dropped++; - WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped); + WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped); return false; } @@ -3470,18 +4897,29 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, bool mm_fault = false; /* if we have a backlog and couldn't flush it all, return BUSY */ - if (!list_empty(&ctx->cq_overflow_list) && - !io_cqring_overflow_flush(ctx, false)) - return -EBUSY; + if (test_bit(0, &ctx->sq_check_overflow)) { + if (!list_empty(&ctx->cq_overflow_list) && + !io_cqring_overflow_flush(ctx, false)) + return -EBUSY; + } + + /* make sure SQ entry isn't read before tail */ + nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); + + if (!percpu_ref_tryget_many(&ctx->refs, nr)) + return -EAGAIN; if (nr > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, nr); statep = &state; } + ctx->ring_fd = ring_fd; + ctx->ring_file = ring_file; + for (i = 0; i < nr; i++) { + const struct io_uring_sqe *sqe; struct io_kiocb *req; - unsigned int sqe_flags; req = io_get_req(ctx, statep); if (unlikely(!req)) { @@ -3489,12 +4927,21 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, submitted = -EAGAIN; break; } - if (!io_get_sqring(ctx, req)) { - __io_free_req(req); + if (!io_get_sqring(ctx, req, &sqe)) { + __io_req_do_free(req); break; } - if (io_sqe_needs_user(req->sqe) && !*mm) { + /* will complete beyond this point, count as submitted */ + submitted++; + + if (unlikely(req->opcode >= IORING_OP_LAST)) { + io_cqring_add_event(req, -EINVAL); + io_double_put_req(req); + break; + } + + if (io_op_defs[req->opcode].needs_mm && !*mm) { mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); if (!mm_fault) { use_mm(ctx->sqo_mm); @@ -3502,28 +4949,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } } - submitted++; - sqe_flags = req->sqe->flags; - - req->ring_file = ring_file; - req->ring_fd = ring_fd; req->has_user = *mm != NULL; req->in_async = async; req->needs_fixed_file = async; - trace_io_uring_submit_sqe(ctx, req->sqe->user_data, - true, async); - if (!io_submit_sqe(req, statep, &link)) + trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, + true, async); + if (!io_submit_sqe(req, sqe, statep, &link)) break; - /* - * If previous wasn't linked and we have a linked command, - * that's the end of the chain. Submit the previous link. - */ - if (!(sqe_flags & IOSQE_IO_LINK) && link) { - io_queue_link_head(link); - link = NULL; - } } + if (unlikely(submitted != nr)) { + int ref_used = (submitted == -EAGAIN) ? 0 : submitted; + + percpu_ref_put_many(&ctx->refs, nr - ref_used); + } if (link) io_queue_link_head(link); if (statep) @@ -3646,8 +5085,9 @@ static int io_sq_thread(void *data) ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; } - to_submit = min(to_submit, ctx->sq_entries); + mutex_lock(&ctx->uring_lock); ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); + mutex_unlock(&ctx->uring_lock); if (ret > 0) inflight += ret; } @@ -3676,7 +5116,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush) struct io_ring_ctx *ctx = iowq->ctx; /* - * Wake up if we have enough events, or if a timeout occured since we + * Wake up if we have enough events, or if a timeout occurred since we * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ @@ -3775,19 +5215,40 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) #endif } +static void io_file_ref_kill(struct percpu_ref *ref) +{ + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + complete(&data->done); +} + static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { + struct fixed_file_data *data = ctx->file_data; unsigned nr_tables, i; - if (!ctx->file_table) + if (!data) return -ENXIO; + /* protect against inflight atomic switch, which drops the ref */ + percpu_ref_get(&data->refs); + /* wait for existing switches */ + flush_work(&data->ref_work); + percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill); + wait_for_completion(&data->done); + percpu_ref_put(&data->refs); + /* flush potential new switch */ + flush_work(&data->ref_work); + percpu_ref_exit(&data->refs); + __io_sqe_files_unregister(ctx); nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); for (i = 0; i < nr_tables; i++) - kfree(ctx->file_table[i].files); - kfree(ctx->file_table); - ctx->file_table = NULL; + kfree(data->table[i].files); + kfree(data->table); + kfree(data); + ctx->file_data = NULL; ctx->nr_user_files = 0; return 0; } @@ -3818,16 +5279,6 @@ static void io_finish_async(struct io_ring_ctx *ctx) } #if defined(CONFIG_UNIX) -static void io_destruct_skb(struct sk_buff *skb) -{ - struct io_ring_ctx *ctx = skb->sk->sk_user_data; - - if (ctx->io_wq) - io_wq_flush(ctx->io_wq); - - unix_destruct_scm(skb); -} - /* * Ensure the UNIX gc is aware of our file set, so we are certain that * the io_uring can be safely unregistered on process exit, even if we have @@ -3875,7 +5326,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) fpl->max = SCM_MAX_FD; fpl->count = nr_files; UNIXCB(skb).fp = fpl; - skb->destructor = io_destruct_skb; + skb->destructor = unix_destruct_scm; refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_queue_head(&sk->sk_receive_queue, skb); @@ -3937,7 +5388,7 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, int i; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_table[i]; + struct fixed_file_table *table = &ctx->file_data->table[i]; unsigned this_files; this_files = min(nr_files, IORING_MAX_FILES_TABLE); @@ -3952,36 +5403,159 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, return 0; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_table[i]; + struct fixed_file_table *table = &ctx->file_data->table[i]; kfree(table->files); } return 1; } +static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file) +{ +#if defined(CONFIG_UNIX) + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff_head list, *head = &sock->sk_receive_queue; + struct sk_buff *skb; + int i; + + __skb_queue_head_init(&list); + + /* + * Find the skb that holds this file in its SCM_RIGHTS. When found, + * remove this entry and rearrange the file array. + */ + skb = skb_dequeue(head); + while (skb) { + struct scm_fp_list *fp; + + fp = UNIXCB(skb).fp; + for (i = 0; i < fp->count; i++) { + int left; + + if (fp->fp[i] != file) + continue; + + unix_notinflight(fp->user, fp->fp[i]); + left = fp->count - 1 - i; + if (left) { + memmove(&fp->fp[i], &fp->fp[i + 1], + left * sizeof(struct file *)); + } + fp->count--; + if (!fp->count) { + kfree_skb(skb); + skb = NULL; + } else { + __skb_queue_tail(&list, skb); + } + fput(file); + file = NULL; + break; + } + + if (!file) + break; + + __skb_queue_tail(&list, skb); + + skb = skb_dequeue(head); + } + + if (skb_peek(&list)) { + spin_lock_irq(&head->lock); + while ((skb = __skb_dequeue(&list)) != NULL) + __skb_queue_tail(head, skb); + spin_unlock_irq(&head->lock); + } +#else + fput(file); +#endif +} + +struct io_file_put { + struct llist_node llist; + struct file *file; + struct completion *done; +}; + +static void io_ring_file_ref_switch(struct work_struct *work) +{ + struct io_file_put *pfile, *tmp; + struct fixed_file_data *data; + struct llist_node *node; + + data = container_of(work, struct fixed_file_data, ref_work); + + while ((node = llist_del_all(&data->put_llist)) != NULL) { + llist_for_each_entry_safe(pfile, tmp, node, llist) { + io_ring_file_put(data->ctx, pfile->file); + if (pfile->done) + complete(pfile->done); + else + kfree(pfile); + } + } + + percpu_ref_get(&data->refs); + percpu_ref_switch_to_percpu(&data->refs); +} + +static void io_file_data_ref_zero(struct percpu_ref *ref) +{ + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + + /* we can't safely switch from inside this context, punt to wq */ + queue_work(system_wq, &data->ref_work); +} + static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { __s32 __user *fds = (__s32 __user *) arg; unsigned nr_tables; + struct file *file; int fd, ret = 0; unsigned i; - if (ctx->file_table) + if (ctx->file_data) return -EBUSY; if (!nr_args) return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; + ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL); + if (!ctx->file_data) + return -ENOMEM; + ctx->file_data->ctx = ctx; + init_completion(&ctx->file_data->done); + nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); - ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table), + ctx->file_data->table = kcalloc(nr_tables, + sizeof(struct fixed_file_table), GFP_KERNEL); - if (!ctx->file_table) + if (!ctx->file_data->table) { + kfree(ctx->file_data); + ctx->file_data = NULL; return -ENOMEM; + } + + if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; + return -ENOMEM; + } + ctx->file_data->put_llist.first = NULL; + INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch); if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { - kfree(ctx->file_table); - ctx->file_table = NULL; + percpu_ref_exit(&ctx->file_data->refs); + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; return -ENOMEM; } @@ -3998,13 +5572,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, continue; } - table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; + table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; index = i & IORING_FILE_TABLE_MASK; - table->files[index] = fget(fd); + file = fget(fd); ret = -EBADF; - if (!table->files[index]) + if (!file) break; + /* * Don't allow io_uring instances to be registered. If UNIX * isn't enabled, then this causes a reference cycle and this @@ -4012,26 +5587,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, * handle it just fine, but there's still no point in allowing * a ring fd as it doesn't support regular read/write anyway. */ - if (table->files[index]->f_op == &io_uring_fops) { - fput(table->files[index]); + if (file->f_op == &io_uring_fops) { + fput(file); break; } ret = 0; + table->files[index] = file; } if (ret) { for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file; - file = io_file_from_index(ctx, i); if (file) fput(file); } for (i = 0; i < nr_tables; i++) - kfree(ctx->file_table[i].files); + kfree(ctx->file_data->table[i].files); - kfree(ctx->file_table); - ctx->file_table = NULL; + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; ctx->nr_user_files = 0; return ret; } @@ -4043,69 +5618,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } -static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index) -{ -#if defined(CONFIG_UNIX) - struct file *file = io_file_from_index(ctx, index); - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff_head list, *head = &sock->sk_receive_queue; - struct sk_buff *skb; - int i; - - __skb_queue_head_init(&list); - - /* - * Find the skb that holds this file in its SCM_RIGHTS. When found, - * remove this entry and rearrange the file array. - */ - skb = skb_dequeue(head); - while (skb) { - struct scm_fp_list *fp; - - fp = UNIXCB(skb).fp; - for (i = 0; i < fp->count; i++) { - int left; - - if (fp->fp[i] != file) - continue; - - unix_notinflight(fp->user, fp->fp[i]); - left = fp->count - 1 - i; - if (left) { - memmove(&fp->fp[i], &fp->fp[i + 1], - left * sizeof(struct file *)); - } - fp->count--; - if (!fp->count) { - kfree_skb(skb); - skb = NULL; - } else { - __skb_queue_tail(&list, skb); - } - fput(file); - file = NULL; - break; - } - - if (!file) - break; - - __skb_queue_tail(&list, skb); - - skb = skb_dequeue(head); - } - - if (skb_peek(&list)) { - spin_lock_irq(&head->lock); - while ((skb = __skb_dequeue(&list)) != NULL) - __skb_queue_tail(head, skb); - spin_unlock_irq(&head->lock); - } -#else - fput(io_file_from_index(ctx, index)); -#endif -} - static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, int index) { @@ -4149,27 +5661,65 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, #endif } -static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) +static void io_atomic_switch(struct percpu_ref *ref) { - struct io_uring_files_update up; + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + clear_bit(FFD_F_ATOMIC, &data->state); +} + +static bool io_queue_file_removal(struct fixed_file_data *data, + struct file *file) +{ + struct io_file_put *pfile, pfile_stack; + DECLARE_COMPLETION_ONSTACK(done); + + /* + * If we fail allocating the struct we need for doing async reomval + * of this file, just punt to sync and wait for it. + */ + pfile = kzalloc(sizeof(*pfile), GFP_KERNEL); + if (!pfile) { + pfile = &pfile_stack; + pfile->done = &done; + } + + pfile->file = file; + llist_add(&pfile->llist, &data->put_llist); + + if (pfile == &pfile_stack) { + if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) { + percpu_ref_put(&data->refs); + percpu_ref_switch_to_atomic(&data->refs, + io_atomic_switch); + } + wait_for_completion(&done); + flush_work(&data->ref_work); + return false; + } + + return true; +} + +static int __io_sqe_files_update(struct io_ring_ctx *ctx, + struct io_uring_files_update *up, + unsigned nr_args) +{ + struct fixed_file_data *data = ctx->file_data; + bool ref_switch = false; + struct file *file; __s32 __user *fds; int fd, i, err; __u32 done; - if (!ctx->file_table) - return -ENXIO; - if (!nr_args) - return -EINVAL; - if (copy_from_user(&up, arg, sizeof(up))) - return -EFAULT; - if (check_add_overflow(up.offset, nr_args, &done)) + if (check_add_overflow(up->offset, nr_args, &done)) return -EOVERFLOW; if (done > ctx->nr_user_files) return -EINVAL; done = 0; - fds = (__s32 __user *) up.fds; + fds = u64_to_user_ptr(up->fds); while (nr_args) { struct fixed_file_table *table; unsigned index; @@ -4179,16 +5729,16 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, err = -EFAULT; break; } - i = array_index_nospec(up.offset, ctx->nr_user_files); - table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; + i = array_index_nospec(up->offset, ctx->nr_user_files); + table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; index = i & IORING_FILE_TABLE_MASK; if (table->files[index]) { - io_sqe_file_unregister(ctx, i); + file = io_file_from_index(ctx, index); table->files[index] = NULL; + if (io_queue_file_removal(data, file)) + ref_switch = true; } if (fd != -1) { - struct file *file; - file = fget(fd); if (!file) { err = -EBADF; @@ -4214,11 +5764,32 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, } nr_args--; done++; - up.offset++; + up->offset++; + } + + if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) { + percpu_ref_put(&data->refs); + percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); } return done ? done : err; } +static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + struct io_uring_files_update up; + + if (!ctx->file_data) + return -ENXIO; + if (!nr_args) + return -EINVAL; + if (copy_from_user(&up, arg, sizeof(up))) + return -EFAULT; + if (up.resv) + return -EINVAL; + + return __io_sqe_files_update(ctx, &up, nr_args); +} static void io_put_work(struct io_wq_work *work) { @@ -4234,11 +5805,56 @@ static void io_get_work(struct io_wq_work *work) refcount_inc(&req->refs); } +static int io_init_wq_offload(struct io_ring_ctx *ctx, + struct io_uring_params *p) +{ + struct io_wq_data data; + struct fd f; + struct io_ring_ctx *ctx_attach; + unsigned int concurrency; + int ret = 0; + + data.user = ctx->user; + data.get_work = io_get_work; + data.put_work = io_put_work; + + if (!(p->flags & IORING_SETUP_ATTACH_WQ)) { + /* Do QD, or 4 * CPUS, whatever is smallest */ + concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); + + ctx->io_wq = io_wq_create(concurrency, &data); + if (IS_ERR(ctx->io_wq)) { + ret = PTR_ERR(ctx->io_wq); + ctx->io_wq = NULL; + } + return ret; + } + + f = fdget(p->wq_fd); + if (!f.file) + return -EBADF; + + if (f.file->f_op != &io_uring_fops) { + ret = -EINVAL; + goto out_fput; + } + + ctx_attach = f.file->private_data; + /* @io_wq is protected by holding the fd */ + if (!io_wq_get(ctx_attach->io_wq, &data)) { + ret = -EINVAL; + goto out_fput; + } + + ctx->io_wq = ctx_attach->io_wq; +out_fput: + fdput(f); + return ret; +} + static int io_sq_offload_start(struct io_ring_ctx *ctx, struct io_uring_params *p) { - struct io_wq_data data; - unsigned concurrency; int ret; init_waitqueue_head(&ctx->sqo_wait); @@ -4282,20 +5898,9 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, goto err; } - data.mm = ctx->sqo_mm; - data.user = ctx->user; - data.creds = ctx->creds; - data.get_work = io_get_work; - data.put_work = io_put_work; - - /* Do QD, or 4 * CPUS, whatever is smallest */ - concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - ctx->io_wq = io_wq_create(concurrency, &data); - if (IS_ERR(ctx->io_wq)) { - ret = PTR_ERR(ctx->io_wq); - ctx->io_wq = NULL; + ret = io_init_wq_offload(ctx, p); + if (ret) goto err; - } return 0; err: @@ -4400,7 +6005,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; for (j = 0; j < imu->nr_bvecs; j++) - put_user_page(imu->bvec[j].bv_page); + unpin_user_page(imu->bvec[j].bv_page); if (ctx->account_mem) io_unaccount_mem(ctx->user, imu->nr_bvecs); @@ -4428,7 +6033,7 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) return -EFAULT; - dst->iov_base = (void __user *) (unsigned long) ciov.iov_base; + dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); dst->iov_len = ciov.iov_len; return 0; } @@ -4521,7 +6126,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, ret = 0; down_read(¤t->mm->mmap_sem); - pret = get_user_pages(ubuf, nr_pages, + pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages, vmas); if (pret == nr_pages) { @@ -4545,7 +6150,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, * release any pages we did get */ if (pret > 0) - put_user_pages(pages, pret); + unpin_user_pages(pages, pret); if (ctx->account_mem) io_unaccount_mem(ctx->user, nr_pages); kvfree(imu->bvec); @@ -4673,6 +6278,17 @@ static int io_uring_fasync(int fd, struct file *file, int on) return fasync_helper(fd, file, on, &ctx->cq_fasync); } +static int io_remove_personalities(int id, void *p, void *data) +{ + struct io_ring_ctx *ctx = data; + const struct cred *cred; + + cred = idr_remove(&ctx->personality_idr, id); + if (cred) + put_cred(cred); + return 0; +} + static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); @@ -4689,6 +6305,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) /* if we failed setting up the ctx, we might not have any rings */ if (ctx->rings) io_cqring_overflow_flush(ctx, true); + idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); wait_for_completion(&ctx->completions[0]); io_ring_ctx_free(ctx); } @@ -4742,10 +6359,6 @@ static int io_uring_flush(struct file *file, void *data) struct io_ring_ctx *ctx = file->private_data; io_uring_cancel_files(ctx, data); - if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) { - io_cqring_overflow_flush(ctx, true); - io_wq_cancel_all(ctx->io_wq); - } return 0; } @@ -4859,13 +6472,15 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } else if (to_submit) { struct mm_struct *cur_mm; - to_submit = min(to_submit, ctx->sq_entries); mutex_lock(&ctx->uring_lock); /* already have mm, so io_submit_sqes() won't try to grab it */ cur_mm = ctx->sqo_mm; submitted = io_submit_sqes(ctx, to_submit, f.file, fd, &cur_mm, false); mutex_unlock(&ctx->uring_lock); + + if (submitted != to_submit) + goto out; } if (flags & IORING_ENTER_GETEVENTS) { unsigned nr_events = 0; @@ -4879,6 +6494,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } } +out: percpu_ref_put(&ctx->refs); out_fput: fdput(f); @@ -4971,7 +6587,6 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; - ctx->ring_sock->sk->sk_user_data = ctx; #endif fd_install(ret, file); return ret; @@ -4990,8 +6605,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) bool account_mem; int ret; - if (!entries || entries > IORING_MAX_ENTRIES) + if (!entries) return -EINVAL; + if (entries > IORING_MAX_ENTRIES) { + if (!(p->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + entries = IORING_MAX_ENTRIES; + } /* * Use twice as many entries for the CQ ring. It's possible for the @@ -5008,8 +6628,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) * to a power-of-two, if it isn't already. We do NOT impose * any cq vs sq ring sizing. */ - if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES) + if (p->cq_entries < p->sq_entries) return -EINVAL; + if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { + if (!(p->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + p->cq_entries = IORING_MAX_CQ_ENTRIES; + } p->cq_entries = roundup_pow_of_two(p->cq_entries); } else { p->cq_entries = 2 * p->sq_entries; @@ -5074,7 +6699,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) goto err; p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE; + IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | + IORING_FEAT_CUR_PERSONALITY; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: @@ -5101,7 +6727,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) } if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | - IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE)) + IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | + IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ)) return -EINVAL; ret = io_uring_create(entries, &p); @@ -5120,6 +6747,84 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, return io_uring_setup(entries, params); } +static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +{ + struct io_uring_probe *p; + size_t size; + int i, ret; + + size = struct_size(p, ops, nr_args); + if (size == SIZE_MAX) + return -EOVERFLOW; + p = kzalloc(size, GFP_KERNEL); + if (!p) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(p, arg, size)) + goto out; + ret = -EINVAL; + if (memchr_inv(p, 0, size)) + goto out; + + p->last_op = IORING_OP_LAST - 1; + if (nr_args > IORING_OP_LAST) + nr_args = IORING_OP_LAST; + + for (i = 0; i < nr_args; i++) { + p->ops[i].op = i; + if (!io_op_defs[i].not_supported) + p->ops[i].flags = IO_URING_OP_SUPPORTED; + } + p->ops_len = i; + + ret = 0; + if (copy_to_user(arg, p, size)) + ret = -EFAULT; +out: + kfree(p); + return ret; +} + +static int io_register_personality(struct io_ring_ctx *ctx) +{ + const struct cred *creds = get_current_cred(); + int id; + + id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1, + USHRT_MAX, GFP_KERNEL); + if (id < 0) + put_cred(creds); + return id; +} + +static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) +{ + const struct cred *old_creds; + + old_creds = idr_remove(&ctx->personality_idr, id); + if (old_creds) { + put_cred(old_creds); + return 0; + } + + return -EINVAL; +} + +static bool io_register_op_must_quiesce(int op) +{ + switch (op) { + case IORING_UNREGISTER_FILES: + case IORING_REGISTER_FILES_UPDATE: + case IORING_REGISTER_PROBE: + case IORING_REGISTER_PERSONALITY: + case IORING_UNREGISTER_PERSONALITY: + return false; + default: + return true; + } +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -5135,18 +6840,26 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (percpu_ref_is_dying(&ctx->refs)) return -ENXIO; - percpu_ref_kill(&ctx->refs); + if (io_register_op_must_quiesce(opcode)) { + percpu_ref_kill(&ctx->refs); - /* - * Drop uring mutex before waiting for references to exit. If another - * thread is currently inside io_uring_enter() it might need to grab - * the uring_lock to make progress. If we hold it here across the drain - * wait, then we can deadlock. It's safe to drop the mutex here, since - * no new references will come in after we've killed the percpu ref. - */ - mutex_unlock(&ctx->uring_lock); - wait_for_completion(&ctx->completions[0]); - mutex_lock(&ctx->uring_lock); + /* + * Drop uring mutex before waiting for references to exit. If + * another thread is currently inside io_uring_enter() it might + * need to grab the uring_lock to make progress. If we hold it + * here across the drain wait, then we can deadlock. It's safe + * to drop the mutex here, since no new references will come in + * after we've killed the percpu ref. + */ + mutex_unlock(&ctx->uring_lock); + ret = wait_for_completion_interruptible(&ctx->completions[0]); + mutex_lock(&ctx->uring_lock); + if (ret) { + percpu_ref_resurrect(&ctx->refs); + ret = -EINTR; + goto out; + } + } switch (opcode) { case IORING_REGISTER_BUFFERS: @@ -5171,10 +6884,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = io_sqe_files_update(ctx, arg, nr_args); break; case IORING_REGISTER_EVENTFD: + case IORING_REGISTER_EVENTFD_ASYNC: ret = -EINVAL; if (nr_args != 1) break; ret = io_eventfd_register(ctx, arg); + if (ret) + break; + if (opcode == IORING_REGISTER_EVENTFD_ASYNC) + ctx->eventfd_async = 1; + else + ctx->eventfd_async = 0; break; case IORING_UNREGISTER_EVENTFD: ret = -EINVAL; @@ -5182,14 +6902,35 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_eventfd_unregister(ctx); break; + case IORING_REGISTER_PROBE: + ret = -EINVAL; + if (!arg || nr_args > 256) + break; + ret = io_probe(ctx, arg, nr_args); + break; + case IORING_REGISTER_PERSONALITY: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_register_personality(ctx); + break; + case IORING_UNREGISTER_PERSONALITY: + ret = -EINVAL; + if (arg) + break; + ret = io_unregister_personality(ctx, nr_args); + break; default: ret = -EINVAL; break; } - /* bring the ctx back to life */ - reinit_completion(&ctx->completions[0]); - percpu_ref_reinit(&ctx->refs); + if (io_register_op_must_quiesce(opcode)) { + /* bring the ctx back to life */ + percpu_ref_reinit(&ctx->refs); +out: + reinit_completion(&ctx->completions[0]); + } return ret; } @@ -5222,6 +6963,7 @@ out_fput: static int __init io_uring_init(void) { + BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); return 0; }; diff --git a/fs/ioctl.c b/fs/ioctl.c index 2f5e4e5b97e1..7c9a5df5a597 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -467,7 +467,7 @@ EXPORT_SYMBOL(generic_block_fiemap); * Only the l_start, l_len and l_whence fields of the 'struct space_resv' * are used here, rest are ignored. */ -int ioctl_preallocate(struct file *filp, int mode, void __user *argp) +static int ioctl_preallocate(struct file *filp, int mode, void __user *argp) { struct inode *inode = file_inode(filp); struct space_resv sr; @@ -495,8 +495,8 @@ int ioctl_preallocate(struct file *filp, int mode, void __user *argp) /* on ia32 l_start is on a 32-bit boundary */ #if defined CONFIG_COMPAT && defined(CONFIG_X86_64) /* just account for different alignment */ -int compat_ioctl_preallocate(struct file *file, int mode, - struct space_resv_32 __user *argp) +static int compat_ioctl_preallocate(struct file *file, int mode, + struct space_resv_32 __user *argp) { struct inode *inode = file_inode(file); struct space_resv_32 sr; @@ -521,11 +521,9 @@ int compat_ioctl_preallocate(struct file *file, int mode, } #endif -static int file_ioctl(struct file *filp, unsigned int cmd, - unsigned long arg) +static int file_ioctl(struct file *filp, unsigned int cmd, int __user *p) { struct inode *inode = file_inode(filp); - int __user *p = (int __user *)arg; switch (cmd) { case FIBMAP: @@ -542,7 +540,7 @@ static int file_ioctl(struct file *filp, unsigned int cmd, return ioctl_preallocate(filp, FALLOC_FL_ZERO_RANGE, p); } - return vfs_ioctl(filp, cmd, arg); + return -ENOIOCTLCMD; } static int ioctl_fionbio(struct file *filp, int __user *argp) @@ -661,53 +659,48 @@ out: } /* - * When you add any new common ioctls to the switches above and below - * please update compat_sys_ioctl() too. - * * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d. * It's just a simple helper for sys_ioctl and compat_sys_ioctl. + * + * When you add any new common ioctls to the switches above and below, + * please ensure they have compatible arguments in compat mode. */ -int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, - unsigned long arg) +static int do_vfs_ioctl(struct file *filp, unsigned int fd, + unsigned int cmd, unsigned long arg) { - int error = 0; void __user *argp = (void __user *)arg; struct inode *inode = file_inode(filp); switch (cmd) { case FIOCLEX: set_close_on_exec(fd, 1); - break; + return 0; case FIONCLEX: set_close_on_exec(fd, 0); - break; + return 0; case FIONBIO: - error = ioctl_fionbio(filp, argp); - break; + return ioctl_fionbio(filp, argp); case FIOASYNC: - error = ioctl_fioasync(fd, filp, argp); - break; + return ioctl_fioasync(fd, filp, argp); case FIOQSIZE: if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { loff_t res = inode_get_bytes(inode); - error = copy_to_user(argp, &res, sizeof(res)) ? - -EFAULT : 0; - } else - error = -ENOTTY; - break; + return copy_to_user(argp, &res, sizeof(res)) ? + -EFAULT : 0; + } + + return -ENOTTY; case FIFREEZE: - error = ioctl_fsfreeze(filp); - break; + return ioctl_fsfreeze(filp); case FITHAW: - error = ioctl_fsthaw(filp); - break; + return ioctl_fsthaw(filp); case FS_IOC_FIEMAP: return ioctl_fiemap(filp, argp); @@ -716,6 +709,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, /* anon_bdev filesystems may not have a block size */ if (!inode->i_sb->s_blocksize) return -EINVAL; + return put_user(inode->i_sb->s_blocksize, (int __user *)argp); case FICLONE: @@ -729,24 +723,30 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, default: if (S_ISREG(inode->i_mode)) - error = file_ioctl(filp, cmd, arg); - else - error = vfs_ioctl(filp, cmd, arg); + return file_ioctl(filp, cmd, argp); break; } - return error; + + return -ENOIOCTLCMD; } int ksys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) { - int error; struct fd f = fdget(fd); + int error; if (!f.file) return -EBADF; + error = security_file_ioctl(f.file, cmd, arg); - if (!error) - error = do_vfs_ioctl(f.file, fd, cmd, arg); + if (error) + goto out; + + error = do_vfs_ioctl(f.file, fd, cmd, arg); + if (error == -ENOIOCTLCMD) + error = vfs_ioctl(f.file, cmd, arg); + +out: fdput(f); return error; } @@ -788,4 +788,65 @@ long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); } EXPORT_SYMBOL(compat_ptr_ioctl); + +COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) +{ + struct fd f = fdget(fd); + int error; + + if (!f.file) + return -EBADF; + + /* RED-PEN how should LSM module know it's handling 32bit? */ + error = security_file_ioctl(f.file, cmd, arg); + if (error) + goto out; + + switch (cmd) { + /* FICLONE takes an int argument, so don't use compat_ptr() */ + case FICLONE: + error = ioctl_file_clone(f.file, arg, 0, 0, 0); + break; + +#if defined(CONFIG_X86_64) + /* these get messy on amd64 due to alignment differences */ + case FS_IOC_RESVSP_32: + case FS_IOC_RESVSP64_32: + error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg)); + break; + case FS_IOC_UNRESVSP_32: + case FS_IOC_UNRESVSP64_32: + error = compat_ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE, + compat_ptr(arg)); + break; + case FS_IOC_ZERO_RANGE_32: + error = compat_ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE, + compat_ptr(arg)); + break; +#endif + + /* + * everything else in do_vfs_ioctl() takes either a compatible + * pointer argument or no argument -- call it with a modified + * argument. + */ + default: + error = do_vfs_ioctl(f.file, fd, cmd, + (unsigned long)compat_ptr(arg)); + if (error != -ENOIOCTLCMD) + break; + + if (f.file->f_op->compat_ioctl) + error = f.file->f_op->compat_ioctl(f.file, cmd, arg); + if (error == -ENOIOCTLCMD) + error = -ENOTTY; + break; + } + + out: + fdput(f); + + return error; +} #endif diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 828444e14d09..7c84c4c027c4 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1077,24 +1077,16 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); unsigned long length; - loff_t offset, size; + loff_t offset; ssize_t ret; lock_page(page); - size = i_size_read(inode); - offset = page_offset(page); - if (page->mapping != inode->i_mapping || offset > size) { - /* We overload EFAULT to mean page got truncated */ - ret = -EFAULT; + ret = page_mkwrite_check_truncate(page, inode); + if (ret < 0) goto out_unlock; - } - - /* page is wholly or partially inside EOF */ - if (offset > size - PAGE_SIZE) - length = offset_in_page(size); - else - length = PAGE_SIZE; + length = ret; + offset = page_offset(page); while (length > 0) { ret = iomap_apply(inode, offset, length, IOMAP_WRITE | IOMAP_FAULT, ops, page, diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 8fff6677a5da..96bf33986d03 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -164,7 +164,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) "journal space in %s\n", __func__, journal->j_devname); WARN_ON(1); - jbd2_journal_abort(journal, 0); + jbd2_journal_abort(journal, -EIO); } write_lock(&journal->j_state_lock); } else { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7f0b362b3842..2494095e0340 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -782,7 +782,7 @@ start_journal_io: err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) - __jbd2_journal_abort_hard(journal); + jbd2_journal_abort(journal, err); } blk_finish_plug(&plug); @@ -875,7 +875,7 @@ start_journal_io: err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) - __jbd2_journal_abort_hard(journal); + jbd2_journal_abort(journal, err); } if (cbh) err = journal_wait_on_commit_record(journal, cbh); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5e408ee24a1a..eb8ca446d1ab 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -96,7 +96,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); EXPORT_SYMBOL(jbd2_inode_cache); -static void __journal_abort_soft (journal_t *journal, int errno); static int jbd2_journal_create_slab(size_t slab_size); #ifdef CONFIG_JBD2_DEBUG @@ -805,7 +804,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, "at offset %lu on %s\n", __func__, blocknr, journal->j_devname); err = -EIO; - __journal_abort_soft(journal, err); + jbd2_journal_abort(journal, err); } } else { *retp = blocknr; /* +journal->j_blk_offset */ @@ -982,6 +981,7 @@ static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos) { + (*pos)++; return NULL; } @@ -1074,12 +1074,11 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -static const struct file_operations jbd2_seq_info_fops = { - .owner = THIS_MODULE, - .open = jbd2_seq_info_open, - .read = seq_read, - .llseek = seq_lseek, - .release = jbd2_seq_info_release, +static const struct proc_ops jbd2_info_proc_ops = { + .proc_open = jbd2_seq_info_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = jbd2_seq_info_release, }; static struct proc_dir_entry *proc_jbd2_stats; @@ -1089,7 +1088,7 @@ static void jbd2_stats_proc_init(journal_t *journal) journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); if (journal->j_proc_entry) { proc_create_data("info", S_IRUGO, journal->j_proc_entry, - &jbd2_seq_info_fops, journal); + &jbd2_info_proc_ops, journal); } } @@ -1710,6 +1709,11 @@ int jbd2_journal_load(journal_t *journal) journal->j_devname); return -EFSCORRUPTED; } + /* + * clear JBD2_ABORT flag initialized in journal_init_common + * here to update log tail information with the newest seq. + */ + journal->j_flags &= ~JBD2_ABORT; /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory @@ -1717,7 +1721,6 @@ int jbd2_journal_load(journal_t *journal) if (journal_reset(journal)) goto recovery_error; - journal->j_flags &= ~JBD2_ABORT; journal->j_flags |= JBD2_LOADED; return 0; @@ -2098,67 +2101,6 @@ int jbd2_journal_wipe(journal_t *journal, int write) return err; } -/* - * Journal abort has very specific semantics, which we describe - * for journal abort. - * - * Two internal functions, which provide abort to the jbd layer - * itself are here. - */ - -/* - * Quick version for internal journal use (doesn't lock the journal). - * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, - * and don't attempt to make any other journal updates. - */ -void __jbd2_journal_abort_hard(journal_t *journal) -{ - transaction_t *transaction; - - if (journal->j_flags & JBD2_ABORT) - return; - - printk(KERN_ERR "Aborting journal on device %s.\n", - journal->j_devname); - - write_lock(&journal->j_state_lock); - journal->j_flags |= JBD2_ABORT; - transaction = journal->j_running_transaction; - if (transaction) - __jbd2_log_start_commit(journal, transaction->t_tid); - write_unlock(&journal->j_state_lock); -} - -/* Soft abort: record the abort error status in the journal superblock, - * but don't do any other IO. */ -static void __journal_abort_soft (journal_t *journal, int errno) -{ - int old_errno; - - write_lock(&journal->j_state_lock); - old_errno = journal->j_errno; - if (!journal->j_errno || errno == -ESHUTDOWN) - journal->j_errno = errno; - - if (journal->j_flags & JBD2_ABORT) { - write_unlock(&journal->j_state_lock); - if (!old_errno && old_errno != -ESHUTDOWN && - errno == -ESHUTDOWN) - jbd2_journal_update_sb_errno(journal); - return; - } - write_unlock(&journal->j_state_lock); - - __jbd2_journal_abort_hard(journal); - - if (errno) { - jbd2_journal_update_sb_errno(journal); - write_lock(&journal->j_state_lock); - journal->j_flags |= JBD2_REC_ERR; - write_unlock(&journal->j_state_lock); - } -} - /** * void jbd2_journal_abort () - Shutdown the journal immediately. * @journal: the journal to shutdown. @@ -2198,16 +2140,51 @@ static void __journal_abort_soft (journal_t *journal, int errno) * failure to disk. ext3_error, for example, now uses this * functionality. * - * Errors which originate from within the journaling layer will NOT - * supply an errno; a null errno implies that absolutely no further - * writes are done to the journal (unless there are any already in - * progress). - * */ void jbd2_journal_abort(journal_t *journal, int errno) { - __journal_abort_soft(journal, errno); + transaction_t *transaction; + + /* + * ESHUTDOWN always takes precedence because a file system check + * caused by any other journal abort error is not required after + * a shutdown triggered. + */ + write_lock(&journal->j_state_lock); + if (journal->j_flags & JBD2_ABORT) { + int old_errno = journal->j_errno; + + write_unlock(&journal->j_state_lock); + if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) { + journal->j_errno = errno; + jbd2_journal_update_sb_errno(journal); + } + return; + } + + /* + * Mark the abort as occurred and start current running transaction + * to release all journaled buffer. + */ + pr_err("Aborting journal on device %s.\n", journal->j_devname); + + journal->j_flags |= JBD2_ABORT; + journal->j_errno = errno; + transaction = journal->j_running_transaction; + if (transaction) + __jbd2_log_start_commit(journal, transaction->t_tid); + write_unlock(&journal->j_state_lock); + + /* + * Record errno to the journal super block, so that fsck and jbd2 + * layer could realise that a filesystem check is needed. + */ + jbd2_journal_update_sb_errno(journal); + + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_REC_ERR; + write_unlock(&journal->j_state_lock); } /** @@ -2556,7 +2533,6 @@ static void __journal_remove_journal_head(struct buffer_head *bh) { struct journal_head *jh = bh2jh(bh); - J_ASSERT_JH(jh, jh->b_jcount >= 0); J_ASSERT_JH(jh, jh->b_transaction == NULL); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 27b9f9dee434..e77a5a0b4e46 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -525,7 +525,7 @@ EXPORT_SYMBOL(jbd2__journal_start); * modified buffers in the log. We block until the log can guarantee * that much space. Additionally, if rsv_blocks > 0, we also create another * handle with rsv_blocks reserved blocks in the journal. This handle is - * is stored in h_rsv_handle. It is not attached to any particular transaction + * stored in h_rsv_handle. It is not attached to any particular transaction * and thus doesn't block transaction commit. If the caller uses this reserved * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() * on the parent handle will dispose the reserved one. Reserved handle has to @@ -1595,7 +1595,7 @@ out: * Allow this call even if the handle has aborted --- it may be part of * the caller's cleanup after an abort. */ -int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) +int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal; diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c index 888cdd685a1e..44b62b3c322e 100644 --- a/fs/jfs/jfs_debug.c +++ b/fs/jfs/jfs_debug.c @@ -43,12 +43,12 @@ static ssize_t jfs_loglevel_proc_write(struct file *file, return count; } -static const struct file_operations jfs_loglevel_proc_fops = { - .open = jfs_loglevel_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = jfs_loglevel_proc_write, +static const struct proc_ops jfs_loglevel_proc_ops = { + .proc_open = jfs_loglevel_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = jfs_loglevel_proc_write, }; #endif @@ -68,7 +68,7 @@ void jfs_proc_init(void) #endif #ifdef CONFIG_JFS_DEBUG proc_create_single("TxAnchor", 0, base, jfs_txanchor_proc_show); - proc_create("loglevel", 0, base, &jfs_loglevel_proc_fops); + proc_create("loglevel", 0, base, &jfs_loglevel_proc_ops); #endif } diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 9d96e6871e1a..9aec80b9d7c6 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1266,7 +1266,7 @@ void kernfs_activate(struct kernfs_node *kn) pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) { - if (!pos || (pos->flags & KERNFS_ACTIVATED)) + if (pos->flags & KERNFS_ACTIVATED) continue; WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb)); diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c index ca9228a56d65..a01f08c8c2f3 100644 --- a/fs/lockd/procfs.c +++ b/fs/lockd/procfs.c @@ -60,11 +60,11 @@ nlm_end_grace_read(struct file *file, char __user *buf, size_t size, return simple_read_from_buffer(buf, size, pos, resp, sizeof(resp)); } -static const struct file_operations lockd_end_grace_operations = { - .write = nlm_end_grace_write, - .read = nlm_end_grace_read, - .llseek = default_llseek, - .release = simple_transaction_release, +static const struct proc_ops lockd_end_grace_proc_ops = { + .proc_write = nlm_end_grace_write, + .proc_read = nlm_end_grace_read, + .proc_lseek = default_llseek, + .proc_release = simple_transaction_release, }; int __init @@ -76,7 +76,7 @@ lockd_create_procfs(void) if (!entry) return -ENOMEM; entry = proc_create("nlm_end_grace", S_IRUGO|S_IWUSR, entry, - &lockd_end_grace_operations); + &lockd_end_grace_proc_ops); if (!entry) { remove_proc_entry("fs/lockd", NULL); return -ENOMEM; diff --git a/fs/locks.c b/fs/locks.c index 6970f55daf54..44b6da032842 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2853,7 +2853,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, } if (inode) { /* userspace relies on this representation of dev_t */ - seq_printf(f, "%d %02x:%02x:%ld ", fl_pid, + seq_printf(f, "%d %02x:%02x:%lu ", fl_pid, MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); } else { diff --git a/fs/mpage.c b/fs/mpage.c index a63620cdb73a..ccba3c4c4479 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -62,7 +62,7 @@ static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio) { bio->bi_end_io = mpage_end_io; bio_set_op_attrs(bio, op, op_flags); - guard_bio_eod(op, bio); + guard_bio_eod(bio); submit_bio(bio); return NULL; } diff --git a/fs/namei.c b/fs/namei.c index d6c91d1e88cb..db6565c99825 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -491,7 +491,7 @@ struct nameidata { struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags; - unsigned seq, m_seq; + unsigned seq, m_seq, r_seq; int last_type; unsigned depth; int total_link_count; @@ -641,6 +641,14 @@ static bool legitimize_links(struct nameidata *nd) static bool legitimize_root(struct nameidata *nd) { + /* + * For scoped-lookups (where nd->root has been zeroed), we need to + * restart the whole lookup from scratch -- because set_root() is wrong + * for these lookups (nd->dfd is the root, not the filesystem root). + */ + if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED)) + return false; + /* Nothing to do if nd->root is zero or is managed by the VFS user. */ if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT)) return true; nd->flags |= LOOKUP_ROOT_GRABBED; @@ -776,12 +784,37 @@ static int complete_walk(struct nameidata *nd) int status; if (nd->flags & LOOKUP_RCU) { - if (!(nd->flags & LOOKUP_ROOT)) + /* + * We don't want to zero nd->root for scoped-lookups or + * externally-managed nd->root. + */ + if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED))) nd->root.mnt = NULL; if (unlikely(unlazy_walk(nd))) return -ECHILD; } + if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { + /* + * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't + * ever step outside the root during lookup" and should already + * be guaranteed by the rest of namei, we want to avoid a namei + * BUG resulting in userspace being given a path that was not + * scoped within the root at some point during the lookup. + * + * So, do a final sanity-check to make sure that in the + * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED) + * we won't silently return an fd completely outside of the + * requested root to userspace. + * + * Userspace could move the path outside the root after this + * check, but as discussed elsewhere this is not a concern (the + * resolved file was inside the root at some point). + */ + if (!path_is_under(&nd->path, &nd->root)) + return -EXDEV; + } + if (likely(!(nd->flags & LOOKUP_JUMPED))) return 0; @@ -798,10 +831,18 @@ static int complete_walk(struct nameidata *nd) return status; } -static void set_root(struct nameidata *nd) +static int set_root(struct nameidata *nd) { struct fs_struct *fs = current->fs; + /* + * Jumping to the real root in a scoped-lookup is a BUG in namei, but we + * still have to ensure it doesn't happen because it will cause a breakout + * from the dirfd. + */ + if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED)) + return -ENOTRECOVERABLE; + if (nd->flags & LOOKUP_RCU) { unsigned seq; @@ -814,6 +855,7 @@ static void set_root(struct nameidata *nd) get_fs_root(fs, &nd->root); nd->flags |= LOOKUP_ROOT_GRABBED; } + return 0; } static void path_put_conditional(struct path *path, struct nameidata *nd) @@ -837,6 +879,18 @@ static inline void path_to_nameidata(const struct path *path, static int nd_jump_root(struct nameidata *nd) { + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return -EXDEV; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { + /* Absolute path arguments to path_init() are allowed. */ + if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt) + return -EXDEV; + } + if (!nd->root.mnt) { + int error = set_root(nd); + if (error) + return error; + } if (nd->flags & LOOKUP_RCU) { struct dentry *d; nd->path = nd->root; @@ -859,14 +913,32 @@ static int nd_jump_root(struct nameidata *nd) * Helper to directly jump to a known parsed path from ->get_link, * caller must have taken a reference to path beforehand. */ -void nd_jump_link(struct path *path) +int nd_jump_link(struct path *path) { + int error = -ELOOP; struct nameidata *nd = current->nameidata; - path_put(&nd->path); + if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS)) + goto err; + + error = -EXDEV; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { + if (nd->path.mnt != path->mnt) + goto err; + } + /* Not currently safe for scoped-lookups. */ + if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) + goto err; + + path_put(&nd->path); nd->path = *path; nd->inode = nd->path.dentry->d_inode; nd->flags |= LOOKUP_JUMPED; + return 0; + +err: + path_put(path); + return error; } static inline void put_link(struct nameidata *nd) @@ -1001,7 +1073,8 @@ static int may_linkat(struct path *link) * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory * should be allowed, or not, on files that already * exist. - * @dir: the sticky parent directory + * @dir_mode: mode bits of directory + * @dir_uid: owner of directory * @inode: the inode of the file to open * * Block an O_CREAT open of a FIFO (or a regular file) when: @@ -1017,18 +1090,18 @@ static int may_linkat(struct path *link) * * Returns 0 if the open is allowed, -ve on error. */ -static int may_create_in_sticky(struct dentry * const dir, +static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid, struct inode * const inode) { if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || - likely(!(dir->d_inode->i_mode & S_ISVTX)) || - uid_eq(inode->i_uid, dir->d_inode->i_uid) || + likely(!(dir_mode & S_ISVTX)) || + uid_eq(inode->i_uid, dir_uid) || uid_eq(current_fsuid(), inode->i_uid)) return 0; - if (likely(dir->d_inode->i_mode & 0002) || - (dir->d_inode->i_mode & 0020 && + if (likely(dir_mode & 0002) || + (dir_mode & 0020 && ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) || (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) { const char *operation = S_ISFIFO(inode->i_mode) ? @@ -1049,6 +1122,9 @@ const char *get_link(struct nameidata *nd) int error; const char *res; + if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS)) + return ERR_PTR(-ELOOP); + if (!(nd->flags & LOOKUP_RCU)) { touch_atime(&last->link); cond_resched(); @@ -1083,10 +1159,9 @@ const char *get_link(struct nameidata *nd) return res; } if (*res == '/') { - if (!nd->root.mnt) - set_root(nd); - if (unlikely(nd_jump_root(nd))) - return ERR_PTR(-ECHILD); + error = nd_jump_root(nd); + if (unlikely(error)) + return ERR_PTR(error); while (unlikely(*++res == '/')) ; } @@ -1232,6 +1307,7 @@ static int follow_managed(struct path *path, struct nameidata *nd) BUG_ON(!path->dentry->d_op); BUG_ON(!path->dentry->d_op->d_manage); ret = path->dentry->d_op->d_manage(path, false); + flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } @@ -1267,10 +1343,14 @@ static int follow_managed(struct path *path, struct nameidata *nd) break; } - if (need_mntput && path->mnt == mnt) - mntput(path->mnt); - if (need_mntput) - nd->flags |= LOOKUP_JUMPED; + if (need_mntput) { + if (path->mnt == mnt) + mntput(path->mnt); + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + ret = -EXDEV; + else + nd->flags |= LOOKUP_JUMPED; + } if (ret == -EISDIR || !ret) ret = 1; if (ret > 0 && unlikely(d_flags_negative(flags))) @@ -1331,6 +1411,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, mounted = __lookup_mnt(path->mnt, path->dentry); if (!mounted) break; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return false; path->mnt = &mounted->mnt; path->dentry = mounted->mnt.mnt_root; nd->flags |= LOOKUP_JUMPED; @@ -1351,8 +1433,11 @@ static int follow_dotdot_rcu(struct nameidata *nd) struct inode *inode = nd->inode; while (1) { - if (path_equal(&nd->path, &nd->root)) + if (path_equal(&nd->path, &nd->root)) { + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return -ECHILD; break; + } if (nd->path.dentry != nd->path.mnt->mnt_root) { struct dentry *old = nd->path.dentry; struct dentry *parent = old->d_parent; @@ -1365,7 +1450,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) nd->path.dentry = parent; nd->seq = seq; if (unlikely(!path_connected(&nd->path))) - return -ENOENT; + return -ECHILD; break; } else { struct mount *mnt = real_mount(nd->path.mnt); @@ -1377,6 +1462,8 @@ static int follow_dotdot_rcu(struct nameidata *nd) return -ECHILD; if (&mparent->mnt == nd->path.mnt) break; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return -ECHILD; /* we know that mountpoint was pinned */ nd->path.dentry = mountpoint; nd->path.mnt = &mparent->mnt; @@ -1391,6 +1478,8 @@ static int follow_dotdot_rcu(struct nameidata *nd) return -ECHILD; if (!mounted) break; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return -ECHILD; nd->path.mnt = &mounted->mnt; nd->path.dentry = mounted->mnt.mnt_root; inode = nd->path.dentry->d_inode; @@ -1478,9 +1567,12 @@ static int path_parent_directory(struct path *path) static int follow_dotdot(struct nameidata *nd) { - while(1) { - if (path_equal(&nd->path, &nd->root)) + while (1) { + if (path_equal(&nd->path, &nd->root)) { + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return -EXDEV; break; + } if (nd->path.dentry != nd->path.mnt->mnt_root) { int ret = path_parent_directory(&nd->path); if (ret) @@ -1489,6 +1581,8 @@ static int follow_dotdot(struct nameidata *nd) } if (!follow_up(&nd->path)) break; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return -EXDEV; } follow_mount(&nd->path); nd->inode = nd->path.dentry->d_inode; @@ -1649,17 +1743,15 @@ again: if (IS_ERR(dentry)) return dentry; if (unlikely(!d_in_lookup(dentry))) { - if (!(flags & LOOKUP_NO_REVAL)) { - int error = d_revalidate(dentry, flags); - if (unlikely(error <= 0)) { - if (!error) { - d_invalidate(dentry); - dput(dentry); - goto again; - } + int error = d_revalidate(dentry, flags); + if (unlikely(error <= 0)) { + if (!error) { + d_invalidate(dentry); dput(dentry); - dentry = ERR_PTR(error); + goto again; } + dput(dentry); + dentry = ERR_PTR(error); } } else { old = inode->i_op->lookup(inode, dentry, flags); @@ -1699,12 +1791,33 @@ static inline int may_lookup(struct nameidata *nd) static inline int handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { - if (!nd->root.mnt) - set_root(nd); - if (nd->flags & LOOKUP_RCU) { - return follow_dotdot_rcu(nd); - } else - return follow_dotdot(nd); + int error = 0; + + if (!nd->root.mnt) { + error = set_root(nd); + if (error) + return error; + } + if (nd->flags & LOOKUP_RCU) + error = follow_dotdot_rcu(nd); + else + error = follow_dotdot(nd); + if (error) + return error; + + if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { + /* + * If there was a racing rename or mount along our + * path, then we can't be sure that ".." hasn't jumped + * above nd->root (and so userspace should retry or use + * some fallback). + */ + smp_rmb(); + if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))) + return -EAGAIN; + if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))) + return -EAGAIN; + } } return 0; } @@ -2158,6 +2271,7 @@ OK: /* must be paired with terminate_walk() */ static const char *path_init(struct nameidata *nd, unsigned flags) { + int error; const char *s = nd->name->name; if (!*s) @@ -2168,6 +2282,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT; nd->depth = 0; + + nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); + nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); + smp_rmb(); + if (flags & LOOKUP_ROOT) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; @@ -2176,9 +2295,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); nd->root_seq = nd->seq; - nd->m_seq = read_seqbegin(&mount_lock); } else { path_get(&nd->path); } @@ -2189,13 +2307,16 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->path.mnt = NULL; nd->path.dentry = NULL; - nd->m_seq = read_seqbegin(&mount_lock); - if (*s == '/') { - set_root(nd); - if (likely(!nd_jump_root(nd))) - return s; - return ERR_PTR(-ECHILD); - } else if (nd->dfd == AT_FDCWD) { + /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ + if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { + error = nd_jump_root(nd); + if (unlikely(error)) + return ERR_PTR(error); + return s; + } + + /* Relative pathname -- get the starting-point it is relative to. */ + if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) { struct fs_struct *fs = current->fs; unsigned seq; @@ -2210,7 +2331,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags) get_fs_pwd(current->fs, &nd->path); nd->inode = nd->path.dentry->d_inode; } - return s; } else { /* Caller must check execute permissions on the starting path component */ struct fd f = fdget_raw(nd->dfd); @@ -2235,8 +2355,19 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->inode = nd->path.dentry->d_inode; } fdput(f); - return s; } + + /* For scoped-lookups we need to set the root to the dirfd as well. */ + if (flags & LOOKUP_IS_SCOPED) { + nd->root = nd->path; + if (flags & LOOKUP_RCU) { + nd->root_seq = nd->seq; + } else { + path_get(&nd->root); + nd->flags |= LOOKUP_ROOT_GRABBED; + } + } + return s; } static const char *trailing_symlink(struct nameidata *nd) @@ -2618,72 +2749,6 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags, EXPORT_SYMBOL(user_path_at_empty); /** - * mountpoint_last - look up last component for umount - * @nd: pathwalk nameidata - currently pointing at parent directory of "last" - * - * This is a special lookup_last function just for umount. In this case, we - * need to resolve the path without doing any revalidation. - * - * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since - * mountpoints are always pinned in the dcache, their ancestors are too. Thus, - * in almost all cases, this lookup will be served out of the dcache. The only - * cases where it won't are if nd->last refers to a symlink or the path is - * bogus and it doesn't exist. - * - * Returns: - * -error: if there was an error during lookup. This includes -ENOENT if the - * lookup found a negative dentry. - * - * 0: if we successfully resolved nd->last and found it to not to be a - * symlink that needs to be followed. - * - * 1: if we successfully resolved nd->last and found it to be a symlink - * that needs to be followed. - */ -static int -mountpoint_last(struct nameidata *nd) -{ - int error = 0; - struct dentry *dir = nd->path.dentry; - struct path path; - - /* If we're in rcuwalk, drop out of it to handle last component */ - if (nd->flags & LOOKUP_RCU) { - if (unlazy_walk(nd)) - return -ECHILD; - } - - nd->flags &= ~LOOKUP_PARENT; - - if (unlikely(nd->last_type != LAST_NORM)) { - error = handle_dots(nd, nd->last_type); - if (error) - return error; - path.dentry = dget(nd->path.dentry); - } else { - path.dentry = d_lookup(dir, &nd->last); - if (!path.dentry) { - /* - * No cached dentry. Mounted dentries are pinned in the - * cache, so that means that this dentry is probably - * a symlink or the path doesn't actually point - * to a mounted dentry. - */ - path.dentry = lookup_slow(&nd->last, dir, - nd->flags | LOOKUP_NO_REVAL); - if (IS_ERR(path.dentry)) - return PTR_ERR(path.dentry); - } - } - if (d_flags_negative(smp_load_acquire(&path.dentry->d_flags))) { - dput(path.dentry); - return -ENOENT; - } - path.mnt = nd->path.mnt; - return step_into(nd, &path, 0, d_backing_inode(path.dentry), 0); -} - -/** * path_mountpoint - look up a path to be umounted * @nd: lookup context * @flags: lookup flags @@ -2699,14 +2764,17 @@ path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path) int err; while (!(err = link_path_walk(s, nd)) && - (err = mountpoint_last(nd)) > 0) { + (err = lookup_last(nd)) > 0) { s = trailing_symlink(nd); } + if (!err && (nd->flags & LOOKUP_RCU)) + err = unlazy_walk(nd); + if (!err) + err = handle_lookup_down(nd); if (!err) { *path = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; - follow_mount(path); } terminate_walk(nd); return err; @@ -3265,6 +3333,8 @@ static int do_last(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct dentry *dir = nd->path.dentry; + kuid_t dir_uid = nd->inode->i_uid; + umode_t dir_mode = nd->inode->i_mode; int open_flag = op->open_flag; bool will_truncate = (open_flag & O_TRUNC) != 0; bool got_write = false; @@ -3395,7 +3465,7 @@ finish_open: error = -EISDIR; if (d_is_dir(nd->path.dentry)) goto out; - error = may_create_in_sticky(dir, + error = may_create_in_sticky(dir_mode, dir_uid, d_backing_inode(nd->path.dentry)); if (unlikely(error)) goto out; diff --git a/fs/namespace.c b/fs/namespace.c index 2fd0c8bcb8c1..5e1bf611a9eb 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1728,7 +1728,7 @@ static bool is_mnt_ns_file(struct dentry *dentry) dentry->d_fsdata == &mntns_operations; } -struct mnt_namespace *to_mnt_ns(struct ns_common *ns) +static struct mnt_namespace *to_mnt_ns(struct ns_common *ns) { return container_of(ns, struct mnt_namespace, ns); } @@ -3325,8 +3325,8 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name) } EXPORT_SYMBOL(mount_subtree); -int ksys_mount(const char __user *dev_name, const char __user *dir_name, - const char __user *type, unsigned long flags, void __user *data) +SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, + char __user *, type, unsigned long, flags, void __user *, data) { int ret; char *kernel_type; @@ -3359,12 +3359,6 @@ out_type: return ret; } -SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, - char __user *, type, unsigned long, flags, void __user *, data) -{ - return ksys_mount(dev_name, dir_name, type, flags, data); -} - /* * Create a kernel mount representation for a new, prepared superblock * (specified by fs_fd) and attach to an open_tree-like file descriptor. diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c index 15f271401dcc..573b1da9342c 100644 --- a/fs/nfs/fscache-index.c +++ b/fs/nfs/fscache-index.c @@ -84,8 +84,10 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, return FSCACHE_CHECKAUX_OBSOLETE; memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime); - auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime); + auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; + auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; + auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; + auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 3800ab6f08fa..7def925d3af5 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -238,8 +238,10 @@ void nfs_fscache_init_inode(struct inode *inode) return; memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime); - auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime); + auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; + auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; + auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; + auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); @@ -263,8 +265,10 @@ void nfs_fscache_clear_inode(struct inode *inode) dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime); - auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime); + auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; + auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; + auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; + auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; fscache_relinquish_cookie(cookie, &auxdata, false); nfsi->fscache = NULL; } @@ -305,8 +309,10 @@ void nfs_fscache_open_file(struct inode *inode, struct file *filp) return; memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime); - auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime); + auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; + auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; + auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; + auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; if (inode_is_open_for_write(inode)) { dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index ad041cfbf9ec..6754c8607230 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -62,9 +62,11 @@ struct nfs_fscache_key { * cache object. */ struct nfs_fscache_inode_auxdata { - struct timespec mtime; - struct timespec ctime; - u64 change_attr; + s64 mtime_sec; + s64 mtime_nsec; + s64 ctime_sec; + s64 ctime_nsec; + u64 change_attr; }; /* diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 936c57779ff4..728d88b6a698 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4097,7 +4097,7 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str status = NFS_ATTR_FATTR_ATIME; bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS; } - dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec); + dprintk("%s: atime=%lld\n", __func__, time->tv_sec); return status; } @@ -4115,7 +4115,7 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s status = NFS_ATTR_FATTR_CTIME; bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA; } - dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec); + dprintk("%s: ctime=%lld\n", __func__, time->tv_sec); return status; } @@ -4132,8 +4132,8 @@ static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap, status = decode_attr_time(xdr, time); bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA; } - dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec, - (long)time->tv_nsec); + dprintk("%s: time_delta=%lld %ld\n", __func__, time->tv_sec, + time->tv_nsec); return status; } @@ -4197,7 +4197,7 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str status = NFS_ATTR_FATTR_MTIME; bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY; } - dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec); + dprintk("%s: mtime=%lld\n", __func__, time->tv_sec); return status; } diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index f64a33d2a1d1..2a82dcce5fc1 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -206,7 +206,6 @@ TRACE_DEFINE_ENUM(LOOKUP_AUTOMOUNT); TRACE_DEFINE_ENUM(LOOKUP_PARENT); TRACE_DEFINE_ENUM(LOOKUP_REVAL); TRACE_DEFINE_ENUM(LOOKUP_RCU); -TRACE_DEFINE_ENUM(LOOKUP_NO_REVAL); TRACE_DEFINE_ENUM(LOOKUP_OPEN); TRACE_DEFINE_ENUM(LOOKUP_CREATE); TRACE_DEFINE_ENUM(LOOKUP_EXCL); @@ -224,7 +223,6 @@ TRACE_DEFINE_ENUM(LOOKUP_DOWN); { LOOKUP_PARENT, "PARENT" }, \ { LOOKUP_REVAL, "REVAL" }, \ { LOOKUP_RCU, "RCU" }, \ - { LOOKUP_NO_REVAL, "NO_REVAL" }, \ { LOOKUP_OPEN, "OPEN" }, \ { LOOKUP_CREATE, "CREATE" }, \ { LOOKUP_EXCL, "EXCL" }, \ diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 11b42c523f04..7eb919f1b13f 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -157,11 +157,11 @@ static int exports_proc_open(struct inode *inode, struct file *file) return exports_net_open(current->nsproxy->net_ns, file); } -static const struct file_operations exports_proc_operations = { - .open = exports_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, +static const struct proc_ops exports_proc_ops = { + .proc_open = exports_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, }; static int exports_nfsd_open(struct inode *inode, struct file *file) @@ -1431,8 +1431,7 @@ static int create_proc_exports_entry(void) entry = proc_mkdir("fs/nfs", NULL); if (!entry) return -ENOMEM; - entry = proc_create("exports", 0, entry, - &exports_proc_operations); + entry = proc_create("exports", 0, entry, &exports_proc_ops); if (!entry) { remove_proc_entry("fs/nfs", NULL); return -ENOMEM; diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 9bce3b913189..b1bc582b0493 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -84,17 +84,17 @@ static int nfsd_proc_open(struct inode *inode, struct file *file) return single_open(file, nfsd_proc_show, NULL); } -static const struct file_operations nfsd_proc_fops = { - .open = nfsd_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops nfsd_proc_ops = { + .proc_open = nfsd_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, }; void nfsd_stat_init(void) { - svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_fops); + svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops); } void diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 3e77b728a22b..46f225580009 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -57,6 +57,9 @@ static void fsnotify_unmount_inodes(struct super_block *sb) * doing an __iget/iput with SB_ACTIVE clear would actually * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. + * However, we should have been called /after/ evict_inodes + * removed all zero refcount inodes, in any case. Test to + * be sure. */ if (!atomic_read(&inode->i_count)) { spin_unlock(&inode->i_lock); @@ -77,6 +80,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) iput_inode = inode; + cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); diff --git a/fs/nsfs.c b/fs/nsfs.c index a0431642c6b5..b13bfd406820 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -3,6 +3,7 @@ #include <linux/pseudo_fs.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/ktime.h> @@ -11,6 +12,8 @@ #include <linux/nsfs.h> #include <linux/uaccess.h> +#include "internal.h" + static struct vfsmount *nsfs_mnt; static long ns_ioctl(struct file *filp, unsigned int ioctl, @@ -52,7 +55,7 @@ static void nsfs_evict(struct inode *inode) ns->ops->put(ns); } -static void *__ns_get_path(struct path *path, struct ns_common *ns) +static int __ns_get_path(struct path *path, struct ns_common *ns) { struct vfsmount *mnt = nsfs_mnt; struct dentry *dentry; @@ -71,13 +74,13 @@ static void *__ns_get_path(struct path *path, struct ns_common *ns) got_it: path->mnt = mntget(mnt); path->dentry = dentry; - return NULL; + return 0; slow: rcu_read_unlock(); inode = new_inode_pseudo(mnt->mnt_sb); if (!inode) { ns->ops->put(ns); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } inode->i_ino = ns->inum; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); @@ -89,7 +92,7 @@ slow: dentry = d_alloc_anon(mnt->mnt_sb); if (!dentry) { iput(inode); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } d_instantiate(dentry, inode); dentry->d_fsdata = (void *)ns->ops; @@ -98,23 +101,22 @@ slow: d_delete(dentry); /* make sure ->d_prune() does nothing */ dput(dentry); cpu_relax(); - return ERR_PTR(-EAGAIN); + return -EAGAIN; } goto got_it; } -void *ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb, +int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb, void *private_data) { - void *ret; + int ret; do { struct ns_common *ns = ns_get_cb(private_data); if (!ns) - return ERR_PTR(-ENOENT); - + return -ENOENT; ret = __ns_get_path(path, ns); - } while (ret == ERR_PTR(-EAGAIN)); + } while (ret == -EAGAIN); return ret; } @@ -131,7 +133,7 @@ static struct ns_common *ns_get_path_task(void *private_data) return args->ns_ops->get(args->task); } -void *ns_get_path(struct path *path, struct task_struct *task, +int ns_get_path(struct path *path, struct task_struct *task, const struct proc_ns_operations *ns_ops) { struct ns_get_path_task_args args = { @@ -147,7 +149,7 @@ int open_related_ns(struct ns_common *ns, { struct path path = {}; struct file *f; - void *err; + int err; int fd; fd = get_unused_fd_flags(O_CLOEXEC); @@ -164,11 +166,11 @@ int open_related_ns(struct ns_common *ns, } err = __ns_get_path(&path, relative); - } while (err == ERR_PTR(-EAGAIN)); + } while (err == -EAGAIN); - if (IS_ERR(err)) { + if (err) { put_unused_fd(fd); - return PTR_ERR(err); + return err; } f = dentry_open(&path, O_RDONLY, current_cred()); diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 5c424a099280..1ef24574f481 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -73,7 +73,7 @@ static void o2quo_fence_self(void) "system by restarting ***\n"); emergency_restart(); break; - }; + } } /* Indicate that a timeout occurred on a heartbeat region write. The diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile index 38b224372776..5e700b45d32d 100644 --- a/fs/ocfs2/dlm/Makefile +++ b/fs/ocfs2/dlm/Makefile @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -ccflags-y := -I $(srctree)/$(src)/.. - obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 4de89af96abf..6abaded3ff6b 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -23,15 +23,15 @@ #include <linux/spinlock.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_lock *lock); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index aaf24548b02a..0463dce65bb2 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -688,10 +688,6 @@ struct dlm_begin_reco __be32 pad2; }; - -#define BITS_PER_BYTE 8 -#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE) - struct dlm_query_join_request { u8 node_idx; diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 965f45dbe17b..6051edc33aef 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -23,9 +23,9 @@ #include <linux/spinlock.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -33,7 +33,7 @@ #include "dlmconvert.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" /* NOTE: __dlmconvert_master is the only function in here that * needs a spinlock held on entry (res->spinlock) and it is the diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 4d0b452012b2..c5c6efba7b5e 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -17,9 +17,9 @@ #include <linux/debugfs.h> #include <linux/export.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -27,7 +27,7 @@ #include "dlmdebug.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static int stringify_lockname(const char *lockname, int locklen, char *buf, int len); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index ee6f459f9770..357cfc702ce3 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -20,9 +20,9 @@ #include <linux/debugfs.h> #include <linux/sched/signal.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -30,7 +30,7 @@ #include "dlmdebug.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) -#include "cluster/masklog.h" +#include "../cluster/masklog.h" /* * ocfs2 node maps are array of long int, which limits to send them freely diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index baff087f3863..83f0760e4fba 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -25,9 +25,9 @@ #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -35,7 +35,7 @@ #include "dlmconvert.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static struct kmem_cache *dlm_lock_cache; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 74b768ca1cd8..900f7e466d11 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -25,9 +25,9 @@ #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -35,7 +35,7 @@ #include "dlmdebug.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static void dlm_mle_node_down(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle, @@ -2554,8 +2554,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, if (!dlm_grab(dlm)) return -EINVAL; - BUG_ON(target == O2NM_MAX_NODES); - name = res->lockname.name; namelen = res->lockname.len; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 064ce5bbc3f6..4b566e88582f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -26,16 +26,16 @@ #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" #include "dlmdomain.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY) -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node); @@ -1668,7 +1668,7 @@ static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 nodenum, u8 *real_master) { - int ret = -EINVAL; + int ret; struct dlm_master_requery req; int status = DLM_LOCK_RES_OWNER_UNKNOWN; diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 61c51c268460..fd40c17cd022 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -25,16 +25,16 @@ #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" #include "dlmdomain.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD) -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static int dlm_thread(void *data); static void dlm_flush_asts(struct dlm_ctxt *dlm); diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 3883633e82eb..dcb17ca8ae74 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -23,15 +23,15 @@ #include <linux/spinlock.h> #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" #define DLM_UNLOCK_FREE_LOCK 0x00000001 #define DLM_UNLOCK_CALL_AST 0x00000002 diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile index a9874e441bd4..c7895f65be0e 100644 --- a/fs/ocfs2/dlmfs/Makefile +++ b/fs/ocfs2/dlmfs/Makefile @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -ccflags-y := -I $(srctree)/$(src)/.. - obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o ocfs2_dlmfs-objs := userdlm.o dlmfs.o diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 4f1668c81e1f..8e4f1ace467c 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -33,11 +33,11 @@ #include <linux/uaccess.h> -#include "stackglue.h" +#include "../stackglue.h" #include "userdlm.h" #define MLOG_MASK_PREFIX ML_DLMFS -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static const struct super_operations dlmfs_ops; diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c index 525b14ddfba5..3df5be25bfb1 100644 --- a/fs/ocfs2/dlmfs/userdlm.c +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -21,12 +21,12 @@ #include <linux/types.h> #include <linux/crc32.h> -#include "ocfs2_lockingver.h" -#include "stackglue.h" +#include "../ocfs2_lockingver.h" +#include "../stackglue.h" #include "userdlm.h" #define MLOG_MASK_PREFIX ML_DLMFS -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1c4c51f3df60..cb9e6a73bea9 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -570,7 +570,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, mlog_bug_on_msg(1, "type: %d\n", type); ops = NULL; /* thanks, gcc */ break; - }; + } ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno, generation, res->l_name); @@ -3282,6 +3282,7 @@ static void ocfs2_dlm_init_debug(struct ocfs2_super *osb) debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root, &dlm_debug->d_filter_secs); + ocfs2_get_dlm_debug(dlm_debug); } static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9876db52913a..6cd5e4924e4d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2101,17 +2101,15 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) static int ocfs2_inode_lock_for_extent_tree(struct inode *inode, struct buffer_head **di_bh, int meta_level, - int overwrite_io, int write_sem, int wait) { int ret = 0; if (wait) - ret = ocfs2_inode_lock(inode, NULL, meta_level); + ret = ocfs2_inode_lock(inode, di_bh, meta_level); else - ret = ocfs2_try_inode_lock(inode, - overwrite_io ? NULL : di_bh, meta_level); + ret = ocfs2_try_inode_lock(inode, di_bh, meta_level); if (ret < 0) goto out; @@ -2136,6 +2134,7 @@ static int ocfs2_inode_lock_for_extent_tree(struct inode *inode, out_unlock: brelse(*di_bh); + *di_bh = NULL; ocfs2_inode_unlock(inode, meta_level); out: return ret; @@ -2177,7 +2176,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file, ret = ocfs2_inode_lock_for_extent_tree(inode, &di_bh, meta_level, - overwrite_io, write_sem, wait); if (ret < 0) { @@ -2233,13 +2231,13 @@ static int ocfs2_prepare_inode_for_write(struct file *file, &di_bh, meta_level, write_sem); + meta_level = 1; + write_sem = 1; ret = ocfs2_inode_lock_for_extent_tree(inode, &di_bh, meta_level, - overwrite_io, - 1, + write_sem, wait); - write_sem = 1; if (ret < 0) { if (ret != -EAGAIN) mlog_errno(ret); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 1afe57f425a0..68ba354cf361 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1066,6 +1066,14 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num); + if (replayed) { + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); + if (status < 0) + mlog_errno(status); + } + status = ocfs2_journal_toggle_dirty(osb, 1, replayed); if (status < 0) { mlog_errno(status); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 3103ba7f97a2..bfe611ed1b1d 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -597,9 +597,11 @@ static inline void ocfs2_update_inode_fsync_trans(handle_t *handle, { struct ocfs2_inode_info *oi = OCFS2_I(inode); - oi->i_sync_tid = handle->h_transaction->t_tid; - if (datasync) - oi->i_datasync_tid = handle->h_transaction->t_tid; + if (!is_handle_aborted(handle)) { + oi->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + oi->i_datasync_tid = handle->h_transaction->t_tid; + } } #endif /* OCFS2_JOURNAL_H */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 8ea51cf27b97..da65251ef815 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -586,8 +586,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, mlog_errno(status); } - oi->i_sync_tid = handle->h_transaction->t_tid; - oi->i_datasync_tid = handle->h_transaction->t_tid; + ocfs2_update_inode_fsync_trans(handle, inode, 1); leave: if (status < 0) { diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 4180c3ef0a68..939df99d2dec 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -696,7 +696,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, ac, cl); - if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC)) + if (PTR_ERR(bg_bh) == -ENOSPC) bg_bh = ocfs2_block_group_alloc_discontig(handle, alloc_inode, ac, cl); diff --git a/fs/open.c b/fs/open.c index b62f5c0923a8..0788b3715731 100644 --- a/fs/open.c +++ b/fs/open.c @@ -955,48 +955,83 @@ struct file *open_with_fake_path(const struct path *path, int flags, } EXPORT_SYMBOL(open_with_fake_path); -static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) +#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) +#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) + +inline struct open_how build_open_how(int flags, umode_t mode) +{ + struct open_how how = { + .flags = flags & VALID_OPEN_FLAGS, + .mode = mode & S_IALLUGO, + }; + + /* O_PATH beats everything else. */ + if (how.flags & O_PATH) + how.flags &= O_PATH_FLAGS; + /* Modes should only be set for create-like flags. */ + if (!WILL_CREATE(how.flags)) + how.mode = 0; + return how; +} + +inline int build_open_flags(const struct open_how *how, struct open_flags *op) { + int flags = how->flags; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); + /* Must never be set by userspace */ + flags &= ~(FMODE_NONOTIFY | O_CLOEXEC); + /* - * Clear out all open flags we don't know about so that we don't report - * them in fcntl(F_GETFD) or similar interfaces. + * Older syscalls implicitly clear all of the invalid flags or argument + * values before calling build_open_flags(), but openat2(2) checks all + * of its arguments. */ - flags &= VALID_OPEN_FLAGS; + if (flags & ~VALID_OPEN_FLAGS) + return -EINVAL; + if (how->resolve & ~VALID_RESOLVE_FLAGS) + return -EINVAL; - if (flags & (O_CREAT | __O_TMPFILE)) - op->mode = (mode & S_IALLUGO) | S_IFREG; - else + /* Deal with the mode. */ + if (WILL_CREATE(flags)) { + if (how->mode & ~S_IALLUGO) + return -EINVAL; + op->mode = how->mode | S_IFREG; + } else { + if (how->mode != 0) + return -EINVAL; op->mode = 0; - - /* Must never be set by userspace */ - flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC; + } /* - * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only - * check for O_DSYNC if the need any syncing at all we enforce it's - * always set instead of having to deal with possibly weird behaviour - * for malicious applications setting only __O_SYNC. + * In order to ensure programs get explicit errors when trying to use + * O_TMPFILE on old kernels, O_TMPFILE is implemented such that it + * looks like (O_DIRECTORY|O_RDWR & ~O_CREAT) to old kernels. But we + * have to require userspace to explicitly set it. */ - if (flags & __O_SYNC) - flags |= O_DSYNC; - if (flags & __O_TMPFILE) { if ((flags & O_TMPFILE_MASK) != O_TMPFILE) return -EINVAL; if (!(acc_mode & MAY_WRITE)) return -EINVAL; - } else if (flags & O_PATH) { - /* - * If we have O_PATH in the open flag. Then we - * cannot have anything other than the below set of flags - */ - flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; + } + if (flags & O_PATH) { + /* O_PATH only permits certain other flags to be set. */ + if (flags & ~O_PATH_FLAGS) + return -EINVAL; acc_mode = 0; } + /* + * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only + * check for O_DSYNC if the need any syncing at all we enforce it's + * always set instead of having to deal with possibly weird behaviour + * for malicious applications setting only __O_SYNC. + */ + if (flags & __O_SYNC) + flags |= O_DSYNC; + op->open_flag = flags; /* O_TRUNC implies we need access checks for write permissions */ @@ -1022,6 +1057,18 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o lookup_flags |= LOOKUP_DIRECTORY; if (!(flags & O_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; + + if (how->resolve & RESOLVE_NO_XDEV) + lookup_flags |= LOOKUP_NO_XDEV; + if (how->resolve & RESOLVE_NO_MAGICLINKS) + lookup_flags |= LOOKUP_NO_MAGICLINKS; + if (how->resolve & RESOLVE_NO_SYMLINKS) + lookup_flags |= LOOKUP_NO_SYMLINKS; + if (how->resolve & RESOLVE_BENEATH) + lookup_flags |= LOOKUP_BENEATH; + if (how->resolve & RESOLVE_IN_ROOT) + lookup_flags |= LOOKUP_IN_ROOT; + op->lookup_flags = lookup_flags; return 0; } @@ -1040,8 +1087,11 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o struct file *file_open_name(struct filename *name, int flags, umode_t mode) { struct open_flags op; - int err = build_open_flags(flags, mode, &op); - return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op); + struct open_how how = build_open_how(flags, mode); + int err = build_open_flags(&how, &op); + if (err) + return ERR_PTR(err); + return do_filp_open(AT_FDCWD, name, &op); } /** @@ -1072,17 +1122,19 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, const char *filename, int flags, umode_t mode) { struct open_flags op; - int err = build_open_flags(flags, mode, &op); + struct open_how how = build_open_how(flags, mode); + int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); return do_file_open_root(dentry, mnt, filename, &op); } EXPORT_SYMBOL(file_open_root); -long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) +static long do_sys_openat2(int dfd, const char __user *filename, + struct open_how *how) { struct open_flags op; - int fd = build_open_flags(flags, mode, &op); + int fd = build_open_flags(how, &op); struct filename *tmp; if (fd) @@ -1092,7 +1144,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) if (IS_ERR(tmp)) return PTR_ERR(tmp); - fd = get_unused_fd_flags(flags); + fd = get_unused_fd_flags(how->flags); if (fd >= 0) { struct file *f = do_filp_open(dfd, tmp, &op); if (IS_ERR(f)) { @@ -1107,12 +1159,16 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) return fd; } -SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) +long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { - if (force_o_largefile()) - flags |= O_LARGEFILE; + struct open_how how = build_open_how(flags, mode); + return do_sys_openat2(dfd, filename, &how); +} - return do_sys_open(AT_FDCWD, filename, flags, mode); + +SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) +{ + return ksys_open(filename, flags, mode); } SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, @@ -1120,10 +1176,32 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, { if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(dfd, filename, flags, mode); } +SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, + struct open_how __user *, how, size_t, usize) +{ + int err; + struct open_how tmp; + + BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST); + + if (unlikely(usize < OPEN_HOW_SIZE_VER0)) + return -EINVAL; + + err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize); + if (err) + return err; + + /* O_LARGEFILE is only allowed for non-O_PATH. */ + if (!(tmp.flags & O_PATH) && force_o_largefile()) + tmp.flags |= O_LARGEFILE; + + return do_sys_openat2(dfd, filename, &tmp); +} + #ifdef CONFIG_COMPAT /* * Exactly like sys_open(), except that it doesn't set the diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index b801c6353100..9fc47c2e078d 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -24,7 +24,7 @@ static int ovl_ccup_set(const char *buf, const struct kernel_param *param) { - pr_warn("overlayfs: \"check_copy_up\" module option is obsolete\n"); + pr_warn("\"check_copy_up\" module option is obsolete\n"); return 0; } @@ -123,6 +123,9 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) loff_t old_pos = 0; loff_t new_pos = 0; loff_t cloned; + loff_t data_pos = -1; + loff_t hole_len; + bool skip_hole = false; int error = 0; if (len == 0) @@ -144,7 +147,11 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) goto out; /* Couldn't clone, so now we try to copy the data */ - /* FIXME: copy up sparse files efficiently */ + /* Check if lower fs supports seek operation */ + if (old_file->f_mode & FMODE_LSEEK && + old_file->f_op->llseek) + skip_hole = true; + while (len) { size_t this_len = OVL_COPY_UP_CHUNK_SIZE; long bytes; @@ -157,6 +164,36 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) break; } + /* + * Fill zero for hole will cost unnecessary disk space + * and meanwhile slow down the copy-up speed, so we do + * an optimization for hole during copy-up, it relies + * on SEEK_DATA implementation in lower fs so if lower + * fs does not support it, copy-up will behave as before. + * + * Detail logic of hole detection as below: + * When we detect next data position is larger than current + * position we will skip that hole, otherwise we copy + * data in the size of OVL_COPY_UP_CHUNK_SIZE. Actually, + * it may not recognize all kind of holes and sometimes + * only skips partial of hole area. However, it will be + * enough for most of the use cases. + */ + + if (skip_hole && data_pos < old_pos) { + data_pos = vfs_llseek(old_file, old_pos, SEEK_DATA); + if (data_pos > old_pos) { + hole_len = data_pos - old_pos; + len -= hole_len; + old_pos = new_pos = data_pos; + continue; + } else if (data_pos == -ENXIO) { + break; + } else if (data_pos < 0) { + skip_hole = false; + } + } + bytes = do_splice_direct(old_file, &old_pos, new_file, &new_pos, this_len, SPLICE_F_MOVE); @@ -227,13 +264,17 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) { struct ovl_fh *fh; - int fh_type, fh_len, dwords; - void *buf; + int fh_type, dwords; int buflen = MAX_HANDLE_SZ; uuid_t *uuid = &real->d_sb->s_uuid; + int err; - buf = kmalloc(buflen, GFP_KERNEL); - if (!buf) + /* Make sure the real fid stays 32bit aligned */ + BUILD_BUG_ON(OVL_FH_FID_OFFSET % 4); + BUILD_BUG_ON(MAX_HANDLE_SZ + OVL_FH_FID_OFFSET > 255); + + fh = kzalloc(buflen + OVL_FH_FID_OFFSET, GFP_KERNEL); + if (!fh) return ERR_PTR(-ENOMEM); /* @@ -242,27 +283,19 @@ struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) * the price or reconnecting the dentry. */ dwords = buflen >> 2; - fh_type = exportfs_encode_fh(real, buf, &dwords, 0); + fh_type = exportfs_encode_fh(real, (void *)fh->fb.fid, &dwords, 0); buflen = (dwords << 2); - fh = ERR_PTR(-EIO); + err = -EIO; if (WARN_ON(fh_type < 0) || WARN_ON(buflen > MAX_HANDLE_SZ) || WARN_ON(fh_type == FILEID_INVALID)) - goto out; - - BUILD_BUG_ON(MAX_HANDLE_SZ + offsetof(struct ovl_fh, fid) > 255); - fh_len = offsetof(struct ovl_fh, fid) + buflen; - fh = kmalloc(fh_len, GFP_KERNEL); - if (!fh) { - fh = ERR_PTR(-ENOMEM); - goto out; - } + goto out_err; - fh->version = OVL_FH_VERSION; - fh->magic = OVL_FH_MAGIC; - fh->type = fh_type; - fh->flags = OVL_FH_FLAG_CPU_ENDIAN; + fh->fb.version = OVL_FH_VERSION; + fh->fb.magic = OVL_FH_MAGIC; + fh->fb.type = fh_type; + fh->fb.flags = OVL_FH_FLAG_CPU_ENDIAN; /* * When we will want to decode an overlay dentry from this handle * and all layers are on the same fs, if we get a disconncted real @@ -270,14 +303,15 @@ struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) * it to upperdentry or to lowerstack is by checking this flag. */ if (is_upper) - fh->flags |= OVL_FH_FLAG_PATH_UPPER; - fh->len = fh_len; - fh->uuid = *uuid; - memcpy(fh->fid, buf, buflen); + fh->fb.flags |= OVL_FH_FLAG_PATH_UPPER; + fh->fb.len = sizeof(fh->fb) + buflen; + fh->fb.uuid = *uuid; -out: - kfree(buf); return fh; + +out_err: + kfree(fh); + return ERR_PTR(err); } int ovl_set_origin(struct dentry *dentry, struct dentry *lower, @@ -300,8 +334,8 @@ int ovl_set_origin(struct dentry *dentry, struct dentry *lower, /* * Do not fail when upper doesn't support xattrs. */ - err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh, - fh ? fh->len : 0, 0); + err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh->buf, + fh ? fh->fb.len : 0, 0); kfree(fh); return err; @@ -317,7 +351,7 @@ static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index) if (IS_ERR(fh)) return PTR_ERR(fh); - err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh, fh->len, 0); + err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh->buf, fh->fb.len, 0); kfree(fh); return err; @@ -483,7 +517,7 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) } inode_lock(temp->d_inode); - if (c->metacopy) + if (S_ISREG(c->stat.mode)) err = ovl_set_size(temp, &c->stat); if (!err) err = ovl_set_attr(temp, &c->stat); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 702aa63f6774..8e57d5372b8f 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -35,7 +35,7 @@ int ovl_cleanup(struct inode *wdir, struct dentry *wdentry) dput(wdentry); if (err) { - pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n", + pr_err("cleanup of '%pd2' failed (%i)\n", wdentry, err); } @@ -53,7 +53,7 @@ static struct dentry *ovl_lookup_temp(struct dentry *workdir) temp = lookup_one_len(name, workdir, strlen(name)); if (!IS_ERR(temp) && temp->d_inode) { - pr_err("overlayfs: workdir/%s already exists\n", name); + pr_err("workdir/%s already exists\n", name); dput(temp); temp = ERR_PTR(-EIO); } @@ -134,7 +134,7 @@ static int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, d = lookup_one_len(dentry->d_name.name, dentry->d_parent, dentry->d_name.len); if (IS_ERR(d)) { - pr_warn("overlayfs: failed lookup after mkdir (%pd2, err=%i).\n", + pr_warn("failed lookup after mkdir (%pd2, err=%i).\n", dentry, err); return PTR_ERR(d); } @@ -267,7 +267,7 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode, d_instantiate(dentry, inode); if (inode != oip.newinode) { - pr_warn_ratelimited("overlayfs: newly created inode found in cache (%pd2)\n", + pr_warn_ratelimited("newly created inode found in cache (%pd2)\n", dentry); } @@ -1009,7 +1009,7 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) spin_unlock(&dentry->d_lock); } else { kfree(redirect); - pr_warn_ratelimited("overlayfs: failed to set redirect (%i)\n", + pr_warn_ratelimited("failed to set redirect (%i)\n", err); /* Fall back to userspace copy-up */ err = -EXDEV; @@ -1170,7 +1170,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, if (newdentry == trap) goto out_dput; - if (WARN_ON(olddentry->d_inode == newdentry->d_inode)) + if (olddentry->d_inode == newdentry->d_inode) goto out_dput; err = 0; diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 73c9775215b3..6f54d70cef27 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -30,7 +30,7 @@ static int ovl_encode_maybe_copy_up(struct dentry *dentry) } if (err) { - pr_warn_ratelimited("overlayfs: failed to copy up on encode (%pd2, err=%i)\n", + pr_warn_ratelimited("failed to copy up on encode (%pd2, err=%i)\n", dentry, err); } @@ -211,10 +211,11 @@ static int ovl_check_encode_origin(struct dentry *dentry) return 1; } -static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) +static int ovl_dentry_to_fid(struct dentry *dentry, u32 *fid, int buflen) { struct ovl_fh *fh = NULL; int err, enc_lower; + int len; /* * Check if we should encode a lower or upper file handle and maybe @@ -231,43 +232,29 @@ static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) return PTR_ERR(fh); err = -EOVERFLOW; - if (fh->len > buflen) + len = OVL_FH_LEN(fh); + if (len > buflen) goto fail; - memcpy(buf, (char *)fh, fh->len); - err = fh->len; + memcpy(fid, fh, len); + err = len; out: kfree(fh); return err; fail: - pr_warn_ratelimited("overlayfs: failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n", - dentry, err, buflen, fh ? (int)fh->len : 0, - fh ? fh->type : 0); + pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n", + dentry, err, buflen, fh ? (int)fh->fb.len : 0, + fh ? fh->fb.type : 0); goto out; } -static int ovl_dentry_to_fh(struct dentry *dentry, u32 *fid, int *max_len) -{ - int res, len = *max_len << 2; - - res = ovl_d_to_fh(dentry, (char *)fid, len); - if (res <= 0) - return FILEID_INVALID; - - len = res; - - /* Round up to dwords */ - *max_len = (len + 3) >> 2; - return OVL_FILEID; -} - static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, struct inode *parent) { struct dentry *dentry; - int type; + int bytes = *max_len << 2; /* TODO: encode connectable file handles */ if (parent) @@ -277,10 +264,14 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, if (WARN_ON(!dentry)) return FILEID_INVALID; - type = ovl_dentry_to_fh(dentry, fid, max_len); - + bytes = ovl_dentry_to_fid(dentry, fid, bytes); dput(dentry); - return type; + if (bytes <= 0) + return FILEID_INVALID; + + *max_len = bytes >> 2; + + return OVL_FILEID_V1; } /* @@ -367,7 +358,7 @@ static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx) */ static struct dentry *ovl_lookup_real_one(struct dentry *connected, struct dentry *real, - struct ovl_layer *layer) + const struct ovl_layer *layer) { struct inode *dir = d_inode(connected); struct dentry *this, *parent = NULL; @@ -415,7 +406,7 @@ out: return this; fail: - pr_warn_ratelimited("overlayfs: failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", + pr_warn_ratelimited("failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", real, layer->idx, connected, err); this = ERR_PTR(err); goto out; @@ -423,17 +414,16 @@ fail: static struct dentry *ovl_lookup_real(struct super_block *sb, struct dentry *real, - struct ovl_layer *layer); + const struct ovl_layer *layer); /* * Lookup an indexed or hashed overlay dentry by real inode. */ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, struct dentry *real, - struct ovl_layer *layer) + const struct ovl_layer *layer) { struct ovl_fs *ofs = sb->s_fs_info; - struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt }; struct dentry *index = NULL; struct dentry *this = NULL; struct inode *inode; @@ -475,7 +465,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, * recursive call walks back from indexed upper to the topmost * connected/hashed upper parent (or up to root). */ - this = ovl_lookup_real(sb, upper, &upper_layer); + this = ovl_lookup_real(sb, upper, &ofs->layers[0]); dput(upper); } @@ -496,7 +486,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, */ static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb, struct dentry *real, - struct ovl_layer *layer) + const struct ovl_layer *layer) { struct dentry *next, *parent = NULL; struct dentry *ancestor = ERR_PTR(-EIO); @@ -549,7 +539,7 @@ static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb, */ static struct dentry *ovl_lookup_real(struct super_block *sb, struct dentry *real, - struct ovl_layer *layer) + const struct ovl_layer *layer) { struct dentry *connected; int err = 0; @@ -640,7 +630,7 @@ static struct dentry *ovl_lookup_real(struct super_block *sb, return connected; fail: - pr_warn_ratelimited("overlayfs: failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", + pr_warn_ratelimited("failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", real, layer->idx, connected, err); dput(connected); return ERR_PTR(err); @@ -655,8 +645,7 @@ static struct dentry *ovl_get_dentry(struct super_block *sb, struct dentry *index) { struct ovl_fs *ofs = sb->s_fs_info; - struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt }; - struct ovl_layer *layer = upper ? &upper_layer : lowerpath->layer; + const struct ovl_layer *layer = upper ? &ofs->layers[0] : lowerpath->layer; struct dentry *real = upper ?: (index ?: lowerpath->dentry); /* @@ -777,24 +766,45 @@ out_err: goto out; } +static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type) +{ + struct ovl_fh *fh; + + /* If on-wire inner fid is aligned - nothing to do */ + if (fh_type == OVL_FILEID_V1) + return (struct ovl_fh *)fid; + + if (fh_type != OVL_FILEID_V0) + return ERR_PTR(-EINVAL); + + fh = kzalloc(buflen, GFP_KERNEL); + if (!fh) + return ERR_PTR(-ENOMEM); + + /* Copy unaligned inner fh into aligned buffer */ + memcpy(&fh->fb, fid, buflen - OVL_FH_WIRE_OFFSET); + return fh; +} + static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct dentry *dentry = NULL; - struct ovl_fh *fh = (struct ovl_fh *) fid; + struct ovl_fh *fh = NULL; int len = fh_len << 2; unsigned int flags = 0; int err; - err = -EINVAL; - if (fh_type != OVL_FILEID) + fh = ovl_fid_to_fh(fid, len, fh_type); + err = PTR_ERR(fh); + if (IS_ERR(fh)) goto out_err; err = ovl_check_fh_len(fh, len); if (err) goto out_err; - flags = fh->flags; + flags = fh->fb.flags; dentry = (flags & OVL_FH_FLAG_PATH_UPPER) ? ovl_upper_fh_to_d(sb, fh) : ovl_lower_fh_to_d(sb, fh); @@ -802,18 +812,24 @@ static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid, if (IS_ERR(dentry) && err != -ESTALE) goto out_err; +out: + /* We may have needed to re-align OVL_FILEID_V0 */ + if (!IS_ERR_OR_NULL(fh) && fh != (void *)fid) + kfree(fh); + return dentry; out_err: - pr_warn_ratelimited("overlayfs: failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n", - len, fh_type, flags, err); - return ERR_PTR(err); + pr_warn_ratelimited("failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n", + fh_len, fh_type, flags, err); + dentry = ERR_PTR(err); + goto out; } static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { - pr_warn_ratelimited("overlayfs: connectable file handles not supported; use 'no_subtree_check' exportfs option.\n"); + pr_warn_ratelimited("connectable file handles not supported; use 'no_subtree_check' exportfs option.\n"); return ERR_PTR(-EACCES); } diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index e235a635d9ec..a5317216de73 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -9,8 +9,19 @@ #include <linux/xattr.h> #include <linux/uio.h> #include <linux/uaccess.h> +#include <linux/splice.h> +#include <linux/mm.h> +#include <linux/fs.h> #include "overlayfs.h" +struct ovl_aio_req { + struct kiocb iocb; + struct kiocb *orig_iocb; + struct fd fd; +}; + +static struct kmem_cache *ovl_aio_request_cachep; + static char ovl_whatisit(struct inode *inode, struct inode *realinode) { if (realinode != ovl_inode_upper(inode)) @@ -146,7 +157,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file_inode(file); struct fd real; const struct cred *old_cred; - ssize_t ret; + loff_t ret; /* * The two special cases below do not need to involve real fs, @@ -171,7 +182,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) * limitations that are more strict than ->s_maxbytes for specific * files, so we use the real file to perform seeks. */ - inode_lock(inode); + ovl_inode_lock(inode); real.file->f_pos = file->f_pos; old_cred = ovl_override_creds(inode->i_sb); @@ -179,7 +190,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) revert_creds(old_cred); file->f_pos = real.file->f_pos; - inode_unlock(inode); + ovl_inode_unlock(inode); fdput(real); @@ -225,6 +236,33 @@ static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb) return flags; } +static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) +{ + struct kiocb *iocb = &aio_req->iocb; + struct kiocb *orig_iocb = aio_req->orig_iocb; + + if (iocb->ki_flags & IOCB_WRITE) { + struct inode *inode = file_inode(orig_iocb->ki_filp); + + file_end_write(iocb->ki_filp); + ovl_copyattr(ovl_inode_real(inode), inode); + } + + orig_iocb->ki_pos = iocb->ki_pos; + fdput(aio_req->fd); + kmem_cache_free(ovl_aio_request_cachep, aio_req); +} + +static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2) +{ + struct ovl_aio_req *aio_req = container_of(iocb, + struct ovl_aio_req, iocb); + struct kiocb *orig_iocb = aio_req->orig_iocb; + + ovl_aio_cleanup_handler(aio_req); + orig_iocb->ki_complete(orig_iocb, res, res2); +} + static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; @@ -240,10 +278,28 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) return ret; old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb)); + if (is_sync_kiocb(iocb)) { + ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, + ovl_iocb_to_rwf(iocb)); + } else { + struct ovl_aio_req *aio_req; + + ret = -ENOMEM; + aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); + if (!aio_req) + goto out; + + aio_req->fd = real; + real.flags = 0; + aio_req->orig_iocb = iocb; + kiocb_clone(&aio_req->iocb, iocb, real.file); + aio_req->iocb.ki_complete = ovl_aio_rw_complete; + ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); + if (ret != -EIOCBQUEUED) + ovl_aio_cleanup_handler(aio_req); + } +out: revert_creds(old_cred); - ovl_file_accessed(file); fdput(real); @@ -274,15 +330,33 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) goto out_unlock; old_cred = ovl_override_creds(file_inode(file)->i_sb); - file_start_write(real.file); - ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb)); - file_end_write(real.file); + if (is_sync_kiocb(iocb)) { + file_start_write(real.file); + ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, + ovl_iocb_to_rwf(iocb)); + file_end_write(real.file); + /* Update size */ + ovl_copyattr(ovl_inode_real(inode), inode); + } else { + struct ovl_aio_req *aio_req; + + ret = -ENOMEM; + aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); + if (!aio_req) + goto out; + + file_start_write(real.file); + aio_req->fd = real; + real.flags = 0; + aio_req->orig_iocb = iocb; + kiocb_clone(&aio_req->iocb, iocb, real.file); + aio_req->iocb.ki_complete = ovl_aio_rw_complete; + ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); + if (ret != -EIOCBQUEUED) + ovl_aio_cleanup_handler(aio_req); + } +out: revert_creds(old_cred); - - /* Update size */ - ovl_copyattr(ovl_inode_real(inode), inode); - fdput(real); out_unlock: @@ -291,6 +365,48 @@ out_unlock: return ret; } +static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + ssize_t ret; + struct fd real; + const struct cred *old_cred; + + ret = ovl_real_fdget(in, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(in)->i_sb); + ret = generic_file_splice_read(real.file, ppos, pipe, len, flags); + revert_creds(old_cred); + + ovl_file_accessed(in); + fdput(real); + return ret; +} + +static ssize_t +ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct fd real; + const struct cred *old_cred; + ssize_t ret; + + ret = ovl_real_fdget(out, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(out)->i_sb); + ret = iter_file_splice_write(pipe, real.file, ppos, len, flags); + revert_creds(old_cred); + + ovl_file_accessed(out); + fdput(real); + return ret; +} + static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct fd real; @@ -647,7 +763,25 @@ const struct file_operations ovl_file_operations = { .fadvise = ovl_fadvise, .unlocked_ioctl = ovl_ioctl, .compat_ioctl = ovl_compat_ioctl, + .splice_read = ovl_splice_read, + .splice_write = ovl_splice_write, .copy_file_range = ovl_copy_file_range, .remap_file_range = ovl_remap_file_range, }; + +int __init ovl_aio_request_cache_init(void) +{ + ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req", + sizeof(struct ovl_aio_req), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!ovl_aio_request_cachep) + return -ENOMEM; + + return 0; +} + +void ovl_aio_request_cache_destroy(void) +{ + kmem_cache_destroy(ovl_aio_request_cachep); +} diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index bc14781886bf..79e8994e3bc1 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -75,10 +75,9 @@ out: return err; } -static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, - struct ovl_layer *lower_layer) +static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) { - bool samefs = ovl_same_sb(dentry->d_sb); + bool samefs = ovl_same_fs(dentry->d_sb); unsigned int xinobits = ovl_xino_bits(dentry->d_sb); if (samefs) { @@ -100,12 +99,10 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, * persistent for a given layer configuration. */ if (stat->ino >> shift) { - pr_warn_ratelimited("overlayfs: inode number too big (%pd2, ino=%llu, xinobits=%d)\n", + pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n", dentry, stat->ino, xinobits); } else { - if (lower_layer) - stat->ino |= ((u64)lower_layer->fsid) << shift; - + stat->ino |= ((u64)fsid) << shift; stat->dev = dentry->d_sb->s_dev; return 0; } @@ -124,15 +121,14 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, */ stat->dev = dentry->d_sb->s_dev; stat->ino = dentry->d_inode->i_ino; - } else if (lower_layer && lower_layer->fsid) { + } else { /* * For non-samefs setup, if we cannot map all layers st_ino * to a unified address space, we need to make sure that st_dev - * is unique per lower fs. Upper layer uses real st_dev and - * lower layers use the unique anonymous bdev assigned to the - * lower fs. + * is unique per underlying fs, so we use the unique anonymous + * bdev assigned to the underlying fs. */ - stat->dev = lower_layer->fs->pseudo_dev; + stat->dev = OVL_FS(dentry->d_sb)->fs[fsid].pseudo_dev; } return 0; @@ -146,8 +142,7 @@ int ovl_getattr(const struct path *path, struct kstat *stat, struct path realpath; const struct cred *old_cred; bool is_dir = S_ISDIR(dentry->d_inode->i_mode); - bool samefs = ovl_same_sb(dentry->d_sb); - struct ovl_layer *lower_layer = NULL; + int fsid = 0; int err; bool metacopy_blocks = false; @@ -168,9 +163,9 @@ int ovl_getattr(const struct path *path, struct kstat *stat, * If lower filesystem supports NFS file handles, this also guaranties * persistent st_ino across mount cycle. */ - if (!is_dir || samefs || ovl_xino_bits(dentry->d_sb)) { + if (!is_dir || ovl_same_dev(dentry->d_sb)) { if (!OVL_TYPE_UPPER(type)) { - lower_layer = ovl_layer_lower(dentry); + fsid = ovl_layer_lower(dentry)->fsid; } else if (OVL_TYPE_ORIGIN(type)) { struct kstat lowerstat; u32 lowermask = STATX_INO | STATX_BLOCKS | @@ -200,8 +195,8 @@ int ovl_getattr(const struct path *path, struct kstat *stat, if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || (!ovl_verify_lower(dentry->d_sb) && (is_dir || lowerstat.nlink == 1))) { + fsid = ovl_layer_lower(dentry)->fsid; stat->ino = lowerstat.ino; - lower_layer = ovl_layer_lower(dentry); } /* @@ -235,7 +230,7 @@ int ovl_getattr(const struct path *path, struct kstat *stat, } } - err = ovl_map_dev_ino(dentry, stat, lower_layer); + err = ovl_map_dev_ino(dentry, stat, fsid); if (err) goto out; @@ -521,6 +516,27 @@ static const struct address_space_operations ovl_aops = { * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2) * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1) * [...] &type->i_mutex_dir_key (stack_depth=0) + * + * Locking order w.r.t ovl_want_write() is important for nested overlayfs. + * + * This chain is valid: + * - inode->i_rwsem (inode_lock[2]) + * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) + * - OVL_I(inode)->lock (ovl_inode_lock[2]) + * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) + * + * And this chain is valid: + * - inode->i_rwsem (inode_lock[2]) + * - OVL_I(inode)->lock (ovl_inode_lock[2]) + * - lowerinode->i_rwsem (inode_lock[1]) + * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) + * + * But lowerinode->i_rwsem SHOULD NOT be acquired while ovl_want_write() is + * held, because it is in reverse order of the non-nested case using the same + * upper fs: + * - inode->i_rwsem (inode_lock[1]) + * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) + * - OVL_I(inode)->lock (ovl_inode_lock[1]) */ #define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH @@ -559,7 +575,7 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev, * ovl_new_inode(), ino arg is 0, so i_ino will be updated to real * upper inode i_ino on ovl_inode_init() or ovl_inode_update(). */ - if (ovl_same_sb(inode->i_sb) || xinobits) { + if (ovl_same_dev(inode->i_sb)) { inode->i_ino = ino; if (xinobits && fsid && !(ino >> (64 - xinobits))) inode->i_ino |= (unsigned long)fsid << (64 - xinobits); @@ -692,7 +708,7 @@ unsigned int ovl_get_nlink(struct dentry *lowerdentry, return nlink; fail: - pr_warn_ratelimited("overlayfs: failed to get index nlink (%pd2, err=%i)\n", + pr_warn_ratelimited("failed to get index nlink (%pd2, err=%i)\n", upperdentry, err); return fallback; } @@ -963,7 +979,7 @@ out: return inode; out_err: - pr_warn_ratelimited("overlayfs: failed to get inode (%i)\n", err); + pr_warn_ratelimited("failed to get inode (%i)\n", err); inode = ERR_PTR(err); goto out; } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index c269d6033525..ed9e129fae04 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -84,21 +84,21 @@ static int ovl_acceptable(void *ctx, struct dentry *dentry) * Return -ENODATA for "origin unknown". * Return <0 for an invalid file handle. */ -int ovl_check_fh_len(struct ovl_fh *fh, int fh_len) +int ovl_check_fb_len(struct ovl_fb *fb, int fb_len) { - if (fh_len < sizeof(struct ovl_fh) || fh_len < fh->len) + if (fb_len < sizeof(struct ovl_fb) || fb_len < fb->len) return -EINVAL; - if (fh->magic != OVL_FH_MAGIC) + if (fb->magic != OVL_FH_MAGIC) return -EINVAL; /* Treat larger version and unknown flags as "origin unknown" */ - if (fh->version > OVL_FH_VERSION || fh->flags & ~OVL_FH_FLAG_ALL) + if (fb->version > OVL_FH_VERSION || fb->flags & ~OVL_FH_FLAG_ALL) return -ENODATA; /* Treat endianness mismatch as "origin unknown" */ - if (!(fh->flags & OVL_FH_FLAG_ANY_ENDIAN) && - (fh->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) + if (!(fb->flags & OVL_FH_FLAG_ANY_ENDIAN) && + (fb->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) return -ENODATA; return 0; @@ -119,15 +119,15 @@ static struct ovl_fh *ovl_get_fh(struct dentry *dentry, const char *name) if (res == 0) return NULL; - fh = kzalloc(res, GFP_KERNEL); + fh = kzalloc(res + OVL_FH_WIRE_OFFSET, GFP_KERNEL); if (!fh) return ERR_PTR(-ENOMEM); - res = vfs_getxattr(dentry, name, fh, res); + res = vfs_getxattr(dentry, name, fh->buf, res); if (res < 0) goto fail; - err = ovl_check_fh_len(fh, res); + err = ovl_check_fb_len(&fh->fb, res); if (err < 0) { if (err == -ENODATA) goto out; @@ -141,10 +141,10 @@ out: return NULL; fail: - pr_warn_ratelimited("overlayfs: failed to get origin (%i)\n", res); + pr_warn_ratelimited("failed to get origin (%i)\n", res); goto out; invalid: - pr_warn_ratelimited("overlayfs: invalid origin (%*phN)\n", res, fh); + pr_warn_ratelimited("invalid origin (%*phN)\n", res, fh); goto out; } @@ -158,12 +158,12 @@ struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, * Make sure that the stored uuid matches the uuid of the lower * layer where file handle will be decoded. */ - if (!uuid_equal(&fh->uuid, &mnt->mnt_sb->s_uuid)) + if (!uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid)) return NULL; - bytes = (fh->len - offsetof(struct ovl_fh, fid)); - real = exportfs_decode_fh(mnt, (struct fid *)fh->fid, - bytes >> 2, (int)fh->type, + bytes = (fh->fb.len - offsetof(struct ovl_fb, fid)); + real = exportfs_decode_fh(mnt, (struct fid *)fh->fb.fid, + bytes >> 2, (int)fh->fb.type, connected ? ovl_acceptable : NULL, mnt); if (IS_ERR(real)) { /* @@ -173,7 +173,7 @@ struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, * index entries correctly. */ if (real == ERR_PTR(-ESTALE) && - !(fh->flags & OVL_FH_FLAG_PATH_UPPER)) + !(fh->fb.flags & OVL_FH_FLAG_PATH_UPPER)) real = NULL; return real; } @@ -322,8 +322,16 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *origin = NULL; int i; - for (i = 0; i < ofs->numlower; i++) { - origin = ovl_decode_real_fh(fh, ofs->lower_layers[i].mnt, + for (i = 1; i < ofs->numlayer; i++) { + /* + * If lower fs uuid is not unique among lower fs we cannot match + * fh->uuid to layer. + */ + if (ofs->layers[i].fsid && + ofs->layers[i].fs->bad_uuid) + continue; + + origin = ovl_decode_real_fh(fh, ofs->layers[i].mnt, connected); if (origin) break; @@ -346,13 +354,13 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, } **stackp = (struct ovl_path){ .dentry = origin, - .layer = &ofs->lower_layers[i] + .layer = &ofs->layers[i] }; return 0; invalid: - pr_warn_ratelimited("overlayfs: invalid origin (%pd2, ftype=%x, origin ftype=%x).\n", + pr_warn_ratelimited("invalid origin (%pd2, ftype=%x, origin ftype=%x).\n", upperdentry, d_inode(upperdentry)->i_mode & S_IFMT, d_inode(origin)->i_mode & S_IFMT); dput(origin); @@ -400,7 +408,7 @@ static int ovl_verify_fh(struct dentry *dentry, const char *name, if (IS_ERR(ofh)) return PTR_ERR(ofh); - if (fh->len != ofh->len || memcmp(fh, ofh, fh->len)) + if (fh->fb.len != ofh->fb.len || memcmp(&fh->fb, &ofh->fb, fh->fb.len)) err = -ESTALE; kfree(ofh); @@ -431,7 +439,7 @@ int ovl_verify_set_fh(struct dentry *dentry, const char *name, err = ovl_verify_fh(dentry, name, fh); if (set && err == -ENODATA) - err = ovl_do_setxattr(dentry, name, fh, fh->len, 0); + err = ovl_do_setxattr(dentry, name, fh->buf, fh->fb.len, 0); if (err) goto fail; @@ -441,7 +449,7 @@ out: fail: inode = d_inode(real); - pr_warn_ratelimited("overlayfs: failed to verify %s (%pd2, ino=%lu, err=%i)\n", + pr_warn_ratelimited("failed to verify %s (%pd2, ino=%lu, err=%i)\n", is_upper ? "upper" : "origin", real, inode ? inode->i_ino : 0, err); goto out; @@ -467,7 +475,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) return upper ?: ERR_PTR(-ESTALE); if (!d_is_dir(upper)) { - pr_warn_ratelimited("overlayfs: invalid index upper (%pd2, upper=%pd2).\n", + pr_warn_ratelimited("invalid index upper (%pd2, upper=%pd2).\n", index, upper); dput(upper); return ERR_PTR(-EIO); @@ -505,20 +513,20 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) goto fail; err = -EINVAL; - if (index->d_name.len < sizeof(struct ovl_fh)*2) + if (index->d_name.len < sizeof(struct ovl_fb)*2) goto fail; err = -ENOMEM; len = index->d_name.len / 2; - fh = kzalloc(len, GFP_KERNEL); + fh = kzalloc(len + OVL_FH_WIRE_OFFSET, GFP_KERNEL); if (!fh) goto fail; err = -EINVAL; - if (hex2bin((u8 *)fh, index->d_name.name, len)) + if (hex2bin(fh->buf, index->d_name.name, len)) goto fail; - err = ovl_check_fh_len(fh, len); + err = ovl_check_fb_len(&fh->fb, len); if (err) goto fail; @@ -581,12 +589,12 @@ out: return err; fail: - pr_warn_ratelimited("overlayfs: failed to verify index (%pd2, ftype=%x, err=%i)\n", + pr_warn_ratelimited("failed to verify index (%pd2, ftype=%x, err=%i)\n", index, d_inode(index)->i_mode & S_IFMT, err); goto out; orphan: - pr_warn_ratelimited("overlayfs: orphan index entry (%pd2, ftype=%x, nlink=%u)\n", + pr_warn_ratelimited("orphan index entry (%pd2, ftype=%x, nlink=%u)\n", index, d_inode(index)->i_mode & S_IFMT, d_inode(index)->i_nlink); err = -ENOENT; @@ -597,11 +605,11 @@ static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name) { char *n, *s; - n = kcalloc(fh->len, 2, GFP_KERNEL); + n = kcalloc(fh->fb.len, 2, GFP_KERNEL); if (!n) return -ENOMEM; - s = bin2hex(n, fh, fh->len); + s = bin2hex(n, fh->buf, fh->fb.len); *name = (struct qstr) QSTR_INIT(n, s - n); return 0; @@ -688,7 +696,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, index = NULL; goto out; } - pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n" + pr_warn_ratelimited("failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n" "overlayfs: mount with '-o index=off' to disable inodes index.\n", d_inode(origin)->i_ino, name.len, name.name, err); @@ -715,13 +723,13 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, * unlinked, which means that finding a lower origin on lookup * whose index is a whiteout should be treated as an error. */ - pr_warn_ratelimited("overlayfs: bad index found (index=%pd2, ftype=%x, origin ftype=%x).\n", + pr_warn_ratelimited("bad index found (index=%pd2, ftype=%x, origin ftype=%x).\n", index, d_inode(index)->i_mode & S_IFMT, d_inode(origin)->i_mode & S_IFMT); goto fail; } else if (is_dir && verify) { if (!upper) { - pr_warn_ratelimited("overlayfs: suspected uncovered redirected dir found (origin=%pd2, index=%pd2).\n", + pr_warn_ratelimited("suspected uncovered redirected dir found (origin=%pd2, index=%pd2).\n", origin, index); goto fail; } @@ -730,7 +738,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, err = ovl_verify_upper(index, upper, false); if (err) { if (err == -ESTALE) { - pr_warn_ratelimited("overlayfs: suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n", + pr_warn_ratelimited("suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n", upper, origin, index); } goto fail; @@ -877,7 +885,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (!d.stop && poe->numlower) { err = -ENOMEM; - stack = kcalloc(ofs->numlower, sizeof(struct ovl_path), + stack = kcalloc(ofs->numlayer - 1, sizeof(struct ovl_path), GFP_KERNEL); if (!stack) goto out_put_upper; @@ -959,7 +967,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, */ err = -EPERM; if (d.redirect && !ofs->config.redirect_follow) { - pr_warn_ratelimited("overlayfs: refusing to follow redirect for (%pd2)\n", + pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n", dentry); goto out_put; } @@ -986,7 +994,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, err = -EPERM; if (!ofs->config.metacopy) { - pr_warn_ratelimited("overlay: refusing to follow metacopy origin for (%pd2)\n", + pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry); goto out_put; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 6934bcf030f0..3623d28aa4fa 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -9,6 +9,9 @@ #include <linux/fs.h> #include "ovl_entry.h" +#undef pr_fmt +#define pr_fmt(fmt) "overlayfs: " fmt + enum ovl_path_type { __OVL_PATH_UPPER = (1 << 0), __OVL_PATH_MERGE = (1 << 1), @@ -71,20 +74,36 @@ enum ovl_entry_flag { #error Endianness not defined #endif -/* The type returned by overlay exportfs ops when encoding an ovl_fh handle */ -#define OVL_FILEID 0xfb +/* The type used to be returned by overlay exportfs for misaligned fid */ +#define OVL_FILEID_V0 0xfb +/* The type returned by overlay exportfs for 32bit aligned fid */ +#define OVL_FILEID_V1 0xf8 -/* On-disk and in-memeory format for redirect by file handle */ -struct ovl_fh { +/* On-disk format for "origin" file handle */ +struct ovl_fb { u8 version; /* 0 */ u8 magic; /* 0xfb */ u8 len; /* size of this header + size of fid */ u8 flags; /* OVL_FH_FLAG_* */ u8 type; /* fid_type of fid */ uuid_t uuid; /* uuid of filesystem */ - u8 fid[0]; /* file identifier */ + u32 fid[0]; /* file identifier should be 32bit aligned in-memory */ +} __packed; + +/* In-memory and on-wire format for overlay file handle */ +struct ovl_fh { + u8 padding[3]; /* make sure fb.fid is 32bit aligned */ + union { + struct ovl_fb fb; + u8 buf[0]; + }; } __packed; +#define OVL_FH_WIRE_OFFSET offsetof(struct ovl_fh, fb) +#define OVL_FH_LEN(fh) (OVL_FH_WIRE_OFFSET + (fh)->fb.len) +#define OVL_FH_FID_OFFSET (OVL_FH_WIRE_OFFSET + \ + offsetof(struct ovl_fb, fid)) + static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) { int err = vfs_rmdir(dir, dentry); @@ -205,7 +224,6 @@ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); -struct super_block *ovl_same_sb(struct super_block *sb); int ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); bool ovl_index_all(struct super_block *sb); @@ -221,7 +239,7 @@ enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); struct dentry *ovl_dentry_lowerdata(struct dentry *dentry); -struct ovl_layer *ovl_layer_lower(struct dentry *dentry); +const struct ovl_layer *ovl_layer_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); struct dentry *ovl_i_dentry_upper(struct inode *inode); struct inode *ovl_inode_upper(struct inode *inode); @@ -283,11 +301,21 @@ static inline bool ovl_is_impuredir(struct dentry *dentry) return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE); } -static inline unsigned int ovl_xino_bits(struct super_block *sb) +/* All layers on same fs? */ +static inline bool ovl_same_fs(struct super_block *sb) { - struct ovl_fs *ofs = sb->s_fs_info; + return OVL_FS(sb)->xino_mode == 0; +} - return ofs->xino_bits; +/* All overlay inodes have same st_dev? */ +static inline bool ovl_same_dev(struct super_block *sb) +{ + return OVL_FS(sb)->xino_mode >= 0; +} + +static inline unsigned int ovl_xino_bits(struct super_block *sb) +{ + return ovl_same_dev(sb) ? OVL_FS(sb)->xino_mode : 0; } static inline int ovl_inode_lock(struct inode *inode) @@ -302,7 +330,13 @@ static inline void ovl_inode_unlock(struct inode *inode) /* namei.c */ -int ovl_check_fh_len(struct ovl_fh *fh, int fh_len); +int ovl_check_fb_len(struct ovl_fb *fb, int fb_len); + +static inline int ovl_check_fh_len(struct ovl_fh *fh, int fh_len) +{ + return ovl_check_fb_len(&fh->fb, fh_len - OVL_FH_WIRE_OFFSET); +} + struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, bool connected); int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, @@ -416,6 +450,8 @@ struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr); /* file.c */ extern const struct file_operations ovl_file_operations; +int __init ovl_aio_request_cache_init(void); +void ovl_aio_request_cache_destroy(void); /* copy_up.c */ int ovl_copy_up(struct dentry *dentry); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index a8279280e88d..89015ea822e7 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -22,6 +22,10 @@ struct ovl_config { struct ovl_sb { struct super_block *sb; dev_t pseudo_dev; + /* Unusable (conflicting) uuid */ + bool bad_uuid; + /* Used as a lower layer (but maybe also as upper) */ + bool is_lower; }; struct ovl_layer { @@ -36,18 +40,18 @@ struct ovl_layer { }; struct ovl_path { - struct ovl_layer *layer; + const struct ovl_layer *layer; struct dentry *dentry; }; /* private information held for overlayfs's superblock */ struct ovl_fs { struct vfsmount *upper_mnt; - unsigned int numlower; - /* Number of unique lower sb that differ from upper sb */ - unsigned int numlowerfs; - struct ovl_layer *lower_layers; - struct ovl_sb *lower_fs; + unsigned int numlayer; + /* Number of unique fs among layers including upper fs */ + unsigned int numfs; + const struct ovl_layer *layers; + struct ovl_sb *fs; /* workbasedir is the path at workdir= mount option */ struct dentry *workbasedir; /* workdir is the 'work' directory under workbasedir */ @@ -69,10 +73,15 @@ struct ovl_fs { struct inode *workbasedir_trap; struct inode *workdir_trap; struct inode *indexdir_trap; - /* Inode numbers in all layers do not use the high xino_bits */ - unsigned int xino_bits; + /* -1: disabled, 0: same fs, 1..32: number of unused ino bits */ + int xino_mode; }; +static inline struct ovl_fs *OVL_FS(struct super_block *sb) +{ + return (struct ovl_fs *)sb->s_fs_info; +} + /* private information held for every overlayfs dentry */ struct ovl_entry { union { diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 47a91c9733a5..40ac9ce2465a 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -441,7 +441,7 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, const char *name, int namelen) { if (ino >> (64 - xinobits)) { - pr_warn_ratelimited("overlayfs: d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", + pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", namelen, name, ino, xinobits); return ino; } @@ -469,7 +469,7 @@ static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p) int xinobits = ovl_xino_bits(dir->d_sb); int err = 0; - if (!ovl_same_sb(dir->d_sb) && !xinobits) + if (!ovl_same_dev(dir->d_sb)) goto out; if (p->name[0] == '.') { @@ -504,7 +504,13 @@ get: if (err) goto fail; - WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); + /* + * Directory inode is always on overlay st_dev. + * Non-dir with ovl_same_dev() could be on pseudo st_dev in case + * of xino bits overflow. + */ + WARN_ON_ONCE(S_ISDIR(stat.mode) && + dir->d_sb->s_dev != stat.dev); ino = stat.ino; } else if (xinobits && !OVL_TYPE_UPPER(type)) { ino = ovl_remap_lower_ino(ino, xinobits, @@ -518,7 +524,7 @@ out: return err; fail: - pr_warn_ratelimited("overlayfs: failed to look up (%s) for ino (%i)\n", + pr_warn_ratelimited("failed to look up (%s) for ino (%i)\n", p->name, err); goto out; } @@ -685,7 +691,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) int err; struct ovl_dir_file *od = file->private_data; struct dentry *dir = file->f_path.dentry; - struct ovl_layer *lower_layer = ovl_layer_lower(dir); + const struct ovl_layer *lower_layer = ovl_layer_lower(dir); struct ovl_readdir_translate rdt = { .ctx.actor = ovl_fill_real, .orig_ctx = ctx, @@ -738,7 +744,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) * entries. */ if (ovl_xino_bits(dentry->d_sb) || - (ovl_same_sb(dentry->d_sb) && + (ovl_same_fs(dentry->d_sb) && (ovl_is_impure_dir(file) || OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) { return ovl_iterate_real(file, ctx); @@ -965,7 +971,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) dentry = lookup_one_len(p->name, upper, p->len); if (IS_ERR(dentry)) { - pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n", + pr_err("lookup '%s/%.*s' failed (%i)\n", upper->d_name.name, p->len, p->name, (int) PTR_ERR(dentry)); continue; @@ -1147,6 +1153,6 @@ next: out: ovl_cache_free(&list); if (err) - pr_err("overlayfs: failed index dir cleanup (%i)\n", err); + pr_err("failed index dir cleanup (%i)\n", err); return err; } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index afbcb116a7f1..319fe0d355b0 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -224,14 +224,14 @@ static void ovl_free_fs(struct ovl_fs *ofs) if (ofs->upperdir_locked) ovl_inuse_unlock(ofs->upper_mnt->mnt_root); mntput(ofs->upper_mnt); - for (i = 0; i < ofs->numlower; i++) { - iput(ofs->lower_layers[i].trap); - mntput(ofs->lower_layers[i].mnt); + for (i = 1; i < ofs->numlayer; i++) { + iput(ofs->layers[i].trap); + mntput(ofs->layers[i].mnt); } - for (i = 0; i < ofs->numlowerfs; i++) - free_anon_bdev(ofs->lower_fs[i].pseudo_dev); - kfree(ofs->lower_layers); - kfree(ofs->lower_fs); + kfree(ofs->layers); + for (i = 0; i < ofs->numfs; i++) + free_anon_bdev(ofs->fs[i].pseudo_dev); + kfree(ofs->fs); kfree(ofs->config.lowerdir); kfree(ofs->config.upperdir); @@ -358,7 +358,7 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) if (ofs->config.nfs_export != ovl_nfs_export_def) seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ? "on" : "off"); - if (ofs->config.xino != ovl_xino_def()) + if (ofs->config.xino != ovl_xino_def() && !ovl_same_fs(sb)) seq_printf(m, ",xino=%s", ovl_xino_str[ofs->config.xino]); if (ofs->config.metacopy != ovl_metacopy_def) seq_printf(m, ",metacopy=%s", @@ -462,7 +462,7 @@ static int ovl_parse_redirect_mode(struct ovl_config *config, const char *mode) if (ovl_redirect_always_follow) config->redirect_follow = true; } else if (strcmp(mode, "nofollow") != 0) { - pr_err("overlayfs: bad mount option \"redirect_dir=%s\"\n", + pr_err("bad mount option \"redirect_dir=%s\"\n", mode); return -EINVAL; } @@ -560,14 +560,15 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) break; default: - pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); + pr_err("unrecognized mount option \"%s\" or missing value\n", + p); return -EINVAL; } } /* Workdir is useless in non-upper mount */ if (!config->upperdir && config->workdir) { - pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n", + pr_info("option \"workdir=%s\" is useless in a non-upper mount, ignore\n", config->workdir); kfree(config->workdir); config->workdir = NULL; @@ -587,7 +588,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) /* Resolve metacopy -> redirect_dir dependency */ if (config->metacopy && !config->redirect_dir) { if (metacopy_opt && redirect_opt) { - pr_err("overlayfs: conflicting options: metacopy=on,redirect_dir=%s\n", + pr_err("conflicting options: metacopy=on,redirect_dir=%s\n", config->redirect_mode); return -EINVAL; } @@ -596,7 +597,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) * There was an explicit redirect_dir=... that resulted * in this conflict. */ - pr_info("overlayfs: disabling metacopy due to redirect_dir=%s\n", + pr_info("disabling metacopy due to redirect_dir=%s\n", config->redirect_mode); config->metacopy = false; } else { @@ -692,7 +693,7 @@ out_unlock: out_dput: dput(work); out_err: - pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n", + pr_warn("failed to create directory %s/%s (errno: %i); mounting read-only\n", ofs->config.workdir, name, -err); work = NULL; goto out_unlock; @@ -716,21 +717,21 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) int err = -EINVAL; if (!*name) { - pr_err("overlayfs: empty lowerdir\n"); + pr_err("empty lowerdir\n"); goto out; } err = kern_path(name, LOOKUP_FOLLOW, path); if (err) { - pr_err("overlayfs: failed to resolve '%s': %i\n", name, err); + pr_err("failed to resolve '%s': %i\n", name, err); goto out; } err = -EINVAL; if (ovl_dentry_weird(path->dentry)) { - pr_err("overlayfs: filesystem on '%s' not supported\n", name); + pr_err("filesystem on '%s' not supported\n", name); goto out_put; } if (!d_is_dir(path->dentry)) { - pr_err("overlayfs: '%s' not a directory\n", name); + pr_err("'%s' not a directory\n", name); goto out_put; } return 0; @@ -752,7 +753,7 @@ static int ovl_mount_dir(const char *name, struct path *path) if (!err) if (ovl_dentry_remote(path->dentry)) { - pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n", + pr_err("filesystem on '%s' not supported as upperdir\n", tmp); path_put_init(path); err = -EINVAL; @@ -769,7 +770,7 @@ static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs, int err = vfs_statfs(path, &statfs); if (err) - pr_err("overlayfs: statfs failed on '%s'\n", name); + pr_err("statfs failed on '%s'\n", name); else ofs->namelen = max(ofs->namelen, statfs.f_namelen); @@ -804,13 +805,13 @@ static int ovl_lower_dir(const char *name, struct path *path, (ofs->config.index && ofs->config.upperdir)) && !fh_type) { ofs->config.index = false; ofs->config.nfs_export = false; - pr_warn("overlayfs: fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n", + pr_warn("fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n", name); } /* Check if lower fs has 32bit inode numbers */ if (fh_type != FILEID_INO32_GEN) - ofs->xino_bits = 0; + ofs->xino_mode = -1; return 0; @@ -996,7 +997,7 @@ static int ovl_setup_trap(struct super_block *sb, struct dentry *dir, err = PTR_ERR_OR_ZERO(trap); if (err) { if (err == -ELOOP) - pr_err("overlayfs: conflicting %s path\n", name); + pr_err("conflicting %s path\n", name); return err; } @@ -1013,11 +1014,11 @@ static int ovl_setup_trap(struct super_block *sb, struct dentry *dir, static int ovl_report_in_use(struct ovl_fs *ofs, const char *name) { if (ofs->config.index) { - pr_err("overlayfs: %s is in-use as upperdir/workdir of another mount, mount with '-o index=off' to override exclusive upperdir protection.\n", + pr_err("%s is in-use as upperdir/workdir of another mount, mount with '-o index=off' to override exclusive upperdir protection.\n", name); return -EBUSY; } else { - pr_warn("overlayfs: %s is in-use as upperdir/workdir of another mount, accessing files from both mounts will result in undefined behavior.\n", + pr_warn("%s is in-use as upperdir/workdir of another mount, accessing files from both mounts will result in undefined behavior.\n", name); return 0; } @@ -1035,7 +1036,7 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, /* Upper fs should not be r/o */ if (sb_rdonly(upperpath->mnt->mnt_sb)) { - pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n"); + pr_err("upper fs is r/o, try multi-lower layers mount\n"); err = -EINVAL; goto out; } @@ -1052,7 +1053,7 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, upper_mnt = clone_private_mount(upperpath); err = PTR_ERR(upper_mnt); if (IS_ERR(upper_mnt)) { - pr_err("overlayfs: failed to clone upperpath\n"); + pr_err("failed to clone upperpath\n"); goto out; } @@ -1108,7 +1109,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, * kernel upgrade. So warn instead of erroring out. */ if (!err) - pr_warn("overlayfs: upper fs needs to support d_type.\n"); + pr_warn("upper fs needs to support d_type.\n"); /* Check if upper/work fs supports O_TMPFILE */ temp = ovl_do_tmpfile(ofs->workdir, S_IFREG | 0); @@ -1116,7 +1117,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, if (ofs->tmpfile) dput(temp); else - pr_warn("overlayfs: upper fs does not support tmpfile.\n"); + pr_warn("upper fs does not support tmpfile.\n"); /* * Check if upper/work fs supports trusted.overlay.* xattr @@ -1126,7 +1127,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, ofs->noxattr = true; ofs->config.index = false; ofs->config.metacopy = false; - pr_warn("overlayfs: upper fs does not support xattr, falling back to index=off and metacopy=off.\n"); + pr_warn("upper fs does not support xattr, falling back to index=off and metacopy=off.\n"); err = 0; } else { vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE); @@ -1136,16 +1137,16 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, fh_type = ovl_can_decode_fh(ofs->workdir->d_sb); if (ofs->config.index && !fh_type) { ofs->config.index = false; - pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n"); + pr_warn("upper fs does not support file handles, falling back to index=off.\n"); } /* Check if upper fs has 32bit inode numbers */ if (fh_type != FILEID_INO32_GEN) - ofs->xino_bits = 0; + ofs->xino_mode = -1; /* NFS export of r/w mount depends on index */ if (ofs->config.nfs_export && !ofs->config.index) { - pr_warn("overlayfs: NFS export requires \"index=on\", falling back to nfs_export=off.\n"); + pr_warn("NFS export requires \"index=on\", falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } out: @@ -1165,11 +1166,11 @@ static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs, err = -EINVAL; if (upperpath->mnt != workpath.mnt) { - pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); + pr_err("workdir and upperdir must reside under the same mount\n"); goto out; } if (!ovl_workdir_ok(workpath.dentry, upperpath->dentry)) { - pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); + pr_err("workdir and upperdir must be separate subtrees\n"); goto out; } @@ -1210,7 +1211,7 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, err = ovl_verify_origin(upperpath->dentry, oe->lowerstack[0].dentry, true); if (err) { - pr_err("overlayfs: failed to verify upper root origin\n"); + pr_err("failed to verify upper root origin\n"); goto out; } @@ -1233,18 +1234,18 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, err = ovl_verify_set_fh(ofs->indexdir, OVL_XATTR_ORIGIN, upperpath->dentry, true, false); if (err) - pr_err("overlayfs: failed to verify index dir 'origin' xattr\n"); + pr_err("failed to verify index dir 'origin' xattr\n"); } err = ovl_verify_upper(ofs->indexdir, upperpath->dentry, true); if (err) - pr_err("overlayfs: failed to verify index dir 'upper' xattr\n"); + pr_err("failed to verify index dir 'upper' xattr\n"); /* Cleanup bad/stale/orphan index entries */ if (!err) err = ovl_indexdir_cleanup(ofs); } if (err || !ofs->indexdir) - pr_warn("overlayfs: try deleting index dir or mounting with '-o index=off' to disable inodes index.\n"); + pr_warn("try deleting index dir or mounting with '-o index=off' to disable inodes index.\n"); out: mnt_drop_write(mnt); @@ -1255,17 +1256,22 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) { unsigned int i; - if (!ofs->config.nfs_export && !(ofs->config.index && ofs->upper_mnt)) + if (!ofs->config.nfs_export && !ofs->upper_mnt) return true; - for (i = 0; i < ofs->numlowerfs; i++) { + for (i = 0; i < ofs->numfs; i++) { /* * We use uuid to associate an overlay lower file handle with a * lower layer, so we can accept lower fs with null uuid as long * as all lower layers with null uuid are on the same fs. + * if we detect multiple lower fs with the same uuid, we + * disable lower file handle decoding on all of them. */ - if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid)) + if (ofs->fs[i].is_lower && + uuid_equal(&ofs->fs[i].sb->s_uuid, uuid)) { + ofs->fs[i].bad_uuid = true; return false; + } } return true; } @@ -1277,53 +1283,79 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) unsigned int i; dev_t dev; int err; + bool bad_uuid = false; - /* fsid 0 is reserved for upper fs even with non upper overlay */ - if (ofs->upper_mnt && ofs->upper_mnt->mnt_sb == sb) - return 0; - - for (i = 0; i < ofs->numlowerfs; i++) { - if (ofs->lower_fs[i].sb == sb) - return i + 1; + for (i = 0; i < ofs->numfs; i++) { + if (ofs->fs[i].sb == sb) + return i; } if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) { - ofs->config.index = false; - ofs->config.nfs_export = false; - pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", - uuid_is_null(&sb->s_uuid) ? "null" : "conflicting", - path->dentry); + bad_uuid = true; + if (ofs->config.index || ofs->config.nfs_export) { + ofs->config.index = false; + ofs->config.nfs_export = false; + pr_warn("%s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", + uuid_is_null(&sb->s_uuid) ? "null" : + "conflicting", + path->dentry); + } } err = get_anon_bdev(&dev); if (err) { - pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n"); + pr_err("failed to get anonymous bdev for lowerpath\n"); return err; } - ofs->lower_fs[ofs->numlowerfs].sb = sb; - ofs->lower_fs[ofs->numlowerfs].pseudo_dev = dev; - ofs->numlowerfs++; + ofs->fs[ofs->numfs].sb = sb; + ofs->fs[ofs->numfs].pseudo_dev = dev; + ofs->fs[ofs->numfs].bad_uuid = bad_uuid; - return ofs->numlowerfs; + return ofs->numfs++; } -static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs, - struct path *stack, unsigned int numlower) +static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, + struct path *stack, unsigned int numlower) { int err; unsigned int i; + struct ovl_layer *layers; err = -ENOMEM; - ofs->lower_layers = kcalloc(numlower, sizeof(struct ovl_layer), - GFP_KERNEL); - if (ofs->lower_layers == NULL) + layers = kcalloc(numlower + 1, sizeof(struct ovl_layer), GFP_KERNEL); + if (!layers) goto out; + ofs->layers = layers; - ofs->lower_fs = kcalloc(numlower, sizeof(struct ovl_sb), - GFP_KERNEL); - if (ofs->lower_fs == NULL) + ofs->fs = kcalloc(numlower + 1, sizeof(struct ovl_sb), GFP_KERNEL); + if (ofs->fs == NULL) + goto out; + + /* idx/fsid 0 are reserved for upper fs even with lower only overlay */ + ofs->numfs++; + + layers[0].mnt = ofs->upper_mnt; + layers[0].idx = 0; + layers[0].fsid = 0; + ofs->numlayer = 1; + + /* + * All lower layers that share the same fs as upper layer, use the same + * pseudo_dev as upper layer. Allocate fs[0].pseudo_dev even for lower + * only overlay to simplify ovl_fs_free(). + * is_lower will be set if upper fs is shared with a lower layer. + */ + err = get_anon_bdev(&ofs->fs[0].pseudo_dev); + if (err) { + pr_err("failed to get anonymous bdev for upper fs\n"); goto out; + } + + if (ofs->upper_mnt) { + ofs->fs[0].sb = ofs->upper_mnt->mnt_sb; + ofs->fs[0].is_lower = false; + } for (i = 0; i < numlower; i++) { struct vfsmount *mnt; @@ -1347,7 +1379,7 @@ static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs, mnt = clone_private_mount(&stack[i]); err = PTR_ERR(mnt); if (IS_ERR(mnt)) { - pr_err("overlayfs: failed to clone lowerpath\n"); + pr_err("failed to clone lowerpath\n"); iput(trap); goto out; } @@ -1358,15 +1390,13 @@ static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs, */ mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME; - ofs->lower_layers[ofs->numlower].trap = trap; - ofs->lower_layers[ofs->numlower].mnt = mnt; - ofs->lower_layers[ofs->numlower].idx = i + 1; - ofs->lower_layers[ofs->numlower].fsid = fsid; - if (fsid) { - ofs->lower_layers[ofs->numlower].fs = - &ofs->lower_fs[fsid - 1]; - } - ofs->numlower++; + layers[ofs->numlayer].trap = trap; + layers[ofs->numlayer].mnt = mnt; + layers[ofs->numlayer].idx = ofs->numlayer; + layers[ofs->numlayer].fsid = fsid; + layers[ofs->numlayer].fs = &ofs->fs[fsid]; + ofs->numlayer++; + ofs->fs[fsid].is_lower = true; } /* @@ -1377,22 +1407,23 @@ static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs, * bits reserved for fsid, it emits a warning and uses the original * inode number. */ - if (!ofs->numlowerfs || (ofs->numlowerfs == 1 && !ofs->upper_mnt)) { - ofs->xino_bits = 0; - ofs->config.xino = OVL_XINO_OFF; - } else if (ofs->config.xino == OVL_XINO_ON && !ofs->xino_bits) { + if (ofs->numfs - !ofs->upper_mnt == 1) { + if (ofs->config.xino == OVL_XINO_ON) + pr_info("\"xino=on\" is useless with all layers on same fs, ignore.\n"); + ofs->xino_mode = 0; + } else if (ofs->config.xino == OVL_XINO_ON && ofs->xino_mode < 0) { /* - * This is a roundup of number of bits needed for numlowerfs+1 - * (i.e. ilog2(numlowerfs+1 - 1) + 1). fsid 0 is reserved for - * upper fs even with non upper overlay. + * This is a roundup of number of bits needed for encoding + * fsid, where fsid 0 is reserved for upper fs even with + * lower only overlay. */ BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 31); - ofs->xino_bits = ilog2(ofs->numlowerfs) + 1; + ofs->xino_mode = ilog2(ofs->numfs - 1) + 1; } - if (ofs->xino_bits) { - pr_info("overlayfs: \"xino\" feature enabled using %d upper inode bits.\n", - ofs->xino_bits); + if (ofs->xino_mode > 0) { + pr_info("\"xino\" feature enabled using %d upper inode bits.\n", + ofs->xino_mode); } err = 0; @@ -1418,15 +1449,15 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, err = -EINVAL; stacklen = ovl_split_lowerdirs(lowertmp); if (stacklen > OVL_MAX_STACK) { - pr_err("overlayfs: too many lower directories, limit is %d\n", + pr_err("too many lower directories, limit is %d\n", OVL_MAX_STACK); goto out_err; } else if (!ofs->config.upperdir && stacklen == 1) { - pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n"); + pr_err("at least 2 lowerdir are needed while upperdir nonexistent\n"); goto out_err; } else if (!ofs->config.upperdir && ofs->config.nfs_export && ofs->config.redirect_follow) { - pr_warn("overlayfs: NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n"); + pr_warn("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } @@ -1449,11 +1480,11 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, err = -EINVAL; sb->s_stack_depth++; if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { - pr_err("overlayfs: maximum fs stacking depth exceeded\n"); + pr_err("maximum fs stacking depth exceeded\n"); goto out_err; } - err = ovl_get_lower_layers(sb, ofs, stack, numlower); + err = ovl_get_layers(sb, ofs, stack, numlower); if (err) goto out_err; @@ -1464,7 +1495,7 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, for (i = 0; i < numlower; i++) { oe->lowerstack[i].dentry = dget(stack[i].dentry); - oe->lowerstack[i].layer = &ofs->lower_layers[i]; + oe->lowerstack[i].layer = &ofs->layers[i+1]; } if (remote) @@ -1505,7 +1536,7 @@ static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs, while (!err && parent != next) { if (ovl_lookup_trap_inode(sb, parent)) { err = -ELOOP; - pr_err("overlayfs: overlapping %s path\n", name); + pr_err("overlapping %s path\n", name); } else if (ovl_is_inuse(parent)) { err = ovl_report_in_use(ofs, name); } @@ -1545,9 +1576,9 @@ static int ovl_check_overlapping_layers(struct super_block *sb, return err; } - for (i = 0; i < ofs->numlower; i++) { + for (i = 1; i < ofs->numlayer; i++) { err = ovl_check_layer(sb, ofs, - ofs->lower_layers[i].mnt->mnt_root, + ofs->layers[i].mnt->mnt_root, "lowerdir"); if (err) return err; @@ -1585,7 +1616,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; if (!ofs->config.lowerdir) { if (!silent) - pr_err("overlayfs: missing 'lowerdir'\n"); + pr_err("missing 'lowerdir'\n"); goto out_err; } @@ -1593,14 +1624,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = MAX_LFS_FILESIZE; /* Assume underlaying fs uses 32bit inodes unless proven otherwise */ if (ofs->config.xino != OVL_XINO_OFF) - ofs->xino_bits = BITS_PER_LONG - 32; + ofs->xino_mode = BITS_PER_LONG - 32; /* alloc/destroy_inode needed for setting up traps in inode cache */ sb->s_op = &ovl_super_operations; if (ofs->config.upperdir) { if (!ofs->config.workdir) { - pr_err("overlayfs: missing 'workdir'\n"); + pr_err("missing 'workdir'\n"); goto out_err; } @@ -1650,13 +1681,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!ofs->indexdir) { ofs->config.index = false; if (ofs->upper_mnt && ofs->config.nfs_export) { - pr_warn("overlayfs: NFS export requires an index dir, falling back to nfs_export=off.\n"); + pr_warn("NFS export requires an index dir, falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } } if (ofs->config.metacopy && ofs->config.nfs_export) { - pr_warn("overlayfs: NFS export is not supported with metadata only copy up, falling back to nfs_export=off.\n"); + pr_warn("NFS export is not supported with metadata only copy up, falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } @@ -1739,9 +1770,15 @@ static int __init ovl_init(void) if (ovl_inode_cachep == NULL) return -ENOMEM; - err = register_filesystem(&ovl_fs_type); - if (err) - kmem_cache_destroy(ovl_inode_cachep); + err = ovl_aio_request_cache_init(); + if (!err) { + err = register_filesystem(&ovl_fs_type); + if (!err) + return 0; + + ovl_aio_request_cache_destroy(); + } + kmem_cache_destroy(ovl_inode_cachep); return err; } @@ -1756,7 +1793,7 @@ static void __exit ovl_exit(void) */ rcu_barrier(); kmem_cache_destroy(ovl_inode_cachep); - + ovl_aio_request_cache_destroy(); } module_init(ovl_init); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index f5678a3f8350..ea005085803f 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -40,18 +40,6 @@ const struct cred *ovl_override_creds(struct super_block *sb) return override_creds(ofs->creator_cred); } -struct super_block *ovl_same_sb(struct super_block *sb) -{ - struct ovl_fs *ofs = sb->s_fs_info; - - if (!ofs->numlowerfs) - return ofs->upper_mnt->mnt_sb; - else if (ofs->numlowerfs == 1 && !ofs->upper_mnt) - return ofs->lower_fs[0].sb; - else - return NULL; -} - /* * Check if underlying fs supports file handles and try to determine encoding * type, in order to deduce maximum inode number used by fs. @@ -198,7 +186,7 @@ struct dentry *ovl_dentry_lower(struct dentry *dentry) return oe->numlower ? oe->lowerstack[0].dentry : NULL; } -struct ovl_layer *ovl_layer_lower(struct dentry *dentry) +const struct ovl_layer *ovl_layer_lower(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; @@ -576,7 +564,7 @@ int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, err = ovl_do_setxattr(upperdentry, name, value, size, 0); if (err == -EOPNOTSUPP) { - pr_warn("overlayfs: cannot set %s xattr on upper\n", name); + pr_warn("cannot set %s xattr on upper\n", name); ofs->noxattr = true; return xerr; } @@ -700,7 +688,7 @@ static void ovl_cleanup_index(struct dentry *dentry) inode = d_inode(upperdentry); if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) { - pr_warn_ratelimited("overlayfs: cleanup linked index (%pd2, ino=%lu, nlink=%u)\n", + pr_warn_ratelimited("cleanup linked index (%pd2, ino=%lu, nlink=%u)\n", upperdentry, inode->i_ino, inode->i_nlink); /* * We either have a bug with persistent union nlink or a lower @@ -739,7 +727,7 @@ out: return; fail: - pr_err("overlayfs: cleanup index of '%pd2' failed (%i)\n", dentry, err); + pr_err("cleanup index of '%pd2' failed (%i)\n", dentry, err); goto out; } @@ -830,7 +818,7 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir) err_unlock: unlock_rename(workdir, upperdir); err: - pr_err("overlayfs: failed to lock workdir+upperdir\n"); + pr_err("failed to lock workdir+upperdir\n"); return -EIO; } @@ -852,7 +840,7 @@ int ovl_check_metacopy_xattr(struct dentry *dentry) return 1; out: - pr_warn_ratelimited("overlayfs: failed to get metacopy (%i)\n", res); + pr_warn_ratelimited("failed to get metacopy (%i)\n", res); return res; } @@ -899,7 +887,7 @@ ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, return res; fail: - pr_warn_ratelimited("overlayfs: failed to get xattr %s: err=%zi)\n", + pr_warn_ratelimited("failed to get xattr %s: err=%zi)\n", name, res); kfree(buf); return res; @@ -931,7 +919,7 @@ char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) return buf; invalid: - pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf); + pr_warn_ratelimited("invalid redirect (%s)\n", buf); res = -EINVAL; kfree(buf); return ERR_PTR(res); diff --git a/fs/pipe.c b/fs/pipe.c index 87109e761fa5..57502c3c0fba 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -364,17 +364,39 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) ret = -EAGAIN; break; } - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; - break; - } __pipe_unlock(pipe); - if (was_full) { + + /* + * We only get here if we didn't actually read anything. + * + * However, we could have seen (and removed) a zero-sized + * pipe buffer, and might have made space in the buffers + * that way. + * + * You can't make zero-sized pipe buffers by doing an empty + * write (not even in packet mode), but they can happen if + * the writer gets an EFAULT when trying to fill a buffer + * that already got allocated and inserted in the buffer + * array. + * + * So we still need to wake up any pending writers in the + * _very_ unlikely case that the pipe was full, but we got + * no data. + */ + if (unlikely(was_full)) { wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - wait_event_interruptible(pipe->wait, pipe_readable(pipe)); + + /* + * But because we didn't read anything, at this point we can + * just return directly with -ERESTARTSYS if we're interrupted, + * since we've done any required wakeups and there's no need + * to mark anything accessed. And we've dropped the lock. + */ + if (wait_event_interruptible(pipe->wait, pipe_readable(pipe)) < 0) + return -ERESTARTSYS; + __pipe_lock(pipe); was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); } @@ -559,7 +581,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) } wait_event_interruptible(pipe->wait, pipe_writable(pipe)); __pipe_lock(pipe); - was_empty = pipe_empty(head, pipe->tail); + was_empty = pipe_empty(pipe->head, pipe->tail); } out: __pipe_unlock(pipe); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 84ad1c90d535..249672bf54fe 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -631,12 +631,15 @@ EXPORT_SYMBOL_GPL(posix_acl_create); /** * posix_acl_update_mode - update mode in set_acl + * @inode: target inode + * @mode_p: mode (pointer) for update + * @acl: acl pointer * * Update the file mode when setting an ACL: compute the new file permission * bits based on the ACL. In addition, if the ACL is equivalent to the new - * file mode, set *acl to NULL to indicate that no ACL should be set. + * file mode, set *@acl to NULL to indicate that no ACL should be set. * - * As with chmod, clear the setgit bit if the caller is not in the owning group + * As with chmod, clear the setgid bit if the caller is not in the owning group * or capable of CAP_FSETID (see inode_change_ok). * * Called from set_acl inode operations. diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 733881a6387b..27ef84d99f59 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -103,3 +103,7 @@ config PROC_CHILDREN config PROC_PID_ARCH_STATUS def_bool n depends on PROC_FS + +config PROC_CPU_RESCTRL + def_bool n + depends on PROC_FS diff --git a/fs/proc/base.c b/fs/proc/base.c index ebea9501afb8..c7c64272b0fa 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -94,6 +94,8 @@ #include <linux/sched/debug.h> #include <linux/sched/stat.h> #include <linux/posix-timers.h> +#include <linux/time_namespace.h> +#include <linux/resctrl.h> #include <trace/events/oom.h> #include "internal.h" #include "fd.h" @@ -1533,6 +1535,96 @@ static const struct file_operations proc_pid_sched_autogroup_operations = { #endif /* CONFIG_SCHED_AUTOGROUP */ +#ifdef CONFIG_TIME_NS +static int timens_offsets_show(struct seq_file *m, void *v) +{ + struct task_struct *p; + + p = get_proc_task(file_inode(m->file)); + if (!p) + return -ESRCH; + proc_timens_show_offsets(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t timens_offsets_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct inode *inode = file_inode(file); + struct proc_timens_offset offsets[2]; + char *kbuf = NULL, *pos, *next_line; + struct task_struct *p; + int ret, noffsets; + + /* Only allow < page size writes at the beginning of the file */ + if ((*ppos != 0) || (count >= PAGE_SIZE)) + return -EINVAL; + + /* Slurp in the user data */ + kbuf = memdup_user_nul(buf, count); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + /* Parse the user data */ + ret = -EINVAL; + noffsets = 0; + for (pos = kbuf; pos; pos = next_line) { + struct proc_timens_offset *off = &offsets[noffsets]; + int err; + + /* Find the end of line and ensure we don't look past it */ + next_line = strchr(pos, '\n'); + if (next_line) { + *next_line = '\0'; + next_line++; + if (*next_line == '\0') + next_line = NULL; + } + + err = sscanf(pos, "%u %lld %lu", &off->clockid, + &off->val.tv_sec, &off->val.tv_nsec); + if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC) + goto out; + noffsets++; + if (noffsets == ARRAY_SIZE(offsets)) { + if (next_line) + count = next_line - kbuf; + break; + } + } + + ret = -ESRCH; + p = get_proc_task(inode); + if (!p) + goto out; + ret = proc_timens_set_offset(file, p, offsets, noffsets); + put_task_struct(p); + if (ret) + goto out; + + ret = count; +out: + kfree(kbuf); + return ret; +} + +static int timens_offsets_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, timens_offsets_show, inode); +} + +static const struct file_operations proc_timens_offsets_operations = { + .open = timens_offsets_open, + .read = seq_read, + .write = timens_offsets_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_TIME_NS */ + static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { @@ -1626,8 +1718,7 @@ static const char *proc_pid_get_link(struct dentry *dentry, if (error) goto out; - nd_jump_link(&path); - return NULL; + error = nd_jump_link(&path); out: return ERR_PTR(error); } @@ -3016,6 +3107,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif +#ifdef CONFIG_TIME_NS + REG("timens_offsets", S_IRUGO|S_IWUSR, proc_timens_offsets_operations), +#endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), @@ -3061,6 +3155,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_CGROUPS ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif +#ifdef CONFIG_PROC_CPU_RESCTRL + ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show), +#endif ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), @@ -3461,6 +3558,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_CGROUPS ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif +#ifdef CONFIG_PROC_CPU_RESCTRL + ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show), +#endif ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index 96f1087e372c..c1dea9b8222e 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -16,16 +16,16 @@ static int cpuinfo_open(struct inode *inode, struct file *file) return seq_open(file, &cpuinfo_op); } -static const struct file_operations proc_cpuinfo_operations = { - .open = cpuinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, +static const struct proc_ops cpuinfo_proc_ops = { + .proc_open = cpuinfo_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, }; static int __init proc_cpuinfo_init(void) { - proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); + proc_create("cpuinfo", 0, NULL, &cpuinfo_proc_ops); return 0; } fs_initcall(proc_cpuinfo_init); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 074e9585c699..3faed94e4b65 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -473,7 +473,7 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, ent = __proc_create(&parent, name, S_IFDIR | mode, 2); if (ent) { ent->data = data; - ent->proc_fops = &proc_dir_operations; + ent->proc_dir_ops = &proc_dir_operations; ent->proc_iops = &proc_dir_inode_operations; ent = proc_register(parent, ent); } @@ -503,7 +503,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent = __proc_create(&parent, name, mode, 2); if (ent) { ent->data = NULL; - ent->proc_fops = NULL; + ent->proc_dir_ops = NULL; ent->proc_iops = NULL; ent = proc_register(parent, ent); } @@ -533,25 +533,23 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, - const struct file_operations *proc_fops, void *data) + const struct proc_ops *proc_ops, void *data) { struct proc_dir_entry *p; - BUG_ON(proc_fops == NULL); - p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; - p->proc_fops = proc_fops; + p->proc_ops = proc_ops; return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_data); struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, - const struct file_operations *proc_fops) + const struct proc_ops *proc_ops) { - return proc_create_data(name, mode, parent, proc_fops, NULL); + return proc_create_data(name, mode, parent, proc_ops, NULL); } EXPORT_SYMBOL(proc_create); @@ -573,11 +571,11 @@ static int proc_seq_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -static const struct file_operations proc_seq_fops = { - .open = proc_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = proc_seq_release, +static const struct proc_ops proc_seq_ops = { + .proc_open = proc_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = proc_seq_release, }; struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, @@ -589,7 +587,7 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; - p->proc_fops = &proc_seq_fops; + p->proc_ops = &proc_seq_ops; p->seq_ops = ops; p->state_size = state_size; return proc_register(parent, p); @@ -603,11 +601,11 @@ static int proc_single_open(struct inode *inode, struct file *file) return single_open(file, de->single_show, de->data); } -static const struct file_operations proc_single_fops = { - .open = proc_single_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops proc_single_ops = { + .proc_open = proc_single_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, }; struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, @@ -619,7 +617,7 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; - p->proc_fops = &proc_single_fops; + p->proc_ops = &proc_single_ops; p->single_show = show; return proc_register(parent, p); } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index dbe43a50caf2..6da18316d209 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -163,7 +163,7 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); file = pdeo->file; - pde->proc_fops->release(file_inode(file), file); + pde->proc_ops->proc_release(file_inode(file), file); spin_lock(&pde->pde_unload_lock); /* After ->release. */ list_del(&pdeo->lh); @@ -200,12 +200,12 @@ static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; if (use_pde(pde)) { - typeof_member(struct file_operations, llseek) llseek; + typeof_member(struct proc_ops, proc_lseek) lseek; - llseek = pde->proc_fops->llseek; - if (!llseek) - llseek = default_llseek; - rv = llseek(file, offset, whence); + lseek = pde->proc_ops->proc_lseek; + if (!lseek) + lseek = default_llseek; + rv = lseek(file, offset, whence); unuse_pde(pde); } return rv; @@ -216,9 +216,9 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (use_pde(pde)) { - typeof_member(struct file_operations, read) read; + typeof_member(struct proc_ops, proc_read) read; - read = pde->proc_fops->read; + read = pde->proc_ops->proc_read; if (read) rv = read(file, buf, count, ppos); unuse_pde(pde); @@ -231,9 +231,9 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (use_pde(pde)) { - typeof_member(struct file_operations, write) write; + typeof_member(struct proc_ops, proc_write) write; - write = pde->proc_fops->write; + write = pde->proc_ops->proc_write; if (write) rv = write(file, buf, count, ppos); unuse_pde(pde); @@ -246,9 +246,9 @@ static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) struct proc_dir_entry *pde = PDE(file_inode(file)); __poll_t rv = DEFAULT_POLLMASK; if (use_pde(pde)) { - typeof_member(struct file_operations, poll) poll; + typeof_member(struct proc_ops, proc_poll) poll; - poll = pde->proc_fops->poll; + poll = pde->proc_ops->proc_poll; if (poll) rv = poll(file, pts); unuse_pde(pde); @@ -261,9 +261,9 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; if (use_pde(pde)) { - typeof_member(struct file_operations, unlocked_ioctl) ioctl; + typeof_member(struct proc_ops, proc_ioctl) ioctl; - ioctl = pde->proc_fops->unlocked_ioctl; + ioctl = pde->proc_ops->proc_ioctl; if (ioctl) rv = ioctl(file, cmd, arg); unuse_pde(pde); @@ -277,9 +277,9 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; if (use_pde(pde)) { - typeof_member(struct file_operations, compat_ioctl) compat_ioctl; + typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; - compat_ioctl = pde->proc_fops->compat_ioctl; + compat_ioctl = pde->proc_ops->proc_compat_ioctl; if (compat_ioctl) rv = compat_ioctl(file, cmd, arg); unuse_pde(pde); @@ -293,9 +293,9 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; if (use_pde(pde)) { - typeof_member(struct file_operations, mmap) mmap; + typeof_member(struct proc_ops, proc_mmap) mmap; - mmap = pde->proc_fops->mmap; + mmap = pde->proc_ops->proc_mmap; if (mmap) rv = mmap(file, vma); unuse_pde(pde); @@ -312,9 +312,9 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long rv = -EIO; if (use_pde(pde)) { - typeof_member(struct file_operations, get_unmapped_area) get_area; + typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; - get_area = pde->proc_fops->get_unmapped_area; + get_area = pde->proc_ops->proc_get_unmapped_area; #ifdef CONFIG_MMU if (!get_area) get_area = current->mm->get_unmapped_area; @@ -333,8 +333,8 @@ static int proc_reg_open(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); int rv = 0; - typeof_member(struct file_operations, open) open; - typeof_member(struct file_operations, release) release; + typeof_member(struct proc_ops, proc_open) open; + typeof_member(struct proc_ops, proc_release) release; struct pde_opener *pdeo; /* @@ -351,7 +351,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) if (!use_pde(pde)) return -ENOENT; - release = pde->proc_fops->release; + release = pde->proc_ops->proc_release; if (release) { pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL); if (!pdeo) { @@ -360,7 +360,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) } } - open = pde->proc_fops->open; + open = pde->proc_ops->proc_open; if (open) rv = open(inode, file); @@ -468,21 +468,23 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_size = de->size; if (de->nlink) set_nlink(inode, de->nlink); - WARN_ON(!de->proc_iops); - inode->i_op = de->proc_iops; - if (de->proc_fops) { - if (S_ISREG(inode->i_mode)) { + + if (S_ISREG(inode->i_mode)) { + inode->i_op = de->proc_iops; + inode->i_fop = &proc_reg_file_ops; #ifdef CONFIG_COMPAT - if (!de->proc_fops->compat_ioctl) - inode->i_fop = - &proc_reg_file_ops_no_compat; - else -#endif - inode->i_fop = &proc_reg_file_ops; - } else { - inode->i_fop = de->proc_fops; + if (!de->proc_ops->proc_compat_ioctl) { + inode->i_fop = &proc_reg_file_ops_no_compat; } - } +#endif + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = de->proc_iops; + inode->i_fop = de->proc_dir_ops; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = de->proc_iops; + inode->i_fop = NULL; + } else + BUG(); } else pde_put(de); return inode; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 0f3b557c9b77..41587276798e 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -39,7 +39,10 @@ struct proc_dir_entry { spinlock_t pde_unload_lock; struct completion *pde_unload_completion; const struct inode_operations *proc_iops; - const struct file_operations *proc_fops; + union { + const struct proc_ops *proc_ops; + const struct file_operations *proc_dir_ops; + }; const struct dentry_operations *proc_dops; union { const struct seq_operations *seq_ops; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index e2ed8e08cc7a..8ba492d44e68 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -574,11 +574,11 @@ static int release_kcore(struct inode *inode, struct file *file) return 0; } -static const struct file_operations proc_kcore_operations = { - .read = read_kcore, - .open = open_kcore, - .release = release_kcore, - .llseek = default_llseek, +static const struct proc_ops kcore_proc_ops = { + .proc_read = read_kcore, + .proc_open = open_kcore, + .proc_release = release_kcore, + .proc_lseek = default_llseek, }; /* just remember that we have to update kcore */ @@ -637,8 +637,7 @@ static void __init add_modules_range(void) static int __init proc_kcore_init(void) { - proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, - &proc_kcore_operations); + proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &kcore_proc_ops); if (!proc_root_kcore) { pr_err("couldn't create /proc/kcore\n"); return 0; /* Always returns 0. */ diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 4f4a2abb225e..ec1b7d2fb773 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -49,17 +49,17 @@ static __poll_t kmsg_poll(struct file *file, poll_table *wait) } -static const struct file_operations proc_kmsg_operations = { - .read = kmsg_read, - .poll = kmsg_poll, - .open = kmsg_open, - .release = kmsg_release, - .llseek = generic_file_llseek, +static const struct proc_ops kmsg_proc_ops = { + .proc_read = kmsg_read, + .proc_poll = kmsg_poll, + .proc_open = kmsg_open, + .proc_release = kmsg_release, + .proc_lseek = generic_file_llseek, }; static int __init proc_kmsg_init(void) { - proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); + proc_create("kmsg", S_IRUSR, NULL, &kmsg_proc_ops); return 0; } fs_initcall(proc_kmsg_init); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index dd2b35f78b09..8e159fc78c0a 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -33,6 +33,10 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_CGROUPS &cgroupns_operations, #endif +#ifdef CONFIG_TIME_NS + &timens_operations, + &timens_for_children_operations, +#endif }; static const char *proc_ns_get_link(struct dentry *dentry, @@ -42,22 +46,26 @@ static const char *proc_ns_get_link(struct dentry *dentry, const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; struct path ns_path; - void *error = ERR_PTR(-EACCES); + int error = -EACCES; if (!dentry) return ERR_PTR(-ECHILD); task = get_proc_task(inode); if (!task) - return error; + return ERR_PTR(-EACCES); - if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { - error = ns_get_path(&ns_path, task, ns_ops); - if (!error) - nd_jump_link(&ns_path); - } + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + goto out; + + error = ns_get_path(&ns_path, task, ns_ops); + if (error) + goto out; + + error = nd_jump_link(&ns_path); +out: put_task_struct(task); - return error; + return ERR_PTR(error); } static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) diff --git a/fs/proc/page.c b/fs/proc/page.c index 7c952ee732e6..f909243d4a66 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -21,6 +21,21 @@ #define KPMMASK (KPMSIZE - 1) #define KPMBITS (KPMSIZE * BITS_PER_BYTE) +static inline unsigned long get_max_dump_pfn(void) +{ +#ifdef CONFIG_SPARSEMEM + /* + * The memmap of early sections is completely populated and marked + * online even if max_pfn does not fall on a section boundary - + * pfn_to_online_page() will succeed on all pages. Allow inspecting + * these memmaps. + */ + return round_up(max_pfn, PAGES_PER_SECTION); +#else + return max_pfn; +#endif +} + /* /proc/kpagecount - an array exposing page counts * * Each entry is a u64 representing the corresponding @@ -29,6 +44,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; struct page *ppage; unsigned long src = *ppos; @@ -37,9 +53,11 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, u64 pcount; pfn = src / KPMSIZE; - count = min_t(size_t, count, (max_pfn * KPMSIZE) - src); if (src & KPMMASK || count & KPMMASK) return -EINVAL; + if (src >= max_dump_pfn * KPMSIZE) + return 0; + count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); while (count > 0) { /* @@ -71,9 +89,9 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, return ret; } -static const struct file_operations proc_kpagecount_operations = { - .llseek = mem_lseek, - .read = kpagecount_read, +static const struct proc_ops kpagecount_proc_ops = { + .proc_lseek = mem_lseek, + .proc_read = kpagecount_read, }; /* /proc/kpageflags - an array exposing page flags @@ -206,6 +224,7 @@ u64 stable_page_flags(struct page *page) static ssize_t kpageflags_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; struct page *ppage; unsigned long src = *ppos; @@ -213,9 +232,11 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, ssize_t ret = 0; pfn = src / KPMSIZE; - count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); if (src & KPMMASK || count & KPMMASK) return -EINVAL; + if (src >= max_dump_pfn * KPMSIZE) + return 0; + count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); while (count > 0) { /* @@ -242,15 +263,16 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, return ret; } -static const struct file_operations proc_kpageflags_operations = { - .llseek = mem_lseek, - .read = kpageflags_read, +static const struct proc_ops kpageflags_proc_ops = { + .proc_lseek = mem_lseek, + .proc_read = kpageflags_read, }; #ifdef CONFIG_MEMCG static ssize_t kpagecgroup_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; struct page *ppage; unsigned long src = *ppos; @@ -259,9 +281,11 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf, u64 ino; pfn = src / KPMSIZE; - count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); if (src & KPMMASK || count & KPMMASK) return -EINVAL; + if (src >= max_dump_pfn * KPMSIZE) + return 0; + count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); while (count > 0) { /* @@ -293,18 +317,18 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf, return ret; } -static const struct file_operations proc_kpagecgroup_operations = { - .llseek = mem_lseek, - .read = kpagecgroup_read, +static const struct proc_ops kpagecgroup_proc_ops = { + .proc_lseek = mem_lseek, + .proc_read = kpagecgroup_read, }; #endif /* CONFIG_MEMCG */ static int __init proc_page_init(void) { - proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); - proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); + proc_create("kpagecount", S_IRUSR, NULL, &kpagecount_proc_ops); + proc_create("kpageflags", S_IRUSR, NULL, &kpageflags_proc_ops); #ifdef CONFIG_MEMCG - proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations); + proc_create("kpagecgroup", S_IRUSR, NULL, &kpagecgroup_proc_ops); #endif return 0; } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 76ae278df1c4..4888c5224442 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -90,12 +90,12 @@ static int seq_release_net(struct inode *ino, struct file *f) return 0; } -static const struct file_operations proc_net_seq_fops = { - .open = seq_open_net, - .read = seq_read, - .write = proc_simple_write, - .llseek = seq_lseek, - .release = seq_release_net, +static const struct proc_ops proc_net_seq_ops = { + .proc_open = seq_open_net, + .proc_read = seq_read, + .proc_write = proc_simple_write, + .proc_lseek = seq_lseek, + .proc_release = seq_release_net, }; struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, @@ -108,7 +108,7 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, if (!p) return NULL; pde_force_lookup(p); - p->proc_fops = &proc_net_seq_fops; + p->proc_ops = &proc_net_seq_ops; p->seq_ops = ops; p->state_size = state_size; return proc_register(parent, p); @@ -152,7 +152,7 @@ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode if (!p) return NULL; pde_force_lookup(p); - p->proc_fops = &proc_net_seq_fops; + p->proc_ops = &proc_net_seq_ops; p->seq_ops = ops; p->state_size = state_size; p->write = write; @@ -183,12 +183,12 @@ static int single_release_net(struct inode *ino, struct file *f) return single_release(ino, f); } -static const struct file_operations proc_net_single_fops = { - .open = single_open_net, - .read = seq_read, - .write = proc_simple_write, - .llseek = seq_lseek, - .release = single_release_net, +static const struct proc_ops proc_net_single_ops = { + .proc_open = single_open_net, + .proc_read = seq_read, + .proc_write = proc_simple_write, + .proc_lseek = seq_lseek, + .proc_release = single_release_net, }; struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, @@ -201,7 +201,7 @@ struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, if (!p) return NULL; pde_force_lookup(p); - p->proc_fops = &proc_net_single_fops; + p->proc_ops = &proc_net_single_ops; p->single_show = show; return proc_register(parent, p); } @@ -244,7 +244,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo if (!p) return NULL; pde_force_lookup(p); - p->proc_fops = &proc_net_single_fops; + p->proc_ops = &proc_net_single_ops; p->single_show = show; p->write = write; return proc_register(parent, p); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d80989b6c344..c75bb4632ed1 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1720,7 +1720,7 @@ int __init proc_sys_init(void) proc_sys_root = proc_mkdir("sys", NULL); proc_sys_root->proc_iops = &proc_sys_dir_operations; - proc_sys_root->proc_fops = &proc_sys_dir_file_operations; + proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; return sysctl_init(); diff --git a/fs/proc/root.c b/fs/proc/root.c index 0b7c8dffc9ae..72c07a34cff0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -292,7 +292,7 @@ struct proc_dir_entry proc_root = { .nlink = 2, .refcnt = REFCOUNT_INIT(1), .proc_iops = &proc_root_inode_operations, - .proc_fops = &proc_root_operations, + .proc_dir_ops = &proc_root_operations, .parent = &proc_root, .subdir = RB_ROOT, .name = "/proc", diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 37bdbec5b402..0449edf460f5 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -134,7 +134,7 @@ static int show_stat(struct seq_file *p, void *v) softirq += cpustat[CPUTIME_SOFTIRQ]; steal += cpustat[CPUTIME_STEAL]; guest += cpustat[CPUTIME_GUEST]; - guest_nice += cpustat[CPUTIME_USER]; + guest_nice += cpustat[CPUTIME_GUEST_NICE]; sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); @@ -175,7 +175,7 @@ static int show_stat(struct seq_file *p, void *v) softirq = cpustat[CPUTIME_SOFTIRQ]; steal = cpustat[CPUTIME_STEAL]; guest = cpustat[CPUTIME_GUEST]; - guest_nice = cpustat[CPUTIME_USER]; + guest_nice = cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); @@ -223,16 +223,16 @@ static int stat_open(struct inode *inode, struct file *file) return single_open_size(file, show_stat, NULL, size); } -static const struct file_operations proc_stat_operations = { - .open = stat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops stat_proc_ops = { + .proc_open = stat_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, }; static int __init proc_stat_init(void) { - proc_create("stat", 0, NULL, &proc_stat_operations); + proc_create("stat", 0, NULL, &stat_proc_ops); return 0; } fs_initcall(proc_stat_init); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9442631fd4af..3ba9ae83bff5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -505,7 +505,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, #ifdef CONFIG_SHMEM static int smaps_pte_hole(unsigned long addr, unsigned long end, - struct mm_walk *walk) + __always_unused int depth, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; @@ -1282,7 +1282,7 @@ static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, } static int pagemap_pte_hole(unsigned long start, unsigned long end, - struct mm_walk *walk) + __always_unused int depth, struct mm_walk *walk) { struct pagemapread *pm = walk->private; unsigned long addr = start; diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index a4c2791ab70b..5a1b228964fb 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -5,6 +5,7 @@ #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/time.h> +#include <linux/time_namespace.h> #include <linux/kernel_stat.h> static int uptime_proc_show(struct seq_file *m, void *v) @@ -20,6 +21,8 @@ static int uptime_proc_show(struct seq_file *m, void *v) nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; ktime_get_boottime_ts64(&uptime); + timens_add_boottime(&uptime); + idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); idle.tv_nsec = rem; seq_printf(m, "%lu.%02lu %lu.%02lu\n", diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 7b13988796e1..7dc800cce354 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -667,10 +667,10 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) } #endif -static const struct file_operations proc_vmcore_operations = { - .read = read_vmcore, - .llseek = default_llseek, - .mmap = mmap_vmcore, +static const struct proc_ops vmcore_proc_ops = { + .proc_read = read_vmcore, + .proc_lseek = default_llseek, + .proc_mmap = mmap_vmcore, }; static struct vmcore* __init get_new_element(void) @@ -1555,7 +1555,7 @@ static int __init vmcore_init(void) elfcorehdr_free(elfcorehdr_addr); elfcorehdr_addr = ELFCORE_ADDR_ERR; - proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); + proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &vmcore_proc_ops); if (proc_vmcore) proc_vmcore->size = vmcore_size; return 0; diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 8caff834f002..013486b5125e 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -407,6 +407,17 @@ static int notrace ramoops_pstore_write(struct pstore_record *record) prz = cxt->dprzs[cxt->dump_write_cnt]; + /* + * Since this is a new crash dump, we need to reset the buffer in + * case it still has an old dump present. Without this, the new dump + * will get appended, which would seriously confuse anything trying + * to check dump file contents. Specifically, ramoops_read_kmsg_hdr() + * expects to find a dump header in the beginning of buffer data, so + * we must to reset the buffer values, in order to ensure that the + * header will be written to the beginning of the buffer. + */ + persistent_ram_zap(prz); + /* Build header and append record contents. */ hlen = ramoops_write_kmsg_hdr(prz, record); if (!hlen) @@ -572,6 +583,7 @@ static int ramoops_init_przs(const char *name, prz_ar[i] = persistent_ram_new(*paddr, zone_sz, sig, &cxt->ecc_info, cxt->memtype, flags, label); + kfree(label); if (IS_ERR(prz_ar[i])) { err = PTR_ERR(prz_ar[i]); dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n", @@ -617,6 +629,7 @@ static int ramoops_init_prz(const char *name, label = kasprintf(GFP_KERNEL, "ramoops:%s", name); *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype, PRZ_FLAG_ZAP_OLD, label); + kfree(label); if (IS_ERR(*prz)) { int err = PTR_ERR(*prz); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 8823f65888f0..1f4d8c06f9be 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -574,7 +574,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, /* Initialize general buffer state. */ raw_spin_lock_init(&prz->buffer_lock); prz->flags = flags; - prz->label = label; + prz->label = kstrdup(label, GFP_KERNEL); ret = persistent_ram_buffer_map(start, size, prz, memtype); if (ret) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index b0688c02dc90..b6a4f692d345 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -984,6 +984,7 @@ static int add_dquot_ref(struct super_block *sb, int type) * later. */ old_inode = inode; + cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 53429c29c784..58fc2a7c7fd1 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -22,8 +22,6 @@ MODULE_AUTHOR("Jan Kara"); MODULE_DESCRIPTION("Quota format v2 support"); MODULE_LICENSE("GPL"); -#define __QUOTA_V2_PARANOIA - static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot); static void v2r0_disk2memdqb(struct dquot *dquot, void *dp); static int v2r0_is_id(void *dp, struct dquot *dquot); diff --git a/fs/quota/quotaio_v1.h b/fs/quota/quotaio_v1.h index bd11e2c08119..31dca9a89176 100644 --- a/fs/quota/quotaio_v1.h +++ b/fs/quota/quotaio_v1.h @@ -25,8 +25,10 @@ struct v1_disk_dqblk { __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */ __u32 dqb_isoftlimit; /* preferred inode limit */ __u32 dqb_curinodes; /* current # allocated inodes */ - time_t dqb_btime; /* time limit for excessive disk use */ - time_t dqb_itime; /* time limit for excessive inode use */ + + /* below fields differ in length on 32-bit vs 64-bit architectures */ + unsigned long dqb_btime; /* time limit for excessive disk use */ + unsigned long dqb_itime; /* time limit for excessive inode use */ }; #define v1_dqoff(UID) ((loff_t)((UID) * sizeof (struct v1_disk_dqblk))) diff --git a/fs/read_write.c b/fs/read_write.c index 5bbf587f5bc1..59d819c5b92e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -939,6 +939,34 @@ out: return ret; } +ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, + struct iov_iter *iter) +{ + size_t tot_len; + ssize_t ret = 0; + + if (!file->f_op->read_iter) + return -EINVAL; + if (!(file->f_mode & FMODE_READ)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_READ)) + return -EINVAL; + + tot_len = iov_iter_count(iter); + if (!tot_len) + goto out; + ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); + if (ret < 0) + return ret; + + ret = call_read_iter(file, iocb, iter); +out: + if (ret >= 0) + fsnotify_access(file); + return ret; +} +EXPORT_SYMBOL(vfs_iocb_iter_read); + ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { @@ -975,6 +1003,34 @@ static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, return ret; } +ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, + struct iov_iter *iter) +{ + size_t tot_len; + ssize_t ret = 0; + + if (!file->f_op->write_iter) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_WRITE)) + return -EINVAL; + + tot_len = iov_iter_count(iter); + if (!tot_len) + return 0; + ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); + if (ret < 0) + return ret; + + ret = call_write_iter(file, iocb, iter); + if (ret > 0) + fsnotify_modify(file); + + return ret; +} +EXPORT_SYMBOL(vfs_iocb_iter_write); + ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { @@ -1777,10 +1833,9 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len, * else. Assume that the offsets have already been checked for block * alignment. * - * For deduplication we always scale down to the previous block because we - * can't meaningfully compare post-EOF contents. - * - * For clone we only link a partial EOF block above the destination file's EOF. + * For clone we only link a partial EOF block above or at the destination file's + * EOF. For deduplication we accept a partial EOF block only if it ends at the + * destination file's EOF (can not link it into the middle of a file). * * Shorten the request if possible. */ @@ -1796,8 +1851,7 @@ static int generic_remap_check_len(struct inode *inode_in, if ((*len & blkmask) == 0) return 0; - if ((remap_flags & REMAP_FILE_DEDUP) || - pos_out + *len < i_size_read(inode_out)) + if (pos_out + *len < i_size_read(inode_out)) new_len &= ~blkmask; if (new_len == *len) diff --git a/fs/readdir.c b/fs/readdir.c index d26d5ea4de7b..de2eceffdee8 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -102,10 +102,14 @@ EXPORT_SYMBOL(iterate_dir); * filename length, and the above "soft error" worry means * that it's probably better left alone until we have that * issue clarified. + * + * Note the PATH_MAX check - it's arbitrary but the real + * kernel limit on a possible path component, not NAME_MAX, + * which is the technical standard limit. */ static int verify_dirent_name(const char *name, int len) { - if (!len) + if (len <= 0 || len >= PATH_MAX) return -EIO; if (memchr(name, '/', len)) return -EIO; @@ -206,7 +210,7 @@ struct linux_dirent { struct getdents_callback { struct dir_context ctx; struct linux_dirent __user * current_dir; - struct linux_dirent __user * previous; + int prev_reclen; int count; int error; }; @@ -214,12 +218,13 @@ struct getdents_callback { static int filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { - struct linux_dirent __user * dirent; + struct linux_dirent __user *dirent, *prev; struct getdents_callback *buf = container_of(ctx, struct getdents_callback, ctx); unsigned long d_ino; int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2, sizeof(long)); + int prev_reclen; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) @@ -232,28 +237,24 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, buf->error = -EOVERFLOW; return -EOVERFLOW; } - dirent = buf->previous; - if (dirent && signal_pending(current)) + prev_reclen = buf->prev_reclen; + if (prev_reclen && signal_pending(current)) return -EINTR; - - /* - * Note! This range-checks 'previous' (which may be NULL). - * The real range was checked in getdents - */ - if (!user_access_begin(dirent, sizeof(*dirent))) - goto efault; - if (dirent) - unsafe_put_user(offset, &dirent->d_off, efault_end); dirent = buf->current_dir; + prev = (void __user *) dirent - prev_reclen; + if (!user_access_begin(prev, reclen + prev_reclen)) + goto efault; + + /* This might be 'dirent->d_off', but if so it will get overwritten */ + unsafe_put_user(offset, &prev->d_off, efault_end); unsafe_put_user(d_ino, &dirent->d_ino, efault_end); unsafe_put_user(reclen, &dirent->d_reclen, efault_end); unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_access_end(); - buf->previous = dirent; - dirent = (void __user *)dirent + reclen; - buf->current_dir = dirent; + buf->current_dir = (void __user *)dirent + reclen; + buf->prev_reclen = reclen; buf->count -= reclen; return 0; efault_end: @@ -267,7 +268,6 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, struct linux_dirent __user *, dirent, unsigned int, count) { struct fd f; - struct linux_dirent __user * lastdirent; struct getdents_callback buf = { .ctx.actor = filldir, .count = count, @@ -285,8 +285,10 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; - lastdirent = buf.previous; - if (lastdirent) { + if (buf.prev_reclen) { + struct linux_dirent __user * lastdirent; + lastdirent = (void __user *)buf.current_dir - buf.prev_reclen; + if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else @@ -299,7 +301,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, struct getdents_callback64 { struct dir_context ctx; struct linux_dirent64 __user * current_dir; - struct linux_dirent64 __user * previous; + int prev_reclen; int count; int error; }; @@ -307,11 +309,12 @@ struct getdents_callback64 { static int filldir64(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { - struct linux_dirent64 __user *dirent; + struct linux_dirent64 __user *dirent, *prev; struct getdents_callback64 *buf = container_of(ctx, struct getdents_callback64, ctx); int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, sizeof(u64)); + int prev_reclen; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) @@ -319,30 +322,27 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; - dirent = buf->previous; - if (dirent && signal_pending(current)) + prev_reclen = buf->prev_reclen; + if (prev_reclen && signal_pending(current)) return -EINTR; - - /* - * Note! This range-checks 'previous' (which may be NULL). - * The real range was checked in getdents - */ - if (!user_access_begin(dirent, sizeof(*dirent))) - goto efault; - if (dirent) - unsafe_put_user(offset, &dirent->d_off, efault_end); dirent = buf->current_dir; + prev = (void __user *)dirent - prev_reclen; + if (!user_access_begin(prev, reclen + prev_reclen)) + goto efault; + + /* This might be 'dirent->d_off', but if so it will get overwritten */ + unsafe_put_user(offset, &prev->d_off, efault_end); unsafe_put_user(ino, &dirent->d_ino, efault_end); unsafe_put_user(reclen, &dirent->d_reclen, efault_end); unsafe_put_user(d_type, &dirent->d_type, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_access_end(); - buf->previous = dirent; - dirent = (void __user *)dirent + reclen; - buf->current_dir = dirent; + buf->prev_reclen = reclen; + buf->current_dir = (void __user *)dirent + reclen; buf->count -= reclen; return 0; + efault_end: user_access_end(); efault: @@ -354,7 +354,6 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent, unsigned int count) { struct fd f; - struct linux_dirent64 __user * lastdirent; struct getdents_callback64 buf = { .ctx.actor = filldir64, .count = count, @@ -372,9 +371,11 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent, error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; - lastdirent = buf.previous; - if (lastdirent) { + if (buf.prev_reclen) { + struct linux_dirent64 __user * lastdirent; typeof(lastdirent->d_off) d_off = buf.ctx.pos; + + lastdirent = (void __user *) buf.current_dir - buf.prev_reclen; if (__put_user(d_off, &lastdirent->d_off)) error = -EFAULT; else diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 4b3e3e73b512..072156c4f895 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -56,8 +56,6 @@ /* gets a struct reiserfs_journal_list * from a list head */ #define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ j_list)) -#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ - j_working_list)) /* must be correct to keep the desc and commit structs at 4k */ #define JOURNAL_TRANS_HALF 1018 diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index f2cf3441fdfc..ff336513c254 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -63,7 +63,6 @@ static int show_version(struct seq_file *m, void *unused) #define MAP( i ) D4C( objectid_map( sb, rs )[ i ] ) #define DJF( x ) le32_to_cpu( rs -> x ) -#define DJV( x ) le32_to_cpu( s_v1 -> x ) #define DJP( x ) le32_to_cpu( jp -> x ) #define JF( x ) ( r -> s_journal -> x ) diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index da9ebe33882b..8bf88d690729 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -918,12 +918,6 @@ int comp_items(const struct item_head *stored_ih, const struct treepath *path) return memcmp(stored_ih, ih, IH_SIZE); } -/* unformatted nodes are not logged anymore, ever. This is safe now */ -#define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1) - -/* block can not be forgotten as it is in I/O or held by someone */ -#define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh))) - /* prepare for delete or cut of direct item */ static inline int prepare_for_direct_item(struct treepath *path, struct item_head *le_ih, @@ -2246,7 +2240,8 @@ error_out: /* also releases the path */ unfix_nodes(&s_ins_balance); #ifdef REISERQUOTA_DEBUG - reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, + if (inode) + reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, "reiserquota insert_item(): freeing %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(ih)); #endif diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 3244037b1286..a6bce5b1fb1d 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -629,6 +629,7 @@ static void reiserfs_put_super(struct super_block *s) reiserfs_write_unlock(s); mutex_destroy(&REISERFS_SB(s)->lock); destroy_workqueue(REISERFS_SB(s)->commit_wq); + kfree(REISERFS_SB(s)->s_jdev); kfree(s->s_fs_info); s->s_fs_info = NULL; } @@ -1947,7 +1948,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (!sbi->s_jdev) { SWARN(silent, s, "", "Cannot allocate memory for " "journal device name"); - goto error; + goto error_unlocked; } } #ifdef CONFIG_QUOTA @@ -2240,6 +2241,7 @@ error_unlocked: kfree(qf_names[j]); } #endif + kfree(sbi->s_jdev); kfree(sbi); s->s_fs_info = NULL; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 62b40df36c98..28b241cd6987 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -319,8 +319,12 @@ static int reiserfs_for_each_xattr(struct inode *inode, out_dir: dput(dir); out: - /* -ENODATA isn't an error */ - if (err == -ENODATA) + /* + * -ENODATA: this object doesn't have any xattrs + * -EOPNOTSUPP: this file system doesn't have xattrs enabled on disk. + * Neither are errors + */ + if (err == -ENODATA || err == -EOPNOTSUPP) err = 0; return err; } diff --git a/fs/stack.c b/fs/stack.c index 4ef2c056269d..c9830924eb12 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -23,7 +23,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src) /* * But on 32-bit, we ought to make an effort to keep the two halves of - * i_blocks in sync despite SMP or PREEMPT - though stat's + * i_blocks in sync despite SMP or PREEMPTION - though stat's * generic_fillattr() doesn't bother, and we won't be applying quotas * (where i_blocks does become important) at the upper level. * @@ -38,14 +38,14 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src) spin_unlock(&src->i_lock); /* - * If CONFIG_SMP or CONFIG_PREEMPT on 32-bit, it's vital for + * If CONFIG_SMP or CONFIG_PREEMPTION on 32-bit, it's vital for * fsstack_copy_inode_size() to hold some lock around * i_size_write(), otherwise i_size_read() may spin forever (see * include/linux/fs.h). We don't necessarily hold i_mutex when this * is called, so take i_lock for that case. * * And if on 32-bit, continue our effort to keep the two halves of - * i_blocks in sync despite SMP or PREEMPT: use i_lock for that case + * i_blocks in sync despite SMP or PREEMPTION: use i_lock for that case * too, and do both at once by combining the tests. * * There is none of this locking overhead in the 64-bit case. diff --git a/fs/stat.c b/fs/stat.c index c38e4c2e1221..030008796479 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -21,6 +21,8 @@ #include <linux/uaccess.h> #include <asm/unistd.h> +#include "internal.h" + /** * generic_fillattr - Fill in the basic attributes from the inode struct * @inode: Inode to use as the source @@ -150,6 +152,23 @@ int vfs_statx_fd(unsigned int fd, struct kstat *stat, } EXPORT_SYMBOL(vfs_statx_fd); +inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags) +{ + if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | + AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0) + return -EINVAL; + + *lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; + if (flags & AT_SYMLINK_NOFOLLOW) + *lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_NO_AUTOMOUNT) + *lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & AT_EMPTY_PATH) + *lookup_flags |= LOOKUP_EMPTY; + + return 0; +} + /** * vfs_statx - Get basic and extra attributes by filename * @dfd: A file descriptor representing the base dir for a relative filename @@ -170,19 +189,10 @@ int vfs_statx(int dfd, const char __user *filename, int flags, { struct path path; int error = -EINVAL; - unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; + unsigned lookup_flags; - if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | - AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0) + if (vfs_stat_set_lookup_flags(&lookup_flags, flags)) return -EINVAL; - - if (flags & AT_SYMLINK_NOFOLLOW) - lookup_flags &= ~LOOKUP_FOLLOW; - if (flags & AT_NO_AUTOMOUNT) - lookup_flags &= ~LOOKUP_AUTOMOUNT; - if (flags & AT_EMPTY_PATH) - lookup_flags |= LOOKUP_EMPTY; - retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) @@ -523,7 +533,7 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename, } #endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */ -static noinline_for_stack int +noinline_for_stack int cp_statx(const struct kstat *stat, struct statx __user *buffer) { struct statx tmp; diff --git a/fs/super.c b/fs/super.c index cfadab2cbf35..cd352530eca9 100644 --- a/fs/super.c +++ b/fs/super.c @@ -448,10 +448,12 @@ void generic_shutdown_super(struct super_block *sb) sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; - fsnotify_sb_delete(sb); cgroup_writeback_umount(); + /* evict all inodes with zero refcount */ evict_inodes(sb); + /* only nonzero refcount inodes can have marks */ + fsnotify_sb_delete(sb); if (sb->s_dio_done_wq) { destroy_workqueue(sb->s_dio_done_wq); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index d41c21fef138..c4ab045926b7 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -449,7 +449,7 @@ int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, } link = kernfs_create_link(kobj->sd, target_name, entry); - if (IS_ERR(link) && PTR_ERR(link) == -EEXIST) + if (PTR_ERR(link) == -EEXIST) sysfs_warn_dup(kobj->sd, target_name); kernfs_put(entry); diff --git a/fs/timerfd.c b/fs/timerfd.c index ac7f59a58f94..c5509d2448e3 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -26,6 +26,7 @@ #include <linux/syscalls.h> #include <linux/compat.h> #include <linux/rcupdate.h> +#include <linux/time_namespace.h> struct timerfd_ctx { union { @@ -196,6 +197,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, } if (texp != 0) { + if (flags & TFD_TIMER_ABSTIME) + texp = timens_ktime_to_host(clockid, texp); if (isalarm(ctx)) { if (flags & TFD_TIMER_ABSTIME) alarm_start(&ctx->t.alarm, texp); diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index 69932bcfa920..45d3d207fb99 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -12,6 +12,7 @@ config UBIFS_FS select CRYPTO_ZSTD if UBIFS_FS_ZSTD select CRYPTO_HASH_INFO select UBIFS_FS_XATTR if FS_ENCRYPTION + select FS_ENCRYPTION_ALGS if FS_ENCRYPTION depends on MTD_UBI help UBIFS is a file system for flash devices which works on top of UBI. diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 0b98e3c8b461..ef85ec167a84 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -81,7 +81,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, struct ubifs_inode *ui; bool encrypted = false; - if (ubifs_crypt_is_encrypted(dir)) { + if (IS_ENCRYPTED(dir)) { err = fscrypt_get_encryption_info(dir); if (err) { ubifs_err(c, "fscrypt_get_encryption_info failed: %i", err); @@ -225,9 +225,9 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, goto done; } - if (nm.hash) { - ubifs_assert(c, fname_len(&nm) == 0); - ubifs_assert(c, fname_name(&nm) == NULL); + if (fname_name(&nm) == NULL) { + if (nm.hash & ~UBIFS_S_KEY_HASH_MASK) + goto done; /* ENOENT */ dent_key_init_hash(c, &key, dir->i_ino, nm.hash); err = ubifs_tnc_lookup_dh(c, &key, dent, nm.minor_hash); } else { @@ -261,7 +261,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, goto done; } - if (ubifs_crypt_is_encrypted(dir) && + if (IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { ubifs_warn(c, "Inconsistent encryption contexts: %lu/%lu", @@ -499,7 +499,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) struct ubifs_dent_node *dent; struct inode *dir = file_inode(file); struct ubifs_info *c = dir->i_sb->s_fs_info; - bool encrypted = ubifs_crypt_is_encrypted(dir); + bool encrypted = IS_ENCRYPTED(dir); dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos); @@ -512,7 +512,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) if (encrypted) { err = fscrypt_get_encryption_info(dir); - if (err && err != -ENOKEY) + if (err) return err; err = fscrypt_fname_alloc_buffer(dir, UBIFS_MAX_NLEN, &fstr); @@ -1618,7 +1618,7 @@ int ubifs_getattr(const struct path *path, struct kstat *stat, static int ubifs_dir_open(struct inode *dir, struct file *file) { - if (ubifs_crypt_is_encrypted(dir)) + if (IS_ENCRYPTED(dir)) return fscrypt_get_encryption_info(dir) ? -EACCES : 0; return 0; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 91362079f82a..743928efffc1 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -67,7 +67,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; - if (ubifs_crypt_is_encrypted(inode)) { + if (IS_ENCRYPTED(inode)) { err = ubifs_decrypt(inode, dn, &dlen, block); if (err) goto dump; @@ -647,7 +647,7 @@ static int populate_page(struct ubifs_info *c, struct page *page, dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; out_len = UBIFS_BLOCK_SIZE; - if (ubifs_crypt_is_encrypted(inode)) { + if (IS_ENCRYPTED(inode)) { err = ubifs_decrypt(inode, dn, &dlen, page_block); if (err) goto out_err; @@ -786,7 +786,9 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, if (page_offset > end_index) break; - page = find_or_create_page(mapping, page_offset, ra_gfp_mask); + page = pagecache_get_page(mapping, page_offset, + FGP_LOCK|FGP_ACCESSED|FGP_CREAT|FGP_NOWAIT, + ra_gfp_mask); if (!page) break; if (!PageUptodate(page)) diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 5dc5abca11c7..d49fc04f2d7d 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -17,10 +17,14 @@ #include "ubifs.h" /* Need to be kept consistent with checked flags in ioctl2ubifs() */ -#define UBIFS_SUPPORTED_IOCTL_FLAGS \ +#define UBIFS_SETTABLE_IOCTL_FLAGS \ (FS_COMPR_FL | FS_SYNC_FL | FS_APPEND_FL | \ FS_IMMUTABLE_FL | FS_DIRSYNC_FL) +/* Need to be kept consistent with checked flags in ubifs2ioctl() */ +#define UBIFS_GETTABLE_IOCTL_FLAGS \ + (UBIFS_SETTABLE_IOCTL_FLAGS | FS_ENCRYPT_FL) + /** * ubifs_set_inode_flags - set VFS inode flags. * @inode: VFS inode to set flags for @@ -91,6 +95,8 @@ static int ubifs2ioctl(int ubifs_flags) ioctl_flags |= FS_IMMUTABLE_FL; if (ubifs_flags & UBIFS_DIRSYNC_FL) ioctl_flags |= FS_DIRSYNC_FL; + if (ubifs_flags & UBIFS_CRYPT_FL) + ioctl_flags |= FS_ENCRYPT_FL; return ioctl_flags; } @@ -113,7 +119,8 @@ static int setflags(struct inode *inode, int flags) if (err) goto out_unlock; - ui->flags = ioctl2ubifs(flags); + ui->flags &= ~ioctl2ubifs(UBIFS_SETTABLE_IOCTL_FLAGS); + ui->flags |= ioctl2ubifs(flags); ubifs_set_inode_flags(inode); inode->i_ctime = current_time(inode); release = ui->dirty; @@ -155,8 +162,9 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (get_user(flags, (int __user *) arg)) return -EFAULT; - if (flags & ~UBIFS_SUPPORTED_IOCTL_FLAGS) + if (flags & ~UBIFS_GETTABLE_IOCTL_FLAGS) return -EOPNOTSUPP; + flags &= UBIFS_SETTABLE_IOCTL_FLAGS; if (!S_ISDIR(inode->i_mode)) flags &= ~FS_DIRSYNC_FL; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 388fe8f5dc51..3bf8b1fda9d7 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -588,7 +588,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, if (!xent) { dent->ch.node_type = UBIFS_DENT_NODE; - if (nm->hash) + if (fname_name(nm) == NULL) dent_key_init_hash(c, &dent_key, dir->i_ino, nm->hash); else dent_key_init(c, &dent_key, dir->i_ino, nm); @@ -646,7 +646,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, ubifs_add_auth_dirt(c, lnum); if (deletion) { - if (nm->hash) + if (fname_name(nm) == NULL) err = ubifs_tnc_remove_dh(c, &dent_key, nm->minor_hash); else err = ubifs_tnc_remove_nm(c, &dent_key, nm); @@ -727,7 +727,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1; int write_len; struct ubifs_inode *ui = ubifs_inode(inode); - bool encrypted = ubifs_crypt_is_encrypted(inode); + bool encrypted = IS_ENCRYPTED(inode); u8 hash[UBIFS_HASH_ARR_SZ]; dbg_jnlk(key, "ino %lu, blk %u, len %d, key ", @@ -1449,7 +1449,7 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; compr_type = le16_to_cpu(dn->compr_type); - if (ubifs_crypt_is_encrypted(inode)) { + if (IS_ENCRYPTED(inode)) { err = ubifs_decrypt(inode, dn, &dlen, block); if (err) goto out; @@ -1465,7 +1465,7 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in ubifs_compress(c, buf, *new_len, &dn->data, &out_len, &compr_type); } - if (ubifs_crypt_is_encrypted(inode)) { + if (IS_ENCRYPTED(inode)) { err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block); if (err) goto out; diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h index afa704ff5ca0..8142d9d6fe5d 100644 --- a/fs/ubifs/key.h +++ b/fs/ubifs/key.h @@ -150,7 +150,6 @@ static inline void dent_key_init(const struct ubifs_info *c, uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm)); ubifs_assert(c, !(hash & ~UBIFS_S_KEY_HASH_MASK)); - ubifs_assert(c, !nm->hash && !nm->minor_hash); key->u32[0] = inum; key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS); } diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 54d6db61106f..edf43ddd7dce 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -129,7 +129,7 @@ static void __orphan_drop(struct ubifs_info *c, struct ubifs_orphan *o) static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph) { if (orph->del) { - dbg_gen("deleted twice ino %lu", orph->inum); + dbg_gen("deleted twice ino %lu", (unsigned long)orph->inum); return; } @@ -137,7 +137,7 @@ static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph) orph->del = 1; orph->dnext = c->orph_dnext; c->orph_dnext = orph; - dbg_gen("delete later ino %lu", orph->inum); + dbg_gen("delete later ino %lu", (unsigned long)orph->inum); return; } diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 93d550be4c11..4b4b65b48c57 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -160,7 +160,7 @@ static int create_default_filesystem(struct ubifs_info *c) sup = kzalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_KERNEL); mst = kzalloc(c->mst_node_alsz, GFP_KERNEL); idx_node_size = ubifs_idx_node_sz(c, 1); - idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL); + idx = kzalloc(ALIGN(idx_node_size, c->min_io_size), GFP_KERNEL); ino = kzalloc(ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size), GFP_KERNEL); cs = kzalloc(ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size), GFP_KERNEL); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 5e1e8ec0589e..7fc2f3f07c16 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1599,6 +1599,7 @@ out_free: vfree(c->ileb_buf); vfree(c->sbuf); kfree(c->bottom_up_buf); + kfree(c->sup_node); ubifs_debugging_exit(c); return err; } @@ -1641,6 +1642,7 @@ static void ubifs_umount(struct ubifs_info *c) vfree(c->ileb_buf); vfree(c->sbuf); kfree(c->bottom_up_buf); + kfree(c->sup_node); ubifs_debugging_exit(c); } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index c55f212dcb75..bff682309fbe 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -2095,13 +2095,6 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn, extern const struct fscrypt_operations ubifs_crypt_operations; -static inline bool ubifs_crypt_is_encrypted(const struct inode *inode) -{ - const struct ubifs_inode *ui = ubifs_inode(inode); - - return ui->flags & UBIFS_CRYPT_FL; -} - /* Normal UBIFS messages */ __printf(2, 3) void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...); diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h index fb7f2c7bec9c..3fd85464abd5 100644 --- a/fs/udf/ecma_167.h +++ b/fs/udf/ecma_167.h @@ -4,7 +4,8 @@ * This file is based on ECMA-167 3rd edition (June 1997) * http://www.ecma.ch * - * Copyright (c) 2001-2002 Ben Fennema <bfennema@falcon.csc.calpoly.edu> + * Copyright (c) 2001-2002 Ben Fennema + * Copyright (c) 2017-2019 Pali Rohár <pali.rohar@gmail.com> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,11 +33,19 @@ * SUCH DAMAGE. */ +/** + * @file + * ECMA-167r3 defines and structure definitions + */ + #include <linux/types.h> #ifndef _ECMA_167_H #define _ECMA_167_H 1 +/* Character sets and coding - d-characters (ECMA 167r3 1/7.2) */ +typedef uint8_t dchars; + /* Character set specification (ECMA 167r3 1/7.2.1) */ struct charspec { uint8_t charSetType; @@ -54,6 +63,7 @@ struct charspec { #define CHARSPEC_TYPE_CS7 0x07 /* (1/7.2.9) */ #define CHARSPEC_TYPE_CS8 0x08 /* (1/7.2.10) */ +/* Fixed-length character fields - d-string (EMCA 167r3 1/7.2.12) */ typedef uint8_t dstring; /* Timestamp (ECMA 167r3 1/7.3) */ @@ -85,22 +95,8 @@ struct regid { } __packed; /* Flags (ECMA 167r3 1/7.4.1) */ -#define ENTITYID_FLAGS_DIRTY 0x00 -#define ENTITYID_FLAGS_PROTECTED 0x01 - -/* OSTA UDF 2.1.5.2 */ -#define UDF_ID_COMPLIANT "*OSTA UDF Compliant" - -/* OSTA UDF 2.1.5.3 */ -struct domainEntityIDSuffix { - uint16_t revision; - uint8_t flags; - uint8_t reserved[5]; -}; - -/* OSTA UDF 2.1.5.3 */ -#define ENTITYIDSUFFIX_FLAGS_HARDWRITEPROTECT 0 -#define ENTITYIDSUFFIX_FLAGS_SOFTWRITEPROTECT 1 +#define ENTITYID_FLAGS_DIRTY 0x01 +#define ENTITYID_FLAGS_PROTECTED 0x02 /* Volume Structure Descriptor (ECMA 167r3 2/9.1) */ #define VSD_STD_ID_LEN 5 @@ -202,6 +198,13 @@ struct NSRDesc { uint8_t structData[2040]; } __packed; +/* Generic Descriptor */ +struct genericDesc { + struct tag descTag; + __le32 volDescSeqNum; + uint8_t reserved[492]; +} __packed; + /* Primary Volume Descriptor (ECMA 167r3 3/10.1) */ struct primaryVolDesc { struct tag descTag; @@ -316,7 +319,7 @@ struct genericPartitionMap { /* Partition Map Type (ECMA 167r3 3/10.7.1.1) */ #define GP_PARTITION_MAP_TYPE_UNDEF 0x00 -#define GP_PARTIITON_MAP_TYPE_1 0x01 +#define GP_PARTITION_MAP_TYPE_1 0x01 #define GP_PARTITION_MAP_TYPE_2 0x02 /* Type 1 Partition Map (ECMA 167r3 3/10.7.2) */ @@ -723,6 +726,7 @@ struct appUseExtAttr { #define EXTATTR_DEV_SPEC 12 #define EXTATTR_IMP_USE 2048 #define EXTATTR_APP_USE 65536 +#define EXTATTR_SUBTYPE 1 /* Unallocated Space Entry (ECMA 167r3 4/14.11) */ struct unallocSpaceEntry { @@ -754,10 +758,12 @@ struct partitionIntegrityEntry { /* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ /* Extent Length (ECMA 167r3 4/14.14.1.1) */ +#define EXT_LENGTH_MASK 0x3FFFFFFF +#define EXT_TYPE_MASK 0xC0000000 #define EXT_RECORDED_ALLOCATED 0x00000000 #define EXT_NOT_RECORDED_ALLOCATED 0x40000000 #define EXT_NOT_RECORDED_NOT_ALLOCATED 0x80000000 -#define EXT_NEXT_EXTENT_ALLOCDECS 0xC0000000 +#define EXT_NEXT_EXTENT_ALLOCDESCS 0xC0000000 /* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */ @@ -774,7 +780,7 @@ struct pathComponent { uint8_t componentType; uint8_t lengthComponentIdent; __le16 componentFileVersionNum; - dstring componentIdent[0]; + dchars componentIdent[0]; } __packed; /* File Entry (ECMA 167r3 4/14.17) */ diff --git a/fs/udf/inode.c b/fs/udf/inode.c index ea80036d7897..e875bc5668ee 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1981,10 +1981,10 @@ int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block, __udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1); udf_write_aext(inode, epos, &nepos.block, - sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0); + sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDESCS, 0); } else { __udf_add_aext(inode, epos, &nepos.block, - sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0); + sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDESCS, 0); } brelse(epos->bh); @@ -2143,7 +2143,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, unsigned int indirections = 0; while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) == - (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { + (EXT_NEXT_EXTENT_ALLOCDESCS >> 30)) { udf_pblk_t block; if (++indirections > UDF_MAX_INDIR_EXTS) { diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h index a4da59e38b7f..35e61b2cacfe 100644 --- a/fs/udf/osta_udf.h +++ b/fs/udf/osta_udf.h @@ -1,10 +1,11 @@ /* * osta_udf.h * - * This file is based on OSTA UDF(tm) 2.50 (April 30, 2003) + * This file is based on OSTA UDF(tm) 2.60 (March 1, 2005) * http://www.osta.org * - * Copyright (c) 2001-2004 Ben Fennema <bfennema@falcon.csc.calpoly.edu> + * Copyright (c) 2001-2004 Ben Fennema + * Copyright (c) 2017-2019 Pali Rohár <pali.rohar@gmail.com> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,38 +33,57 @@ * SUCH DAMAGE. */ +/** + * @file + * OSTA-UDF defines and structure definitions + */ + #include "ecma_167.h" #ifndef _OSTA_UDF_H #define _OSTA_UDF_H 1 -/* OSTA CS0 Charspec (UDF 2.50 2.1.2) */ +/* OSTA CS0 Charspec (UDF 2.60 2.1.2) */ #define UDF_CHAR_SET_TYPE 0 #define UDF_CHAR_SET_INFO "OSTA Compressed Unicode" -/* Entity Identifier (UDF 2.50 2.1.5) */ -/* Identifiers (UDF 2.50 2.1.5.2) */ +/* Entity Identifier (UDF 2.60 2.1.5) */ +/* Identifiers (UDF 2.60 2.1.5.2) */ +/* Implementation Use Extended Attribute (UDF 2.60 3.3.4.5) */ +/* Virtual Allocation Table (UDF 1.50 2.2.10) */ +/* Logical Volume Extended Information (UDF 1.50 Errata, DCN 5003, 3.3.4.5.1.3) */ +/* OS2EA (UDF 1.50 3.3.4.5.3.1) */ +/* MacUniqueIDTable (UDF 1.50 3.3.4.5.4.3) */ +/* MacResourceFork (UDF 1.50 3.3.4.5.4.4) */ #define UDF_ID_DEVELOPER "*Linux UDFFS" #define UDF_ID_COMPLIANT "*OSTA UDF Compliant" #define UDF_ID_LV_INFO "*UDF LV Info" #define UDF_ID_FREE_EA "*UDF FreeEASpace" #define UDF_ID_FREE_APP_EA "*UDF FreeAppEASpace" #define UDF_ID_DVD_CGMS "*UDF DVD CGMS Info" +#define UDF_ID_VAT_LVEXTENSION "*UDF VAT LVExtension" #define UDF_ID_OS2_EA "*UDF OS/2 EA" #define UDF_ID_OS2_EA_LENGTH "*UDF OS/2 EALength" #define UDF_ID_MAC_VOLUME "*UDF Mac VolumeInfo" #define UDF_ID_MAC_FINDER "*UDF Mac FinderInfo" #define UDF_ID_MAC_UNIQUE "*UDF Mac UniqueIDTable" #define UDF_ID_MAC_RESOURCE "*UDF Mac ResourceFork" +#define UDF_ID_OS400_DIRINFO "*UDF OS/400 DirInfo" #define UDF_ID_VIRTUAL "*UDF Virtual Partition" #define UDF_ID_SPARABLE "*UDF Sparable Partition" #define UDF_ID_ALLOC "*UDF Virtual Alloc Tbl" #define UDF_ID_SPARING "*UDF Sparing Table" #define UDF_ID_METADATA "*UDF Metadata Partition" -/* Identifier Suffix (UDF 2.50 2.1.5.3) */ -#define IS_DF_HARD_WRITE_PROTECT 0x01 -#define IS_DF_SOFT_WRITE_PROTECT 0x02 +/* Identifier Suffix (UDF 2.60 2.1.5.3) */ +#define DOMAIN_FLAGS_HARD_WRITE_PROTECT 0x01 +#define DOMAIN_FLAGS_SOFT_WRITE_PROTECT 0x02 + +struct domainIdentSuffix { + __le16 UDFRevision; + uint8_t domainFlags; + uint8_t reserved[5]; +} __packed; struct UDFIdentSuffix { __le16 UDFRevision; @@ -75,15 +95,15 @@ struct UDFIdentSuffix { struct impIdentSuffix { uint8_t OSClass; uint8_t OSIdentifier; - uint8_t reserved[6]; + uint8_t impUse[6]; } __packed; struct appIdentSuffix { uint8_t impUse[8]; } __packed; -/* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */ -/* Implementation Use (UDF 2.50 2.2.6.4) */ +/* Logical Volume Integrity Descriptor (UDF 2.60 2.2.6) */ +/* Implementation Use (UDF 2.60 2.2.6.4) */ struct logicalVolIntegrityDescImpUse { struct regid impIdent; __le32 numFiles; @@ -94,8 +114,8 @@ struct logicalVolIntegrityDescImpUse { uint8_t impUse[0]; } __packed; -/* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */ -/* Implementation Use (UDF 2.50 2.2.7.2) */ +/* Implementation Use Volume Descriptor (UDF 2.60 2.2.7) */ +/* Implementation Use (UDF 2.60 2.2.7.2) */ struct impUseVolDescImpUse { struct charspec LVICharset; dstring logicalVolIdent[128]; @@ -115,7 +135,7 @@ struct udfPartitionMap2 { __le16 partitionNum; } __packed; -/* Virtual Partition Map (UDF 2.50 2.2.8) */ +/* Virtual Partition Map (UDF 2.60 2.2.8) */ struct virtualPartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; @@ -126,7 +146,7 @@ struct virtualPartitionMap { uint8_t reserved2[24]; } __packed; -/* Sparable Partition Map (UDF 2.50 2.2.9) */ +/* Sparable Partition Map (UDF 2.60 2.2.9) */ struct sparablePartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; @@ -141,7 +161,7 @@ struct sparablePartitionMap { __le32 locSparingTable[4]; } __packed; -/* Metadata Partition Map (UDF 2.4.0 2.2.10) */ +/* Metadata Partition Map (UDF 2.60 2.2.10) */ struct metadataPartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; @@ -160,14 +180,14 @@ struct metadataPartitionMap { /* Virtual Allocation Table (UDF 1.5 2.2.10) */ struct virtualAllocationTable15 { - __le32 VirtualSector[0]; + __le32 vatEntry[0]; struct regid vatIdent; __le32 previousVATICBLoc; } __packed; #define ICBTAG_FILE_TYPE_VAT15 0x00U -/* Virtual Allocation Table (UDF 2.50 2.2.11) */ +/* Virtual Allocation Table (UDF 2.60 2.2.11) */ struct virtualAllocationTable20 { __le16 lengthHeader; __le16 lengthImpUse; @@ -175,9 +195,9 @@ struct virtualAllocationTable20 { __le32 previousVATICBLoc; __le32 numFiles; __le32 numDirs; - __le16 minReadRevision; - __le16 minWriteRevision; - __le16 maxWriteRevision; + __le16 minUDFReadRev; + __le16 minUDFWriteRev; + __le16 maxUDFWriteRev; __le16 reserved; uint8_t impUse[0]; __le32 vatEntry[0]; @@ -185,7 +205,7 @@ struct virtualAllocationTable20 { #define ICBTAG_FILE_TYPE_VAT20 0xF8U -/* Sparing Table (UDF 2.50 2.2.12) */ +/* Sparing Table (UDF 2.60 2.2.12) */ struct sparingEntry { __le32 origLocation; __le32 mappedLocation; @@ -201,12 +221,12 @@ struct sparingTable { mapEntry[0]; } __packed; -/* Metadata File (and Metadata Mirror File) (UDF 2.50 2.2.13.1) */ +/* Metadata File (and Metadata Mirror File) (UDF 2.60 2.2.13.1) */ #define ICBTAG_FILE_TYPE_MAIN 0xFA #define ICBTAG_FILE_TYPE_MIRROR 0xFB #define ICBTAG_FILE_TYPE_BITMAP 0xFC -/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */ +/* struct struct long_ad ICB - ADImpUse (UDF 2.60 2.2.4.3) */ struct allocDescImpUse { __le16 flags; uint8_t impUse[4]; @@ -214,17 +234,17 @@ struct allocDescImpUse { #define AD_IU_EXT_ERASED 0x0001 -/* Real-Time Files (UDF 2.50 6.11) */ +/* Real-Time Files (UDF 2.60 6.11) */ #define ICBTAG_FILE_TYPE_REALTIME 0xF9U -/* Implementation Use Extended Attribute (UDF 2.50 3.3.4.5) */ -/* FreeEASpace (UDF 2.50 3.3.4.5.1.1) */ +/* Implementation Use Extended Attribute (UDF 2.60 3.3.4.5) */ +/* FreeEASpace (UDF 2.60 3.3.4.5.1.1) */ struct freeEaSpace { __le16 headerChecksum; uint8_t freeEASpace[0]; } __packed; -/* DVD Copyright Management Information (UDF 2.50 3.3.4.5.1.2) */ +/* DVD Copyright Management Information (UDF 2.60 3.3.4.5.1.2) */ struct DVDCopyrightImpUse { __le16 headerChecksum; uint8_t CGMSInfo; @@ -232,20 +252,35 @@ struct DVDCopyrightImpUse { uint8_t protectionSystemInfo[4]; } __packed; -/* Application Use Extended Attribute (UDF 2.50 3.3.4.6) */ -/* FreeAppEASpace (UDF 2.50 3.3.4.6.1) */ +/* Logical Volume Extended Information (UDF 1.50 Errata, DCN 5003, 3.3.4.5.1.3) */ +struct LVExtensionEA { + __le16 headerChecksum; + __le64 verificationID; + __le32 numFiles; + __le32 numDirs; + dstring logicalVolIdent[128]; +} __packed; + +/* Application Use Extended Attribute (UDF 2.60 3.3.4.6) */ +/* FreeAppEASpace (UDF 2.60 3.3.4.6.1) */ struct freeAppEASpace { __le16 headerChecksum; uint8_t freeEASpace[0]; } __packed; -/* UDF Defined System Stream (UDF 2.50 3.3.7) */ +/* UDF Defined System Stream (UDF 2.60 3.3.7) */ #define UDF_ID_UNIQUE_ID "*UDF Unique ID Mapping Data" #define UDF_ID_NON_ALLOC "*UDF Non-Allocatable Space" #define UDF_ID_POWER_CAL "*UDF Power Cal Table" #define UDF_ID_BACKUP "*UDF Backup" -/* Operating System Identifiers (UDF 2.50 6.3) */ +/* UDF Defined Non-System Streams (UDF 2.60 3.3.8) */ +#define UDF_ID_MAC_RESOURCE_FORK_STREAM "*UDF Macintosh Resource Fork" +/* #define UDF_ID_OS2_EA "*UDF OS/2 EA" */ +#define UDF_ID_NT_ACL "*UDF NT ACL" +#define UDF_ID_UNIX_ACL "*UDF UNIX ACL" + +/* Operating System Identifiers (UDF 2.60 6.3) */ #define UDF_OS_CLASS_UNDEF 0x00U #define UDF_OS_CLASS_DOS 0x01U #define UDF_OS_CLASS_OS2 0x02U @@ -270,6 +305,7 @@ struct freeAppEASpace { #define UDF_OS_ID_LINUX 0x05U #define UDF_OS_ID_MKLINUX 0x06U #define UDF_OS_ID_FREEBSD 0x07U +#define UDF_OS_ID_NETBSD 0x08U #define UDF_OS_ID_WIN9X 0x00U #define UDF_OS_ID_WINNT 0x00U #define UDF_OS_ID_OS400 0x00U diff --git a/fs/udf/super.c b/fs/udf/super.c index 8c28e93e9b73..f747bf72edbe 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -767,20 +767,20 @@ static int udf_check_vsd(struct super_block *sb) static int udf_verify_domain_identifier(struct super_block *sb, struct regid *ident, char *dname) { - struct domainEntityIDSuffix *suffix; + struct domainIdentSuffix *suffix; if (memcmp(ident->ident, UDF_ID_COMPLIANT, strlen(UDF_ID_COMPLIANT))) { udf_warn(sb, "Not OSTA UDF compliant %s descriptor.\n", dname); goto force_ro; } - if (ident->flags & (1 << ENTITYID_FLAGS_DIRTY)) { + if (ident->flags & ENTITYID_FLAGS_DIRTY) { udf_warn(sb, "Possibly not OSTA UDF compliant %s descriptor.\n", dname); goto force_ro; } - suffix = (struct domainEntityIDSuffix *)ident->identSuffix; - if (suffix->flags & (1 << ENTITYIDSUFFIX_FLAGS_HARDWRITEPROTECT) || - suffix->flags & (1 << ENTITYIDSUFFIX_FLAGS_SOFTWRITEPROTECT)) { + suffix = (struct domainIdentSuffix *)ident->identSuffix; + if ((suffix->domainFlags & DOMAIN_FLAGS_HARD_WRITE_PROTECT) || + (suffix->domainFlags & DOMAIN_FLAGS_SOFT_WRITE_PROTECT)) { if (!sb_rdonly(sb)) { udf_warn(sb, "Descriptor for %s marked write protected." " Forcing read only mount.\n", dname); @@ -1035,7 +1035,6 @@ static int check_partition_desc(struct super_block *sb, switch (le32_to_cpu(p->accessType)) { case PD_ACCESS_TYPE_READ_ONLY: case PD_ACCESS_TYPE_WRITE_ONCE: - case PD_ACCESS_TYPE_REWRITABLE: case PD_ACCESS_TYPE_NONE: goto force_ro; } @@ -1063,7 +1062,8 @@ static int check_partition_desc(struct super_block *sb, goto force_ro; if (map->s_partition_type == UDF_VIRTUAL_MAP15 || - map->s_partition_type == UDF_VIRTUAL_MAP20) + map->s_partition_type == UDF_VIRTUAL_MAP20 || + map->s_partition_type == UDF_METADATA_MAP25) goto force_ro; return 0; @@ -2402,6 +2402,10 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = sbi->s_partmaps[sbi->s_partition].s_partition_len; buf->f_bfree = udf_count_free(sb); buf->f_bavail = buf->f_bfree; + /* + * Let's pretend each free block is also a free 'inode' since UDF does + * not have separate preallocated table of inodes. + */ buf->f_files = (lvidiu != NULL ? (le32_to_cpu(lvidiu->numFiles) + le32_to_cpu(lvidiu->numDirs)) : 0) + buf->f_bfree; @@ -2492,17 +2496,29 @@ static unsigned int udf_count_free_table(struct super_block *sb, static unsigned int udf_count_free(struct super_block *sb) { unsigned int accum = 0; - struct udf_sb_info *sbi; + struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map; + unsigned int part = sbi->s_partition; + int ptype = sbi->s_partmaps[part].s_partition_type; + + if (ptype == UDF_METADATA_MAP25) { + part = sbi->s_partmaps[part].s_type_specific.s_metadata. + s_phys_partition_ref; + } else if (ptype == UDF_VIRTUAL_MAP15 || ptype == UDF_VIRTUAL_MAP20) { + /* + * Filesystems with VAT are append-only and we cannot write to + * them. Let's just report 0 here. + */ + return 0; + } - sbi = UDF_SB(sb); if (sbi->s_lvid_bh) { struct logicalVolIntegrityDesc *lvid = (struct logicalVolIntegrityDesc *) sbi->s_lvid_bh->b_data; - if (le32_to_cpu(lvid->numOfPartitions) > sbi->s_partition) { + if (le32_to_cpu(lvid->numOfPartitions) > part) { accum = le32_to_cpu( - lvid->freeSpaceTable[sbi->s_partition]); + lvid->freeSpaceTable[part]); if (accum == 0xFFFFFFFF) accum = 0; } @@ -2511,7 +2527,7 @@ static unsigned int udf_count_free(struct super_block *sb) if (accum) return accum; - map = &sbi->s_partmaps[sbi->s_partition]; + map = &sbi->s_partmaps[part]; if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { accum += udf_count_free_bitmap(sb, map->s_uspace.s_bitmap); diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 63a47f1e1d52..532cda99644e 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -241,7 +241,7 @@ int udf_truncate_extents(struct inode *inode) while ((etype = udf_current_aext(inode, &epos, &eloc, &elen, 0)) != -1) { - if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { + if (etype == (EXT_NEXT_EXTENT_ALLOCDESCS >> 30)) { udf_write_aext(inode, &epos, &neloc, nelen, 0); if (indirect_ext_len) { /* We managed to free all extents in the diff --git a/fs/verity/enable.c b/fs/verity/enable.c index eabc6ac19906..d98bea308fd7 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -8,18 +8,48 @@ #include "fsverity_private.h" #include <crypto/hash.h> +#include <linux/backing-dev.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> -static int build_merkle_tree_level(struct inode *inode, unsigned int level, +/* + * Read a file data page for Merkle tree construction. Do aggressive readahead, + * since we're sequentially reading the entire file. + */ +static struct page *read_file_data_page(struct file *filp, pgoff_t index, + struct file_ra_state *ra, + unsigned long remaining_pages) +{ + struct page *page; + + page = find_get_page_flags(filp->f_mapping, index, FGP_ACCESSED); + if (!page || !PageUptodate(page)) { + if (page) + put_page(page); + else + page_cache_sync_readahead(filp->f_mapping, ra, filp, + index, remaining_pages); + page = read_mapping_page(filp->f_mapping, index, NULL); + if (IS_ERR(page)) + return page; + } + if (PageReadahead(page)) + page_cache_async_readahead(filp->f_mapping, ra, filp, page, + index, remaining_pages); + return page; +} + +static int build_merkle_tree_level(struct file *filp, unsigned int level, u64 num_blocks_to_hash, const struct merkle_tree_params *params, u8 *pending_hashes, struct ahash_request *req) { + struct inode *inode = file_inode(filp); const struct fsverity_operations *vops = inode->i_sb->s_vop; + struct file_ra_state ra = { 0 }; unsigned int pending_size = 0; u64 dst_block_num; u64 i; @@ -36,6 +66,8 @@ static int build_merkle_tree_level(struct inode *inode, unsigned int level, dst_block_num = 0; /* unused */ } + file_ra_state_init(&ra, filp->f_mapping); + for (i = 0; i < num_blocks_to_hash; i++) { struct page *src_page; @@ -45,7 +77,8 @@ static int build_merkle_tree_level(struct inode *inode, unsigned int level, if (level == 0) { /* Leaf: hashing a data block */ - src_page = read_mapping_page(inode->i_mapping, i, NULL); + src_page = read_file_data_page(filp, i, &ra, + num_blocks_to_hash - i); if (IS_ERR(src_page)) { err = PTR_ERR(src_page); fsverity_err(inode, @@ -54,9 +87,14 @@ static int build_merkle_tree_level(struct inode *inode, unsigned int level, return err; } } else { + unsigned long num_ra_pages = + min_t(unsigned long, num_blocks_to_hash - i, + inode->i_sb->s_bdi->io_pages); + /* Non-leaf: hashing hash block from level below */ src_page = vops->read_merkle_tree_page(inode, - params->level_start[level - 1] + i); + params->level_start[level - 1] + i, + num_ra_pages); if (IS_ERR(src_page)) { err = PTR_ERR(src_page); fsverity_err(inode, @@ -103,17 +141,18 @@ static int build_merkle_tree_level(struct inode *inode, unsigned int level, } /* - * Build the Merkle tree for the given inode using the given parameters, and + * Build the Merkle tree for the given file using the given parameters, and * return the root hash in @root_hash. * * The tree is written to a filesystem-specific location as determined by the * ->write_merkle_tree_block() method. However, the blocks that comprise the * tree are the same for all filesystems. */ -static int build_merkle_tree(struct inode *inode, +static int build_merkle_tree(struct file *filp, const struct merkle_tree_params *params, u8 *root_hash) { + struct inode *inode = file_inode(filp); u8 *pending_hashes; struct ahash_request *req; u64 blocks; @@ -126,9 +165,11 @@ static int build_merkle_tree(struct inode *inode, return 0; } + /* This allocation never fails, since it's mempool-backed. */ + req = fsverity_alloc_hash_request(params->hash_alg, GFP_KERNEL); + pending_hashes = kmalloc(params->block_size, GFP_KERNEL); - req = ahash_request_alloc(params->hash_alg->tfm, GFP_KERNEL); - if (!pending_hashes || !req) + if (!pending_hashes) goto out; /* @@ -139,7 +180,7 @@ static int build_merkle_tree(struct inode *inode, blocks = (inode->i_size + params->block_size - 1) >> params->log_blocksize; for (level = 0; level <= params->num_levels; level++) { - err = build_merkle_tree_level(inode, level, blocks, params, + err = build_merkle_tree_level(filp, level, blocks, params, pending_hashes, req); if (err) goto out; @@ -150,7 +191,7 @@ static int build_merkle_tree(struct inode *inode, err = 0; out: kfree(pending_hashes); - ahash_request_free(req); + fsverity_free_hash_request(params->hash_alg, req); return err; } @@ -175,8 +216,7 @@ static int enable_verity(struct file *filp, /* Get the salt if the user provided one */ if (arg->salt_size && - copy_from_user(desc->salt, - (const u8 __user *)(uintptr_t)arg->salt_ptr, + copy_from_user(desc->salt, u64_to_user_ptr(arg->salt_ptr), arg->salt_size)) { err = -EFAULT; goto out; @@ -185,8 +225,7 @@ static int enable_verity(struct file *filp, /* Get the signature if the user provided one */ if (arg->sig_size && - copy_from_user(desc->signature, - (const u8 __user *)(uintptr_t)arg->sig_ptr, + copy_from_user(desc->signature, u64_to_user_ptr(arg->sig_ptr), arg->sig_size)) { err = -EFAULT; goto out; @@ -227,7 +266,7 @@ static int enable_verity(struct file *filp, */ pr_debug("Building Merkle tree...\n"); BUILD_BUG_ON(sizeof(desc->root_hash) < FS_VERITY_MAX_DIGEST_SIZE); - err = build_merkle_tree(inode, ¶ms, desc->root_hash); + err = build_merkle_tree(filp, ¶ms, desc->root_hash); if (err) { fsverity_err(inode, "Error %d building Merkle tree", err); goto rollback; @@ -315,7 +354,7 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) if (arg.block_size != PAGE_SIZE) return -EINVAL; - if (arg.salt_size > FIELD_SIZEOF(struct fsverity_descriptor, salt)) + if (arg.salt_size > sizeof_field(struct fsverity_descriptor, salt)) return -EMSGSIZE; if (arg.sig_size > FS_VERITY_MAX_SIGNATURE_SIZE) diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index e74c79b64d88..74768cf539da 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -16,6 +16,7 @@ #include <crypto/sha.h> #include <linux/fsverity.h> +#include <linux/mempool.h> struct ahash_request; @@ -37,11 +38,12 @@ struct fsverity_hash_alg { const char *name; /* crypto API name, e.g. sha256 */ unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */ unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */ + mempool_t req_pool; /* mempool with a preallocated hash request */ }; /* Merkle tree parameters: hash algorithm, initial hash state, and topology */ struct merkle_tree_params { - const struct fsverity_hash_alg *hash_alg; /* the hash algorithm */ + struct fsverity_hash_alg *hash_alg; /* the hash algorithm */ const u8 *hashstate; /* initial hash state or NULL */ unsigned int digest_size; /* same as hash_alg->digest_size */ unsigned int block_size; /* size of data and tree blocks */ @@ -50,6 +52,7 @@ struct merkle_tree_params { unsigned int log_arity; /* log2(hashes_per_block) */ unsigned int num_levels; /* number of levels in Merkle tree */ u64 tree_size; /* Merkle tree size in bytes */ + unsigned long level0_blocks; /* number of blocks in tree level 0 */ /* * Starting block index for each tree level, ordered from leaf level (0) @@ -114,14 +117,18 @@ struct fsverity_signed_digest { extern struct fsverity_hash_alg fsverity_hash_algs[]; -const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, - unsigned int num); -const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, +struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, + unsigned int num); +struct ahash_request *fsverity_alloc_hash_request(struct fsverity_hash_alg *alg, + gfp_t gfp_flags); +void fsverity_free_hash_request(struct fsverity_hash_alg *alg, + struct ahash_request *req); +const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg, const u8 *salt, size_t salt_size); int fsverity_hash_page(const struct merkle_tree_params *params, const struct inode *inode, struct ahash_request *req, struct page *page, u8 *out); -int fsverity_hash_buffer(const struct fsverity_hash_alg *alg, +int fsverity_hash_buffer(struct fsverity_hash_alg *alg, const void *data, size_t size, u8 *out); void __init fsverity_check_hash_algs(void); diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index 31e6d7d2389a..c37e186ebeb6 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -24,6 +24,8 @@ struct fsverity_hash_alg fsverity_hash_algs[] = { }, }; +static DEFINE_MUTEX(fsverity_hash_alg_init_mutex); + /** * fsverity_get_hash_alg() - validate and prepare a hash algorithm * @inode: optional inode for logging purposes @@ -36,8 +38,8 @@ struct fsverity_hash_alg fsverity_hash_algs[] = { * * Return: pointer to the hash alg on success, else an ERR_PTR() */ -const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, - unsigned int num) +struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, + unsigned int num) { struct fsverity_hash_alg *alg; struct crypto_ahash *tfm; @@ -50,10 +52,15 @@ const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, } alg = &fsverity_hash_algs[num]; - /* pairs with cmpxchg() below */ - tfm = READ_ONCE(alg->tfm); - if (likely(tfm != NULL)) + /* pairs with smp_store_release() below */ + if (likely(smp_load_acquire(&alg->tfm) != NULL)) return alg; + + mutex_lock(&fsverity_hash_alg_init_mutex); + + if (alg->tfm != NULL) + goto out_unlock; + /* * Using the shash API would make things a bit simpler, but the ahash * API is preferable as it allows the use of crypto accelerators. @@ -64,12 +71,14 @@ const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, fsverity_warn(inode, "Missing crypto API support for hash algorithm \"%s\"", alg->name); - return ERR_PTR(-ENOPKG); + alg = ERR_PTR(-ENOPKG); + goto out_unlock; } fsverity_err(inode, "Error allocating hash algorithm \"%s\": %ld", alg->name, PTR_ERR(tfm)); - return ERR_CAST(tfm); + alg = ERR_CAST(tfm); + goto out_unlock; } err = -EINVAL; @@ -78,18 +87,61 @@ const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, if (WARN_ON(alg->block_size != crypto_ahash_blocksize(tfm))) goto err_free_tfm; + err = mempool_init_kmalloc_pool(&alg->req_pool, 1, + sizeof(struct ahash_request) + + crypto_ahash_reqsize(tfm)); + if (err) + goto err_free_tfm; + pr_info("%s using implementation \"%s\"\n", alg->name, crypto_ahash_driver_name(tfm)); - /* pairs with READ_ONCE() above */ - if (cmpxchg(&alg->tfm, NULL, tfm) != NULL) - crypto_free_ahash(tfm); - - return alg; + /* pairs with smp_load_acquire() above */ + smp_store_release(&alg->tfm, tfm); + goto out_unlock; err_free_tfm: crypto_free_ahash(tfm); - return ERR_PTR(err); + alg = ERR_PTR(err); +out_unlock: + mutex_unlock(&fsverity_hash_alg_init_mutex); + return alg; +} + +/** + * fsverity_alloc_hash_request() - allocate a hash request object + * @alg: the hash algorithm for which to allocate the request + * @gfp_flags: memory allocation flags + * + * This is mempool-backed, so this never fails if __GFP_DIRECT_RECLAIM is set in + * @gfp_flags. However, in that case this might need to wait for all + * previously-allocated requests to be freed. So to avoid deadlocks, callers + * must never need multiple requests at a time to make forward progress. + * + * Return: the request object on success; NULL on failure (but see above) + */ +struct ahash_request *fsverity_alloc_hash_request(struct fsverity_hash_alg *alg, + gfp_t gfp_flags) +{ + struct ahash_request *req = mempool_alloc(&alg->req_pool, gfp_flags); + + if (req) + ahash_request_set_tfm(req, alg->tfm); + return req; +} + +/** + * fsverity_free_hash_request() - free a hash request object + * @alg: the hash algorithm + * @req: the hash request object to free + */ +void fsverity_free_hash_request(struct fsverity_hash_alg *alg, + struct ahash_request *req) +{ + if (req) { + ahash_request_zero(req); + mempool_free(req, &alg->req_pool); + } } /** @@ -101,7 +153,7 @@ err_free_tfm: * Return: NULL if the salt is empty, otherwise the kmalloc()'ed precomputed * initial hash state on success or an ERR_PTR() on failure. */ -const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, +const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg, const u8 *salt, size_t salt_size) { u8 *hashstate = NULL; @@ -119,11 +171,8 @@ const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, if (!hashstate) return ERR_PTR(-ENOMEM); - req = ahash_request_alloc(alg->tfm, GFP_KERNEL); - if (!req) { - err = -ENOMEM; - goto err_free; - } + /* This allocation never fails, since it's mempool-backed. */ + req = fsverity_alloc_hash_request(alg, GFP_KERNEL); /* * Zero-pad the salt to the next multiple of the input size of the hash @@ -158,7 +207,7 @@ const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, if (err) goto err_free; out: - ahash_request_free(req); + fsverity_free_hash_request(alg, req); kfree(padded_salt); return hashstate; @@ -229,7 +278,7 @@ int fsverity_hash_page(const struct merkle_tree_params *params, * * Return: 0 on success, -errno on failure */ -int fsverity_hash_buffer(const struct fsverity_hash_alg *alg, +int fsverity_hash_buffer(struct fsverity_hash_alg *alg, const void *data, size_t size, u8 *out) { struct ahash_request *req; @@ -237,9 +286,8 @@ int fsverity_hash_buffer(const struct fsverity_hash_alg *alg, DECLARE_CRYPTO_WAIT(wait); int err; - req = ahash_request_alloc(alg->tfm, GFP_KERNEL); - if (!req) - return -ENOMEM; + /* This allocation never fails, since it's mempool-backed. */ + req = fsverity_alloc_hash_request(alg, GFP_KERNEL); sg_init_one(&sg, data, size); ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | @@ -249,7 +297,7 @@ int fsverity_hash_buffer(const struct fsverity_hash_alg *alg, err = crypto_wait_req(crypto_ahash_digest(req), &wait); - ahash_request_free(req); + fsverity_free_hash_request(alg, req); return err; } diff --git a/fs/verity/open.c b/fs/verity/open.c index 63d1004b688c..c5fe6948e262 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -31,7 +31,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, unsigned int log_blocksize, const u8 *salt, size_t salt_size) { - const struct fsverity_hash_alg *hash_alg; + struct fsverity_hash_alg *hash_alg; int err; u64 blocks; u64 offset; @@ -102,6 +102,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, /* temporarily using level_start[] to store blocks in level */ params->level_start[params->num_levels++] = blocks; } + params->level0_blocks = params->level_start[0]; /* Compute the starting block of each level */ offset = 0; @@ -126,7 +127,7 @@ out_err: * Compute the file measurement by hashing the fsverity_descriptor excluding the * signature and with the sig_size field set to 0. */ -static int compute_file_measurement(const struct fsverity_hash_alg *hash_alg, +static int compute_file_measurement(struct fsverity_hash_alg *hash_alg, struct fsverity_descriptor *desc, u8 *measurement) { diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 3e8f2de44667..e0cb62da3864 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -84,7 +84,8 @@ static inline int cmp_hashes(const struct fsverity_info *vi, * Return: true if the page is valid, else false. */ static bool verify_page(struct inode *inode, const struct fsverity_info *vi, - struct ahash_request *req, struct page *data_page) + struct ahash_request *req, struct page *data_page, + unsigned long level0_ra_pages) { const struct merkle_tree_params *params = &vi->tree_params; const unsigned int hsize = params->digest_size; @@ -117,8 +118,8 @@ static bool verify_page(struct inode *inode, const struct fsverity_info *vi, pr_debug_ratelimited("Level %d: hindex=%lu, hoffset=%u\n", level, hindex, hoffset); - hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, - hindex); + hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, hindex, + level == 0 ? level0_ra_pages : 0); if (IS_ERR(hpage)) { err = PTR_ERR(hpage); fsverity_err(inode, @@ -191,13 +192,12 @@ bool fsverity_verify_page(struct page *page) struct ahash_request *req; bool valid; - req = ahash_request_alloc(vi->tree_params.hash_alg->tfm, GFP_NOFS); - if (unlikely(!req)) - return false; + /* This allocation never fails, since it's mempool-backed. */ + req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS); - valid = verify_page(inode, vi, req, page); + valid = verify_page(inode, vi, req, page, 0); - ahash_request_free(req); + fsverity_free_hash_request(vi->tree_params.hash_alg, req); return valid; } @@ -222,25 +222,42 @@ void fsverity_verify_bio(struct bio *bio) { struct inode *inode = bio_first_page_all(bio)->mapping->host; const struct fsverity_info *vi = inode->i_verity_info; + const struct merkle_tree_params *params = &vi->tree_params; struct ahash_request *req; struct bio_vec *bv; struct bvec_iter_all iter_all; - - req = ahash_request_alloc(vi->tree_params.hash_alg->tfm, GFP_NOFS); - if (unlikely(!req)) { + unsigned long max_ra_pages = 0; + + /* This allocation never fails, since it's mempool-backed. */ + req = fsverity_alloc_hash_request(params->hash_alg, GFP_NOFS); + + if (bio->bi_opf & REQ_RAHEAD) { + /* + * If this bio is for data readahead, then we also do readahead + * of the first (largest) level of the Merkle tree. Namely, + * when a Merkle tree page is read, we also try to piggy-back on + * some additional pages -- up to 1/4 the number of data pages. + * + * This improves sequential read performance, as it greatly + * reduces the number of I/O requests made to the Merkle tree. + */ bio_for_each_segment_all(bv, bio, iter_all) - SetPageError(bv->bv_page); - return; + max_ra_pages++; + max_ra_pages /= 4; } bio_for_each_segment_all(bv, bio, iter_all) { struct page *page = bv->bv_page; + unsigned long level0_index = page->index >> params->log_arity; + unsigned long level0_ra_pages = + min(max_ra_pages, params->level0_blocks - level0_index); - if (!PageError(page) && !verify_page(inode, vi, req, page)) + if (!PageError(page) && + !verify_page(inode, vi, req, page, level0_ra_pages)) SetPageError(page); } - ahash_request_free(req); + fsverity_free_hash_request(params->hash_alg, req); } EXPORT_SYMBOL_GPL(fsverity_verify_bio); #endif /* CONFIG_BLOCK */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index c284e10af491..fc93fd88ec89 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2248,24 +2248,32 @@ xfs_alloc_longest_free_extent( return pag->pagf_flcount > 0 || pag->pagf_longest > 0; } +/* + * Compute the minimum length of the AGFL in the given AG. If @pag is NULL, + * return the largest possible minimum length. + */ unsigned int xfs_alloc_min_freelist( struct xfs_mount *mp, struct xfs_perag *pag) { + /* AG btrees have at least 1 level. */ + static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1}; + const uint8_t *levels = pag ? pag->pagf_levels : fake_levels; unsigned int min_free; + ASSERT(mp->m_ag_maxlevels > 0); + /* space needed by-bno freespace btree */ - min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1, + min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, mp->m_ag_maxlevels); /* space needed by-size freespace btree */ - min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1, + min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, mp->m_ag_maxlevels); /* space needed reverse mapping used space btree */ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - min_free += min_t(unsigned int, - pag->pagf_levels[XFS_BTNUM_RMAPi] + 1, - mp->m_rmap_maxlevels); + min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, + mp->m_rmap_maxlevels); return min_free; } diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 0d7fcc983b3d..e6149720ce02 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -62,6 +62,7 @@ xfs_attr_args_init( struct xfs_da_args *args, struct xfs_inode *dp, const unsigned char *name, + size_t namelen, int flags) { @@ -74,7 +75,7 @@ xfs_attr_args_init( args->dp = dp; args->flags = flags; args->name = name; - args->namelen = strlen((const char *)name); + args->namelen = namelen; if (args->namelen >= MAXNAMELEN) return -EFAULT; /* match IRIX behaviour */ @@ -139,6 +140,7 @@ int xfs_attr_get( struct xfs_inode *ip, const unsigned char *name, + size_t namelen, unsigned char **value, int *valuelenp, int flags) @@ -154,7 +156,7 @@ xfs_attr_get( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, ip, name, flags); + error = xfs_attr_args_init(&args, ip, name, namelen, flags); if (error) return error; @@ -338,6 +340,7 @@ int xfs_attr_set( struct xfs_inode *dp, const unsigned char *name, + size_t namelen, unsigned char *value, int valuelen, int flags) @@ -353,7 +356,7 @@ xfs_attr_set( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, dp, name, flags); + error = xfs_attr_args_init(&args, dp, name, namelen, flags); if (error) return error; @@ -442,6 +445,7 @@ int xfs_attr_remove( struct xfs_inode *dp, const unsigned char *name, + size_t namelen, int flags) { struct xfs_mount *mp = dp->i_mount; @@ -453,7 +457,7 @@ xfs_attr_remove( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, dp, name, flags); + error = xfs_attr_args_init(&args, dp, name, namelen, flags); if (error) return error; @@ -1007,7 +1011,7 @@ restart: * The INCOMPLETE flag means that we will find the "old" * attr, not the "new" one. */ - args->flags |= XFS_ATTR_INCOMPLETE; + args->op_flags |= XFS_DA_OP_INCOMPLETE; state = xfs_da_state_alloc(); state->args = args; state->mp = mp; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 94badfa1743e..4243b2272642 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -26,7 +26,7 @@ struct xfs_attr_list_context; *========================================================================*/ -#define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */ +#define ATTR_DONTFOLLOW 0x0001 /* -- ignored, from IRIX -- */ #define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ #define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */ #define ATTR_SECURE 0x0008 /* use attrs in security namespace */ @@ -37,7 +37,10 @@ struct xfs_attr_list_context; #define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ #define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ -#define ATTR_ALLOC 0x8000 /* allocate xattr buffer on demand */ +#define ATTR_ALLOC 0x8000 /* [kernel] allocate xattr buffer on demand */ + +#define ATTR_KERNEL_FLAGS \ + (ATTR_KERNOTIME | ATTR_KERNOVAL | ATTR_INCOMPLETE | ATTR_ALLOC) #define XFS_ATTR_FLAGS \ { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ @@ -145,11 +148,13 @@ int xfs_attr_list_int(struct xfs_attr_list_context *); int xfs_inode_hasattr(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, - unsigned char **value, int *valuelenp, int flags); + size_t namelen, unsigned char **value, int *valuelenp, + int flags); int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, - unsigned char *value, int valuelen, int flags); + size_t namelen, unsigned char *value, int valuelen, int flags); int xfs_attr_set_args(struct xfs_da_args *args); -int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); +int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, + size_t namelen, int flags); int xfs_attr_remove_args(struct xfs_da_args *args); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 08d4b10ae2d5..fed537a4353d 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -2403,8 +2403,8 @@ xfs_attr3_leaf_lookup_int( * If we are looking for INCOMPLETE entries, show only those. * If we are looking for complete entries, show only those. */ - if ((args->flags & XFS_ATTR_INCOMPLETE) != - (entry->flags & XFS_ATTR_INCOMPLETE)) { + if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) != + !!(entry->flags & XFS_ATTR_INCOMPLETE)) { continue; } if (entry->flags & XFS_ATTR_LOCAL) { diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index f4a188e28b7b..73615b1dd1a8 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -39,15 +39,6 @@ struct xfs_attr3_icleaf_hdr { } freemap[XFS_ATTR_LEAF_MAPSIZE]; }; -/* - * Used to keep a list of "remote value" extents when unlinking an inode. - */ -typedef struct xfs_attr_inactive_list { - xfs_dablk_t valueblk; /* block number of value bytes */ - int valuelen; /* number of bytes in value */ -} xfs_attr_inactive_list_t; - - /*======================================================================== * Function prototypes for the kernel. *========================================================================*/ diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index a6ef5df42669..a266d05df146 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -26,6 +26,23 @@ #define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ /* + * Remote Attribute Values + * ======================= + * + * Remote extended attribute values are conceptually simple -- they're written + * to data blocks mapped by an inode's attribute fork, and they have an upper + * size limit of 64k. Setting a value does not involve the XFS log. + * + * However, on a v5 filesystem, maximally sized remote attr values require one + * block more than 64k worth of space to hold both the remote attribute value + * header (64 bytes). On a 4k block filesystem this results in a 68k buffer; + * on a 64k block filesystem, this would be a 128k buffer. Note that the log + * format can only handle a dirty buffer of XFS_MAX_BLOCKSIZE length (64k). + * Therefore, we /must/ ensure that remote attribute value buffers never touch + * the logging system and therefore never have a log item. + */ + +/* * Each contiguous block has a header, so it is not just a simple attribute * length to FSB conversion. */ @@ -401,17 +418,25 @@ xfs_attr_rmtval_get( (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_trans_read_buf(mp, args->trans, - mp->m_ddev_targp, - dblkno, dblkcnt, 0, &bp, - &xfs_attr3_rmt_buf_ops); - if (error) + bp = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, 0, + &xfs_attr3_rmt_buf_ops); + if (!bp) + return -ENOMEM; + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; return error; + } error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, &offset, &valuelen, &dst); - xfs_trans_brelse(args->trans, bp); + xfs_buf_relse(bp); if (error) return error; @@ -552,6 +577,33 @@ xfs_attr_rmtval_set( return 0; } +/* Mark stale any incore buffers for the remote value. */ +int +xfs_attr_rmtval_stale( + struct xfs_inode *ip, + struct xfs_bmbt_irec *map, + xfs_buf_flags_t incore_flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_buf *bp; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (XFS_IS_CORRUPT(mp, map->br_startblock == DELAYSTARTBLOCK) || + XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) + return -EFSCORRUPTED; + + bp = xfs_buf_incore(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map->br_startblock), + XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags); + if (bp) { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + } + + return 0; +} + /* * Remove the value associated with an attribute by deleting the * out-of-line buffer that it is stored on. @@ -560,7 +612,6 @@ int xfs_attr_rmtval_remove( struct xfs_da_args *args) { - struct xfs_mount *mp = args->dp->i_mount; xfs_dablk_t lblkno; int blkcnt; int error; @@ -575,9 +626,6 @@ xfs_attr_rmtval_remove( blkcnt = args->rmtblkcnt; while (blkcnt > 0) { struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_daddr_t dblkno; - int dblkcnt; int nmap; /* @@ -588,22 +636,11 @@ xfs_attr_rmtval_remove( blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - - /* - * If the "remote" value is in the cache, remove it. - */ - bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); - if (bp) { - xfs_buf_stale(bp); - xfs_buf_relse(bp); - bp = NULL; - } + if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1)) + return -EFSCORRUPTED; + error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK); + if (error) + return error; lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 9d20b66ad379..6fb4572845ce 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -11,5 +11,7 @@ int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_set(struct xfs_da_args *args); int xfs_attr_rmtval_remove(struct xfs_da_args *args); +int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, + xfs_buf_flags_t incore_flags); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index a9ad1f991ba3..4c2e046fbfad 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4561,7 +4561,7 @@ xfs_bmapi_convert_delalloc( struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmalloca bma = { NULL }; - u16 flags = 0; + uint16_t flags = 0; struct xfs_trans *tp; int error; @@ -5972,8 +5972,7 @@ xfs_bmap_insert_extents( goto del_cursor; } - if (XFS_IS_CORRUPT(mp, - stop_fsb >= got.br_startoff + got.br_blockcount)) { + if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) { error = -EFSCORRUPTED; goto del_cursor; } diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index e2cc98931552..b22c7e928eb1 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2389,8 +2389,6 @@ xfs_btree_lshift( XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); if (level > 0) { /* It's a nonleaf. operate on keys and ptrs */ - int i; /* loop index */ - for (i = 0; i < rrecs; i++) { error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level); if (error) diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index e16610d1c14f..0f4fbb0889ff 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -89,6 +89,7 @@ typedef struct xfs_da_args { #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ #define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */ +#define XFS_DA_OP_INCOMPLETE 0x0040 /* lookup INCOMPLETE attr keys */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ @@ -96,7 +97,8 @@ typedef struct xfs_da_args { { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" } + { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }, \ + { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" } /* * Storage for holding state during Btree searches and split/join ops. diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 3dee33043e09..734837a9b51a 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -217,7 +217,7 @@ typedef struct xfs_dir2_sf_entry { * A 64-bit or 32-bit inode number follows here, at a variable offset * after the name. */ -} xfs_dir2_sf_entry_t; +} __packed xfs_dir2_sf_entry_t; static inline int xfs_dir2_sf_hdr_size(int i8count) { @@ -683,8 +683,6 @@ struct xfs_attr3_leafblock { /* * Flags used in the leaf_entry[i].flags field. - * NOTE: the INCOMPLETE bit must not collide with the flags bits specified - * on the system call, they are "or"ed together for various operations. */ #define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 0aa87cbde49e..dd6fcaaea318 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -724,3 +724,24 @@ xfs_dir2_namecheck( /* There shouldn't be any slashes or nulls here */ return !memchr(name, '/', length) && !memchr(name, 0, length); } + +xfs_dahash_t +xfs_dir2_hashname( + struct xfs_mount *mp, + struct xfs_name *name) +{ + if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb))) + return xfs_ascii_ci_hashname(name); + return xfs_da_hashname(name->name, name->len); +} + +enum xfs_dacmp +xfs_dir2_compname( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb))) + return xfs_ascii_ci_compname(args, name, len); + return xfs_da_compname(args, name, len); +} diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index c031c53d0f0d..01ee0b926572 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -175,6 +175,12 @@ extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip); +int xfs_dir2_sf_entsize(struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, int len); +void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino); +void xfs_dir2_sf_put_ftype(struct xfs_mount *mp, + struct xfs_dir2_sf_entry *sfep, uint8_t ftype); /* xfs_dir2_readdir.c */ extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp, @@ -194,25 +200,8 @@ xfs_dir2_data_entsize( return round_up(len, XFS_DIR2_DATA_ALIGN); } -static inline xfs_dahash_t -xfs_dir2_hashname( - struct xfs_mount *mp, - struct xfs_name *name) -{ - if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb))) - return xfs_ascii_ci_hashname(name); - return xfs_da_hashname(name->name, name->len); -} - -static inline enum xfs_dacmp -xfs_dir2_compname( - struct xfs_da_args *args, - const unsigned char *name, - int len) -{ - if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb))) - return xfs_ascii_ci_compname(args, name, len); - return xfs_da_compname(args, name, len); -} +xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, struct xfs_name *name); +enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args, + const unsigned char *name, int len); #endif /* __XFS_DIR2_PRIV_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 8b94d33d232f..7b7f6fb2ea3b 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -37,7 +37,7 @@ static void xfs_dir2_sf_check(xfs_da_args_t *args); static void xfs_dir2_sf_toino4(xfs_da_args_t *args); static void xfs_dir2_sf_toino8(xfs_da_args_t *args); -static int +int xfs_dir2_sf_entsize( struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, @@ -84,7 +84,7 @@ xfs_dir2_sf_get_ino( return get_unaligned_be64(from) & XFS_MAXINUMBER; } -static void +void xfs_dir2_sf_put_ino( struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, @@ -145,7 +145,7 @@ xfs_dir2_sf_get_ftype( return XFS_DIR3_FT_UNKNOWN; } -static void +void xfs_dir2_sf_put_ftype( struct xfs_mount *mp, struct xfs_dir2_sf_entry *sfep, diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 1b7dcbae051c..77e9fa385980 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1540,6 +1540,13 @@ typedef struct xfs_bmdr_block { #define BMBT_BLOCKCOUNT_BITLEN 21 #define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) +#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1) + +/* + * bmbt records have a file offset (block) field that is 54 bits wide, so this + * is the largest xfs_fileoff_t that we ever expect to see. + */ +#define XFS_MAX_FILEOFF (BMBT_STARTOFF_MASK + BMBT_BLOCKCOUNT_MASK) typedef struct xfs_bmbt_rec { __be64 l0, l1; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 988cde7744e6..5b759af4d165 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2909,3 +2909,67 @@ xfs_ialloc_setup_geometry( else igeo->ialloc_align = 0; } + +/* Compute the location of the root directory inode that is laid out by mkfs. */ +xfs_ino_t +xfs_ialloc_calc_rootino( + struct xfs_mount *mp, + int sunit) +{ + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agblock_t first_bno; + + /* + * Pre-calculate the geometry of AG 0. We know what it looks like + * because libxfs knows how to create allocation groups now. + * + * first_bno is the first block in which mkfs could possibly have + * allocated the root directory inode, once we factor in the metadata + * that mkfs formats before it. Namely, the four AG headers... + */ + first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize); + + /* ...the two free space btree roots... */ + first_bno += 2; + + /* ...the inode btree root... */ + first_bno += 1; + + /* ...the initial AGFL... */ + first_bno += xfs_alloc_min_freelist(mp, NULL); + + /* ...the free inode btree root... */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + first_bno++; + + /* ...the reverse mapping btree root... */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + first_bno++; + + /* ...the reference count btree... */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + first_bno++; + + /* + * ...and the log, if it is allocated in the first allocation group. + * + * This can happen with filesystems that only have a single + * allocation group, or very odd geometries created by old mkfs + * versions on very small filesystems. + */ + if (mp->m_sb.sb_logstart && + XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0) + first_bno += mp->m_sb.sb_logblocks; + + /* + * Now round first_bno up to whatever allocation alignment is given + * by the filesystem or was passed in. + */ + if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0) + first_bno = roundup(first_bno, sunit); + else if (xfs_sb_version_hasalign(&mp->m_sb) && + mp->m_sb.sb_inoalignmt > 1) + first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt); + + return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno)); +} diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 323592d563d5..72b3468b97b1 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -152,5 +152,6 @@ int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask, int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); void xfs_ialloc_setup_geometry(struct xfs_mount *mp); +xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 8ef31d71a9c7..9bac0d2e56dc 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -462,11 +462,20 @@ static inline uint xfs_log_dinode_size(int version) #define XFS_BLF_GDQUOT_BUF (1<<4) /* - * This is the structure used to lay out a buf log item in the - * log. The data map describes which 128 byte chunks of the buffer - * have been logged. - */ -#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) + * This is the structure used to lay out a buf log item in the log. The data + * map describes which 128 byte chunks of the buffer have been logged. + * + * The placement of blf_map_size causes blf_data_map to start at an odd + * multiple of sizeof(unsigned int) offset within the struct. Because the data + * bitmap size will always be an even number, the end of the data_map (and + * therefore the structure) will also be at an odd multiple of sizeof(unsigned + * int). Some 64-bit compilers will insert padding at the end of the struct to + * ensure 64-bit alignment of blf_blkno, but 32-bit ones will not. Therefore, + * XFS_BLF_DATAMAP_SIZE must be an odd number to make the padding explicit and + * keep the structure size consistent between 32-bit and 64-bit platforms. + */ +#define __XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) +#define XFS_BLF_DATAMAP_SIZE (__XFS_BLF_DATAMAP_SIZE + 1) typedef struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index c55cd9a3dec9..7a9c04920505 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -197,6 +197,24 @@ xfs_calc_inode_chunk_res( } /* + * Per-extent log reservation for the btree changes involved in freeing or + * allocating a realtime extent. We have to be able to log as many rtbitmap + * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents, + * as well as the realtime summary block. + */ +static unsigned int +xfs_rtalloc_log_count( + struct xfs_mount *mp, + unsigned int num_ops) +{ + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + unsigned int rtbmp_bytes; + + rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY; + return (howmany(rtbmp_bytes, blksz) + 1) * num_ops; +} + +/* * Various log reservation values. * * These are based on the size of the file system block because that is what @@ -218,13 +236,21 @@ xfs_calc_inode_chunk_res( /* * In a write transaction we can allocate a maximum of 2 - * extents. This gives: + * extents. This gives (t1): * the inode getting the new extents: inode size * the inode's bmap btree: max depth * block size * the agfs of the ags from which the extents are allocated: 2 * sector * the superblock free block counter: sector size * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - * And the bmap_finish transaction can free bmap blocks in a join: + * Or, if we're writing to a realtime file (t2): + * the inode getting the new extents: inode size + * the inode's bmap btree: max depth * block size + * the agfs of the ags from which the extents are allocated: 2 * sector + * the superblock free block counter: sector size + * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 1 block + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + * And the bmap_finish transaction can free bmap blocks in a join (t3): * the agfs of the ags containing the blocks: 2 * sector size * the agfls of the ags containing the blocks: 2 * sector size * the super block free block counter: sector size @@ -234,40 +260,72 @@ STATIC uint xfs_calc_write_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 1) + + unsigned int t1, t2, t3; + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + t1 = xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + + if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + t2 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), - XFS_FSB_TO_B(mp, 1)) + + blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), - XFS_FSB_TO_B(mp, 1)))); + xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz); + } else { + t2 = 0; + } + + t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); } /* - * In truncating a file we free up to two extents at once. We can modify: + * In truncating a file we free up to two extents at once. We can modify (t1): * the inode being truncated: inode size * the inode's bmap btree: (max depth + 1) * block size - * And the bmap_finish transaction can free the blocks and bmap blocks: + * And the bmap_finish transaction can free the blocks and bmap blocks (t2): * the agf for each of the ags: 4 * sector size * the agfl for each of the ags: 4 * sector size * the super block to reflect the freed blocks: sector size * worst case split in allocation btrees per extent assuming 4 extents: * 4 exts * 2 trees * (2 * max depth - 1) * block size + * Or, if it's a realtime file (t3): + * the agf for each of the ags: 2 * sector size + * the agfl for each of the ags: 2 * sector size + * the super block to reflect the freed blocks: sector size + * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 2 exts * 1 block + * worst case split in allocation btrees per extent assuming 2 extents: + * 2 exts * 2 trees * (2 * max depth - 1) * block size */ STATIC uint xfs_calc_itruncate_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), - XFS_FSB_TO_B(mp, 1)))); + unsigned int t1, t2, t3; + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + t1 = xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); + + t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz); + + if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + } else { + t3 = 0; + } + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); } /* diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 60c61d7052a8..c3422403b169 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -75,7 +75,6 @@ static inline xfs_extlen_t xrep_calc_ag_resblks( struct xfs_scrub *sc) { - ASSERT(!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)); return 0; } diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 3362bae28b46..096203119934 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -329,7 +329,7 @@ TRACE_EVENT(xchk_btree_op_error, __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) - __field(int, ptr); + __field(int, ptr) __field(int, error) __field(void *, ret_ip) ), @@ -414,7 +414,7 @@ TRACE_EVENT(xchk_btree_error, __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) - __field(int, ptr); + __field(int, ptr) __field(void *, ret_ip) ), TP_fast_assign( @@ -452,7 +452,7 @@ TRACE_EVENT(xchk_ifork_btree_error, __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) - __field(int, ptr); + __field(int, ptr) __field(void *, ret_ip) ), TP_fast_assign( diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 91693fce34a8..cd743fad8478 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -145,7 +145,8 @@ xfs_get_acl(struct inode *inode, int type) * go out to the disk. */ len = XFS_ACL_MAX_SIZE(ip->i_mount); - error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len, + error = xfs_attr_get(ip, ea_name, strlen(ea_name), + (unsigned char **)&xfs_acl, &len, ATTR_ALLOC | ATTR_ROOT); if (error) { /* @@ -196,15 +197,17 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) len -= sizeof(struct xfs_acl_entry) * (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); - error = xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, - len, ATTR_ROOT); + error = xfs_attr_set(ip, ea_name, strlen(ea_name), + (unsigned char *)xfs_acl, len, ATTR_ROOT); kmem_free(xfs_acl); } else { /* * A NULL ACL argument means we want to remove the ACL. */ - error = xfs_attr_remove(ip, ea_name, ATTR_ROOT); + error = xfs_attr_remove(ip, ea_name, + strlen(ea_name), + ATTR_ROOT); /* * If the attribute didn't exist to start with that's fine. diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 5ff49523d8ea..8fbb841cd6fe 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -25,22 +25,18 @@ #include "xfs_error.h" /* - * Look at all the extents for this logical region, - * invalidate any buffers that are incore/in transactions. + * Invalidate any incore buffers associated with this remote attribute value + * extent. We never log remote attribute value buffers, which means that they + * won't be attached to a transaction and are therefore safe to mark stale. + * The actual bunmapi will be taken care of later. */ STATIC int -xfs_attr3_leaf_freextent( - struct xfs_trans **trans, +xfs_attr3_rmt_stale( struct xfs_inode *dp, xfs_dablk_t blkno, int blkcnt) { struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_dablk_t tblkno; - xfs_daddr_t dblkno; - int tblkcnt; - int dblkcnt; int nmap; int error; @@ -48,47 +44,29 @@ xfs_attr3_leaf_freextent( * Roll through the "value", invalidating the attribute value's * blocks. */ - tblkno = blkno; - tblkcnt = blkcnt; - while (tblkcnt > 0) { + while (blkcnt > 0) { /* * Try to remember where we decided to put the value. */ nmap = 1; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, + error = xfs_bmapi_read(dp, (xfs_fileoff_t)blkno, blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); - if (error) { + if (error) return error; - } - ASSERT(nmap == 1); - ASSERT(map.br_startblock != DELAYSTARTBLOCK); + if (XFS_IS_CORRUPT(dp->i_mount, nmap != 1)) + return -EFSCORRUPTED; /* - * If it's a hole, these are already unmapped - * so there's nothing to invalidate. + * Mark any incore buffers for the remote value as stale. We + * never log remote attr value buffers, so the buffer should be + * easy to kill. */ - if (map.br_startblock != HOLESTARTBLOCK) { - - dblkno = XFS_FSB_TO_DADDR(dp->i_mount, - map.br_startblock); - dblkcnt = XFS_FSB_TO_BB(dp->i_mount, - map.br_blockcount); - bp = xfs_trans_get_buf(*trans, - dp->i_mount->m_ddev_targp, - dblkno, dblkcnt, 0); - if (!bp) - return -ENOMEM; - xfs_trans_binval(*trans, bp); - /* - * Roll to next transaction. - */ - error = xfs_trans_roll_inode(trans, dp); - if (error) - return error; - } + error = xfs_attr_rmtval_stale(dp, &map, 0); + if (error) + return error; - tblkno += map.br_blockcount; - tblkcnt -= map.br_blockcount; + blkno += map.br_blockcount; + blkcnt -= map.br_blockcount; } return 0; @@ -102,86 +80,45 @@ xfs_attr3_leaf_freextent( */ STATIC int xfs_attr3_leaf_inactive( - struct xfs_trans **trans, - struct xfs_inode *dp, - struct xfs_buf *bp) + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) { - struct xfs_attr_leafblock *leaf; - struct xfs_attr3_icleaf_hdr ichdr; - struct xfs_attr_leaf_entry *entry; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_mount *mp = bp->b_mount; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_remote *name_rmt; - struct xfs_attr_inactive_list *list; - struct xfs_attr_inactive_list *lp; - int error; - int count; - int size; - int tmp; - int i; - struct xfs_mount *mp = bp->b_mount; + int error = 0; + int i; - leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); /* - * Count the number of "remote" value extents. + * Find the remote value extents for this leaf and invalidate their + * incore buffers. */ - count = 0; entry = xfs_attr3_leaf_entryp(leaf); for (i = 0; i < ichdr.count; entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) - count++; - } - } + int blkcnt; - /* - * If there are no "remote" values, we're done. - */ - if (count == 0) { - xfs_trans_brelse(*trans, bp); - return 0; - } - - /* - * Allocate storage for a list of all the "remote" value extents. - */ - size = count * sizeof(xfs_attr_inactive_list_t); - list = kmem_alloc(size, 0); - - /* - * Identify each of the "remote" value extents. - */ - lp = list; - entry = xfs_attr3_leaf_entryp(leaf); - for (i = 0; i < ichdr.count; entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) { - lp->valueblk = be32_to_cpu(name_rmt->valueblk); - lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, - be32_to_cpu(name_rmt->valuelen)); - lp++; - } - } - } - xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ + if (!entry->nameidx || (entry->flags & XFS_ATTR_LOCAL)) + continue; - /* - * Invalidate each of the "remote" value extents. - */ - error = 0; - for (lp = list, i = 0; i < count; i++, lp++) { - tmp = xfs_attr3_leaf_freextent(trans, dp, - lp->valueblk, lp->valuelen); + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + if (!name_rmt->valueblk) + continue; - if (error == 0) - error = tmp; /* save only the 1st errno */ + blkcnt = xfs_attr3_rmt_blocks(dp->i_mount, + be32_to_cpu(name_rmt->valuelen)); + error = xfs_attr3_rmt_stale(dp, + be32_to_cpu(name_rmt->valueblk), blkcnt); + if (error) + goto err; } - kmem_free(list); + xfs_trans_brelse(*trans, bp); +err: return error; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2efd78a9719e..e62fb5216341 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -992,6 +992,7 @@ xfs_prepare_shift( struct xfs_inode *ip, loff_t offset) { + struct xfs_mount *mp = ip->i_mount; int error; /* @@ -1005,6 +1006,17 @@ xfs_prepare_shift( } /* + * Shift operations must stabilize the start block offset boundary along + * with the full range of the operation. If we don't, a COW writeback + * completion could race with an insert, front merge with the start + * extent (after split) during the shift and corrupt the file. Start + * with the block just prior to the start to stabilize the boundary. + */ + offset = round_down(offset, 1 << mp->m_sb.sb_blocklog); + if (offset) + offset -= (1 << mp->m_sb.sb_blocklog); + + /* * Writeback and invalidate cache for the remainder of the file as we're * about to shift down every extent from offset to EOF. */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 3458a1264a3f..5be8973a452c 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -27,6 +27,23 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); +/* Is this log iovec plausibly large enough to contain the buffer log format? */ +bool +xfs_buf_log_check_iovec( + struct xfs_log_iovec *iovec) +{ + struct xfs_buf_log_format *blfp = iovec->i_addr; + char *bmp_end; + char *item_end; + + if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len) + return false; + + item_end = (char *)iovec->i_addr + iovec->i_len; + bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size]; + return bmp_end <= item_end; +} + static inline int xfs_buf_log_format_size( struct xfs_buf_log_format *blfp) @@ -688,7 +705,7 @@ static const struct xfs_item_ops xfs_buf_item_ops = { .iop_push = xfs_buf_item_push, }; -STATIC int +STATIC void xfs_buf_item_get_format( struct xfs_buf_log_item *bip, int count) @@ -698,14 +715,11 @@ xfs_buf_item_get_format( if (count == 1) { bip->bli_formats = &bip->__bli_format; - return 0; + return; } bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), 0); - if (!bip->bli_formats) - return -ENOMEM; - return 0; } STATIC void @@ -731,7 +745,6 @@ xfs_buf_item_init( struct xfs_buf_log_item *bip = bp->b_log_item; int chunks; int map_size; - int error; int i; /* @@ -760,19 +773,22 @@ xfs_buf_item_init( * Discontiguous buffer support follows the layout of the underlying * buffer. This makes the implementation as simple as possible. */ - error = xfs_buf_item_get_format(bip, bp->b_map_count); - ASSERT(error == 0); - if (error) { /* to stop gcc throwing set-but-unused warnings */ - kmem_cache_free(xfs_buf_item_zone, bip); - return error; - } - + xfs_buf_item_get_format(bip, bp->b_map_count); for (i = 0; i < bip->bli_format_count; i++) { chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), XFS_BLF_CHUNK); map_size = DIV_ROUND_UP(chunks, NBWORD); + if (map_size > XFS_BLF_DATAMAP_SIZE) { + kmem_cache_free(xfs_buf_item_zone, bip); + xfs_err(mp, + "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!", + map_size, + BBTOB(bp->b_maps[i].bm_len)); + return -EFSCORRUPTED; + } + bip->bli_formats[i].blf_type = XFS_LI_BUF; bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; @@ -805,6 +821,9 @@ xfs_buf_item_log_segment( uint end_bit; uint mask; + ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); + ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); + /* * Convert byte offsets to bit numbers. */ @@ -956,7 +975,7 @@ xfs_buf_item_relse( struct xfs_buf_log_item *bip = bp->b_log_item; trace_xfs_buf_item_relse(bp, _RET_IP_); - ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); + ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); bp->b_log_item = NULL; if (list_empty(&bp->b_li_list)) diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 4a054b11011a..30114b510332 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -61,6 +61,7 @@ void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, struct list_head *); +bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 2bff21ca9d78..9cfd3209f52b 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -137,7 +137,7 @@ xfs_qm_adjust_dqtimers( (d->d_blk_hardlimit && (be64_to_cpu(d->d_bcount) > be64_to_cpu(d->d_blk_hardlimit)))) { - d->d_btimer = cpu_to_be32(get_seconds() + + d->d_btimer = cpu_to_be32(ktime_get_real_seconds() + mp->m_quotainfo->qi_btimelimit); } else { d->d_bwarns = 0; @@ -160,7 +160,7 @@ xfs_qm_adjust_dqtimers( (d->d_ino_hardlimit && (be64_to_cpu(d->d_icount) > be64_to_cpu(d->d_ino_hardlimit)))) { - d->d_itimer = cpu_to_be32(get_seconds() + + d->d_itimer = cpu_to_be32(ktime_get_real_seconds() + mp->m_quotainfo->qi_itimelimit); } else { d->d_iwarns = 0; @@ -183,7 +183,7 @@ xfs_qm_adjust_dqtimers( (d->d_rtb_hardlimit && (be64_to_cpu(d->d_rtbcount) > be64_to_cpu(d->d_rtb_hardlimit)))) { - d->d_rtbtimer = cpu_to_be32(get_seconds() + + d->d_rtbtimer = cpu_to_be32(ktime_get_real_seconds() + mp->m_quotainfo->qi_rtbtimelimit); } else { d->d_rtbwarns = 0; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c93250108952..b8a4a3f29b36 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -187,7 +187,12 @@ xfs_file_dio_aio_read( file_accessed(iocb->ki_filp); - xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + return -EAGAIN; + } else { + xfs_ilock(ip, XFS_IOLOCK_SHARED); + } ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, is_sync_kiocb(iocb)); xfs_iunlock(ip, XFS_IOLOCK_SHARED); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 401da197f012..1979a0055763 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1518,10 +1518,8 @@ xfs_itruncate_extents_flags( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; xfs_fileoff_t first_unmap_block; - xfs_fileoff_t last_block; xfs_filblks_t unmap_len; int error = 0; - int done = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!atomic_read(&VFS_I(ip)->i_count) || @@ -1541,21 +1539,22 @@ xfs_itruncate_extents_flags( * the end of the file (in a crash where the space is allocated * but the inode size is not yet updated), simply remove any * blocks which show up between the new EOF and the maximum - * possible file size. If the first block to be removed is - * beyond the maximum file size (ie it is the same as last_block), - * then there is nothing to do. + * possible file size. + * + * We have to free all the blocks to the bmbt maximum offset, even if + * the page cache can't scale that far. */ first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); - last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - if (first_unmap_block == last_block) + if (first_unmap_block >= XFS_MAX_FILEOFF) { + WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF); return 0; + } - ASSERT(first_unmap_block < last_block); - unmap_len = last_block - first_unmap_block + 1; - while (!done) { + unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; + while (unmap_len > 0) { ASSERT(tp->t_firstblock == NULLFSBLOCK); - error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, - XFS_ITRUNC_MAX_EXTENTS, &done); + error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, + flags, XFS_ITRUNC_MAX_EXTENTS); if (error) goto out; @@ -1575,7 +1574,7 @@ xfs_itruncate_extents_flags( if (whichfork == XFS_DATA_FORK) { /* Remove all pending CoW reservations. */ error = xfs_reflink_cancel_cow_blocks(ip, &tp, - first_unmap_block, last_block, true); + first_unmap_block, XFS_MAX_FILEOFF, true); if (error) goto out; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 7b35d62ede9f..d42de92cb283 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -357,6 +357,7 @@ xfs_attrmulti_attr_get( { unsigned char *kbuf; int error = -EFAULT; + size_t namelen; if (*len > XFS_XATTR_SIZE_MAX) return -EINVAL; @@ -364,7 +365,9 @@ xfs_attrmulti_attr_get( if (!kbuf) return -ENOMEM; - error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags); + namelen = strlen(name); + error = xfs_attr_get(XFS_I(inode), name, namelen, &kbuf, (int *)len, + flags); if (error) goto out_kfree; @@ -386,6 +389,7 @@ xfs_attrmulti_attr_set( { unsigned char *kbuf; int error; + size_t namelen; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; @@ -396,7 +400,8 @@ xfs_attrmulti_attr_set( if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); + namelen = strlen(name); + error = xfs_attr_set(XFS_I(inode), name, namelen, kbuf, len, flags); if (!error) xfs_forget_acl(inode, name, flags); kfree(kbuf); @@ -410,10 +415,12 @@ xfs_attrmulti_attr_remove( uint32_t flags) { int error; + size_t namelen; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; - error = xfs_attr_remove(XFS_I(inode), name, flags); + namelen = strlen(name); + error = xfs_attr_remove(XFS_I(inode), name, namelen, flags); if (!error) xfs_forget_acl(inode, name, flags); return error; @@ -462,6 +469,13 @@ xfs_attrmulti_by_handle( error = 0; for (i = 0; i < am_hreq.opcount; i++) { + if ((ops[i].am_flags & ATTR_ROOT) && + (ops[i].am_flags & ATTR_SECURE)) { + ops[i].am_error = -EINVAL; + continue; + } + ops[i].am_flags &= ~ATTR_KERNEL_FLAGS; + ops[i].am_error = strncpy_from_user((char *)attr_name, ops[i].am_attrname, MAXNAMELEN); if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index c4c4f09113d3..769581a79c58 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -107,7 +107,7 @@ xfs_ioctl32_bstime_copyin( xfs_bstime_t *bstime, compat_xfs_bstime_t __user *bstime32) { - compat_time_t sec32; /* tv_sec differs on 64 vs. 32 */ + old_time32_t sec32; /* tv_sec differs on 64 vs. 32 */ if (get_user(sec32, &bstime32->tv_sec) || get_user(bstime->tv_nsec, &bstime32->tv_nsec)) @@ -450,6 +450,13 @@ xfs_compat_attrmulti_by_handle( error = 0; for (i = 0; i < am_hreq.opcount; i++) { + if ((ops[i].am_flags & ATTR_ROOT) && + (ops[i].am_flags & ATTR_SECURE)) { + ops[i].am_error = -EINVAL; + continue; + } + ops[i].am_flags &= ~ATTR_KERNEL_FLAGS; + ops[i].am_error = strncpy_from_user((char *)attr_name, compat_ptr(ops[i].am_attrname), MAXNAMELEN); diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index 8c7743cd490e..053de7d894cd 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -32,7 +32,7 @@ #endif typedef struct compat_xfs_bstime { - compat_time_t tv_sec; /* seconds */ + old_time32_t tv_sec; /* seconds */ __s32 tv_nsec; /* and nanoseconds */ } compat_xfs_bstime_t; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 28e2d1f37267..bb590a267a7f 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -923,7 +923,7 @@ xfs_buffered_write_iomap_begin( xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_inode_need_cow(ip, &imap, &shared); + error = xfs_bmap_trim_cow(ip, &imap, &shared); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 8afe69ca188b..81f2f93caec0 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -50,8 +50,10 @@ xfs_initxattrs( int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - error = xfs_attr_set(ip, xattr->name, xattr->value, - xattr->value_len, ATTR_SECURE); + error = xfs_attr_set(ip, xattr->name, + strlen(xattr->name), + xattr->value, xattr->value_len, + ATTR_SECURE); if (error < 0) break; } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 99ec3fba4548..0d683fb96396 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1934,6 +1934,12 @@ xlog_recover_buffer_pass1( struct list_head *bucket; struct xfs_buf_cancel *bcp; + if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { + xfs_err(log->l_mp, "bad buffer log item size (%d)", + item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + /* * If this isn't a cancel buffer item, then just return. */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index fca65109cf24..56efe140c923 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -31,7 +31,7 @@ #include "xfs_reflink.h" #include "xfs_extent_busy.h" #include "xfs_health.h" - +#include "xfs_trace.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -360,66 +360,119 @@ release_buf: } /* - * Update alignment values based on mount options and sb values + * If the sunit/swidth change would move the precomputed root inode value, we + * must reject the ondisk change because repair will stumble over that. + * However, we allow the mount to proceed because we never rejected this + * combination before. Returns true to update the sb, false otherwise. + */ +static inline int +xfs_check_new_dalign( + struct xfs_mount *mp, + int new_dalign, + bool *update_sb) +{ + struct xfs_sb *sbp = &mp->m_sb; + xfs_ino_t calc_ino; + + calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign); + trace_xfs_check_new_dalign(mp, new_dalign, calc_ino); + + if (sbp->sb_rootino == calc_ino) { + *update_sb = true; + return 0; + } + + xfs_warn(mp, +"Cannot change stripe alignment; would require moving root inode."); + + /* + * XXX: Next time we add a new incompat feature, this should start + * returning -EINVAL to fail the mount. Until then, spit out a warning + * that we're ignoring the administrator's instructions. + */ + xfs_warn(mp, "Skipping superblock stripe alignment update."); + *update_sb = false; + return 0; +} + +/* + * If we were provided with new sunit/swidth values as mount options, make sure + * that they pass basic alignment and superblock feature checks, and convert + * them into the same units (FSB) that everything else expects. This step + * /must/ be done before computing the inode geometry. */ STATIC int -xfs_update_alignment(xfs_mount_t *mp) +xfs_validate_new_dalign( + struct xfs_mount *mp) { - xfs_sb_t *sbp = &(mp->m_sb); + if (mp->m_dalign == 0) + return 0; - if (mp->m_dalign) { + /* + * If stripe unit and stripe width are not multiples + * of the fs blocksize turn off alignment. + */ + if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || + (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. blocksize(%d)", + mp->m_sb.sb_blocksize); + return -EINVAL; + } else { /* - * If stripe unit and stripe width are not multiples - * of the fs blocksize turn off alignment. + * Convert the stripe unit and width to FSBs. */ - if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || - (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); + if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { xfs_warn(mp, - "alignment check failed: sunit/swidth vs. blocksize(%d)", - sbp->sb_blocksize); + "alignment check failed: sunit/swidth vs. agsize(%d)", + mp->m_sb.sb_agblocks); return -EINVAL; - } else { - /* - * Convert the stripe unit and width to FSBs. - */ - mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); - if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { - xfs_warn(mp, - "alignment check failed: sunit/swidth vs. agsize(%d)", - sbp->sb_agblocks); - return -EINVAL; - } else if (mp->m_dalign) { - mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); - } else { - xfs_warn(mp, - "alignment check failed: sunit(%d) less than bsize(%d)", - mp->m_dalign, sbp->sb_blocksize); - return -EINVAL; - } - } - - /* - * Update superblock with new values - * and log changes - */ - if (xfs_sb_version_hasdalign(sbp)) { - if (sbp->sb_unit != mp->m_dalign) { - sbp->sb_unit = mp->m_dalign; - mp->m_update_sb = true; - } - if (sbp->sb_width != mp->m_swidth) { - sbp->sb_width = mp->m_swidth; - mp->m_update_sb = true; - } + } else if (mp->m_dalign) { + mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); } else { xfs_warn(mp, - "cannot change alignment: superblock does not support data alignment"); + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, mp->m_sb.sb_blocksize); return -EINVAL; } + } + + if (!xfs_sb_version_hasdalign(&mp->m_sb)) { + xfs_warn(mp, +"cannot change alignment: superblock does not support data alignment"); + return -EINVAL; + } + + return 0; +} + +/* Update alignment values based on mount options and sb values. */ +STATIC int +xfs_update_alignment( + struct xfs_mount *mp) +{ + struct xfs_sb *sbp = &mp->m_sb; + + if (mp->m_dalign) { + bool update_sb; + int error; + + if (sbp->sb_unit == mp->m_dalign && + sbp->sb_width == mp->m_swidth) + return 0; + + error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb); + if (error || !update_sb) + return error; + + sbp->sb_unit = mp->m_dalign; + sbp->sb_width = mp->m_swidth; + mp->m_update_sb = true; } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && xfs_sb_version_hasdalign(&mp->m_sb)) { - mp->m_dalign = sbp->sb_unit; - mp->m_swidth = sbp->sb_width; + mp->m_dalign = sbp->sb_unit; + mp->m_swidth = sbp->sb_width; } return 0; @@ -648,12 +701,12 @@ xfs_mountfs( } /* - * Check if sb_agblocks is aligned at stripe boundary - * If sb_agblocks is NOT aligned turn off m_dalign since - * allocator alignment is within an ag, therefore ag has - * to be aligned at stripe boundary. + * If we were given new sunit/swidth options, do some basic validation + * checks and convert the incore dalign and swidth values to the + * same units (FSB) that everything else uses. This /must/ happen + * before computing the inode geometry. */ - error = xfs_update_alignment(mp); + error = xfs_validate_new_dalign(mp); if (error) goto out; @@ -664,6 +717,17 @@ xfs_mountfs( xfs_rmapbt_compute_maxlevels(mp); xfs_refcountbt_compute_maxlevels(mp); + /* + * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks + * is NOT aligned turn off m_dalign since allocator alignment is within + * an ag, therefore ag has to be aligned at stripe boundary. Note that + * we must compute the free space and rmap btree geometry before doing + * this. + */ + error = xfs_update_alignment(mp); + if (error) + goto out; + /* enable fail_at_unmount as default */ mp->m_fail_unmount = true; diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index b6701b4f59a9..5f04d8a5ab2a 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -111,6 +111,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); /* log structures */ + XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88); XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28); XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 7823af39008b..4e57edca8bce 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -64,9 +64,9 @@ struct xfs_quotainfo { struct xfs_inode *qi_pquotaip; /* project quota inode */ struct list_lru qi_lru; int qi_dquots; - time_t qi_btimelimit; /* limit for blks timer */ - time_t qi_itimelimit; /* limit for inodes timer */ - time_t qi_rtbtimelimit;/* limit for rt blks timer */ + time64_t qi_btimelimit; /* limit for blks timer */ + time64_t qi_itimelimit; /* limit for inodes timer */ + time64_t qi_rtbtimelimit;/* limit for rt blks timer */ xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index c7de17deeae6..38669e827206 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -37,9 +37,9 @@ xfs_qm_fill_state( tstate->flags |= QCI_SYSFILE; tstate->blocks = ip->i_d.di_nblocks; tstate->nextents = ip->i_d.di_nextents; - tstate->spc_timelimit = q->qi_btimelimit; - tstate->ino_timelimit = q->qi_itimelimit; - tstate->rt_spc_timelimit = q->qi_rtbtimelimit; + tstate->spc_timelimit = (u32)q->qi_btimelimit; + tstate->ino_timelimit = (u32)q->qi_itimelimit; + tstate->rt_spc_timelimit = (u32)q->qi_rtbtimelimit; tstate->spc_warnlimit = q->qi_bwarnlimit; tstate->ino_warnlimit = q->qi_iwarnlimit; tstate->rt_spc_warnlimit = q->qi_rtbwarnlimit; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index de451235c4ee..e723b267a247 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -223,8 +223,8 @@ xfs_reflink_trim_around_shared( } } -bool -xfs_inode_need_cow( +int +xfs_bmap_trim_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared) @@ -327,7 +327,7 @@ xfs_find_trim_cow_extent( if (cmap->br_startoff > offset_fsb) { xfs_trim_extent(imap, imap->br_startoff, cmap->br_startoff - imap->br_startoff); - return xfs_inode_need_cow(ip, imap, shared); + return xfs_bmap_trim_cow(ip, imap, shared); } *shared = true; @@ -1457,7 +1457,8 @@ xfs_reflink_clear_inode_flag( * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. */ - error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF, + true); if (error) return error; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index d18ad7f4fb64..3e4fd46373ab 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -22,7 +22,7 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); -bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, +int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared); int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d9ae27ddf253..760901783944 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -193,32 +193,6 @@ xfs_fs_show_options( return 0; } -static uint64_t -xfs_max_file_offset( - unsigned int blockshift) -{ - unsigned int pagefactor = 1; - unsigned int bitshift = BITS_PER_LONG - 1; - - /* Figure out maximum filesize, on Linux this can depend on - * the filesystem blocksize (on 32 bit platforms). - * __block_write_begin does this in an [unsigned] long long... - * page->index << (PAGE_SHIFT - bbits) - * So, for page sized blocks (4K on 32 bit platforms), - * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is - * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1) - * but for smaller blocksizes it is less (bbits = log2 bsize). - */ - -#if BITS_PER_LONG == 32 - ASSERT(sizeof(sector_t) == 8); - pagefactor = PAGE_SIZE; - bitshift = BITS_PER_LONG; -#endif - - return (((uint64_t)pagefactor) << bitshift) - 1; -} - /* * Set parameters for inode allocation heuristics, taking into account * filesystem size and inode32/inode64 mount options; i.e. specifically @@ -1424,6 +1398,26 @@ xfs_fc_fill_super( if (error) goto out_free_sb; + /* + * XFS block mappings use 54 bits to store the logical block offset. + * This should suffice to handle the maximum file size that the VFS + * supports (currently 2^63 bytes on 64-bit and ULONG_MAX << PAGE_SHIFT + * bytes on 32-bit), but as XFS and VFS have gotten the s_maxbytes + * calculation wrong on 32-bit kernels in the past, we'll add a WARN_ON + * to check this assertion. + * + * Avoid integer overflow by comparing the maximum bmbt offset to the + * maximum pagecache offset in units of fs blocks. + */ + if (XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE) > XFS_MAX_FILEOFF) { + xfs_warn(mp, +"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!", + XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE), + XFS_MAX_FILEOFF); + error = -EINVAL; + goto out_free_sb; + } + error = xfs_filestream_mount(mp); if (error) goto out_free_sb; @@ -1435,7 +1429,7 @@ xfs_fc_fill_super( sb->s_magic = XFS_SUPER_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; - sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); + sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; sb->s_time_min = S32_MIN; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index c13bb3655e48..e242988f57fb 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -218,8 +218,8 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(void *, leaf); - __field(int, pos); + __field(void *, leaf) + __field(int, pos) __field(xfs_fileoff_t, startoff) __field(xfs_fsblock_t, startblock) __field(xfs_filblks_t, blockcount) @@ -3573,6 +3573,27 @@ DEFINE_KMEM_EVENT(kmem_alloc_large); DEFINE_KMEM_EVENT(kmem_realloc); DEFINE_KMEM_EVENT(kmem_zone_alloc); +TRACE_EVENT(xfs_check_new_dalign, + TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), + TP_ARGS(mp, new_dalign, calc_rootino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, new_dalign) + __field(xfs_ino_t, sb_rootino) + __field(xfs_ino_t, calc_rootino) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->new_dalign = new_dalign; + __entry->sb_rootino = mp->m_sb.sb_rootino; + __entry->calc_rootino = calc_rootino; + ), + TP_printk("dev %d:%d new_dalign %d sb_rootino %llu calc_rootino %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->new_dalign, __entry->sb_rootino, + __entry->calc_rootino) +) + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index a6fe2d8dc40f..d1b9869bc5fa 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -580,7 +580,7 @@ xfs_trans_dqresv( { xfs_qcnt_t hardlimit; xfs_qcnt_t softlimit; - time_t timer; + time64_t timer; xfs_qwarncnt_t warns; xfs_qwarncnt_t warnlimit; xfs_qcnt_t total_count; @@ -635,7 +635,8 @@ xfs_trans_dqresv( goto error_return; } if (softlimit && total_count > softlimit) { - if ((timer != 0 && get_seconds() > timer) || + if ((timer != 0 && + ktime_get_real_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTLONGWARN); @@ -662,7 +663,8 @@ xfs_trans_dqresv( goto error_return; } if (softlimit && total_count > softlimit) { - if ((timer != 0 && get_seconds() > timer) || + if ((timer != 0 && + ktime_get_real_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTLONGWARN); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 383f0203d103..b0fedb543f97 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -24,6 +24,7 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, int xflags = handler->flags; struct xfs_inode *ip = XFS_I(inode); int error, asize = size; + size_t namelen = strlen(name); /* Convert Linux syscall to XFS internal ATTR flags */ if (!size) { @@ -31,7 +32,8 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, value = NULL; } - error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags); + error = xfs_attr_get(ip, name, namelen, (unsigned char **)&value, + &asize, xflags); if (error) return error; return asize; @@ -67,6 +69,7 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, int xflags = handler->flags; struct xfs_inode *ip = XFS_I(inode); int error; + size_t namelen = strlen(name); /* Convert Linux syscall to XFS internal ATTR flags */ if (flags & XATTR_CREATE) @@ -74,10 +77,11 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, if (flags & XATTR_REPLACE) xflags |= ATTR_REPLACE; - if (!value) - return xfs_attr_remove(ip, (unsigned char *)name, xflags); - error = xfs_attr_set(ip, (unsigned char *)name, - (void *)value, size, xflags); + if (value) + error = xfs_attr_set(ip, name, namelen, (void *)value, size, + xflags); + else + error = xfs_attr_remove(ip, name, namelen, xflags); if (!error) xfs_forget_acl(inode, name, xflags); |