From 79124f18b335172e1916075c633745e12dae1dac Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 17 Nov 2010 20:46:15 -0500 Subject: fs: add hole punching to fallocate Hole punching has already been implemented by XFS and OCFS2, and has the potential to be implemented on both BTRFS and EXT4 so we need a generic way to get to this feature. The simplest way in my mind is to add FALLOC_FL_PUNCH_HOLE to fallocate() since it already looks like the normal fallocate() operation. I've tested this patch with XFS and BTRFS to make sure XFS did what it's supposed to do and that BTRFS failed like it was supposed to. Thank you, Signed-off-by: Josef Bacik Signed-off-by: Al Viro --- fs/open.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/open.c') diff --git a/fs/open.c b/fs/open.c index 4197b9ed023d..5b6ef7e2859e 100644 --- a/fs/open.c +++ b/fs/open.c @@ -223,7 +223,12 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EINVAL; /* Return error if mode is not supported */ - if (mode && !(mode & FALLOC_FL_KEEP_SIZE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + /* Punch hole must have keep size set */ + if ((mode & FALLOC_FL_PUNCH_HOLE) && + !(mode & FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; if (!(file->f_mode & FMODE_WRITE)) -- cgit v1.2.3-58-ga151 From 2fe17c1075836b66678ed2a305fd09b6773883aa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 Jan 2011 13:07:43 +0100 Subject: fallocate should be a file operation Currently all filesystems except XFS implement fallocate asynchronously, while XFS forced a commit. Both of these are suboptimal - in case of O_SYNC I/O we really want our allocation on disk, especially for the !KEEP_SIZE case where we actually grow the file with user-visible zeroes. On the other hand always commiting the transaction is a bad idea for fast-path uses of fallocate like for example in recent Samba versions. Given that block allocation is a data plane operation anyway change it from an inode operation to a file operation so that we have the file structure available that lets us check for O_SYNC. This also includes moving the code around for a few of the filesystems, and remove the already unnedded S_ISDIR checks given that we only wire up fallocate for regular files. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 3 +- fs/btrfs/file.c | 113 +++++++++++++++++ fs/btrfs/inode.c | 111 ---------------- fs/ext4/ext4.h | 2 +- fs/ext4/extents.c | 9 +- fs/ext4/file.c | 2 +- fs/gfs2/file.c | 258 ++++++++++++++++++++++++++++++++++++++ fs/gfs2/ops_inode.c | 258 -------------------------------------- fs/ocfs2/file.c | 8 +- fs/open.c | 4 +- fs/xfs/linux-2.6/xfs_file.c | 56 +++++++++ fs/xfs/linux-2.6/xfs_iops.c | 60 --------- include/linux/fs.h | 4 +- 13 files changed, 440 insertions(+), 448 deletions(-) (limited to 'fs/open.c') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 651d5237c155..4471a416c274 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -60,7 +60,6 @@ ata *); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); void (*truncate_range)(struct inode *, loff_t, loff_t); - long (*fallocate)(struct inode *inode, int mode, loff_t offset, loff_t len); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); locking rules: @@ -88,7 +87,6 @@ getxattr: no listxattr: no removexattr: yes truncate_range: yes -fallocate: no fiemap: no Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. @@ -437,6 +435,7 @@ prototypes: ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int (*setlease)(struct file *, long, struct file_lock **); + long (*fallocate)(struct file *, int, loff_t, loff_t); }; locking rules: diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 66836d85763b..a9e0a4eaf3d9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -1237,6 +1238,117 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) return 0; } +static long btrfs_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct extent_state *cached_state = NULL; + u64 cur_offset; + u64 last_byte; + u64 alloc_start; + u64 alloc_end; + u64 alloc_hint = 0; + u64 locked_end; + u64 mask = BTRFS_I(inode)->root->sectorsize - 1; + struct extent_map *em; + int ret; + + alloc_start = offset & ~mask; + alloc_end = (offset + len + mask) & ~mask; + + /* We only support the FALLOC_FL_KEEP_SIZE mode */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + + mutex_lock(&inode->i_mutex); + ret = inode_newsize_ok(inode, alloc_end); + if (ret) + goto out; + + if (alloc_start > inode->i_size) { + ret = btrfs_cont_expand(inode, alloc_start); + if (ret) + goto out; + } + + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); + if (ret) + goto out; + + locked_end = alloc_end - 1; + while (1) { + struct btrfs_ordered_extent *ordered; + + /* the extent lock is ordered inside the running + * transaction + */ + lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, + locked_end, 0, &cached_state, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + alloc_end - 1); + if (ordered && + ordered->file_offset + ordered->len > alloc_start && + ordered->file_offset < alloc_end) { + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + alloc_start, locked_end, + &cached_state, GFP_NOFS); + /* + * we can't wait on the range with the transaction + * running or with the extent lock held + */ + btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + } else { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + } + + cur_offset = alloc_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + alloc_end - cur_offset, 0); + BUG_ON(IS_ERR(em) || !em); + last_byte = min(extent_map_end(em), alloc_end); + last_byte = (last_byte + mask) & ~mask; + if (em->block_start == EXTENT_MAP_HOLE || + (cur_offset >= inode->i_size && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + ret = btrfs_prealloc_file_range(inode, mode, cur_offset, + last_byte - cur_offset, + 1 << inode->i_blkbits, + offset + len, + &alloc_hint); + if (ret < 0) { + free_extent_map(em); + break; + } + } + free_extent_map(em); + + cur_offset = last_byte; + if (cur_offset >= alloc_end) { + ret = 0; + break; + } + } + unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state, GFP_NOFS); + + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + const struct file_operations btrfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -1248,6 +1360,7 @@ const struct file_operations btrfs_file_operations = { .open = generic_file_open, .release = btrfs_release_file, .fsync = btrfs_sync_file, + .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_ioctl, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 64daf2acd0d5..902afbf50811 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7098,116 +7098,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, min_size, actual_len, alloc_hint, trans); } -static long btrfs_fallocate(struct inode *inode, int mode, - loff_t offset, loff_t len) -{ - struct extent_state *cached_state = NULL; - u64 cur_offset; - u64 last_byte; - u64 alloc_start; - u64 alloc_end; - u64 alloc_hint = 0; - u64 locked_end; - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; - struct extent_map *em; - int ret; - - alloc_start = offset & ~mask; - alloc_end = (offset + len + mask) & ~mask; - - /* We only support the FALLOC_FL_KEEP_SIZE mode */ - if (mode & ~FALLOC_FL_KEEP_SIZE) - return -EOPNOTSUPP; - - /* - * wait for ordered IO before we have any locks. We'll loop again - * below with the locks held. - */ - btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); - - mutex_lock(&inode->i_mutex); - ret = inode_newsize_ok(inode, alloc_end); - if (ret) - goto out; - - if (alloc_start > inode->i_size) { - ret = btrfs_cont_expand(inode, alloc_start); - if (ret) - goto out; - } - - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); - if (ret) - goto out; - - locked_end = alloc_end - 1; - while (1) { - struct btrfs_ordered_extent *ordered; - - /* the extent lock is ordered inside the running - * transaction - */ - lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, 0, &cached_state, GFP_NOFS); - ordered = btrfs_lookup_first_ordered_extent(inode, - alloc_end - 1); - if (ordered && - ordered->file_offset + ordered->len > alloc_start && - ordered->file_offset < alloc_end) { - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - alloc_start, locked_end, - &cached_state, GFP_NOFS); - /* - * we can't wait on the range with the transaction - * running or with the extent lock held - */ - btrfs_wait_ordered_range(inode, alloc_start, - alloc_end - alloc_start); - } else { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - } - - cur_offset = alloc_start; - while (1) { - em = btrfs_get_extent(inode, NULL, 0, cur_offset, - alloc_end - cur_offset, 0); - BUG_ON(IS_ERR(em) || !em); - last_byte = min(extent_map_end(em), alloc_end); - last_byte = (last_byte + mask) & ~mask; - if (em->block_start == EXTENT_MAP_HOLE || - (cur_offset >= inode->i_size && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = btrfs_prealloc_file_range(inode, mode, cur_offset, - last_byte - cur_offset, - 1 << inode->i_blkbits, - offset + len, - &alloc_hint); - if (ret < 0) { - free_extent_map(em); - break; - } - } - free_extent_map(em); - - cur_offset = last_byte; - if (cur_offset >= alloc_end) { - ret = 0; - break; - } - } - unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state, GFP_NOFS); - - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); -out: - mutex_unlock(&inode->i_mutex); - return ret; -} - static int btrfs_set_page_dirty(struct page *page) { return __set_page_dirty_nobuffers(page); @@ -7310,7 +7200,6 @@ static const struct inode_operations btrfs_file_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, - .fallocate = btrfs_fallocate, .fiemap = btrfs_fiemap, }; static const struct inode_operations btrfs_special_inode_operations = { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1de65f572033..0c8d97b56f34 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2065,7 +2065,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, extern void ext4_ext_truncate(struct inode *); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); -extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, +extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ssize_t len); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4bdd160854eb..63a75810b7c3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3627,14 +3627,15 @@ static void ext4_falloc_update_inode(struct inode *inode, } /* - * preallocate space for a file. This implements ext4's fallocate inode + * preallocate space for a file. This implements ext4's fallocate file * operation, which gets called from sys_fallocate system call. * For block-mapped files, posix_fallocate should fall back to the method * of writing zeroes to the required new blocks (the same behavior which is * expected for file systems which do not support fallocate() system call). */ -long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) +long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { + struct inode *inode = file->f_path.dentry->d_inode; handle_t *handle; loff_t new_size; unsigned int max_blocks; @@ -3655,10 +3656,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; - /* preallocation to directories is currently not supported */ - if (S_ISDIR(inode->i_mode)) - return -ENODEV; - map.m_lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because diff --git a/fs/ext4/file.c b/fs/ext4/file.c index bb003dc9ffff..2e8322c8aa88 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -210,6 +210,7 @@ const struct file_operations ext4_file_operations = { .fsync = ext4_sync_file, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, + .fallocate = ext4_fallocate, }; const struct inode_operations ext4_file_inode_operations = { @@ -223,7 +224,6 @@ const struct inode_operations ext4_file_inode_operations = { .removexattr = generic_removexattr, #endif .check_acl = ext4_check_acl, - .fallocate = ext4_fallocate, .fiemap = ext4_fiemap, }; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index fca6689e12e6..7cfdcb913363 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include #include @@ -610,6 +612,260 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return generic_file_aio_write(iocb, iov, nr_segs, pos); } +static void empty_write_end(struct page *page, unsigned from, + unsigned to) +{ + struct gfs2_inode *ip = GFS2_I(page->mapping->host); + + page_zero_new_buffers(page, from, to); + flush_dcache_page(page); + mark_page_accessed(page); + + if (!gfs2_is_writeback(ip)) + gfs2_page_add_databufs(ip, page, from, to); + + block_commit_write(page, from, to); +} + +static int write_empty_blocks(struct page *page, unsigned from, unsigned to) +{ + unsigned start, end, next; + struct buffer_head *bh, *head; + int error; + + if (!page_has_buffers(page)) { + error = __block_write_begin(page, from, to - from, gfs2_block_map); + if (unlikely(error)) + return error; + + empty_write_end(page, from, to); + return 0; + } + + bh = head = page_buffers(page); + next = end = 0; + while (next < from) { + next += bh->b_size; + bh = bh->b_this_page; + } + start = next; + do { + next += bh->b_size; + if (buffer_mapped(bh)) { + if (end) { + error = __block_write_begin(page, start, end - start, + gfs2_block_map); + if (unlikely(error)) + return error; + empty_write_end(page, start, end); + end = 0; + } + start = next; + } + else + end = next; + bh = bh->b_this_page; + } while (next < to); + + if (end) { + error = __block_write_begin(page, start, end - start, gfs2_block_map); + if (unlikely(error)) + return error; + empty_write_end(page, start, end); + } + + return 0; +} + +static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, + int mode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *dibh; + int error; + u64 start = offset >> PAGE_CACHE_SHIFT; + unsigned int start_offset = offset & ~PAGE_CACHE_MASK; + u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT; + pgoff_t curr; + struct page *page; + unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK; + unsigned int from, to; + + if (!end_offset) + end_offset = PAGE_CACHE_SIZE; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (unlikely(error)) + goto out; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip, NULL); + if (unlikely(error)) + goto out; + } + + curr = start; + offset = start << PAGE_CACHE_SHIFT; + from = start_offset; + to = PAGE_CACHE_SIZE; + while (curr <= end) { + page = grab_cache_page_write_begin(inode->i_mapping, curr, + AOP_FLAG_NOFS); + if (unlikely(!page)) { + error = -ENOMEM; + goto out; + } + + if (curr == end) + to = end_offset; + error = write_empty_blocks(page, from, to); + if (!error && offset + to > inode->i_size && + !(mode & FALLOC_FL_KEEP_SIZE)) { + i_size_write(inode, offset + to); + } + unlock_page(page); + page_cache_release(page); + if (error) + goto out; + curr++; + offset += PAGE_CACHE_SIZE; + from = 0; + } + + gfs2_dinode_out(ip, dibh->b_data); + mark_inode_dirty(inode); + + brelse(dibh); + +out: + return error; +} + +static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, + unsigned int *data_blocks, unsigned int *ind_blocks) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone; + unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); + + for (tmp = max_data; tmp > sdp->sd_diptrs;) { + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); + max_data -= tmp; + } + /* This calculation isn't the exact reverse of gfs2_write_calc_reserve, + so it might end up with fewer data blocks */ + if (max_data <= *data_blocks) + return; + *data_blocks = max_data; + *ind_blocks = max_blocks - max_data; + *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; + if (*len > max) { + *len = max; + gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks); + } +} + +static long gfs2_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *ip = GFS2_I(inode); + unsigned int data_blocks = 0, ind_blocks = 0, rblocks; + loff_t bytes, max_bytes; + struct gfs2_alloc *al; + int error; + loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; + next = (next + 1) << sdp->sd_sb.sb_bsize_shift; + + /* We only support the FALLOC_FL_KEEP_SIZE mode */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + offset = (offset >> sdp->sd_sb.sb_bsize_shift) << + sdp->sd_sb.sb_bsize_shift; + + len = next - offset; + bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2; + if (!bytes) + bytes = UINT_MAX; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); + error = gfs2_glock_nq(&ip->i_gh); + if (unlikely(error)) + goto out_uninit; + + if (!gfs2_write_alloc_required(ip, offset, len)) + goto out_unlock; + + while (len > 0) { + if (len < bytes) + bytes = len; + al = gfs2_alloc_get(ip); + if (!al) { + error = -ENOMEM; + goto out_unlock; + } + + error = gfs2_quota_lock_check(ip); + if (error) + goto out_alloc_put; + +retry: + gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); + + al->al_requested = data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip); + if (error) { + if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { + bytes >>= 1; + goto retry; + } + goto out_qunlock; + } + max_bytes = bytes; + calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks); + al->al_requested = data_blocks + ind_blocks; + + rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + + RES_RG_HDR + gfs2_rg_blocks(al); + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + + error = gfs2_trans_begin(sdp, rblocks, + PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); + if (error) + goto out_trans_fail; + + error = fallocate_chunk(inode, offset, max_bytes, mode); + gfs2_trans_end(sdp); + + if (error) + goto out_trans_fail; + + len -= max_bytes; + offset += max_bytes; + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + gfs2_alloc_put(ip); + } + goto out_unlock; + +out_trans_fail: + gfs2_inplace_release(ip); +out_qunlock: + gfs2_quota_unlock(ip); +out_alloc_put: + gfs2_alloc_put(ip); +out_unlock: + gfs2_glock_dq(&ip->i_gh); +out_uninit: + gfs2_holder_uninit(&ip->i_gh); + return error; +} + #ifdef CONFIG_GFS2_FS_LOCKING_DLM /** @@ -765,6 +1021,7 @@ const struct file_operations gfs2_file_fops = { .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, .setlease = gfs2_setlease, + .fallocate = gfs2_fallocate, }; const struct file_operations gfs2_dir_fops = { @@ -794,6 +1051,7 @@ const struct file_operations gfs2_file_fops_nolock = { .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, .setlease = generic_setlease, + .fallocate = gfs2_fallocate, }; const struct file_operations gfs2_dir_fops_nolock = { diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index c09528c07f3d..d8b26ac2e20b 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -18,8 +18,6 @@ #include #include #include -#include -#include #include #include "gfs2.h" @@ -1257,261 +1255,6 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name) return ret; } -static void empty_write_end(struct page *page, unsigned from, - unsigned to) -{ - struct gfs2_inode *ip = GFS2_I(page->mapping->host); - - page_zero_new_buffers(page, from, to); - flush_dcache_page(page); - mark_page_accessed(page); - - if (!gfs2_is_writeback(ip)) - gfs2_page_add_databufs(ip, page, from, to); - - block_commit_write(page, from, to); -} - - -static int write_empty_blocks(struct page *page, unsigned from, unsigned to) -{ - unsigned start, end, next; - struct buffer_head *bh, *head; - int error; - - if (!page_has_buffers(page)) { - error = __block_write_begin(page, from, to - from, gfs2_block_map); - if (unlikely(error)) - return error; - - empty_write_end(page, from, to); - return 0; - } - - bh = head = page_buffers(page); - next = end = 0; - while (next < from) { - next += bh->b_size; - bh = bh->b_this_page; - } - start = next; - do { - next += bh->b_size; - if (buffer_mapped(bh)) { - if (end) { - error = __block_write_begin(page, start, end - start, - gfs2_block_map); - if (unlikely(error)) - return error; - empty_write_end(page, start, end); - end = 0; - } - start = next; - } - else - end = next; - bh = bh->b_this_page; - } while (next < to); - - if (end) { - error = __block_write_begin(page, start, end - start, gfs2_block_map); - if (unlikely(error)) - return error; - empty_write_end(page, start, end); - } - - return 0; -} - -static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, - int mode) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct buffer_head *dibh; - int error; - u64 start = offset >> PAGE_CACHE_SHIFT; - unsigned int start_offset = offset & ~PAGE_CACHE_MASK; - u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT; - pgoff_t curr; - struct page *page; - unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK; - unsigned int from, to; - - if (!end_offset) - end_offset = PAGE_CACHE_SIZE; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (unlikely(error)) - goto out; - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - - if (gfs2_is_stuffed(ip)) { - error = gfs2_unstuff_dinode(ip, NULL); - if (unlikely(error)) - goto out; - } - - curr = start; - offset = start << PAGE_CACHE_SHIFT; - from = start_offset; - to = PAGE_CACHE_SIZE; - while (curr <= end) { - page = grab_cache_page_write_begin(inode->i_mapping, curr, - AOP_FLAG_NOFS); - if (unlikely(!page)) { - error = -ENOMEM; - goto out; - } - - if (curr == end) - to = end_offset; - error = write_empty_blocks(page, from, to); - if (!error && offset + to > inode->i_size && - !(mode & FALLOC_FL_KEEP_SIZE)) { - i_size_write(inode, offset + to); - } - unlock_page(page); - page_cache_release(page); - if (error) - goto out; - curr++; - offset += PAGE_CACHE_SIZE; - from = 0; - } - - gfs2_dinode_out(ip, dibh->b_data); - mark_inode_dirty(inode); - - brelse(dibh); - -out: - return error; -} - -static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, - unsigned int *data_blocks, unsigned int *ind_blocks) -{ - const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone; - unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); - - for (tmp = max_data; tmp > sdp->sd_diptrs;) { - tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); - max_data -= tmp; - } - /* This calculation isn't the exact reverse of gfs2_write_calc_reserve, - so it might end up with fewer data blocks */ - if (max_data <= *data_blocks) - return; - *data_blocks = max_data; - *ind_blocks = max_blocks - max_data; - *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; - if (*len > max) { - *len = max; - gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks); - } -} - -static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset, - loff_t len) -{ - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct gfs2_inode *ip = GFS2_I(inode); - unsigned int data_blocks = 0, ind_blocks = 0, rblocks; - loff_t bytes, max_bytes; - struct gfs2_alloc *al; - int error; - loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; - next = (next + 1) << sdp->sd_sb.sb_bsize_shift; - - /* We only support the FALLOC_FL_KEEP_SIZE mode */ - if (mode & ~FALLOC_FL_KEEP_SIZE) - return -EOPNOTSUPP; - - offset = (offset >> sdp->sd_sb.sb_bsize_shift) << - sdp->sd_sb.sb_bsize_shift; - - len = next - offset; - bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2; - if (!bytes) - bytes = UINT_MAX; - - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); - error = gfs2_glock_nq(&ip->i_gh); - if (unlikely(error)) - goto out_uninit; - - if (!gfs2_write_alloc_required(ip, offset, len)) - goto out_unlock; - - while (len > 0) { - if (len < bytes) - bytes = len; - al = gfs2_alloc_get(ip); - if (!al) { - error = -ENOMEM; - goto out_unlock; - } - - error = gfs2_quota_lock_check(ip); - if (error) - goto out_alloc_put; - -retry: - gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); - - al->al_requested = data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip); - if (error) { - if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { - bytes >>= 1; - goto retry; - } - goto out_qunlock; - } - max_bytes = bytes; - calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks); - al->al_requested = data_blocks + ind_blocks; - - rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + - RES_RG_HDR + gfs2_rg_blocks(al); - if (gfs2_is_jdata(ip)) - rblocks += data_blocks ? data_blocks : 1; - - error = gfs2_trans_begin(sdp, rblocks, - PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); - if (error) - goto out_trans_fail; - - error = fallocate_chunk(inode, offset, max_bytes, mode); - gfs2_trans_end(sdp); - - if (error) - goto out_trans_fail; - - len -= max_bytes; - offset += max_bytes; - gfs2_inplace_release(ip); - gfs2_quota_unlock(ip); - gfs2_alloc_put(ip); - } - goto out_unlock; - -out_trans_fail: - gfs2_inplace_release(ip); -out_qunlock: - gfs2_quota_unlock(ip); -out_alloc_put: - gfs2_alloc_put(ip); -out_unlock: - gfs2_glock_dq(&ip->i_gh); -out_uninit: - gfs2_holder_uninit(&ip->i_gh); - return error; -} - - static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -1562,7 +1305,6 @@ const struct inode_operations gfs2_file_iops = { .getxattr = gfs2_getxattr, .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, - .fallocate = gfs2_fallocate, .fiemap = gfs2_fiemap, }; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index cf254ce8c941..a6651956482e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1989,9 +1989,10 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); } -static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset, +static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { + struct inode *inode = file->f_path.dentry->d_inode; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_space_resv sr; int change_size = 1; @@ -2002,9 +2003,6 @@ static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset, if (!ocfs2_writes_unwritten_extents(osb)) return -EOPNOTSUPP; - if (S_ISDIR(inode->i_mode)) - return -ENODEV; - if (mode & FALLOC_FL_KEEP_SIZE) change_size = 0; @@ -2612,7 +2610,6 @@ const struct inode_operations ocfs2_file_iops = { .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, - .fallocate = ocfs2_fallocate, .fiemap = ocfs2_fiemap, }; @@ -2644,6 +2641,7 @@ const struct file_operations ocfs2_fops = { .flock = ocfs2_flock, .splice_read = ocfs2_file_splice_read, .splice_write = ocfs2_file_splice_write, + .fallocate = ocfs2_fallocate, }; const struct file_operations ocfs2_dops = { diff --git a/fs/open.c b/fs/open.c index 5b6ef7e2859e..e52389e1f05b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -255,10 +255,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) return -EFBIG; - if (!inode->i_op->fallocate) + if (!file->f_op->fallocate) return -EOPNOTSUPP; - return inode->i_op->fallocate(inode, mode, offset, len); + return file->f_op->fallocate(file, mode, offset, len); } SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index ef51eb43e137..a55c1b46b219 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -37,6 +37,7 @@ #include "xfs_trace.h" #include +#include static const struct vm_operations_struct xfs_file_vm_ops; @@ -882,6 +883,60 @@ out_unlock: return ret; } +STATIC long +xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + long error; + loff_t new_size = 0; + xfs_flock64_t bf; + xfs_inode_t *ip = XFS_I(inode); + int cmd = XFS_IOC_RESVSP; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + bf.l_whence = 0; + bf.l_start = offset; + bf.l_len = len; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + if (mode & FALLOC_FL_PUNCH_HOLE) + cmd = XFS_IOC_UNRESVSP; + + /* check the new inode size is valid before allocating */ + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + error = inode_newsize_ok(inode, new_size); + if (error) + goto out_unlock; + } + + error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK); + if (error) + goto out_unlock; + + /* Change file size if needed */ + if (new_size) { + struct iattr iattr; + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = new_size; + error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); + } + +out_unlock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; +} + + STATIC int xfs_file_open( struct inode *inode, @@ -1000,6 +1055,7 @@ const struct file_operations xfs_file_operations = { .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, + .fallocate = xfs_file_fallocate, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index a4ecc2188a09..bd5727852fd6 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include @@ -505,64 +504,6 @@ xfs_vn_setattr( return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); } -STATIC long -xfs_vn_fallocate( - struct inode *inode, - int mode, - loff_t offset, - loff_t len) -{ - long error; - loff_t new_size = 0; - xfs_flock64_t bf; - xfs_inode_t *ip = XFS_I(inode); - int cmd = XFS_IOC_RESVSP; - - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) - return -EOPNOTSUPP; - - /* preallocation on directories not yet supported */ - error = -ENODEV; - if (S_ISDIR(inode->i_mode)) - goto out_error; - - bf.l_whence = 0; - bf.l_start = offset; - bf.l_len = len; - - xfs_ilock(ip, XFS_IOLOCK_EXCL); - - if (mode & FALLOC_FL_PUNCH_HOLE) - cmd = XFS_IOC_UNRESVSP; - - /* check the new inode size is valid before allocating */ - if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { - new_size = offset + len; - error = inode_newsize_ok(inode, new_size); - if (error) - goto out_unlock; - } - - error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK); - if (error) - goto out_unlock; - - /* Change file size if needed */ - if (new_size) { - struct iattr iattr; - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = new_size; - error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); - } - -out_unlock: - xfs_iunlock(ip, XFS_IOLOCK_EXCL); -out_error: - return error; -} - #define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) /* @@ -656,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, - .fallocate = xfs_vn_fallocate, .fiemap = xfs_vn_fiemap, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 177b4ddea418..09b5bd6a7c6b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1552,6 +1552,8 @@ struct file_operations { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int (*setlease)(struct file *, long, struct file_lock **); + long (*fallocate)(struct file *file, int mode, loff_t offset, + loff_t len); }; #define IPERM_FLAG_RCU 0x0001 @@ -1582,8 +1584,6 @@ struct inode_operations { ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); void (*truncate_range)(struct inode *, loff_t, loff_t); - long (*fallocate)(struct inode *inode, int mode, loff_t offset, - loff_t len); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); } ____cacheline_aligned; -- cgit v1.2.3-58-ga151 From 2dab597441667d6c04451a7dcf215241ad4c74f6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 11 Feb 2011 15:53:38 -0800 Subject: Fix possible filp_cachep memory corruption In commit 31e6b01f4183 ("fs: rcu-walk for path lookup") we started doing path lookup using RCU, which then falls back to a careful non-RCU lookup in case of problems (LOOKUP_REVAL). So do_filp_open() has this "re-do the lookup carefully" looping case. However, that means that we must not release the open-intent file data if we are going to loop around and use it once more! Fix this by moving the release of the open-intent data to the function that allocates it (do_filp_open() itself) rather than the helper functions that can get called multiple times (finish_open() and do_last()). This makes the logic for the lifetime of that field much more obvious, and avoids the possible double free. Reported-by: J. R. Okajima Acked-by: Al Viro Cc: Nick Piggin Cc: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 20 ++++++++++---------- fs/open.c | 2 ++ 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'fs/open.c') diff --git a/fs/namei.c b/fs/namei.c index 7d77f24d32a9..ec4b2d0190a8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -561,10 +561,14 @@ static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd) */ void release_open_intent(struct nameidata *nd) { - if (nd->intent.open.file->f_path.dentry == NULL) - put_filp(nd->intent.open.file); - else - fput(nd->intent.open.file); + struct file *file = nd->intent.open.file; + + if (file && !IS_ERR(file)) { + if (file->f_path.dentry == NULL) + put_filp(file); + else + fput(file); + } } /* @@ -2265,8 +2269,6 @@ static struct file *finish_open(struct nameidata *nd, return filp; exit: - if (!IS_ERR(nd->intent.open.file)) - release_open_intent(nd); path_put(&nd->path); return ERR_PTR(error); } @@ -2389,8 +2391,6 @@ exit_mutex_unlock: exit_dput: path_put_conditional(path, nd); exit: - if (!IS_ERR(nd->intent.open.file)) - release_open_intent(nd); path_put(&nd->path); return ERR_PTR(error); } @@ -2477,6 +2477,7 @@ struct file *do_filp_open(int dfd, const char *pathname, } audit_inode(pathname, nd.path.dentry); filp = finish_open(&nd, open_flag, acc_mode); + release_open_intent(&nd); return filp; creat: @@ -2553,6 +2554,7 @@ out: path_put(&nd.root); if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL)) goto reval; + release_open_intent(&nd); return filp; exit_dput: @@ -2560,8 +2562,6 @@ exit_dput: out_path: path_put(&nd.path); out_filp: - if (!IS_ERR(nd.intent.open.file)) - release_open_intent(&nd); filp = ERR_PTR(error); goto out; } diff --git a/fs/open.c b/fs/open.c index e52389e1f05b..5a2c6ebc22b5 100644 --- a/fs/open.c +++ b/fs/open.c @@ -790,6 +790,8 @@ struct file *nameidata_to_filp(struct nameidata *nd) /* Pick up the filp from the open intent */ filp = nd->intent.open.file; + nd->intent.open.file = NULL; + /* Has the filesystem initialised the file for us? */ if (filp->f_path.dentry == NULL) { path_get(&nd->path); -- cgit v1.2.3-58-ga151