From 3271d7eb00f1df82d9ea26b359ed065129639f7c Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Wed, 10 Nov 2021 10:37:13 +0800 Subject: f2fs: compress: reduce one page array alloc and free when write compressed page Don't alloc new page pointers array to replace old, just use old, introduce valid_nr_cpages to indicate valid number of page pointers in array, try to reduce one page array alloc and free when write compress page. Signed-off-by: Fengnan Chang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 32 ++++++++++---------------------- fs/f2fs/data.c | 1 + fs/f2fs/f2fs.h | 1 + 3 files changed, 12 insertions(+), 22 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 49121a21f749..fb9e5149af5d 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -154,6 +154,7 @@ void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; + cc->valid_nr_cpages = 0; if (!reuse) cc->cluster_idx = NULL_CLUSTER; } @@ -620,7 +621,6 @@ static int f2fs_compress_pages(struct compress_ctx *cc) const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; unsigned int max_len, new_nr_cpages; - struct page **new_cpages; u32 chksum = 0; int i, ret; @@ -635,6 +635,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) max_len = COMPRESS_HEADER_SIZE + cc->clen; cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); + cc->valid_nr_cpages = cc->nr_cpages; cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages); if (!cc->cpages) { @@ -685,13 +686,6 @@ static int f2fs_compress_pages(struct compress_ctx *cc) new_nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); - /* Now we're going to cut unnecessary tail pages */ - new_cpages = page_array_alloc(cc->inode, new_nr_cpages); - if (!new_cpages) { - ret = -ENOMEM; - goto out_vunmap_cbuf; - } - /* zero out any unused part of the last page */ memset(&cc->cbuf->cdata[cc->clen], 0, (new_nr_cpages * PAGE_SIZE) - @@ -701,10 +695,8 @@ static int f2fs_compress_pages(struct compress_ctx *cc) vm_unmap_ram(cc->rbuf, cc->cluster_size); for (i = 0; i < cc->nr_cpages; i++) { - if (i < new_nr_cpages) { - new_cpages[i] = cc->cpages[i]; + if (i < new_nr_cpages) continue; - } f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } @@ -712,9 +704,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) if (cops->destroy_compress_ctx) cops->destroy_compress_ctx(cc); - page_array_free(cc->inode, cc->cpages, cc->nr_cpages); - cc->cpages = new_cpages; - cc->nr_cpages = new_nr_cpages; + cc->valid_nr_cpages = new_nr_cpages; trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, cc->clen, ret); @@ -1308,14 +1298,14 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; cic->inode = inode; - atomic_set(&cic->pending_pages, cc->nr_cpages); + atomic_set(&cic->pending_pages, cc->valid_nr_cpages); cic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!cic->rpages) goto out_put_cic; cic->nr_rpages = cc->cluster_size; - for (i = 0; i < cc->nr_cpages; i++) { + for (i = 0; i < cc->valid_nr_cpages; i++) { f2fs_set_compressed_page(cc->cpages[i], inode, cc->rpages[i + 1]->index, cic); fio.compressed_page = cc->cpages[i]; @@ -1360,7 +1350,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, if (fio.compr_blocks && __is_valid_data_blkaddr(blkaddr)) fio.compr_blocks++; - if (i > cc->nr_cpages) { + if (i > cc->valid_nr_cpages) { if (__is_valid_data_blkaddr(blkaddr)) { f2fs_invalidate_blocks(sbi, blkaddr); f2fs_update_data_blkaddr(&dn, NEW_ADDR); @@ -1385,8 +1375,8 @@ unlock_continue: if (fio.compr_blocks) f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false); - f2fs_i_compr_blocks_update(inode, cc->nr_cpages, true); - add_compr_block_stat(inode, cc->nr_cpages); + f2fs_i_compr_blocks_update(inode, cc->valid_nr_cpages, true); + add_compr_block_stat(inode, cc->valid_nr_cpages); set_inode_flag(cc->inode, FI_APPEND_WRITE); if (cc->cluster_idx == 0) @@ -1424,9 +1414,7 @@ out_unlock_op: else f2fs_unlock_op(sbi); out_free: - for (i = 0; i < cc->nr_cpages; i++) { - if (!cc->cpages[i]) - continue; + for (i = 0; i < cc->valid_nr_cpages; i++) { f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9f754aaef558..d8190e836a96 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2987,6 +2987,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, .rpages = NULL, .nr_rpages = 0, .cpages = NULL, + .valid_nr_cpages = 0, .rbuf = NULL, .cbuf = NULL, .rlen = PAGE_SIZE * F2FS_I(inode)->i_cluster_size, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ce9fc9f13000..ff37cdd7a6b7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1486,6 +1486,7 @@ struct compress_ctx { unsigned int nr_rpages; /* total page number in rpages */ struct page **cpages; /* pages store compressed data in cluster */ unsigned int nr_cpages; /* total page number in cpages */ + unsigned int valid_nr_cpages; /* valid page number in cpages */ void *rbuf; /* virtual mapped address on rpages */ struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ -- cgit v1.2.3-58-ga151 From 3d697a4a6b7dab8fb8a8c928b640999af3a08d87 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Jul 2021 09:39:13 -0500 Subject: f2fs: rework write preallocations f2fs_write_begin() assumes that all blocks were preallocated by default unless FI_NO_PREALLOC is explicitly set. This invites data corruption, as there are cases in which not all blocks are preallocated. Commit 47501f87c61a ("f2fs: preallocate DIO blocks when forcing buffered_io") fixed one case, but there are others remaining. Fix up this logic by replacing this flag with FI_PREALLOCATED_ALL, which only gets set if all blocks for the current write were preallocated. Also clean up f2fs_preallocate_blocks(), move it to file.c, and make it handle some of the logic that was previously in write_iter() directly. Signed-off-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 55 ++---------------------- fs/f2fs/f2fs.h | 3 +- fs/f2fs/file.c | 131 ++++++++++++++++++++++++++++++++++++--------------------- 3 files changed, 88 insertions(+), 101 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d8190e836a96..3db0f3049b90 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1384,53 +1384,6 @@ alloc: return 0; } -int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct f2fs_map_blocks map; - int flag; - int err = 0; - bool direct_io = iocb->ki_flags & IOCB_DIRECT; - - map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); - map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); - if (map.m_len > map.m_lblk) - map.m_len -= map.m_lblk; - else - map.m_len = 0; - - map.m_next_pgofs = NULL; - map.m_next_extent = NULL; - map.m_seg_type = NO_CHECK_TYPE; - map.m_may_create = true; - - if (direct_io) { - map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint); - flag = f2fs_force_buffered_io(inode, iocb, from) ? - F2FS_GET_BLOCK_PRE_AIO : - F2FS_GET_BLOCK_PRE_DIO; - goto map_blocks; - } - if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } - if (f2fs_has_inline_data(inode)) - return err; - - flag = F2FS_GET_BLOCK_PRE_AIO; - -map_blocks: - err = f2fs_map_blocks(inode, &map, 1, flag); - if (map.m_len > 0 && err == -ENOSPC) { - if (!direct_io) - set_inode_flag(inode, FI_NO_PREALLOC); - err = 0; - } - return err; -} - void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) { if (flag == F2FS_GET_BLOCK_PRE_AIO) { @@ -3340,12 +3293,10 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, int flag; /* - * we already allocated all the blocks, so we don't need to get - * the block addresses when there is no need to fill the page. + * If a whole page is being written and we already preallocated all the + * blocks, then there is no need to get a block address now. */ - if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE && - !is_inode_flag_set(inode, FI_NO_PREALLOC) && - !f2fs_verity_in_progress(inode)) + if (len == PAGE_SIZE && is_inode_flag_set(inode, FI_PREALLOCATED_ALL)) return 0; /* f2fs_lock_op avoids race between write CP and convert_inline_page */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ff37cdd7a6b7..6f196621f772 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -715,7 +715,7 @@ enum { FI_INLINE_DOTS, /* indicate inline dot dentries */ FI_DO_DEFRAG, /* indicate defragment is running */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ - FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ + FI_PREALLOCATED_ALL, /* all blocks for write were preallocated */ FI_HOT_DATA, /* indicate file is hot */ FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ @@ -3615,7 +3615,6 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); -int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, int op_flags, bool for_write); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 92ec2699bc85..fc87d0f5b82b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4235,10 +4235,77 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) return ret; } +/* + * Preallocate blocks for a write request, if it is possible and helpful to do + * so. Returns a positive number if blocks may have been preallocated, 0 if no + * blocks were preallocated, or a negative errno value if something went + * seriously wrong. Also sets FI_PREALLOCATED_ALL on the inode if *all* the + * requested blocks (not just some of them) have been allocated. + */ +static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + const loff_t pos = iocb->ki_pos; + const size_t count = iov_iter_count(iter); + struct f2fs_map_blocks map = {}; + bool dio = (iocb->ki_flags & IOCB_DIRECT) && + !f2fs_force_buffered_io(inode, iocb, iter); + int flag; + int ret; + + /* If it will be an out-of-place direct write, don't bother. */ + if (dio && f2fs_lfs_mode(sbi)) + return 0; + + /* No-wait I/O can't allocate blocks. */ + if (iocb->ki_flags & IOCB_NOWAIT) + return 0; + + /* If it will be a short write, don't bother. */ + if (fault_in_iov_iter_readable(iter, count)) + return 0; + + if (f2fs_has_inline_data(inode)) { + /* If the data will fit inline, don't bother. */ + if (pos + count <= MAX_INLINE_DATA(inode)) + return 0; + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + + /* Do not preallocate blocks that will be written partially in 4KB. */ + map.m_lblk = F2FS_BLK_ALIGN(pos); + map.m_len = F2FS_BYTES_TO_BLK(pos + count); + if (map.m_len > map.m_lblk) + map.m_len -= map.m_lblk; + else + map.m_len = 0; + map.m_may_create = true; + if (dio) { + map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); + flag = F2FS_GET_BLOCK_PRE_DIO; + } else { + map.m_seg_type = NO_CHECK_TYPE; + flag = F2FS_GET_BLOCK_PRE_AIO; + } + + ret = f2fs_map_blocks(inode, &map, 1, flag); + /* -ENOSPC is only a fatal error if no blocks could be allocated. */ + if (ret < 0 && !(ret == -ENOSPC && map.m_len > 0)) + return ret; + if (ret == 0) + set_inode_flag(inode, FI_PREALLOCATED_ALL); + return map.m_len; +} + static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + loff_t target_size; + int preallocated; ssize_t ret; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { @@ -4262,84 +4329,54 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (unlikely(IS_IMMUTABLE(inode))) { ret = -EPERM; - goto unlock; + goto out_unlock; } if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { ret = -EPERM; - goto unlock; + goto out_unlock; } ret = generic_write_checks(iocb, from); if (ret > 0) { - bool preallocated = false; - size_t target_size = 0; - int err; - - if (fault_in_iov_iter_readable(from, iov_iter_count(from))) - set_inode_flag(inode, FI_NO_PREALLOC); - - if ((iocb->ki_flags & IOCB_NOWAIT)) { + if (iocb->ki_flags & IOCB_NOWAIT) { if (!f2fs_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from)) || f2fs_has_inline_data(inode) || f2fs_force_buffered_io(inode, iocb, from)) { - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); ret = -EAGAIN; - goto out; + goto out_unlock; } - goto write; } - - if (is_inode_flag_set(inode, FI_NO_PREALLOC)) - goto write; - if (iocb->ki_flags & IOCB_DIRECT) { - /* - * Convert inline data for Direct I/O before entering - * f2fs_direct_IO(). - */ - err = f2fs_convert_inline_inode(inode); - if (err) - goto out_err; - /* - * If force_buffere_io() is true, we have to allocate - * blocks all the time, since f2fs_direct_IO will fall - * back to buffered IO. - */ - if (!f2fs_force_buffered_io(inode, iocb, from) && - f2fs_lfs_mode(F2FS_I_SB(inode))) - goto write; + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out_unlock; } - preallocated = true; + /* Possibly preallocate the blocks for the write. */ target_size = iocb->ki_pos + iov_iter_count(from); - - err = f2fs_preallocate_blocks(iocb, from); - if (err) { -out_err: - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); - ret = err; - goto out; + preallocated = f2fs_preallocate_blocks(iocb, from); + if (preallocated < 0) { + ret = preallocated; + goto out_unlock; } -write: + ret = __generic_file_write_iter(iocb, from); - clear_inode_flag(inode, FI_NO_PREALLOC); - /* if we couldn't write data, we should deallocate blocks. */ - if (preallocated && i_size_read(inode) < target_size) { + /* Don't leave any preallocated blocks around past i_size. */ + if (preallocated > 0 && i_size_read(inode) < target_size) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); f2fs_truncate(inode); filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } + clear_inode_flag(inode, FI_PREALLOCATED_ALL); if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } -unlock: +out_unlock: inode_unlock(inode); out: trace_f2fs_file_write_iter(inode, iocb->ki_pos, -- cgit v1.2.3-58-ga151 From b31bf0f96e71a2c81d2122c3fecdb91f8e215c20 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Jul 2021 09:39:14 -0500 Subject: f2fs: reduce indentation in f2fs_file_write_iter() Replace 'if (ret > 0)' with 'if (ret <= 0) goto out_unlock;'. No change in behavior. Signed-off-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 64 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 30 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fc87d0f5b82b..808a7c24d993 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4338,44 +4338,48 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } ret = generic_write_checks(iocb, from); - if (ret > 0) { - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!f2fs_overwrite_io(inode, iocb->ki_pos, - iov_iter_count(from)) || + if (ret <= 0) + goto out_unlock; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!f2fs_overwrite_io(inode, iocb->ki_pos, + iov_iter_count(from)) || f2fs_has_inline_data(inode) || f2fs_force_buffered_io(inode, iocb, from)) { - ret = -EAGAIN; - goto out_unlock; - } - } - if (iocb->ki_flags & IOCB_DIRECT) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - goto out_unlock; - } - /* Possibly preallocate the blocks for the write. */ - target_size = iocb->ki_pos + iov_iter_count(from); - preallocated = f2fs_preallocate_blocks(iocb, from); - if (preallocated < 0) { - ret = preallocated; + ret = -EAGAIN; goto out_unlock; } + } - ret = __generic_file_write_iter(iocb, from); + if (iocb->ki_flags & IOCB_DIRECT) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out_unlock; + } + /* Possibly preallocate the blocks for the write. */ + target_size = iocb->ki_pos + iov_iter_count(from); + preallocated = f2fs_preallocate_blocks(iocb, from); + if (preallocated < 0) { + ret = preallocated; + goto out_unlock; + } - /* Don't leave any preallocated blocks around past i_size. */ - if (preallocated > 0 && i_size_read(inode) < target_size) { - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - filemap_invalidate_lock(inode->i_mapping); - f2fs_truncate(inode); - filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - } - clear_inode_flag(inode, FI_PREALLOCATED_ALL); + ret = __generic_file_write_iter(iocb, from); - if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); + /* Don't leave any preallocated blocks around past i_size. */ + if (preallocated > 0 && i_size_read(inode) < target_size) { + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + f2fs_truncate(inode); + filemap_invalidate_unlock(inode->i_mapping); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } + + clear_inode_flag(inode, FI_PREALLOCATED_ALL); + + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); + out_unlock: inode_unlock(inode); out: -- cgit v1.2.3-58-ga151 From d4dd19ec1ea0cf6532d65709325c42b1398614a8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 12 Nov 2021 14:31:16 -0800 Subject: f2fs: do not expose unwritten blocks to user by DIO DIO preallocates physical blocks before writing data, but if an error occurrs or power-cut happens, we can see block contents from the disk. This patch tries to fix it by 1) turning to buffered writes for DIO into holes, 2) truncating unwritten blocks from error or power-cut. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 5 ++++- fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/file.c | 27 ++++++++++++++++++--------- fs/f2fs/inode.c | 8 ++++++++ 4 files changed, 35 insertions(+), 10 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3db0f3049b90..9c867de1ec29 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1543,8 +1543,11 @@ next_block: flag != F2FS_GET_BLOCK_DIO); err = __allocate_data_block(&dn, map->m_seg_type); - if (!err) + if (!err) { + if (flag == F2FS_GET_BLOCK_PRE_DIO) + file_need_truncate(inode); set_inode_flag(inode, FI_APPEND_WRITE); + } } if (err) goto sync_out; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6f196621f772..d7435fcb9658 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -654,6 +654,7 @@ enum { #define FADVISE_KEEP_SIZE_BIT 0x10 #define FADVISE_HOT_BIT 0x20 #define FADVISE_VERITY_BIT 0x40 +#define FADVISE_TRUNC_BIT 0x80 #define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) @@ -681,6 +682,10 @@ enum { #define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT) #define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT) +#define file_should_truncate(inode) is_file(inode, FADVISE_TRUNC_BIT) +#define file_need_truncate(inode) set_file(inode, FADVISE_TRUNC_BIT) +#define file_dont_truncate(inode) clear_file(inode, FADVISE_TRUNC_BIT) + #define DEF_DIR_LEVEL 0 enum { diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 808a7c24d993..e1445cf915ea 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1687,6 +1687,7 @@ next_alloc: map.m_seg_type = CURSEG_COLD_DATA_PINNED; err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + file_dont_truncate(inode); up_write(&sbi->pin_sem); @@ -4257,6 +4258,13 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter) /* If it will be an out-of-place direct write, don't bother. */ if (dio && f2fs_lfs_mode(sbi)) return 0; + /* + * Don't preallocate holes aligned to DIO_SKIP_HOLES which turns into + * buffered IO, if DIO meets any holes. + */ + if (dio && i_size_read(inode) && + (F2FS_BYTES_TO_BLK(pos) < F2FS_BLK_ALIGN(i_size_read(inode)))) + return 0; /* No-wait I/O can't allocate blocks. */ if (iocb->ki_flags & IOCB_NOWAIT) @@ -4292,8 +4300,8 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter) } ret = f2fs_map_blocks(inode, &map, 1, flag); - /* -ENOSPC is only a fatal error if no blocks could be allocated. */ - if (ret < 0 && !(ret == -ENOSPC && map.m_len > 0)) + /* -ENOSPC|-EDQUOT are fine to report the number of allocated blocks. */ + if (ret < 0 && !((ret == -ENOSPC || ret == -EDQUOT) && map.m_len > 0)) return ret; if (ret == 0) set_inode_flag(inode, FI_PREALLOCATED_ALL); @@ -4359,20 +4367,21 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) /* Possibly preallocate the blocks for the write. */ target_size = iocb->ki_pos + iov_iter_count(from); preallocated = f2fs_preallocate_blocks(iocb, from); - if (preallocated < 0) { + if (preallocated < 0) ret = preallocated; - goto out_unlock; - } - - ret = __generic_file_write_iter(iocb, from); + else + ret = __generic_file_write_iter(iocb, from); /* Don't leave any preallocated blocks around past i_size. */ - if (preallocated > 0 && i_size_read(inode) < target_size) { + if (preallocated && i_size_read(inode) < target_size) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); - f2fs_truncate(inode); + if (!f2fs_truncate(inode)) + file_dont_truncate(inode); filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } else { + file_dont_truncate(inode); } clear_inode_flag(inode, FI_PREALLOCATED_ALL); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 0f8b2df3e1e0..6998eb1d6bdb 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -544,6 +544,14 @@ make_now: goto bad_inode; } f2fs_set_inode_flags(inode); + + if (file_should_truncate(inode)) { + ret = f2fs_truncate(inode); + if (ret) + goto bad_inode; + file_dont_truncate(inode); + } + unlock_new_inode(inode); trace_f2fs_iget(inode); return inode; -- cgit v1.2.3-58-ga151 From ccf7cf92373d1a53166582013430b3b9c05a6ba2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Jul 2021 09:39:15 -0500 Subject: f2fs: fix the f2fs_file_write_iter tracepoint Pass in the original position and count rather than the position and count that were updated by the write. Also use the correct types for all arguments, in particular the file offset which was being truncated to 32 bits on 32-bit platforms. Signed-off-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +++-- include/trace/events/f2fs.h | 12 ++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e1445cf915ea..048db4852b28 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4312,6 +4312,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + const loff_t orig_pos = iocb->ki_pos; + const size_t orig_count = iov_iter_count(from); loff_t target_size; int preallocated; ssize_t ret; @@ -4392,8 +4394,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) out_unlock: inode_unlock(inode); out: - trace_f2fs_file_write_iter(inode, iocb->ki_pos, - iov_iter_count(from), ret); + trace_f2fs_file_write_iter(inode, orig_pos, orig_count, ret); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index f8cb916f3595..dcb94d740e12 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -540,17 +540,17 @@ TRACE_EVENT(f2fs_truncate_partial_nodes, TRACE_EVENT(f2fs_file_write_iter, - TP_PROTO(struct inode *inode, unsigned long offset, - unsigned long length, int ret), + TP_PROTO(struct inode *inode, loff_t offset, size_t length, + ssize_t ret), TP_ARGS(inode, offset, length, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(unsigned long, offset) - __field(unsigned long, length) - __field(int, ret) + __field(loff_t, offset) + __field(size_t, length) + __field(ssize_t, ret) ), TP_fast_assign( @@ -562,7 +562,7 @@ TRACE_EVENT(f2fs_file_write_iter, ), TP_printk("dev = (%d,%d), ino = %lu, " - "offset = %lu, length = %lu, written(err) = %d", + "offset = %lld, length = %zu, written(err) = %zd", show_dev_ino(__entry), __entry->offset, __entry->length, -- cgit v1.2.3-58-ga151 From 1517c1a7a4456f080fabc4ac9853930e4b880d14 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 23 Jul 2021 00:59:20 -0700 Subject: f2fs: implement iomap operations Implement 'struct iomap_ops' for f2fs, in preparation for making f2fs use iomap for direct I/O. Note that this may be used for other things besides direct I/O in the future; however, for now I've only tested it for direct I/O. Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 1 + fs/f2fs/data.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/f2fs.h | 1 + 3 files changed, 58 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 7eea3cfd894d..f46a7339d6cf 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -7,6 +7,7 @@ config F2FS_FS select CRYPTO_CRC32 select F2FS_FS_XATTR if FS_ENCRYPTION select FS_ENCRYPTION_ALGS if FS_ENCRYPTION + select FS_IOMAP select LZ4_COMPRESS if F2FS_FS_LZ4 select LZ4_DECOMPRESS if F2FS_FS_LZ4 select LZ4HC_COMPRESS if F2FS_FS_LZ4HC diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9c867de1ec29..57e6a6f0daf9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -4237,3 +4238,58 @@ void f2fs_destroy_bio_entry_cache(void) { kmem_cache_destroy(bio_entry_slab); } + +static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct f2fs_map_blocks map = {}; + pgoff_t next_pgofs = 0; + int err; + + map.m_lblk = bytes_to_blks(inode, offset); + map.m_len = bytes_to_blks(inode, offset + length - 1) - map.m_lblk + 1; + map.m_next_pgofs = &next_pgofs; + map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); + if (flags & IOMAP_WRITE) + map.m_may_create = true; + + err = f2fs_map_blocks(inode, &map, flags & IOMAP_WRITE, + F2FS_GET_BLOCK_DIO); + if (err) + return err; + + iomap->offset = blks_to_bytes(inode, map.m_lblk); + + if (map.m_flags & (F2FS_MAP_MAPPED | F2FS_MAP_UNWRITTEN)) { + iomap->length = blks_to_bytes(inode, map.m_len); + if (map.m_flags & F2FS_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + iomap->flags |= IOMAP_F_MERGED; + } else { + iomap->type = IOMAP_UNWRITTEN; + } + if (WARN_ON_ONCE(!__is_valid_data_blkaddr(map.m_pblk))) + return -EINVAL; + + iomap->bdev = map.m_bdev; + iomap->addr = blks_to_bytes(inode, map.m_pblk); + } else { + iomap->length = blks_to_bytes(inode, next_pgofs) - + iomap->offset; + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + } + + if (map.m_flags & F2FS_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; + if ((inode->i_state & I_DIRTY_DATASYNC) || + offset + length > i_size_read(inode)) + iomap->flags |= IOMAP_F_DIRTY; + + return 0; +} + +const struct iomap_ops f2fs_iomap_ops = { + .iomap_begin = f2fs_iomap_begin, +}; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d7435fcb9658..8242f47304a5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3655,6 +3655,7 @@ int f2fs_init_post_read_processing(void); void f2fs_destroy_post_read_processing(void); int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); +extern const struct iomap_ops f2fs_iomap_ops; /* * gc.c -- cgit v1.2.3-58-ga151 From a1e09b03e6f5c1d713c88259909137c0fd264ede Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 23 Jul 2021 00:59:21 -0700 Subject: f2fs: use iomap for direct I/O Make f2fs_file_read_iter() and f2fs_file_write_iter() use the iomap direct I/O implementation instead of the fs/direct-io.c one. The iomap implementation is more efficient, and it also avoids the need to add new features and optimizations to the old implementation. This new implementation also eliminates the need for f2fs to hook bio submission and completion and to allocate memory per-bio. This is because it's possible to correctly update f2fs's in-flight DIO counters using __iomap_dio_rw() in combination with an implementation of iomap_dio_ops::end_io() (as suggested by Christoph Hellwig). When possible, this new implementation preserves existing f2fs behavior such as the conditions for falling back to buffered I/O. This patch has been tested with xfstests by running 'gce-xfstests -c f2fs -g auto -X generic/017' with and without this patch; no regressions were seen. (Some tests fail both before and after. generic/017 hangs both before and after, so it had to be excluded.) Signed-off-by: Eric Biggers [Jaegeuk Kim: use spin_lock_bh for f2fs_update_iostat in softirq] Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 205 +-------------------------------- fs/f2fs/f2fs.h | 8 +- fs/f2fs/file.c | 342 ++++++++++++++++++++++++++++++++++++++++++++++++------- fs/f2fs/iostat.c | 40 +++---- 4 files changed, 322 insertions(+), 273 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 57e6a6f0daf9..a9652a8e669b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1377,11 +1377,6 @@ alloc: f2fs_invalidate_compress_page(sbi, old_blkaddr); } f2fs_update_data_blkaddr(dn, dn->data_blkaddr); - - /* - * i_size will be updated by direct_IO. Otherwise, we'll get stale - * data from unwritten block via dio_read. - */ return 0; } @@ -1743,50 +1738,6 @@ static inline u64 blks_to_bytes(struct inode *inode, u64 blks) return (blks << inode->i_blkbits); } -static int __get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create, int flag, - pgoff_t *next_pgofs, int seg_type, bool may_write) -{ - struct f2fs_map_blocks map; - int err; - - map.m_lblk = iblock; - map.m_len = bytes_to_blks(inode, bh->b_size); - map.m_next_pgofs = next_pgofs; - map.m_next_extent = NULL; - map.m_seg_type = seg_type; - map.m_may_create = may_write; - - err = f2fs_map_blocks(inode, &map, create, flag); - if (!err) { - map_bh(bh, inode->i_sb, map.m_pblk); - bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; - bh->b_size = blks_to_bytes(inode, map.m_len); - - if (map.m_multidev_dio) - bh->b_bdev = map.m_bdev; - } - return err; -} - -static int get_data_block_dio_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL, - f2fs_rw_hint_to_seg_type(inode->i_write_hint), - true); -} - -static int get_data_block_dio(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL, - f2fs_rw_hint_to_seg_type(inode->i_write_hint), - false); -} - static int f2fs_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) { @@ -3263,7 +3214,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, FS_CP_DATA_IO : FS_DATA_IO); } -static void f2fs_write_failed(struct inode *inode, loff_t to) +void f2fs_write_failed(struct inode *inode, loff_t to) { loff_t i_size = i_size_read(inode); @@ -3551,158 +3502,6 @@ unlock_out: return copied; } -static int check_direct_IO(struct inode *inode, struct iov_iter *iter, - loff_t offset) -{ - unsigned i_blkbits = READ_ONCE(inode->i_blkbits); - unsigned blkbits = i_blkbits; - unsigned blocksize_mask = (1 << blkbits) - 1; - unsigned long align = offset | iov_iter_alignment(iter); - struct block_device *bdev = inode->i_sb->s_bdev; - - if (iov_iter_rw(iter) == READ && offset >= i_size_read(inode)) - return 1; - - if (align & blocksize_mask) { - if (bdev) - blkbits = blksize_bits(bdev_logical_block_size(bdev)); - blocksize_mask = (1 << blkbits) - 1; - if (align & blocksize_mask) - return -EINVAL; - return 1; - } - return 0; -} - -static void f2fs_dio_end_io(struct bio *bio) -{ - struct f2fs_private_dio *dio = bio->bi_private; - - dec_page_count(F2FS_I_SB(dio->inode), - dio->write ? F2FS_DIO_WRITE : F2FS_DIO_READ); - - bio->bi_private = dio->orig_private; - bio->bi_end_io = dio->orig_end_io; - - kfree(dio); - - bio_endio(bio); -} - -static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode, - loff_t file_offset) -{ - struct f2fs_private_dio *dio; - bool write = (bio_op(bio) == REQ_OP_WRITE); - - dio = f2fs_kzalloc(F2FS_I_SB(inode), - sizeof(struct f2fs_private_dio), GFP_NOFS); - if (!dio) - goto out; - - dio->inode = inode; - dio->orig_end_io = bio->bi_end_io; - dio->orig_private = bio->bi_private; - dio->write = write; - - bio->bi_end_io = f2fs_dio_end_io; - bio->bi_private = dio; - - inc_page_count(F2FS_I_SB(inode), - write ? F2FS_DIO_WRITE : F2FS_DIO_READ); - - submit_bio(bio); - return; -out: - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); -} - -static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - size_t count = iov_iter_count(iter); - loff_t offset = iocb->ki_pos; - int rw = iov_iter_rw(iter); - int err; - enum rw_hint hint = iocb->ki_hint; - int whint_mode = F2FS_OPTION(sbi).whint_mode; - bool do_opu; - - err = check_direct_IO(inode, iter, offset); - if (err) - return err < 0 ? err : 0; - - if (f2fs_force_buffered_io(inode, iocb, iter)) - return 0; - - do_opu = rw == WRITE && f2fs_lfs_mode(sbi); - - trace_f2fs_direct_IO_enter(inode, offset, count, rw); - - if (rw == WRITE && whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = WRITE_LIFE_NOT_SET; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!down_read_trylock(&fi->i_gc_rwsem[rw])) { - iocb->ki_hint = hint; - err = -EAGAIN; - goto out; - } - if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) { - up_read(&fi->i_gc_rwsem[rw]); - iocb->ki_hint = hint; - err = -EAGAIN; - goto out; - } - } else { - down_read(&fi->i_gc_rwsem[rw]); - if (do_opu) - down_read(&fi->i_gc_rwsem[READ]); - } - - err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, rw == WRITE ? get_data_block_dio_write : - get_data_block_dio, NULL, f2fs_dio_submit_bio, - rw == WRITE ? DIO_LOCKING | DIO_SKIP_HOLES : - DIO_SKIP_HOLES); - - if (do_opu) - up_read(&fi->i_gc_rwsem[READ]); - - up_read(&fi->i_gc_rwsem[rw]); - - if (rw == WRITE) { - if (whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = hint; - if (err > 0) { - f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, - err); - if (!do_opu) - set_inode_flag(inode, FI_UPDATE_WRITE); - } else if (err == -EIOCBQUEUED) { - f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, - count - iov_iter_count(iter)); - } else if (err < 0) { - f2fs_write_failed(inode, offset + count); - } - } else { - if (err > 0) - f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, err); - else if (err == -EIOCBQUEUED) - f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_READ_IO, - count - iov_iter_count(iter)); - } - -out: - trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); - - return err; -} - void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length) { @@ -4158,7 +3957,7 @@ const struct address_space_operations f2fs_dblock_aops = { .set_page_dirty = f2fs_set_data_page_dirty, .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, - .direct_IO = f2fs_direct_IO, + .direct_IO = noop_direct_IO, .bmap = f2fs_bmap, .swap_activate = f2fs_swap_activate, .swap_deactivate = f2fs_swap_deactivate, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8242f47304a5..ac6dda6c4c5a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1807,13 +1807,6 @@ struct f2fs_sb_info { #endif }; -struct f2fs_private_dio { - struct inode *inode; - void *orig_private; - bio_end_io_t *orig_end_io; - bool write; -}; - #ifdef CONFIG_F2FS_FAULT_INJECTION #define f2fs_show_injection_info(sbi, type) \ printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", \ @@ -3642,6 +3635,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, struct writeback_control *wbc, enum iostat_type io_type, int compr_blocks, bool allow_balance); +void f2fs_write_failed(struct inode *inode, loff_t to); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 048db4852b28..7516d97d5016 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -4219,23 +4220,145 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return __f2fs_ioctl(filp, cmd, arg); } -static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +/* + * Return %true if the given read or write request should use direct I/O, or + * %false if it should use buffered I/O. + */ +static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb, + struct iov_iter *iter) +{ + unsigned int align; + + if (!(iocb->ki_flags & IOCB_DIRECT)) + return false; + + if (f2fs_force_buffered_io(inode, iocb, iter)) + return false; + + /* + * Direct I/O not aligned to the disk's logical_block_size will be + * attempted, but will fail with -EINVAL. + * + * f2fs additionally requires that direct I/O be aligned to the + * filesystem block size, which is often a stricter requirement. + * However, f2fs traditionally falls back to buffered I/O on requests + * that are logical_block_size-aligned but not fs-block aligned. + * + * The below logic implements this behavior. + */ + align = iocb->ki_pos | iov_iter_alignment(iter); + if (!IS_ALIGNED(align, i_blocksize(inode)) && + IS_ALIGNED(align, bdev_logical_block_size(inode->i_sb->s_bdev))) + return false; + + return true; +} + +static int f2fs_dio_read_end_io(struct kiocb *iocb, ssize_t size, int error, + unsigned int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp)); + + dec_page_count(sbi, F2FS_DIO_READ); + if (error) + return error; + f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, size); + return 0; +} + +static const struct iomap_dio_ops f2fs_iomap_dio_read_ops = { + .end_io = f2fs_dio_read_end_io, +}; + +static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - int ret; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + const loff_t pos = iocb->ki_pos; + const size_t count = iov_iter_count(to); + struct iomap_dio *dio; + ssize_t ret; + + if (count == 0) + return 0; /* skip atime update */ + + trace_f2fs_direct_IO_enter(inode, pos, count, READ); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!down_read_trylock(&fi->i_gc_rwsem[READ])) { + ret = -EAGAIN; + goto out; + } + } else { + down_read(&fi->i_gc_rwsem[READ]); + } + + /* + * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of + * the higher-level function iomap_dio_rw() in order to ensure that the + * F2FS_DIO_READ counter will be decremented correctly in all cases. + */ + inc_page_count(sbi, F2FS_DIO_READ); + dio = __iomap_dio_rw(iocb, to, &f2fs_iomap_ops, + &f2fs_iomap_dio_read_ops, 0, 0); + if (IS_ERR_OR_NULL(dio)) { + ret = PTR_ERR_OR_ZERO(dio); + if (ret != -EIOCBQUEUED) + dec_page_count(sbi, F2FS_DIO_READ); + } else { + ret = iomap_dio_complete(dio); + } + + up_read(&fi->i_gc_rwsem[READ]); + + file_accessed(file); +out: + trace_f2fs_direct_IO_exit(inode, pos, count, READ, ret); + return ret; +} + +static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - ret = generic_file_read_iter(iocb, iter); + if (f2fs_should_use_dio(inode, iocb, to)) + return f2fs_dio_read_iter(iocb, to); + ret = filemap_read(iocb, to, 0); if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_READ_IO, ret); - + f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret); return ret; } +static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + ssize_t count; + int err; + + if (IS_IMMUTABLE(inode)) + return -EPERM; + + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + return -EPERM; + + count = generic_write_checks(iocb, from); + if (count <= 0) + return count; + + err = file_modified(file); + if (err) + return err; + return count; +} + /* * Preallocate blocks for a write request, if it is possible and helpful to do * so. Returns a positive number if blocks may have been preallocated, 0 if no @@ -4243,15 +4366,14 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * seriously wrong. Also sets FI_PREALLOCATED_ALL on the inode if *all* the * requested blocks (not just some of them) have been allocated. */ -static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter) +static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter, + bool dio) { struct inode *inode = file_inode(iocb->ki_filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); const loff_t pos = iocb->ki_pos; const size_t count = iov_iter_count(iter); struct f2fs_map_blocks map = {}; - bool dio = (iocb->ki_flags & IOCB_DIRECT) && - !f2fs_force_buffered_io(inode, iocb, iter); int flag; int ret; @@ -4308,13 +4430,174 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter) return map.m_len; } -static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, + struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + + current->backing_dev_info = inode_to_bdi(inode); + ret = generic_perform_write(file, from, iocb->ki_pos); + current->backing_dev_info = NULL; + + if (ret > 0) { + iocb->ki_pos += ret; + f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_IO, ret); + } + return ret; +} + +static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, + unsigned int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp)); + + dec_page_count(sbi, F2FS_DIO_WRITE); + if (error) + return error; + f2fs_update_iostat(sbi, APP_DIRECT_IO, size); + return 0; +} + +static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = { + .end_io = f2fs_dio_write_end_io, +}; + +static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, + bool *may_need_sync) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + const bool do_opu = f2fs_lfs_mode(sbi); + const int whint_mode = F2FS_OPTION(sbi).whint_mode; + const loff_t pos = iocb->ki_pos; + const ssize_t count = iov_iter_count(from); + const enum rw_hint hint = iocb->ki_hint; + unsigned int dio_flags; + struct iomap_dio *dio; + ssize_t ret; + + trace_f2fs_direct_IO_enter(inode, pos, count, WRITE); + + if (iocb->ki_flags & IOCB_NOWAIT) { + /* f2fs_convert_inline_inode() and block allocation can block */ + if (f2fs_has_inline_data(inode) || + !f2fs_overwrite_io(inode, pos, count)) { + ret = -EAGAIN; + goto out; + } + + if (!down_read_trylock(&fi->i_gc_rwsem[WRITE])) { + ret = -EAGAIN; + goto out; + } + if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) { + up_read(&fi->i_gc_rwsem[WRITE]); + ret = -EAGAIN; + goto out; + } + } else { + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + down_read(&fi->i_gc_rwsem[WRITE]); + if (do_opu) + down_read(&fi->i_gc_rwsem[READ]); + } + if (whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = WRITE_LIFE_NOT_SET; + + /* + * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of + * the higher-level function iomap_dio_rw() in order to ensure that the + * F2FS_DIO_WRITE counter will be decremented correctly in all cases. + */ + inc_page_count(sbi, F2FS_DIO_WRITE); + dio_flags = 0; + if (pos + count > inode->i_size) + dio_flags |= IOMAP_DIO_FORCE_WAIT; + dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops, + &f2fs_iomap_dio_write_ops, dio_flags, 0); + if (IS_ERR_OR_NULL(dio)) { + ret = PTR_ERR_OR_ZERO(dio); + if (ret == -ENOTBLK) + ret = 0; + if (ret != -EIOCBQUEUED) + dec_page_count(sbi, F2FS_DIO_WRITE); + } else { + ret = iomap_dio_complete(dio); + } + + if (whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = hint; + if (do_opu) + up_read(&fi->i_gc_rwsem[READ]); + up_read(&fi->i_gc_rwsem[WRITE]); + + if (ret < 0) + goto out; + if (pos + ret > inode->i_size) + f2fs_i_size_write(inode, pos + ret); + if (!do_opu) + set_inode_flag(inode, FI_UPDATE_WRITE); + + if (iov_iter_count(from)) { + ssize_t ret2; + loff_t bufio_start_pos = iocb->ki_pos; + + /* + * The direct write was partial, so we need to fall back to a + * buffered write for the remainder. + */ + + ret2 = f2fs_buffered_write_iter(iocb, from); + if (iov_iter_count(from)) + f2fs_write_failed(inode, iocb->ki_pos); + if (ret2 < 0) + goto out; + + /* + * Ensure that the pagecache pages are written to disk and + * invalidated to preserve the expected O_DIRECT semantics. + */ + if (ret2 > 0) { + loff_t bufio_end_pos = bufio_start_pos + ret2 - 1; + + ret += ret2; + + ret2 = filemap_write_and_wait_range(file->f_mapping, + bufio_start_pos, + bufio_end_pos); + if (ret2 < 0) + goto out; + invalidate_mapping_pages(file->f_mapping, + bufio_start_pos >> PAGE_SHIFT, + bufio_end_pos >> PAGE_SHIFT); + } + } else { + /* iomap_dio_rw() already handled the generic_write_sync(). */ + *may_need_sync = false; + } +out: + trace_f2fs_direct_IO_exit(inode, pos, count, WRITE, ret); + return ret; +} + +static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); const loff_t orig_pos = iocb->ki_pos; const size_t orig_count = iov_iter_count(from); loff_t target_size; + bool dio; + bool may_need_sync = true; int preallocated; ssize_t ret; @@ -4337,42 +4620,23 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); } - if (unlikely(IS_IMMUTABLE(inode))) { - ret = -EPERM; - goto out_unlock; - } - - if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { - ret = -EPERM; - goto out_unlock; - } - - ret = generic_write_checks(iocb, from); + ret = f2fs_write_checks(iocb, from); if (ret <= 0) goto out_unlock; - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!f2fs_overwrite_io(inode, iocb->ki_pos, - iov_iter_count(from)) || - f2fs_has_inline_data(inode) || - f2fs_force_buffered_io(inode, iocb, from)) { - ret = -EAGAIN; - goto out_unlock; - } - } + /* Determine whether we will do a direct write or a buffered write. */ + dio = f2fs_should_use_dio(inode, iocb, from); - if (iocb->ki_flags & IOCB_DIRECT) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - goto out_unlock; - } /* Possibly preallocate the blocks for the write. */ target_size = iocb->ki_pos + iov_iter_count(from); - preallocated = f2fs_preallocate_blocks(iocb, from); + preallocated = f2fs_preallocate_blocks(iocb, from, dio); if (preallocated < 0) ret = preallocated; else - ret = __generic_file_write_iter(iocb, from); + /* Do the actual write. */ + ret = dio ? + f2fs_dio_write_iter(iocb, from, &may_need_sync): + f2fs_buffered_write_iter(iocb, from); /* Don't leave any preallocated blocks around past i_size. */ if (preallocated && i_size_read(inode) < target_size) { @@ -4387,15 +4651,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } clear_inode_flag(inode, FI_PREALLOCATED_ALL); - - if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); - out_unlock: inode_unlock(inode); out: trace_f2fs_file_write_iter(inode, orig_pos, orig_count, ret); - if (ret > 0) + if (ret > 0 && may_need_sync) ret = generic_write_sync(iocb, ret); return ret; } diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index cdcf54ae0db8..be599f31d3c4 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -92,7 +92,7 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; - spin_lock_irq(&sbi->iostat_lat_lock); + spin_lock_bh(&sbi->iostat_lat_lock); for (idx = 0; idx < MAX_IO_TYPE; idx++) { for (io = 0; io < NR_PAGE_TYPE; io++) { cnt = io_lat->bio_cnt[idx][io]; @@ -106,7 +106,7 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) io_lat->bio_cnt[idx][io] = 0; } } - spin_unlock_irq(&sbi->iostat_lat_lock); + spin_unlock_bh(&sbi->iostat_lat_lock); trace_f2fs_iostat_latency(sbi, iostat_lat); } @@ -120,9 +120,9 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) return; /* Need double check under the lock */ - spin_lock(&sbi->iostat_lock); + spin_lock_bh(&sbi->iostat_lock); if (time_is_after_jiffies(sbi->iostat_next_period)) { - spin_unlock(&sbi->iostat_lock); + spin_unlock_bh(&sbi->iostat_lock); return; } sbi->iostat_next_period = jiffies + @@ -133,7 +133,7 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) sbi->prev_rw_iostat[i]; sbi->prev_rw_iostat[i] = sbi->rw_iostat[i]; } - spin_unlock(&sbi->iostat_lock); + spin_unlock_bh(&sbi->iostat_lock); trace_f2fs_iostat(sbi, iostat_diff); @@ -145,16 +145,16 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi) struct iostat_lat_info *io_lat = sbi->iostat_io_lat; int i; - spin_lock(&sbi->iostat_lock); + spin_lock_bh(&sbi->iostat_lock); for (i = 0; i < NR_IO_TYPE; i++) { sbi->rw_iostat[i] = 0; sbi->prev_rw_iostat[i] = 0; } - spin_unlock(&sbi->iostat_lock); + spin_unlock_bh(&sbi->iostat_lock); - spin_lock_irq(&sbi->iostat_lat_lock); + spin_lock_bh(&sbi->iostat_lat_lock); memset(io_lat, 0, sizeof(struct iostat_lat_info)); - spin_unlock_irq(&sbi->iostat_lat_lock); + spin_unlock_bh(&sbi->iostat_lat_lock); } void f2fs_update_iostat(struct f2fs_sb_info *sbi, @@ -163,19 +163,16 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, if (!sbi->iostat_enable) return; - spin_lock(&sbi->iostat_lock); + spin_lock_bh(&sbi->iostat_lock); sbi->rw_iostat[type] += io_bytes; - if (type == APP_WRITE_IO || type == APP_DIRECT_IO) - sbi->rw_iostat[APP_BUFFERED_IO] = - sbi->rw_iostat[APP_WRITE_IO] - - sbi->rw_iostat[APP_DIRECT_IO]; + if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO) + sbi->rw_iostat[APP_WRITE_IO] += io_bytes; - if (type == APP_READ_IO || type == APP_DIRECT_READ_IO) - sbi->rw_iostat[APP_BUFFERED_READ_IO] = - sbi->rw_iostat[APP_READ_IO] - - sbi->rw_iostat[APP_DIRECT_READ_IO]; - spin_unlock(&sbi->iostat_lock); + if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO) + sbi->rw_iostat[APP_READ_IO] += io_bytes; + + spin_unlock_bh(&sbi->iostat_lock); f2fs_record_iostat(sbi); } @@ -185,7 +182,6 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, { unsigned long ts_diff; unsigned int iotype = iostat_ctx->type; - unsigned long flags; struct f2fs_sb_info *sbi = iostat_ctx->sbi; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; int idx; @@ -206,12 +202,12 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, idx = WRITE_ASYNC_IO; } - spin_lock_irqsave(&sbi->iostat_lat_lock, flags); + spin_lock_bh(&sbi->iostat_lat_lock); io_lat->sum_lat[idx][iotype] += ts_diff; io_lat->bio_cnt[idx][iotype]++; if (ts_diff > io_lat->peak_lat[idx][iotype]) io_lat->peak_lat[idx][iotype] = ts_diff; - spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); + spin_unlock_bh(&sbi->iostat_lat_lock); } void iostat_update_and_unbind_ctx(struct bio *bio, int rw) -- cgit v1.2.3-58-ga151 From bd984c03097b8e9b7500cba7378040ac1c697dbb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 19 Nov 2021 11:20:33 -0800 Subject: f2fs: show more DIO information in tracepoint This prints more information of DIO in tracepoint. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++-- include/trace/events/f2fs.h | 15 +++++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7516d97d5016..0802a10a651f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4284,7 +4284,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) if (count == 0) return 0; /* skip atime update */ - trace_f2fs_direct_IO_enter(inode, pos, count, READ); + trace_f2fs_direct_IO_enter(inode, iocb, count, READ); if (iocb->ki_flags & IOCB_NOWAIT) { if (!down_read_trylock(&fi->i_gc_rwsem[READ])) { @@ -4483,7 +4483,7 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, struct iomap_dio *dio; ssize_t ret; - trace_f2fs_direct_IO_enter(inode, pos, count, WRITE); + trace_f2fs_direct_IO_enter(inode, iocb, count, WRITE); if (iocb->ki_flags & IOCB_NOWAIT) { /* f2fs_convert_inline_inode() and block allocation can block */ diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index dcb94d740e12..f701bb23f83c 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -936,14 +936,14 @@ TRACE_EVENT(f2fs_fallocate, TRACE_EVENT(f2fs_direct_IO_enter, - TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw), + TP_PROTO(struct inode *inode, struct kiocb *iocb, long len, int rw), - TP_ARGS(inode, offset, len, rw), + TP_ARGS(inode, iocb, len, rw), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(loff_t, pos) + __field(struct kiocb *, iocb) __field(unsigned long, len) __field(int, rw) ), @@ -951,15 +951,18 @@ TRACE_EVENT(f2fs_direct_IO_enter, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->pos = offset; + __entry->iocb = iocb; __entry->len = len; __entry->rw = rw; ), - TP_printk("dev = (%d,%d), ino = %lu pos = %lld len = %lu rw = %d", + TP_printk("dev = (%d,%d), ino = %lu pos = %lld len = %lu ki_flags = %x ki_hint = %x ki_ioprio = %x rw = %d", show_dev_ino(__entry), - __entry->pos, + __entry->iocb->ki_pos, __entry->len, + __entry->iocb->ki_flags, + __entry->iocb->ki_hint, + __entry->iocb->ki_ioprio, __entry->rw) ); -- cgit v1.2.3-58-ga151 From d1917865a7906baf6b687e15e8e6195a295a3992 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Fri, 26 Nov 2021 18:19:19 +0800 Subject: f2fs: fix remove page failed in invalidate compress pages Since compress inode not a regular file, generic_error_remove_page in f2fs_invalidate_compress_pages will always be failed, set compress inode as a regular file to fix it. Fixes: 6ce19aff0b8c ("f2fs: compress: add compress_inode to cache compressed blocks") Signed-off-by: Fengnan Chang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 6998eb1d6bdb..935016e56010 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -516,6 +516,11 @@ make_now: } else if (ino == F2FS_COMPRESS_INO(sbi)) { #ifdef CONFIG_F2FS_FS_COMPRESSION inode->i_mapping->a_ops = &f2fs_compress_aops; + /* + * generic_error_remove_page only truncates pages of regular + * inode + */ + inode->i_mode |= S_IFREG; #endif mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE); -- cgit v1.2.3-58-ga151 From e64347ae13dadba7b847776521a51c94c56605e9 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Mon, 29 Nov 2021 10:13:41 +0800 Subject: f2fs: support POSIX_FADV_DONTNEED drop compressed page cache Previously, compressed page cache drop when clean page cache, but POSIX_FADV_DONTNEED can't clean compressed page cache because raw page don't have private data, and won't call f2fs_invalidate_compress_pages. This commit call f2fs_invalidate_compress_pages() directly in f2fs_file_fadvise() for POSIX_FADV_DONTNEED case. Signed-off-by: Fengnan Chang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0802a10a651f..a15f8ba239f0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4663,12 +4663,12 @@ out: static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) { - struct inode *inode; struct address_space *mapping; struct backing_dev_info *bdi; + struct inode *inode = file_inode(filp); + int err; if (advice == POSIX_FADV_SEQUENTIAL) { - inode = file_inode(filp); if (S_ISFIFO(inode->i_mode)) return -ESPIPE; @@ -4685,7 +4685,13 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, return 0; } - return generic_fadvise(filp, offset, len, advice); + err = generic_fadvise(filp, offset, len, advice); + if (!err && advice == POSIX_FADV_DONTNEED && + test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) && + f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino); + + return err; } #ifdef CONFIG_COMPAT -- cgit v1.2.3-58-ga151 From ae2e2804caa120af188b0d7b08936c7ac5c7d8fe Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 29 Nov 2021 10:36:12 -0800 Subject: f2fs: show number of pending discard commands This information can be used to check how much time we need to give to issue all the discard commands. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 5 +++++ fs/f2fs/sysfs.c | 11 +++++++++++ 2 files changed, 16 insertions(+) (limited to 'fs/f2fs') diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index b268e3e18b4a..9f3c355bb70e 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -112,6 +112,11 @@ Contact: "Jaegeuk Kim" Description: Set timeout to issue discard commands during umount. Default: 5 secs +What: /sys/fs/f2fs//pending_discard +Date: November 2021 +Contact: "Jaegeuk Kim" +Description: Shows the number of pending discard commands in the queue. + What: /sys/fs/f2fs//max_victim_search Date: January 2014 Contact: "Jaegeuk Kim" diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 7d289249cd7e..47c950f65b6f 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -118,6 +118,15 @@ static ssize_t sb_status_show(struct f2fs_attr *a, return sprintf(buf, "%lx\n", sbi->s_flag); } +static ssize_t pending_discard_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (!SM_I(sbi)->dcc_info) + return -EINVAL; + return sprintf(buf, "%llu\n", (unsigned long long)atomic_read( + &SM_I(sbi)->dcc_info->discard_cmd_cnt)); +} + static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -744,6 +753,7 @@ F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); F2FS_GENERAL_RO_ATTR(mounted_time_sec); F2FS_GENERAL_RO_ATTR(main_blkaddr); +F2FS_GENERAL_RO_ATTR(pending_discard); #ifdef CONFIG_F2FS_STAT_FS F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); @@ -812,6 +822,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(main_blkaddr), ATTR_LIST(max_small_discards), ATTR_LIST(discard_granularity), + ATTR_LIST(pending_discard), ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), -- cgit v1.2.3-58-ga151 From 766c663933bec1068a6041f05bf31d39606bc2e8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 4 Dec 2021 09:55:35 -0800 Subject: f2fs: avoid duplicate call of mark_inode_dirty Let's check the condition first before set|clear bit. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ac6dda6c4c5a..cbc73bd71dad 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3115,12 +3115,16 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { + if (is_file(inode, type)) + return; F2FS_I(inode)->i_advise |= type; f2fs_mark_inode_dirty_sync(inode, true); } static inline void clear_file(struct inode *inode, int type) { + if (!is_file(inode, type)) + return; F2FS_I(inode)->i_advise &= ~type; f2fs_mark_inode_dirty_sync(inode, true); } -- cgit v1.2.3-58-ga151 From 9056d6489f5a41cfbb67f719d2c0ce61ead72d9f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 6 Dec 2021 22:44:19 +0800 Subject: f2fs: fix to do sanity check on inode type during garbage collection As report by Wenqing Liu in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215231 - Overview kernel NULL pointer dereference triggered in folio_mark_dirty() when mount and operate on a crafted f2fs image - Reproduce tested on kernel 5.16-rc3, 5.15.X under root 1. mkdir mnt 2. mount -t f2fs tmp1.img mnt 3. touch tmp 4. cp tmp mnt F2FS-fs (loop0): sanity_check_inode: inode (ino=49) extent info [5942, 4294180864, 4] is incorrect, run fsck to fix F2FS-fs (loop0): f2fs_check_nid_range: out-of-range nid=31340049, run fsck to fix. BUG: kernel NULL pointer dereference, address: 0000000000000000 folio_mark_dirty+0x33/0x50 move_data_page+0x2dd/0x460 [f2fs] do_garbage_collect+0xc18/0x16a0 [f2fs] f2fs_gc+0x1d3/0xd90 [f2fs] f2fs_balance_fs+0x13a/0x570 [f2fs] f2fs_create+0x285/0x840 [f2fs] path_openat+0xe6d/0x1040 do_filp_open+0xc5/0x140 do_sys_openat2+0x23a/0x310 do_sys_open+0x57/0x80 The root cause is for special file: e.g. character, block, fifo or socket file, f2fs doesn't assign address space operations pointer array for mapping->a_ops field, so, in a fuzzed image, SSA table indicates a data block belong to special file, when f2fs tries to migrate that block, it causes NULL pointer access once move_data_page() calls a_ops->set_dirty_page(). Cc: stable@vger.kernel.org Reported-by: Wenqing Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a946ce0ead34..e0bdc4361a9b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1457,7 +1457,8 @@ next_step: if (phase == 3) { inode = f2fs_iget(sb, dni.ino); - if (IS_ERR(inode) || is_bad_inode(inode)) + if (IS_ERR(inode) || is_bad_inode(inode) || + special_file(inode->i_mode)) continue; if (!down_write_trylock( -- cgit v1.2.3-58-ga151 From f6db43076d190d9bf75559dec28e18b9d12e4ce5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 6 Dec 2021 22:44:20 +0800 Subject: f2fs: fix to avoid panic in is_alive() if metadata is inconsistent As report by Wenqing Liu in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215231 If we enable CONFIG_F2FS_CHECK_FS config, and with fuzzed image attached in above link, we will encounter panic when executing below script: 1. mkdir mnt 2. mount -t f2fs tmp1.img mnt 3. touch tmp F2FS-fs (loop11): mismatched blkaddr 5765 (source_blkaddr 1) in seg 3 kernel BUG at fs/f2fs/gc.c:1042! do_garbage_collect+0x90f/0xa80 [f2fs] f2fs_gc+0x294/0x12a0 [f2fs] f2fs_balance_fs+0x2c5/0x7d0 [f2fs] f2fs_create+0x239/0xd90 [f2fs] lookup_open+0x45e/0xa90 open_last_lookups+0x203/0x670 path_openat+0xae/0x490 do_filp_open+0xbc/0x160 do_sys_openat2+0x2f1/0x500 do_sys_open+0x5e/0xa0 __x64_sys_openat+0x28/0x40 Previously, f2fs tries to catch data inconcistency exception in between SSA and SIT table during GC, however once the exception is caught, it will call f2fs_bug_on to hang kernel, it's not needed, instead, let's set SBI_NEED_FSCK flag and skip migrating current block. Fixes: bbf9f7d90f21 ("f2fs: Fix indefinite loop in f2fs_gc()") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e0bdc4361a9b..3e64b234df21 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1039,7 +1039,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) { f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u", blkaddr, source_blkaddr, segno); - f2fs_bug_on(sbi, 1); + set_sbi_flag(sbi, SBI_NEED_FSCK); } } #endif -- cgit v1.2.3-58-ga151 From 77900c45ee5cd5da63bd4d818a41dbdf367e81cd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 6 Dec 2021 22:44:21 +0800 Subject: f2fs: fix to do sanity check in is_alive() In fuzzed image, SSA table may indicate that a data block belongs to invalid node, which node ID is out-of-range (0, 1, 2 or max_nid), in order to avoid migrating inconsistent data in such corrupted image, let's do sanity check anyway before data block migration. Cc: stable@vger.kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3e64b234df21..b538cbcba351 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1026,6 +1026,9 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, set_sbi_flag(sbi, SBI_NEED_FSCK); } + if (f2fs_check_nid_range(sbi, dni->ino)) + return false; + *nofs = ofs_of_node(node_page); source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); -- cgit v1.2.3-58-ga151 From 325163e9892b627fc9fb1af51e51f0f95dded517 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Wed, 8 Dec 2021 16:41:51 -0800 Subject: f2fs: add gc_urgent_high_remaining sysfs node Added a new sysfs node called gc_urgent_high_remaining. The user can set the trial count limit for GC urgent high mode with this value. If GC thread gets to the limit, the mode will turn back to GC normal mode. By default, the value is zero, which means there is no limit like before. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 7 +++++++ fs/f2fs/f2fs.h | 3 +++ fs/f2fs/gc.c | 12 ++++++++++++ fs/f2fs/super.c | 1 + fs/f2fs/sysfs.c | 11 +++++++++++ 5 files changed, 34 insertions(+) (limited to 'fs/f2fs') diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 9f3c355bb70e..2416b03ff283 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -533,3 +533,10 @@ Description: With "mode=fragment:block" mount options, we can scatter block allo f2fs will allocate 1.. blocks in a chunk and make a hole in the length of 1.. by turns. This value can be set between 1..512 and the default value is 4. + +What: /sys/fs/f2fs//gc_urgent_high_remaining +Date: December 2021 +Contact: "Daeho Jeong" +Description: You can set the trial count limit for GC urgent high mode with this value. + If GC thread gets to the limit, the mode will turn back to GC normal mode. + By default, the value is zero, which means there is no limit like before. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cbc73bd71dad..5da592286721 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1683,6 +1683,9 @@ struct f2fs_sb_info { unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ + spinlock_t gc_urgent_high_lock; + bool gc_urgent_high_limited; /* indicates having limited trial count */ + unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */ /* for skip statistic */ unsigned int atomic_files; /* # of opened atomic file */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b538cbcba351..7fbe46477a5a 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -92,6 +92,18 @@ static int gc_thread_func(void *data) * So, I'd like to wait some time to collect dirty segments. */ if (sbi->gc_mode == GC_URGENT_HIGH) { + spin_lock(&sbi->gc_urgent_high_lock); + if (sbi->gc_urgent_high_limited) { + if (!sbi->gc_urgent_high_remaining) { + sbi->gc_urgent_high_limited = false; + spin_unlock(&sbi->gc_urgent_high_lock); + sbi->gc_mode = GC_NORMAL; + continue; + } + sbi->gc_urgent_high_remaining--; + } + spin_unlock(&sbi->gc_urgent_high_lock); + wait_ms = gc_th->urgent_sleep_time; down_write(&sbi->gc_lock); goto do_gc; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 040b6d02e1d8..9acd76ea09ca 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3548,6 +3548,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->seq_file_ra_mul = MIN_RA_MUL; sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; + spin_lock_init(&sbi->gc_urgent_high_lock); sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 47c950f65b6f..55a7df17d5f3 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -487,6 +487,15 @@ out: return count; } + if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) { + spin_lock(&sbi->gc_urgent_high_lock); + sbi->gc_urgent_high_limited = t == 0 ? false : true; + sbi->gc_urgent_high_remaining = t; + spin_unlock(&sbi->gc_urgent_high_lock); + + return count; + } + #ifdef CONFIG_F2FS_IOSTAT if (!strcmp(a->attr.name, "iostat_enable")) { sbi->iostat_enable = !!t; @@ -742,6 +751,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining); F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); @@ -855,6 +865,7 @@ static struct attribute *f2fs_attrs[] = { #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), + ATTR_LIST(gc_urgent_high_remaining), ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), -- cgit v1.2.3-58-ga151 From 19bdba5265624ba6b9d9dd936a0c6ccc167cfe80 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 9 Dec 2021 10:25:43 -0800 Subject: f2fs: avoid EINVAL by SBI_NEED_FSCK when pinning a file Android OTA failed due to SBI_NEED_FSCK flag when pinning the file. Let's avoid it since we can do in-place-updates. Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 +++++-- fs/f2fs/file.c | 10 +++++----- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a9652a8e669b..40c0d1426a11 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2525,6 +2525,11 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + /* The below cases were checked when setting it. */ + if (f2fs_is_pinned_file(inode)) + return false; + if (fio && is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + return true; if (f2fs_lfs_mode(sbi)) return true; if (S_ISDIR(inode->i_mode)) @@ -2533,8 +2538,6 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) return true; if (f2fs_is_atomic_file(inode)) return true; - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) - return true; /* swap file is migrating in aligned write mode */ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a15f8ba239f0..5ec6bef3937f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3145,17 +3145,17 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) inode_lock(inode); - if (f2fs_should_update_outplace(inode, NULL)) { - ret = -EINVAL; - goto out; - } - if (!pin) { clear_inode_flag(inode, FI_PIN_FILE); f2fs_i_gc_failures_write(inode, 0); goto done; } + if (f2fs_should_update_outplace(inode, NULL)) { + ret = -EINVAL; + goto out; + } + if (f2fs_pin_file_control(inode, false)) { ret = -EAGAIN; goto out; -- cgit v1.2.3-58-ga151 From 7377e853967ba45bf409e3b5536624d2cbc99f21 Mon Sep 17 00:00:00 2001 From: Hyeong-Jun Kim Date: Fri, 10 Dec 2021 13:30:12 +0900 Subject: f2fs: compress: fix potential deadlock of compress file There is a potential deadlock between writeback process and a process performing write_begin() or write_cache_pages() while trying to write same compress file, but not compressable, as below: [Process A] - doing checkpoint [Process B] [Process C] f2fs_write_cache_pages() - lock_page() [all pages in cluster, 0-31] - f2fs_write_multi_pages() - f2fs_write_raw_pages() - f2fs_write_single_data_page() - f2fs_do_write_data_page() - return -EAGAIN [f2fs_trylock_op() failed] - unlock_page(page) [e.g., page 0] - generic_perform_write() - f2fs_write_begin() - f2fs_prepare_compress_overwrite() - prepare_compress_overwrite() - lock_page() [e.g., page 0] - lock_page() [e.g., page 1] - lock_page(page) [e.g., page 0] Since there is no compress process, it is no longer necessary to hold locks on every pages in cluster within f2fs_write_raw_pages(). This patch changes f2fs_write_raw_pages() to release all locks first and then perform write same as the non-compress file in f2fs_write_cache_pages(). Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Hyeong-Jun Kim Signed-off-by: Sungjong Seo Signed-off-by: Youngjin Gil Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 50 ++++++++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 28 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index fb9e5149af5d..4b49038d150d 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1456,25 +1456,38 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, enum iostat_type io_type) { struct address_space *mapping = cc->inode->i_mapping; - int _submitted, compr_blocks, ret; - int i = -1, err = 0; + int _submitted, compr_blocks, ret, i; compr_blocks = f2fs_compressed_blocks(cc); - if (compr_blocks < 0) { - err = compr_blocks; - goto out_err; + + for (i = 0; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; + + redirty_page_for_writepage(wbc, cc->rpages[i]); + unlock_page(cc->rpages[i]); } + if (compr_blocks < 0) + return compr_blocks; + for (i = 0; i < cc->cluster_size; i++) { if (!cc->rpages[i]) continue; retry_write: + lock_page(cc->rpages[i]); + if (cc->rpages[i]->mapping != mapping) { +continue_unlock: unlock_page(cc->rpages[i]); continue; } - BUG_ON(!PageLocked(cc->rpages[i])); + if (!PageDirty(cc->rpages[i])) + goto continue_unlock; + + if (!clear_page_dirty_for_io(cc->rpages[i])) + goto continue_unlock; ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, NULL, NULL, wbc, io_type, @@ -1489,26 +1502,15 @@ retry_write: * avoid deadlock caused by cluster update race * from foreground operation. */ - if (IS_NOQUOTA(cc->inode)) { - err = 0; - goto out_err; - } + if (IS_NOQUOTA(cc->inode)) + return 0; ret = 0; cond_resched(); congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); - lock_page(cc->rpages[i]); - - if (!PageDirty(cc->rpages[i])) { - unlock_page(cc->rpages[i]); - continue; - } - - clear_page_dirty_for_io(cc->rpages[i]); goto retry_write; } - err = ret; - goto out_err; + return ret; } *submitted += _submitted; @@ -1517,14 +1519,6 @@ retry_write: f2fs_balance_fs(F2FS_M_SB(mapping), true); return 0; -out_err: - for (++i; i < cc->cluster_size; i++) { - if (!cc->rpages[i]) - continue; - redirty_page_for_writepage(wbc, cc->rpages[i]); - unlock_page(cc->rpages[i]); - } - return err; } int f2fs_write_multi_pages(struct compress_ctx *cc, -- cgit v1.2.3-58-ga151 From 0df035c7208c5e3e2ae7685548353ae536a19015 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 13 Dec 2021 13:28:40 -0800 Subject: f2fs: avoid down_write on nat_tree_lock during checkpoint Let's cache nat entry if there's no lock contention only. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 556fcd8457f3..b1bc7d76da3b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -430,6 +430,10 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *new, *e; + /* Let's mitigate lock contention of nat_tree_lock during checkpoint */ + if (rwsem_is_locked(&sbi->cp_global_sem)) + return; + new = __alloc_nat_entry(sbi, nid, false); if (!new) return; -- cgit v1.2.3-58-ga151 From a9419b63bf414775e8aeee95d8c4a5e0df690748 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 13 Dec 2021 14:16:32 -0800 Subject: f2fs: do not bother checkpoint by f2fs_get_node_info This patch tries to mitigate lock contention between f2fs_write_checkpoint and f2fs_get_node_info along with nat_tree_lock. The idea is, if checkpoint is currently running, other threads that try to grab nat_tree_lock would be better to wait for checkpoint. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/compress.c | 2 +- fs/f2fs/data.c | 8 ++++---- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 6 +++--- fs/f2fs/inline.c | 4 ++-- fs/f2fs/inode.c | 2 +- fs/f2fs/node.c | 19 ++++++++++--------- fs/f2fs/recovery.c | 2 +- fs/f2fs/segment.c | 2 +- 11 files changed, 26 insertions(+), 25 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f1693d45bb78..55e3c54d99c1 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -664,7 +664,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* truncate all the data during iput */ iput(inode); - err = f2fs_get_node_info(sbi, ino, &ni); + err = f2fs_get_node_info(sbi, ino, &ni, false); if (err) goto err_out; diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 4b49038d150d..d0c3aeba5945 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1286,7 +1286,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, psize = (loff_t)(cc->rpages[last_index]->index + 1) << PAGE_SHIFT; - err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); if (err) goto out_put_dnode; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 40c0d1426a11..2737fcc0dbcf 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1355,7 +1355,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - err = f2fs_get_node_info(sbi, dn->nid, &ni); + err = f2fs_get_node_info(sbi, dn->nid, &ni, false); if (err) return err; @@ -1757,7 +1757,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { f2fs_put_page(page, 1); return err; @@ -1789,7 +1789,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - err = f2fs_get_node_info(sbi, xnid, &ni); + err = f2fs_get_node_info(sbi, xnid, &ni, false); if (err) { f2fs_put_page(page, 1); return err; @@ -2649,7 +2649,7 @@ got_it: fio->need_lock = LOCK_REQ; } - err = f2fs_get_node_info(fio->sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio->sbi, dn.nid, &ni, false); if (err) goto out_writepage; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5da592286721..a9ed2fd3fffb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3412,7 +3412,7 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, - struct node_info *ni); + struct node_info *ni, bool checkpoint_context); pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5ec6bef3937f..f540c1cbddca 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1233,7 +1233,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, if (ret) return ret; - ret = f2fs_get_node_info(sbi, dn.nid, &ni); + ret = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (ret) { f2fs_put_dnode(&dn); return ret; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 7fbe46477a5a..a6accec60d04 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -959,7 +959,7 @@ next_step: continue; } - if (f2fs_get_node_info(sbi, nid, &ni)) { + if (f2fs_get_node_info(sbi, nid, &ni, false)) { f2fs_put_page(node_page, 1); continue; } @@ -1027,7 +1027,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (IS_ERR(node_page)) return false; - if (f2fs_get_node_info(sbi, nid, dni)) { + if (f2fs_get_node_info(sbi, nid, dni, false)) { f2fs_put_page(node_page, 1); return false; } @@ -1221,7 +1221,7 @@ static int move_data_block(struct inode *inode, block_t bidx, f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); - err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); if (err) goto put_out; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index ea08f0dfa1bd..4b5cefa3f90c 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -131,7 +131,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) if (err) return err; - err = f2fs_get_node_info(fio.sbi, dn->nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn->nid, &ni, false); if (err) { f2fs_truncate_data_blocks_range(dn, 1); f2fs_put_dnode(dn); @@ -786,7 +786,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, ilen = start + len; ilen -= start; - err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni, false); if (err) goto out; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 935016e56010..2ab3b424735a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -881,7 +881,7 @@ void f2fs_handle_failed_inode(struct inode *inode) * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "May loss orphan inode, run fsck to fix."); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b1bc7d76da3b..e0b5eb28d383 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -543,7 +543,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) } int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, - struct node_info *ni) + struct node_info *ni, bool checkpoint_context) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -576,9 +576,10 @@ retry: * nat_tree_lock. Therefore, we should retry, if we failed to grab here * while not bothering checkpoint. */ - if (!rwsem_is_locked(&sbi->cp_global_sem)) { + if (!rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) { down_read(&curseg->journal_rwsem); - } else if (!down_read_trylock(&curseg->journal_rwsem)) { + } else if (rwsem_is_contended(&nm_i->nat_tree_lock) || + !down_read_trylock(&curseg->journal_rwsem)) { up_read(&nm_i->nat_tree_lock); goto retry; } @@ -891,7 +892,7 @@ static int truncate_node(struct dnode_of_data *dn) int err; pgoff_t index; - err = f2fs_get_node_info(sbi, dn->nid, &ni); + err = f2fs_get_node_info(sbi, dn->nid, &ni, false); if (err) return err; @@ -1290,7 +1291,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) goto fail; #ifdef CONFIG_F2FS_CHECK_FS - err = f2fs_get_node_info(sbi, dn->nid, &new_ni); + err = f2fs_get_node_info(sbi, dn->nid, &new_ni, false); if (err) { dec_valid_node_count(sbi, dn->inode, !ofs); goto fail; @@ -1352,7 +1353,7 @@ static int read_node_page(struct page *page, int op_flags) return LOCKED_PAGE; } - err = f2fs_get_node_info(sbi, page->index, &ni); + err = f2fs_get_node_info(sbi, page->index, &ni, false); if (err) return err; @@ -1604,7 +1605,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); - if (f2fs_get_node_info(sbi, nid, &ni)) + if (f2fs_get_node_info(sbi, nid, &ni, !do_balance)) goto redirty_out; if (wbc->for_reclaim) { @@ -2705,7 +2706,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page) goto recover_xnid; /* 1: invalidate the previous xattr nid */ - err = f2fs_get_node_info(sbi, prev_xnid, &ni); + err = f2fs_get_node_info(sbi, prev_xnid, &ni, false); if (err) return err; @@ -2745,7 +2746,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) struct page *ipage; int err; - err = f2fs_get_node_info(sbi, ino, &old_ni); + err = f2fs_get_node_info(sbi, ino, &old_ni, false); if (err) return err; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 6a1b4668d933..e65c73c4411d 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -595,7 +595,7 @@ retry_dn: f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); - err = f2fs_get_node_info(sbi, dn.nid, &ni); + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (err) goto err; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index df9ed75f0b7a..b4a2f8c36149 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -254,7 +254,7 @@ retry: goto next; } - err = f2fs_get_node_info(sbi, dn.nid, &ni); + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (err) { f2fs_put_dnode(&dn); return err; -- cgit v1.2.3-58-ga151 From 645a3c40ca3d40cc32b4b5972bf2620f2eb5dba6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 12 Dec 2021 17:16:30 +0800 Subject: f2fs: fix to do sanity check on last xattr entry in __f2fs_setxattr() As Wenqing Liu reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215235 - Overview page fault in f2fs_setxattr() when mount and operate on corrupted image - Reproduce tested on kernel 5.16-rc3, 5.15.X under root 1. unzip tmp7.zip 2. ./single.sh f2fs 7 Sometimes need to run the script several times - Kernel dump loop0: detected capacity change from 0 to 131072 F2FS-fs (loop0): Found nat_bits in checkpoint F2FS-fs (loop0): Mounted with checkpoint version = 7548c2ee BUG: unable to handle page fault for address: ffffe47bc7123f48 RIP: 0010:kfree+0x66/0x320 Call Trace: __f2fs_setxattr+0x2aa/0xc00 [f2fs] f2fs_setxattr+0xfa/0x480 [f2fs] __f2fs_set_acl+0x19b/0x330 [f2fs] __vfs_removexattr+0x52/0x70 __vfs_removexattr_locked+0xb1/0x140 vfs_removexattr+0x56/0x100 removexattr+0x57/0x80 path_removexattr+0xa3/0xc0 __x64_sys_removexattr+0x17/0x20 do_syscall_64+0x37/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae The root cause is in __f2fs_setxattr(), we missed to do sanity check on last xattr entry, result in out-of-bound memory access during updating inconsistent xattr data of target inode. After the fix, it can detect such xattr inconsistency as below: F2FS-fs (loop11): inode (7) has invalid last xattr entry, entry_size: 60676 F2FS-fs (loop11): inode (8) has corrupted xattr F2FS-fs (loop11): inode (8) has corrupted xattr F2FS-fs (loop11): inode (8) has invalid last xattr entry, entry_size: 47736 Cc: stable@vger.kernel.org Reported-by: Wenqing Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index e348f33bcb2b..797ac505a075 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -684,8 +684,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, } last = here; - while (!IS_XATTR_LAST_ENTRY(last)) + while (!IS_XATTR_LAST_ENTRY(last)) { + if ((void *)(last) + sizeof(__u32) > last_base_addr || + (void *)XATTR_NEXT_ENTRY(last) > last_base_addr) { + f2fs_err(F2FS_I_SB(inode), "inode (%lu) has invalid last xattr entry, entry_size: %zu", + inode->i_ino, ENTRY_SIZE(last)); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + error = -EFSCORRUPTED; + goto exit; + } last = XATTR_NEXT_ENTRY(last); + } newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + len + size); -- cgit v1.2.3-58-ga151 From dd9d4a3a30d009c77139d0cab0f2d08b30fa3941 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 12 Dec 2021 17:16:56 +0800 Subject: f2fs: clean up __find_inline_xattr() with __find_xattr() Just cleanup, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 797ac505a075..8e5cd9c916ff 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -226,15 +226,18 @@ static inline const struct xattr_handler *f2fs_xattr_handler(int index) } static struct f2fs_xattr_entry *__find_xattr(void *base_addr, - void *last_base_addr, int index, - size_t len, const char *name) + void *last_base_addr, void **last_addr, + int index, size_t len, const char *name) { struct f2fs_xattr_entry *entry; list_for_each_xattr(entry, base_addr) { if ((void *)(entry) + sizeof(__u32) > last_base_addr || - (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) + (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { + if (last_addr) + *last_addr = entry; return NULL; + } if (entry->e_name_index != index) continue; @@ -254,19 +257,9 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, unsigned int inline_size = inline_xattr_size(inode); void *max_addr = base_addr + inline_size; - list_for_each_xattr(entry, base_addr) { - if ((void *)entry + sizeof(__u32) > max_addr || - (void *)XATTR_NEXT_ENTRY(entry) > max_addr) { - *last_addr = entry; - return NULL; - } - if (entry->e_name_index != index) - continue; - if (entry->e_name_len != len) - continue; - if (!memcmp(entry->e_name, name, len)) - break; - } + entry = __find_xattr(base_addr, max_addr, last_addr, index, len, name); + if (!entry) + return NULL; /* inline xattr header or entry across max inline xattr size */ if (IS_XATTR_LAST_ENTRY(entry) && @@ -368,7 +361,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, else cur_addr = txattr_addr; - *xe = __find_xattr(cur_addr, last_txattr_addr, index, len, name); + *xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name); if (!*xe) { f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", inode->i_ino); @@ -659,7 +652,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, last_base_addr = (void *)base_addr + XATTR_SIZE(inode); /* find entry with wanted name. */ - here = __find_xattr(base_addr, last_base_addr, index, len, name); + here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name); if (!here) { f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", inode->i_ino); -- cgit v1.2.3-58-ga151 From 3e0203893e0dc4f64e7dc65ff5ac70e970019827 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 12 Dec 2021 17:17:51 +0800 Subject: f2fs: support fault injection to f2fs_trylock_op() f2fs: support fault injection for f2fs_trylock_op() This patch supports to inject fault into f2fs_trylock_op(). Usage: a) echo 65536 > /sys/fs/f2fs//inject_type or b) mount -o fault_type=65536 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 1 + fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/super.c | 1 + 3 files changed, 7 insertions(+) (limited to 'fs/f2fs') diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index d7b84695f56a..4a2426f0485a 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -198,6 +198,7 @@ fault_type=%d Support configuring fault injection type, should be FAULT_WRITE_IO 0x000004000 FAULT_SLAB_ALLOC 0x000008000 FAULT_DQUOT_INIT 0x000010000 + FAULT_LOCK_OP 0x000020000 =================== =========== mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a9ed2fd3fffb..8601d5e979d4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -56,6 +56,7 @@ enum { FAULT_WRITE_IO, FAULT_SLAB_ALLOC, FAULT_DQUOT_INIT, + FAULT_LOCK_OP, FAULT_MAX, }; @@ -2095,6 +2096,10 @@ static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) { + if (time_to_inject(sbi, FAULT_LOCK_OP)) { + f2fs_show_injection_info(sbi, FAULT_LOCK_OP); + return 0; + } return down_read_trylock(&sbi->cp_rwsem); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9acd76ea09ca..ca5783fa56d5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -59,6 +59,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_WRITE_IO] = "write IO error", [FAULT_SLAB_ALLOC] = "slab alloc", [FAULT_DQUOT_INIT] = "dquot initialize", + [FAULT_LOCK_OP] = "lock_op", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, -- cgit v1.2.3-58-ga151 From b702c83e2eaa2fa2d72e957c55c0321535cc8b9f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 12 Dec 2021 20:28:12 +0800 Subject: f2fs: fix to check available space of CP area correctly in update_ckpt_flags() Otherwise, nat_bit area may be persisted across boundary of CP area during nat_bit rebuilding. Fixes: 94c821fb286b ("f2fs: rebuild nat_bits during umount") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 55e3c54d99c1..982f0170639f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1302,8 +1302,8 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned long flags; if (cpc->reason & CP_UMOUNT) { - if (le32_to_cpu(ckpt->cp_pack_total_block_count) > - sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) { + if (le32_to_cpu(ckpt->cp_pack_total_block_count) + + NM_I(sbi)->nat_bits_blocks > sbi->blocks_per_seg) { clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); f2fs_notice(sbi, "Disable nat_bits due to no space"); } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) && -- cgit v1.2.3-58-ga151 From 300a842937fbcfb5a189cea9ba15374fdb0b5c6b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 11 Dec 2021 21:27:36 +0800 Subject: f2fs: fix to reserve space for IO align feature https://bugzilla.kernel.org/show_bug.cgi?id=204137 With below script, we will hit panic during new segment allocation: DISK=bingo.img MOUNT_DIR=/mnt/f2fs dd if=/dev/zero of=$DISK bs=1M count=105 mkfs.f2fe -a 1 -o 19 -t 1 -z 1 -f -q $DISK mount -t f2fs $DISK $MOUNT_DIR -o "noinline_dentry,flush_merge,noextent_cache,mode=lfs,io_bits=7,fsync_mode=strict" for (( i = 0; i < 4096; i++ )); do name=`head /dev/urandom | tr -dc A-Za-z0-9 | head -c 10` mkdir $MOUNT_DIR/$name done umount $MOUNT_DIR rm $DISK --- Core dump --- Call Trace: allocate_segment_by_default+0x9d/0x100 [f2fs] f2fs_allocate_data_block+0x3c0/0x5c0 [f2fs] do_write_page+0x62/0x110 [f2fs] f2fs_outplace_write_data+0x43/0xc0 [f2fs] f2fs_do_write_data_page+0x386/0x560 [f2fs] __write_data_page+0x706/0x850 [f2fs] f2fs_write_cache_pages+0x267/0x6a0 [f2fs] f2fs_write_data_pages+0x19c/0x2e0 [f2fs] do_writepages+0x1c/0x70 __filemap_fdatawrite_range+0xaa/0xe0 filemap_fdatawrite+0x1f/0x30 f2fs_sync_dirty_inodes+0x74/0x1f0 [f2fs] block_operations+0xdc/0x350 [f2fs] f2fs_write_checkpoint+0x104/0x1150 [f2fs] f2fs_sync_fs+0xa2/0x120 [f2fs] f2fs_balance_fs_bg+0x33c/0x390 [f2fs] f2fs_write_node_pages+0x4c/0x1f0 [f2fs] do_writepages+0x1c/0x70 __writeback_single_inode+0x45/0x320 writeback_sb_inodes+0x273/0x5c0 wb_writeback+0xff/0x2e0 wb_workfn+0xa1/0x370 process_one_work+0x138/0x350 worker_thread+0x4d/0x3d0 kthread+0x109/0x140 ret_from_fork+0x25/0x30 The root cause here is, with IO alignment feature enables, in worst case, we need F2FS_IO_SIZE() free blocks space for single one 4k write due to IO alignment feature will fill dummy pages to make IO being aligned. So we will easily run out of free segments during non-inline directory's data writeback, even in process of foreground GC. In order to fix this issue, I just propose to reserve additional free space for IO alignment feature to handle worst case of free space usage ratio during FGGC. Fixes: 0a595ebaaa6b ("f2fs: support IO alignment for DATA and NODE writes") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 +++++++++++ fs/f2fs/segment.h | 3 ++- fs/f2fs/super.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/sysfs.c | 4 +++- 4 files changed, 60 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8601d5e979d4..842020311f83 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1024,6 +1024,7 @@ struct f2fs_sm_info { unsigned int segment_count; /* total # of segments */ unsigned int main_segments; /* # of segments in main area */ unsigned int reserved_segments; /* # of reserved segments */ + unsigned int additional_reserved_segments;/* reserved segs for IO align feature */ unsigned int ovp_segments; /* # of overprovision segments */ /* a threshold to reclaim prefree segments */ @@ -2205,6 +2206,11 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, if (!__allow_reserved_blocks(sbi, inode, true)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; + + if (F2FS_IO_ALIGNED(sbi)) + avail_user_block_count -= sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { if (avail_user_block_count > sbi->unusable_block_count) avail_user_block_count -= sbi->unusable_block_count; @@ -2451,6 +2457,11 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, if (!__allow_reserved_blocks(sbi, inode, false)) valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; + + if (F2FS_IO_ALIGNED(sbi)) + valid_block_count += sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments; + user_block_count = sbi->user_block_count; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) user_block_count -= sbi->unusable_block_count; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 46fde9f3f28e..0291cd55cf09 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -538,7 +538,8 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi) static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi) { - return SM_I(sbi)->reserved_segments; + return SM_I(sbi)->reserved_segments + + SM_I(sbi)->additional_reserved_segments; } static inline unsigned int free_sections(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ca5783fa56d5..053b508d1e4f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -329,6 +329,46 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).s_resgid)); } +static inline int adjust_reserved_segment(struct f2fs_sb_info *sbi) +{ + unsigned int sec_blks = sbi->blocks_per_seg * sbi->segs_per_sec; + unsigned int avg_vblocks; + unsigned int wanted_reserved_segments; + block_t avail_user_block_count; + + if (!F2FS_IO_ALIGNED(sbi)) + return 0; + + /* average valid block count in section in worst case */ + avg_vblocks = sec_blks / F2FS_IO_SIZE(sbi); + + /* + * we need enough free space when migrating one section in worst case + */ + wanted_reserved_segments = (F2FS_IO_SIZE(sbi) / avg_vblocks) * + reserved_segments(sbi); + wanted_reserved_segments -= reserved_segments(sbi); + + avail_user_block_count = sbi->user_block_count - + sbi->current_reserved_blocks - + F2FS_OPTION(sbi).root_reserved_blocks; + + if (wanted_reserved_segments * sbi->blocks_per_seg > + avail_user_block_count) { + f2fs_err(sbi, "IO align feature can't grab additional reserved segment: %u, available segments: %u", + wanted_reserved_segments, + avail_user_block_count >> sbi->log_blocks_per_seg); + return -ENOSPC; + } + + SM_I(sbi)->additional_reserved_segments = wanted_reserved_segments; + + f2fs_info(sbi, "IO align feature needs additional reserved segment: %u", + wanted_reserved_segments); + + return 0; +} + static inline void adjust_unusable_cap_perc(struct f2fs_sb_info *sbi) { if (!F2FS_OPTION(sbi).unusable_cap_perc) @@ -4182,6 +4222,10 @@ try_onemore: goto free_nm; } + err = adjust_reserved_segment(sbi); + if (err) + goto free_nm; + /* For write statistics */ sbi->sectors_written_start = f2fs_get_sectors_written(sbi); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 55a7df17d5f3..c22bee84c8ec 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -424,7 +424,9 @@ out: if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); if (t > (unsigned long)(sbi->user_block_count - - F2FS_OPTION(sbi).root_reserved_blocks)) { + F2FS_OPTION(sbi).root_reserved_blocks - + sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments)) { spin_unlock(&sbi->stat_lock); return -EINVAL; } -- cgit v1.2.3-58-ga151 From 2a64e303e3051550c75897239174e399dfcb8b7e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 16 Dec 2021 17:13:56 +0800 Subject: f2fs: don't drop compressed page cache in .{invalidate,release}page For compressed inode, in .{invalidate,release}page, we will call f2fs_invalidate_compress_pages() to drop all compressed page cache of current inode. But we don't need to drop compressed page cache synchronously in .invalidatepage, because, all trancation paths of compressed physical block has been covered with f2fs_invalidate_compress_page(). And also we don't need to drop compressed page cache synchronously in .releasepage, because, if there is out-of-memory, we can count on page cache reclaim on sbi->compress_inode. BTW, this patch may fix the issue reported below: https://lore.kernel.org/linux-f2fs-devel/20211202092812.197647-1-changfengnan@vivo.com/T/#u Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2737fcc0dbcf..0fc6e0245732 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3528,12 +3528,9 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, clear_page_private_gcing(page); - if (test_opt(sbi, COMPRESS_CACHE)) { - if (f2fs_compressed_file(inode)) - f2fs_invalidate_compress_pages(sbi, inode->i_ino); - if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) - clear_page_private_data(page); - } + if (test_opt(sbi, COMPRESS_CACHE) && + inode->i_ino == F2FS_COMPRESS_INO(sbi)) + clear_page_private_data(page); if (page_private_atomic(page)) return f2fs_drop_inmem_page(inode, page); @@ -3553,12 +3550,9 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 0; if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) { - struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct inode *inode = page->mapping->host; - if (f2fs_compressed_file(inode)) - f2fs_invalidate_compress_pages(sbi, inode->i_ino); - if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) + if (inode->i_ino == F2FS_COMPRESS_INO(F2FS_I_SB(inode))) clear_page_private_data(page); } -- cgit v1.2.3-58-ga151 From d361b690b6fcd0acdb34a56e9054a6eb6be4b0c3 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 15 Dec 2021 10:38:58 +0800 Subject: f2fs: Simplify bool conversion Fix the following coccicheck warning: ./fs/f2fs/sysfs.c:491:41-46: WARNING: conversion to bool not needed here Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index c22bee84c8ec..f8a14b1e2ef7 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -491,7 +491,7 @@ out: if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) { spin_lock(&sbi->gc_urgent_high_lock); - sbi->gc_urgent_high_limited = t == 0 ? false : true; + sbi->gc_urgent_high_limited = t != 0; sbi->gc_urgent_high_remaining = t; spin_unlock(&sbi->gc_urgent_high_lock); -- cgit v1.2.3-58-ga151 From 2b642898e5ea206d04684e55235878ea3425659c Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Wed, 29 Dec 2021 17:47:00 +0800 Subject: f2fs: remove redunant invalidate compress pages Compress page will invalidate in truncate block process too, so remove redunant invalidate compress pages in f2fs_evict_inode. Signed-off-by: Fengnan Chang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2ab3b424735a..1d85f1e58d32 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -751,7 +751,8 @@ void f2fs_evict_inode(struct inode *inode) trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); - if (test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode)) + if ((inode->i_nlink || is_bad_inode(inode)) && + test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode)) f2fs_invalidate_compress_pages(sbi, inode->i_ino); if (inode->i_ino == F2FS_NODE_INO(sbi) || -- cgit v1.2.3-58-ga151 From 5fed0be8583f08c1548b4dcd9e5ee0d1133d0730 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 7 Jan 2022 20:08:45 -0800 Subject: f2fs: do not allow partial truncation on pinned file If the pinned file has a hole by partial truncation, application that has the block map will be broken. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f540c1cbddca..3c98ef6af97d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1750,7 +1750,11 @@ static long f2fs_fallocate(struct file *file, int mode, (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; - if (f2fs_compressed_file(inode) && + /* + * Pinned file should not support partial trucation since the block + * can be used by applications. + */ + if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) && (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; -- cgit v1.2.3-58-ga151