From fd90d4dfb94a8c0d626c0c85ca7dcfb905f81a65 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 14 Apr 2015 15:42:42 -0700 Subject: ocfs2: delete unnecessary checks before three function calls kfree(), ocfs2_free_path() and __ocfs2_free_slot_info() test whether their argument is NULL and then return immediately. Thus the test around their calls is not needed. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 17 +++++------------ fs/ocfs2/slot_map.c | 2 +- fs/ocfs2/stack_user.c | 2 +- 3 files changed, 7 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 044158bd22be..fdab27c9be99 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -3453,8 +3453,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, subtree_index); } out: - if (right_path) - ocfs2_free_path(right_path); + ocfs2_free_path(right_path); return ret; } @@ -3647,8 +3646,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, right_path, subtree_index); } out: - if (left_path) - ocfs2_free_path(left_path); + ocfs2_free_path(left_path); return ret; } @@ -4431,11 +4429,8 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, } out: - if (left_path) - ocfs2_free_path(left_path); - if (right_path) - ocfs2_free_path(right_path); - + ocfs2_free_path(left_path); + ocfs2_free_path(right_path); return ret; } @@ -6996,9 +6991,7 @@ out_commit: out: if (data_ac) ocfs2_free_alloc_context(data_ac); - if (pages) - kfree(pages); - + kfree(pages); return ret; } diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index d5493e361a38..c5e530a9d1b1 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -452,7 +452,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) osb->slot_info = (struct ocfs2_slot_info *)si; bail: - if (status < 0 && si) + if (status < 0) __ocfs2_free_slot_info(si); return status; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 720aa389e0ea..c3b7807c65d6 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -1063,7 +1063,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) } out: - if (rc && lc) + if (rc) kfree(lc); return rc; } -- cgit v1.2.3-58-ga151 From 3cc79b795b53a9a04aa1bcc1a943379f06324bb6 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 14 Apr 2015 15:42:45 -0700 Subject: ocfs2: less function calls in ocfs2_convert_inline_data_to_extents() after error detection kfree() was called in a few cases by ocfs2_convert_inline_data_to_extents() during error handling even if the passed variable "pages" contained a null pointer. * Return from this implementation directly after failure detection for the function call "kcalloc". * Corresponding details could be improved by the introduction of another jump label. Signed-off-by: Markus Elfring Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index fdab27c9be99..bf806e58b1cb 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6853,13 +6853,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, if (pages == NULL) { ret = -ENOMEM; mlog_errno(ret); - goto out; + return ret; } ret = ocfs2_reserve_clusters(osb, 1, &data_ac); if (ret) { mlog_errno(ret); - goto out; + goto free_pages; } } @@ -6991,6 +6991,7 @@ out_commit: out: if (data_ac) ocfs2_free_alloc_context(data_ac); +free_pages: kfree(pages); return ret; } -- cgit v1.2.3-58-ga151 From 06a269ccdf0550671667f5cfa00bdabb6bf05259 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 14 Apr 2015 15:42:48 -0700 Subject: ocfs2: less function calls in ocfs2_figure_merge_contig_type() after error detection ocfs2_free_path() was called in some cases by ocfs2_figure_merge_contig_type() during error handling even if the passed variables "left_path" and "right_path" contained still a null pointer. Corresponding implementation details could be improved by adjustments for jump labels according to the current Linux coding style convention. Signed-off-by: Markus Elfring Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index bf806e58b1cb..370b4ea4c23a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4332,17 +4332,17 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, } else if (path->p_tree_depth > 0) { status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); if (status) - goto out; + goto exit; if (left_cpos != 0) { left_path = ocfs2_new_path_from_path(path); if (!left_path) - goto out; + goto exit; status = ocfs2_find_path(et->et_ci, left_path, left_cpos); if (status) - goto out; + goto free_left_path; new_el = path_leaf_el(left_path); @@ -4359,7 +4359,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, le16_to_cpu(new_el->l_next_free_rec), le16_to_cpu(new_el->l_count)); status = -EINVAL; - goto out; + goto free_left_path; } rec = &new_el->l_recs[ le16_to_cpu(new_el->l_next_free_rec) - 1]; @@ -4386,18 +4386,18 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, path->p_tree_depth > 0) { status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); if (status) - goto out; + goto free_left_path; if (right_cpos == 0) - goto out; + goto free_left_path; right_path = ocfs2_new_path_from_path(path); if (!right_path) - goto out; + goto free_left_path; status = ocfs2_find_path(et->et_ci, right_path, right_cpos); if (status) - goto out; + goto free_right_path; new_el = path_leaf_el(right_path); rec = &new_el->l_recs[0]; @@ -4411,7 +4411,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, (unsigned long long)le64_to_cpu(eb->h_blkno), le16_to_cpu(new_el->l_next_free_rec)); status = -EINVAL; - goto out; + goto free_right_path; } rec = &new_el->l_recs[1]; } @@ -4428,9 +4428,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, ret = contig_type; } -out: - ocfs2_free_path(left_path); +free_right_path: ocfs2_free_path(right_path); +free_left_path: + ocfs2_free_path(left_path); +exit: return ret; } -- cgit v1.2.3-58-ga151 From 992ef6e794c4d6b84e7930ee79310f066c7f6727 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 14 Apr 2015 15:42:51 -0700 Subject: ocfs2: one function call less in ocfs2_merge_rec_left() after error detection ocfs2_free_path() was called by ocfs2_merge_rec_left() even if a call of the ocfs2_get_left_path() function failed. Return from this implementation directly after corresponding exception handling. Signed-off-by: Markus Elfring Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 370b4ea4c23a..4bdc19fb7b85 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -3535,7 +3535,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, ret = ocfs2_get_left_path(et, right_path, &left_path); if (ret) { mlog_errno(ret); - goto out; + return ret; } left_el = path_leaf_el(left_path); -- cgit v1.2.3-58-ga151 From 629a3b5f0b1c09025546e110ea2b2a67335ed8c5 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 14 Apr 2015 15:42:53 -0700 Subject: ocfs2: one function call less in ocfs2_merge_rec_right() after error detection ocfs2_free_path() was called by ocfs2_merge_rec_right() even if a call of the ocfs2_get_right_path() function failed. Return from this implementation directly after corresponding exception handling. Signed-off-by: Markus Elfring Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 4bdc19fb7b85..2d7f76e52c37 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -3370,7 +3370,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, ret = ocfs2_get_right_path(et, left_path, &right_path); if (ret) { mlog_errno(ret); - goto out; + return ret; } right_el = path_leaf_el(right_path); -- cgit v1.2.3-58-ga151 From bb34ed21bce54a900c034089a6b1fde8c09f6a6d Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 14 Apr 2015 15:42:56 -0700 Subject: ocfs2: one function call less in ocfs2_init_slot_info() after error detection __ocfs2_free_slot_info() was called by ocfs2_init_slot_info() even if a call of the kzalloc() function failed. Return from this implementation directly after corresponding exception handling. Signed-off-by: Markus Elfring Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/slot_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index c5e530a9d1b1..e78a203d44c8 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -427,7 +427,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) if (!si) { status = -ENOMEM; mlog_errno(status); - goto bail; + return status; } si->si_extended = ocfs2_uses_extended_slot_map(osb); -- cgit v1.2.3-58-ga151 From 43ee9cad8a81954eea893b52e08d0c00ca9baccc Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 14 Apr 2015 15:42:59 -0700 Subject: ocfs2: one function call less in user_cluster_connect() after error detection kfree() was called by user_cluster_connect() even if a previous call of the kzalloc() function failed. Return from this implementation directly after failure detection. Signed-off-by: Markus Elfring Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stack_user.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index c3b7807c65d6..2768eb1da2b8 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -1004,10 +1004,8 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) BUG_ON(conn == NULL); lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); - if (!lc) { - rc = -ENOMEM; - goto out; - } + if (!lc) + return -ENOMEM; init_waitqueue_head(&lc->oc_wait); init_completion(&lc->oc_sync_wait); -- cgit v1.2.3-58-ga151 From 7a8346429d6da4039a4687a8a07f3f8cdaf96d92 Mon Sep 17 00:00:00 2001 From: Daeseok Youn Date: Tue, 14 Apr 2015 15:43:02 -0700 Subject: ocfs2: avoid a pointless delay in o2cb_cluster_check() Fix an off-by-one when attempting to avoid an msleep() on the final loop iteration. Signed-off-by: Daeseok Youn Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stack_o2cb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 1724d43d3da1..220cae7bbdbc 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -295,7 +295,7 @@ static int o2cb_cluster_check(void) set_bit(node_num, netmap); if (!memcmp(hbmap, netmap, sizeof(hbmap))) return 0; - if (i < O2CB_MAP_STABILIZE_COUNT) + if (i < O2CB_MAP_STABILIZE_COUNT - 1) msleep(1000); } -- cgit v1.2.3-58-ga151 From bdd86215b393b743be34b854299c6cda7cbb361c Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:05 -0700 Subject: ocfs2: fix a typing error in ocfs2_direct_IO_write Only when direct IO succeeds we need consider zeroing out in case of cluster not aligned. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 44db1808cdb5..0c2848a599c9 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -771,7 +771,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, if (ret < 0) mlog_errno(ret); } - } else if (written < 0 && append_write && !is_overwrite && + } else if (written > 0 && append_write && !is_overwrite && !cluster_align) { u32 p_cpos = 0; u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); -- cgit v1.2.3-58-ga151 From 7e9b19551c8249baf380cbd274633ee4af95bc99 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:08 -0700 Subject: ocfs2: no need get dinode bh when zeroing extend Since di_bh won't be used when zeroing extend, set it to NULL. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 0c2848a599c9..2a618dd2577d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -706,7 +706,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, } if (append_write) { - ret = ocfs2_inode_lock(inode, &di_bh, 1); + ret = ocfs2_inode_lock(inode, NULL, 1); if (ret < 0) { mlog_errno(ret); goto clean_orphan; @@ -720,7 +720,6 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, if (ret < 0) { mlog_errno(ret); ocfs2_inode_unlock(inode, 1); - brelse(di_bh); goto clean_orphan; } @@ -728,13 +727,10 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, if (is_overwrite < 0) { mlog_errno(is_overwrite); ocfs2_inode_unlock(inode, 1); - brelse(di_bh); goto clean_orphan; } ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - di_bh = NULL; } written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev, -- cgit v1.2.3-58-ga151 From 37a8d89aee2a5b58812e19520d96632efe987f54 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:10 -0700 Subject: ocfs2: take inode lock when get clusters We need take inode lock when calling ocfs2_get_clusters. And use GFP_NOFS instead of GFP_KERNEL. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 2a618dd2577d..973a636285d1 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -772,10 +772,17 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, u32 p_cpos = 0; u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, &ext_flags); if (ret < 0) { mlog_errno(ret); + ocfs2_inode_unlock(inode, 0); goto clean_orphan; } @@ -783,9 +790,11 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, ret = blkdev_issue_zeroout(osb->sb->s_bdev, p_cpos << (osb->s_clustersize_bits - 9), - zero_len >> 9, GFP_KERNEL, false); + zero_len >> 9, GFP_NOFS, false); if (ret < 0) mlog_errno(ret); + + ocfs2_inode_unlock(inode, 0); } clean_orphan: -- cgit v1.2.3-58-ga151 From 14a5275d8c31ba24832f45eeb2469e835ded660d Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:13 -0700 Subject: ocfs2: do not use ocfs2_zero_extend during direct IO In ocfs2_direct_IO_write, we use ocfs2_zero_extend to zero allocated clusters in case of cluster not aligned. But ocfs2_zero_extend uses page cache, this may happen that it clears the data which blockdev_direct_IO has already written. We should use blkdev_issue_zeroout instead of ocfs2_zero_extend during direct IO. So fix this issue by introducing ocfs2_direct_IO_zero_extend and ocfs2_direct_IO_extend_no_holes. Reported-by: Yiwen Jiang Signed-off-by: Joseph Qi Tested-by: Yiwen Jiang Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 130 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 973a636285d1..1b0463a92b17 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -663,6 +663,117 @@ static int ocfs2_is_overwrite(struct ocfs2_super *osb, return 0; } +static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb, + struct inode *inode, loff_t offset, + u64 zero_len, int cluster_align) +{ + u32 p_cpos = 0; + u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + int ret = 0; + + if (offset <= i_size_read(inode) || cluster_align) + return 0; + + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, + &ext_flags); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { + u64 s = i_size_read(inode); + sector_t sector = (p_cpos << (osb->s_clustersize_bits - 9)) + + (do_div(s, osb->s_clustersize) >> 9); + + ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector, + zero_len >> 9, GFP_NOFS, false); + if (ret < 0) + mlog_errno(ret); + } + + return ret; +} + +static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb, + struct inode *inode, loff_t offset) +{ + u64 zero_start, zero_len, total_zero_len; + u32 p_cpos = 0, clusters_to_add; + u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + u32 size_div, offset_div; + int ret = 0; + + { + u64 o = offset; + u64 s = i_size_read(inode); + + offset_div = do_div(o, osb->s_clustersize); + size_div = do_div(s, osb->s_clustersize); + } + + if (offset <= i_size_read(inode)) + return 0; + + clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) - + ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode)); + total_zero_len = offset - i_size_read(inode); + if (clusters_to_add) + total_zero_len -= offset_div; + + /* Allocate clusters to fill out holes, and this is only needed + * when we add more than one clusters. Otherwise the cluster will + * be allocated during direct IO */ + if (clusters_to_add > 1) { + ret = ocfs2_extend_allocation(inode, + OCFS2_I(inode)->ip_clusters, + clusters_to_add - 1, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + while (total_zero_len) { + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, + &ext_flags); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) + + size_div; + zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) - + size_div; + zero_len = min(total_zero_len, zero_len); + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { + ret = blkdev_issue_zeroout(osb->sb->s_bdev, + zero_start >> 9, zero_len >> 9, + GFP_NOFS, false); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + total_zero_len -= zero_len; + v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div); + + /* Only at first iteration can be cluster not aligned. + * So set size_div to 0 for the rest */ + size_div = 0; + } + +out: + return ret; +} + static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) @@ -677,8 +788,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, struct buffer_head *di_bh = NULL; size_t count = iter->count; journal_t *journal = osb->journal->j_journal; - u32 zero_len; - int cluster_align; + u64 zero_len_head, zero_len_tail; + int cluster_align_head, cluster_align_tail; loff_t final_size = offset + count; int append_write = offset >= i_size_read(inode) ? 1 : 0; unsigned int num_clusters = 0; @@ -686,9 +797,16 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, { u64 o = offset; + u64 s = i_size_read(inode); + + zero_len_head = do_div(o, 1 << osb->s_clustersize_bits); + cluster_align_head = !zero_len_head; - zero_len = do_div(o, 1 << osb->s_clustersize_bits); - cluster_align = !zero_len; + zero_len_tail = osb->s_clustersize - + do_div(s, osb->s_clustersize); + if ((offset - i_size_read(inode)) < zero_len_tail) + zero_len_tail = offset - i_size_read(inode); + cluster_align_tail = !zero_len_tail; } /* @@ -712,10 +830,13 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, goto clean_orphan; } + /* zeroing out the previously allocated cluster tail + * that but not zeroed */ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) - ret = ocfs2_zero_extend(inode, di_bh, offset); + ret = ocfs2_direct_IO_zero_extend(osb, inode, offset, + zero_len_tail, cluster_align_tail); else - ret = ocfs2_extend_no_holes(inode, di_bh, offset, + ret = ocfs2_direct_IO_extend_no_holes(osb, inode, offset); if (ret < 0) { mlog_errno(ret); @@ -768,7 +889,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, mlog_errno(ret); } } else if (written > 0 && append_write && !is_overwrite && - !cluster_align) { + !cluster_align_head) { + /* zeroing out the allocated cluster head */ u32 p_cpos = 0; u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); @@ -790,7 +912,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, ret = blkdev_issue_zeroout(osb->sb->s_bdev, p_cpos << (osb->s_clustersize_bits - 9), - zero_len >> 9, GFP_NOFS, false); + zero_len_head >> 9, GFP_NOFS, false); if (ret < 0) mlog_errno(ret); -- cgit v1.2.3-58-ga151 From d0ba25b905ba1246d04578cd59df83014e9b9152 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:16 -0700 Subject: ocfs2: fix typo in ocfs2_reserve_local_alloc_bits In ocfs2_reserve_local_alloc_bits, it calls ocfs2_error if local alloc inode bitmap used bits mismatch, but the log mistakes it as free bits. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/localalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 044013455621..096cff6f9ba8 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -666,7 +666,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, if (le32_to_cpu(alloc->id1.bitmap1.i_used) != ocfs2_local_alloc_count_bits(alloc)) { ocfs2_error(osb->sb, "local alloc inode %llu says it has " - "%u free bits, but a count shows %u", + "%u used bits, but a count shows %u", (unsigned long long)le64_to_cpu(alloc->i_blkno), le32_to_cpu(alloc->id1.bitmap1.i_used), ocfs2_local_alloc_count_bits(alloc)); -- cgit v1.2.3-58-ga151 From e073fc58dfe6a4c9b614320c1d56bb71cb213ec4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 14 Apr 2015 15:43:19 -0700 Subject: ocfs2: dereferencing freed pointers in ocfs2_reflink() The code at the "out" label assumes that "default_acl" and "acl" are NULL, but actually the pointers can be NULL, unitialized, or freed. Signed-off-by: Dan Carpenter Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/refcounttree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index ee541f92dab4..df3a500789c7 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4276,7 +4276,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, error = posix_acl_create(dir, &mode, &default_acl, &acl); if (error) { mlog_errno(error); - goto out; + return error; } error = ocfs2_create_inode_in_orphan(dir, mode, -- cgit v1.2.3-58-ga151 From e38a573907383b3c57514fe2331322b1fc110eef Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:22 -0700 Subject: ocfs2: use actual name length when find entry in ocfs2_orphan_del() If the namelen is 20 and name only has actual length 16, it will fail in ocfs2_find_entry because of mismatch. So use actual name length when find entry. Signed-off-by: Joseph Qi Signed-off-by: Yiwen Jiang Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index b5c3a5ea3ee6..49837404541e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2322,10 +2322,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, trace_ocfs2_orphan_del( (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, - name, namelen); + name, strlen(name)); /* find it's spot in the orphan directory */ - status = ocfs2_find_entry(name, namelen, orphan_dir_inode, + status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode, &lookup); if (status) { mlog_errno(status); -- cgit v1.2.3-58-ga151 From 62f8b1f0d609838361d65b5b2114859d464e6baa Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:24 -0700 Subject: ocfs2: use ENOENT instead of EEXIST when get system file fails When ocfs2_get_system_file_inode fails, it is obscure to set the return value to -EEXIST. So change it to -ENOENT. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.c | 4 ++-- fs/ocfs2/namei.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 3025c0da6b8a..be71ca0937f7 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -624,7 +624,7 @@ static int ocfs2_remove_inode(struct inode *inode, ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, le16_to_cpu(di->i_suballoc_slot)); if (!inode_alloc_inode) { - status = -EEXIST; + status = -ENOENT; mlog_errno(status); goto bail; } @@ -742,7 +742,7 @@ static int ocfs2_wipe_inode(struct inode *inode, ORPHAN_DIR_SYSTEM_INODE, orphaned_slot); if (!orphan_dir_inode) { - status = -EEXIST; + status = -ENOENT; mlog_errno(status); goto bail; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 49837404541e..09f90cbf0e24 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2808,7 +2808,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, ORPHAN_DIR_SYSTEM_INODE, osb->slot_num); if (!orphan_dir_inode) { - status = -EEXIST; + status = -ENOENT; mlog_errno(status); goto leave; } -- cgit v1.2.3-58-ga151 From a47726bcf299db6b5743d574df36c423263b4e65 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:27 -0700 Subject: ocfs2: rollback the cleared bits if error occurs after ocfs2_block_group_clear_bits ocfs2_block_group_clear_bits will clear bits in block group bitmap. Once it succeeds but fails in the following step, it will cause block group bitmap mismatch the corresponding count recorded in dinode. So rollback the cleared bits if error occurs. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/suballoc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 0cb889a17ae1..4479029630bb 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2499,6 +2499,8 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); + ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh, + start_bit, count); goto bail; } -- cgit v1.2.3-58-ga151 From 7c01ad8fe7c159d7e1ddbd8e586d4d0dfed7ab3d Mon Sep 17 00:00:00 2001 From: Daeseok Youn Date: Tue, 14 Apr 2015 15:43:30 -0700 Subject: ocfs2: remove goto statement in ocfs2_check_dir_for_entry() Signed-off-by: Daeseok Youn Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dir.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index b08050bd3f2e..2241a19b9335 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2047,22 +2047,19 @@ int ocfs2_check_dir_for_entry(struct inode *dir, const char *name, int namelen) { - int ret; + int ret = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; trace_ocfs2_check_dir_for_entry( (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); - ret = -EEXIST; - if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) - goto bail; + if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) { + ret = -EEXIST; + mlog_errno(ret); + } - ret = 0; -bail: ocfs2_free_dir_lookup_result(&lookup); - if (ret) - mlog_errno(ret); return ret; } -- cgit v1.2.3-58-ga151 From 023d4ea358494ccfeb37abfe5b0fd01b45a6051c Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:33 -0700 Subject: ocfs2: fix possible uninitialized variable access In ocfs2_local_alloc_find_clear_bits and ocfs2_get_dentry, variable numfound and set may be uninitialized and then used in tracepoint. In ocfs2_xattr_block_get and ocfs2_delete_xattr_in_bucket, variable block_off and xv may be uninitialized and then used in the following logic due to unchecked return value. This patch fixes these possible issues. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/export.c | 2 +- fs/ocfs2/localalloc.c | 2 +- fs/ocfs2/xattr.c | 8 ++++++++ 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 29651167190d..540dc4bdd042 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -82,7 +82,6 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, } status = ocfs2_test_inode_bit(osb, blkno, &set); - trace_ocfs2_get_dentry_test_bit(status, set); if (status < 0) { if (status == -EINVAL) { /* @@ -96,6 +95,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, goto unlock_nfs_sync; } + trace_ocfs2_get_dentry_test_bit(status, set); /* If the inode allocator bit is clear, this inode must be stale */ if (!set) { status = -ESTALE; diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 096cff6f9ba8..857bbbcd39f3 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -839,7 +839,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, u32 *numbits, struct ocfs2_alloc_reservation *resv) { - int numfound, bitoff, left, startoff, lastzero; + int numfound = 0, bitoff, left, startoff, lastzero; int local_resv = 0; struct ocfs2_alloc_reservation r; void *bitmap = NULL; diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 85b190dc132f..4ca7533be479 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1238,6 +1238,10 @@ static int ocfs2_xattr_block_get(struct inode *inode, i, &block_off, &name_offset); + if (ret) { + mlog_errno(ret); + goto cleanup; + } xs->base = bucket_block(xs->bucket, block_off); } if (ocfs2_xattr_is_local(xs->here)) { @@ -5665,6 +5669,10 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i, &xv, NULL); + if (ret) { + mlog_errno(ret); + break; + } ret = ocfs2_lock_xattr_remove_allocators(inode, xv, args->ref_ci, -- cgit v1.2.3-58-ga151 From 762515a8e9c7ead55539cf96a63dec2363b1df50 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Tue, 14 Apr 2015 15:43:41 -0700 Subject: ocfs2: fix a typo in the copyright statement Signed-off-by: Jakub Wilk Reviewed-by: Eric Ren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 2241a19b9335..ccd4dcfc3645 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -18,7 +18,7 @@ * * linux/fs/minix/dir.c * - * Copyright (C) 1991, 1992 Linux Torvalds + * Copyright (C) 1991, 1992 Linus Torvalds * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public -- cgit v1.2.3-58-ga151 From e2ac55b6a8e337fac7cc59c6f452caac92ab5ee6 Mon Sep 17 00:00:00 2001 From: Chengyu Song Date: Tue, 14 Apr 2015 15:43:44 -0700 Subject: ocfs2: incorrect check for debugfs returns debugfs_create_dir and debugfs_create_file may return -ENODEV when debugfs is not configured, so the return value should be checked against ERROR_VALUE as well, otherwise the later dereference of the dentry pointer would crash the kernel. This patch tries to solve this problem by fixing certain checks. However, I have that found other call sites are protected by #ifdef CONFIG_DEBUG_FS. In current implementation, if CONFIG_DEBUG_FS is defined, then the above two functions will never return any ERROR_VALUE. So another possibility to fix this is to surround all the buggy checks/functions with the same #ifdef CONFIG_DEBUG_FS. But I'm not sure if this would break any functionality, as only OCFS2_FS_STATS declares dependency on DEBUG_FS. Signed-off-by: Chengyu Song Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 42 +++++++++++++++++++++++++++++++----------- fs/ocfs2/dlmglue.c | 2 +- fs/ocfs2/super.c | 9 +++++---- 3 files changed, 37 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 16eff45727ee..8e19b9d7aba8 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1312,7 +1312,9 @@ static int o2hb_debug_init(void) int ret = -ENOMEM; o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); - if (!o2hb_debug_dir) { + if (IS_ERR_OR_NULL(o2hb_debug_dir)) { + ret = o2hb_debug_dir ? + PTR_ERR(o2hb_debug_dir) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -1325,7 +1327,9 @@ static int o2hb_debug_init(void) sizeof(o2hb_live_node_bitmap), O2NM_MAX_NODES, o2hb_live_node_bitmap); - if (!o2hb_debug_livenodes) { + if (IS_ERR_OR_NULL(o2hb_debug_livenodes)) { + ret = o2hb_debug_livenodes ? + PTR_ERR(o2hb_debug_livenodes) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -1338,7 +1342,9 @@ static int o2hb_debug_init(void) sizeof(o2hb_live_region_bitmap), O2NM_MAX_REGIONS, o2hb_live_region_bitmap); - if (!o2hb_debug_liveregions) { + if (IS_ERR_OR_NULL(o2hb_debug_liveregions)) { + ret = o2hb_debug_liveregions ? + PTR_ERR(o2hb_debug_liveregions) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -1352,7 +1358,9 @@ static int o2hb_debug_init(void) sizeof(o2hb_quorum_region_bitmap), O2NM_MAX_REGIONS, o2hb_quorum_region_bitmap); - if (!o2hb_debug_quorumregions) { + if (IS_ERR_OR_NULL(o2hb_debug_quorumregions)) { + ret = o2hb_debug_quorumregions ? + PTR_ERR(o2hb_debug_quorumregions) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -1366,7 +1374,9 @@ static int o2hb_debug_init(void) sizeof(o2hb_failed_region_bitmap), O2NM_MAX_REGIONS, o2hb_failed_region_bitmap); - if (!o2hb_debug_failedregions) { + if (IS_ERR_OR_NULL(o2hb_debug_failedregions)) { + ret = o2hb_debug_failedregions ? + PTR_ERR(o2hb_debug_failedregions) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -2000,7 +2010,8 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) reg->hr_debug_dir = debugfs_create_dir(config_item_name(®->hr_item), dir); - if (!reg->hr_debug_dir) { + if (IS_ERR_OR_NULL(reg->hr_debug_dir)) { + ret = reg->hr_debug_dir ? PTR_ERR(reg->hr_debug_dir) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -2013,7 +2024,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) O2HB_DB_TYPE_REGION_LIVENODES, sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES, reg); - if (!reg->hr_debug_livenodes) { + if (IS_ERR_OR_NULL(reg->hr_debug_livenodes)) { + ret = reg->hr_debug_livenodes ? + PTR_ERR(reg->hr_debug_livenodes) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -2025,7 +2038,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) sizeof(*(reg->hr_db_regnum)), O2HB_DB_TYPE_REGION_NUMBER, 0, O2NM_MAX_NODES, reg); - if (!reg->hr_debug_regnum) { + if (IS_ERR_OR_NULL(reg->hr_debug_regnum)) { + ret = reg->hr_debug_regnum ? + PTR_ERR(reg->hr_debug_regnum) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -2037,7 +2052,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) sizeof(*(reg->hr_db_elapsed_time)), O2HB_DB_TYPE_REGION_ELAPSED_TIME, 0, 0, reg); - if (!reg->hr_debug_elapsed_time) { + if (IS_ERR_OR_NULL(reg->hr_debug_elapsed_time)) { + ret = reg->hr_debug_elapsed_time ? + PTR_ERR(reg->hr_debug_elapsed_time) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -2049,13 +2066,16 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) sizeof(*(reg->hr_db_pinned)), O2HB_DB_TYPE_REGION_PINNED, 0, 0, reg); - if (!reg->hr_debug_pinned) { + if (IS_ERR_OR_NULL(reg->hr_debug_pinned)) { + ret = reg->hr_debug_pinned ? + PTR_ERR(reg->hr_debug_pinned) : -ENOMEM; mlog_errno(ret); goto bail; } - ret = 0; + return 0; bail: + debugfs_remove_recursive(reg->hr_debug_dir); return ret; } diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 11849a44dc5a..23adcbf374d3 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2954,7 +2954,7 @@ static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops); - if (!dlm_debug->d_locking_state) { + if (IS_ERR_OR_NULL(dlm_debug->d_locking_state)) { ret = -EINVAL; mlog(ML_ERROR, "Unable to create locking state debugfs file.\n"); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 26675185b886..fb43de586791 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1112,7 +1112,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); - if (!osb->osb_debug_root) { + if (IS_ERR_OR_NULL(osb->osb_debug_root)) { status = -EINVAL; mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); goto read_super_error; @@ -1122,7 +1122,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->osb_debug_root, osb, &ocfs2_osb_debug_fops); - if (!osb->osb_ctxt) { + if (IS_ERR_OR_NULL(osb->osb_ctxt)) { status = -EINVAL; mlog_errno(status); goto read_super_error; @@ -1606,8 +1606,9 @@ static int __init ocfs2_init(void) } ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); - if (!ocfs2_debugfs_root) { - status = -ENOMEM; + if (IS_ERR_OR_NULL(ocfs2_debugfs_root)) { + status = ocfs2_debugfs_root ? + PTR_ERR(ocfs2_debugfs_root) : -ENOMEM; mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); goto out4; } -- cgit v1.2.3-58-ga151 From 1543306e75ee40662b2c6d37c43c8659f3d6f880 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 14 Apr 2015 15:43:46 -0700 Subject: ocfs2: logging: remove static buffer, use vsprintf extension %pV Use the vsprintf %pV extension to avoid using a static buffer and remove the now unnecessary buffer. Signed-off-by: Joe Perches Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index fb43de586791..c558bff99165 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2564,22 +2564,22 @@ static void ocfs2_handle_error(struct super_block *sb) ocfs2_set_ro_flag(osb, 0); } -static char error_buf[1024]; - -void __ocfs2_error(struct super_block *sb, - const char *function, - const char *fmt, ...) +void __ocfs2_error(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - vsnprintf(error_buf, sizeof(error_buf), fmt, args); - va_end(args); + vaf.fmt = fmt; + vaf.va = &args; /* Not using mlog here because we want to show the actual * function the error came from. */ - printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n", - sb->s_id, function, error_buf); + printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); ocfs2_handle_error(sb); } @@ -2587,18 +2587,21 @@ void __ocfs2_error(struct super_block *sb, /* Handle critical errors. This is intentionally more drastic than * ocfs2_handle_error, so we only use for things like journal errors, * etc. */ -void __ocfs2_abort(struct super_block* sb, - const char *function, +void __ocfs2_abort(struct super_block *sb, const char *function, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - vsnprintf(error_buf, sizeof(error_buf), fmt, args); - va_end(args); - printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n", - sb->s_id, function, error_buf); + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); /* We don't have the cluster support yet to go straight to * hard readonly in here. Until then, we want to keep -- cgit v1.2.3-58-ga151 From 2f2eca20a09dac0e9d62bf57ce6a0c6ef6cf91e6 Mon Sep 17 00:00:00 2001 From: alex chen Date: Tue, 14 Apr 2015 15:43:49 -0700 Subject: ocfs2: check if the ocfs2 lock resource has been initialized before calling ocfs2_dlm_lock If ocfs2 lockres has not been initialized before calling ocfs2_dlm_lock, the lock won't be dropped and then will lead umount hung. The case is described below: ocfs2_mknod ocfs2_mknod_locked __ocfs2_mknod_locked ocfs2_journal_access_di Failed because of -ENOMEM or other reasons, the inode lockres has not been initialized yet. iput(inode) ocfs2_evict_inode ocfs2_delete_inode ocfs2_inode_lock ocfs2_inode_lock_full_nested __ocfs2_cluster_lock Succeeds and allocates a new dlm lockres. ocfs2_clear_inode ocfs2_open_unlock ocfs2_drop_inode_locks ocfs2_drop_lock Since lockres has not been initialized, the lock can't be dropped and the lockres can't be migrated, thus umount will hang forever. Signed-off-by: Alex Chen Reviewed-by: Joseph Qi Reviewed-by: joyce.xue Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 23adcbf374d3..956edf67be20 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1391,6 +1391,11 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb, int noqueue_attempted = 0; int dlm_locked = 0; + if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) { + mlog_errno(-EINVAL); + return -EINVAL; + } + ocfs2_init_mask_waiter(&mw); if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) -- cgit v1.2.3-58-ga151 From 1d5b897706d11ac59fecd85ba7b1cbf91c44fe50 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 14 Apr 2015 15:43:52 -0700 Subject: ocfs2: make mlog_errno return the errno ocfs2 does mlog_errno(v); return v; in many places. Change mlog_errno() so we can do return mlog_errno(v); For some weird reason this patch reduces the size of ocfs2 by 6k: akpm3:/usr/src/25> size fs/ocfs2/ocfs2.ko text data bss dec hex filename 1146613 82767 832192 2061572 1f7504 fs/ocfs2/ocfs2.ko-before 1140857 82767 832192 2055816 1f5e88 fs/ocfs2/ocfs2.ko-after [dan.carpenter@oracle.com: double evaluation concerns in mlog_errno()] Cc: Mark Fasheh Cc: Joel Becker Cc: alex chen Signed-off-by: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/masklog.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 2260fb9e6508..7fdc25a4d8c0 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -196,13 +196,14 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits; } \ } while (0) -#define mlog_errno(st) do { \ +#define mlog_errno(st) ({ \ int _st = (st); \ if (_st != -ERESTARTSYS && _st != -EINTR && \ _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC && \ _st != -EDQUOT) \ mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ -} while (0) + _st; \ +}) #define mlog_bug_on_msg(cond, fmt, args...) do { \ if (cond) { \ -- cgit v1.2.3-58-ga151 From b9ea25152e56365ce149b9a39637cd7a16eec556 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 14 Apr 2015 15:45:27 -0700 Subject: page_writeback: clean up mess around cancel_dirty_page() This patch replaces cancel_dirty_page() with a helper function account_page_cleaned() which only updates counters. It's called from truncate_complete_page() and from try_to_free_buffers() (hack for ext3). Page is locked in both cases, page-lock protects against concurrent dirtiers: see commit 2d6d7f982846 ("mm: protect set_page_dirty() from ongoing truncation"). Delete_from_page_cache() shouldn't be called for dirty pages, they must be handled by caller (either written or truncated). This patch treats final dirty accounting fixup at the end of __delete_from_page_cache() as a debug check and adds WARN_ON_ONCE() around it. If something removes dirty pages without proper handling that might be a bug and unwritten data might be lost. Hugetlbfs has no dirty pages accounting, ClearPageDirty() is enough here. cancel_dirty_page() in nfs_wb_page_cancel() is redundant. This is helper for nfs_invalidate_page() and it's called only in case complete invalidation. The mess was started in v2.6.20 after commits 46d2277c796f ("Clean up and make try_to_free_buffers() not race with dirty pages") and 3e67c0987d75 ("truncate: clear page dirtiness before running try_to_free_buffers()") first was reverted right in v2.6.20 in commit ecdfc9787fe5 ("Resurrect 'try_to_free_buffers()' VM hackery"), second in v2.6.25 commit a2b345642f53 ("Fix dirty page accounting leak with ext3 data=journal"). Custom fixes were introduced between these points. NFS in v2.6.23, commit 1b3b4a1a2deb ("NFS: Fix a write request leak in nfs_invalidate_page()"). Kludge in __delete_from_page_cache() in v2.6.24, commit 3a6927906f1b ("Do dirty page accounting when removing a page from the page cache"). Since v2.6.25 all of them are redundant. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Konstantin Khlebnikov Cc: Tejun Heo Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../lustre/include/linux/lustre_patchless_compat.h | 4 ++- fs/buffer.c | 4 +-- fs/hugetlbfs/inode.c | 2 +- fs/nfs/write.c | 5 --- include/linux/mm.h | 2 ++ include/linux/page-flags.h | 2 -- mm/filemap.c | 15 ++++----- mm/page-writeback.c | 19 +++++++++++ mm/truncate.c | 37 ++++------------------ 9 files changed, 41 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h index a260e99a4447..d72605864b0a 100644 --- a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h +++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h @@ -55,7 +55,9 @@ truncate_complete_page(struct address_space *mapping, struct page *page) if (PagePrivate(page)) page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); - cancel_dirty_page(page, PAGE_SIZE); + if (TestClearPageDirty(page)) + account_page_cleaned(page, mapping); + ClearPageMappedToDisk(page); ll_delete_from_page_cache(page); } diff --git a/fs/buffer.c b/fs/buffer.c index 20805db2c987..c7a5602d01ee 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3243,8 +3243,8 @@ int try_to_free_buffers(struct page *page) * to synchronise against __set_page_dirty_buffers and prevent the * dirty bit from being lost. */ - if (ret) - cancel_dirty_page(page, PAGE_CACHE_SIZE); + if (ret && TestClearPageDirty(page)) + account_page_cleaned(page, mapping); spin_unlock(&mapping->private_lock); out: if (buffers_to_free) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c274aca8e8dc..db76cec3ce21 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -319,7 +319,7 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, static void truncate_huge_page(struct page *page) { - cancel_dirty_page(page, /* No IO accounting for huge pages? */0); + ClearPageDirty(page); ClearPageUptodate(page); delete_from_page_cache(page); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 849ed784d6ac..759931088094 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1876,11 +1876,6 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) * request from the inode / page_private pointer and * release it */ nfs_inode_remove_request(req); - /* - * In case nfs_inode_remove_request has marked the - * page as being dirty - */ - cancel_dirty_page(page, PAGE_CACHE_SIZE); nfs_unlock_and_release_request(req); } diff --git a/include/linux/mm.h b/include/linux/mm.h index cccbbba12b9d..6571dd78e984 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1294,9 +1294,11 @@ int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); void account_page_dirtied(struct page *page, struct address_space *mapping); +void account_page_cleaned(struct page *page, struct address_space *mapping); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); + int get_cmdline(struct task_struct *task, char *buffer, int buflen); /* Is the vma a continuation of the stack vma above it? */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5ed7bdaf22d5..c851ff92d5b3 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -328,8 +328,6 @@ static inline void SetPageUptodate(struct page *page) CLEARPAGEFLAG(Uptodate, uptodate) -extern void cancel_dirty_page(struct page *page, unsigned int account_size); - int test_clear_page_writeback(struct page *page); int __test_set_page_writeback(struct page *page, bool keep_write); diff --git a/mm/filemap.c b/mm/filemap.c index ad7242043bdb..434dba317400 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -203,16 +203,15 @@ void __delete_from_page_cache(struct page *page, void *shadow) BUG_ON(page_mapped(page)); /* - * Some filesystems seem to re-dirty the page even after - * the VM has canceled the dirty bit (eg ext3 journaling). + * At this point page must be either written or cleaned by truncate. + * Dirty page here signals a bug and loss of unwritten data. * - * Fix it up by doing a final dirty accounting check after - * having removed the page entirely. + * This fixes dirty accounting after removing the page entirely but + * leaves PageDirty set: it has no effect for truncated page and + * anyway will be cleared before returning page into buddy allocator. */ - if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { - dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); - } + if (WARN_ON_ONCE(PageDirty(page))) + account_page_cleaned(page, mapping); } /** diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 644bcb665773..0372411f38fc 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2110,6 +2110,25 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) } EXPORT_SYMBOL(account_page_dirtied); +/* + * Helper function for deaccounting dirty page without writeback. + * + * Doing this should *normally* only ever be done when a page + * is truncated, and is not actually mapped anywhere at all. However, + * fs/buffer.c does this when it notices that somebody has cleaned + * out all the buffers on a page without actually doing it through + * the VM. Can you say "ext3 is horribly ugly"? Thought you could. + */ +void account_page_cleaned(struct page *page, struct address_space *mapping) +{ + if (mapping_cap_account_dirty(mapping)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); + task_io_account_cancelled_write(PAGE_CACHE_SIZE); + } +} +EXPORT_SYMBOL(account_page_cleaned); + /* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. diff --git a/mm/truncate.c b/mm/truncate.c index ddec5a5966d7..7a9d8a3cb143 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -92,35 +92,6 @@ void do_invalidatepage(struct page *page, unsigned int offset, (*invalidatepage)(page, offset, length); } -/* - * This cancels just the dirty bit on the kernel page itself, it - * does NOT actually remove dirty bits on any mmap's that may be - * around. It also leaves the page tagged dirty, so any sync - * activity will still find it on the dirty lists, and in particular, - * clear_page_dirty_for_io() will still look at the dirty bits in - * the VM. - * - * Doing this should *normally* only ever be done when a page - * is truncated, and is not actually mapped anywhere at all. However, - * fs/buffer.c does this when it notices that somebody has cleaned - * out all the buffers on a page without actually doing it through - * the VM. Can you say "ext3 is horribly ugly"? Tought you could. - */ -void cancel_dirty_page(struct page *page, unsigned int account_size) -{ - if (TestClearPageDirty(page)) { - struct address_space *mapping = page->mapping; - if (mapping && mapping_cap_account_dirty(mapping)) { - dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), - BDI_RECLAIMABLE); - if (account_size) - task_io_account_cancelled_write(account_size); - } - } -} -EXPORT_SYMBOL(cancel_dirty_page); - /* * If truncate cannot remove the fs-private metadata from the page, the page * becomes orphaned. It will be left on the LRU and may even be mapped into @@ -140,7 +111,13 @@ truncate_complete_page(struct address_space *mapping, struct page *page) if (page_has_private(page)) do_invalidatepage(page, 0, PAGE_CACHE_SIZE); - cancel_dirty_page(page, PAGE_CACHE_SIZE); + /* + * Some filesystems seem to re-dirty the page even after + * the VM has canceled the dirty bit (eg ext3 journaling). + * Hence dirty accounting check is placed after invalidation. + */ + if (TestClearPageDirty(page)) + account_page_cleaned(page, mapping); ClearPageMappedToDisk(page); delete_from_page_cache(page); -- cgit v1.2.3-58-ga151 From 58be19dcf7c0c206e60796c2ee18fc4fc1659fea Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 14 Apr 2015 15:46:39 -0700 Subject: ocfs2: copy fs uuid to superblock Currently, maximal number of cleancache enabled filesystems equals 32, which is insufficient nowadays, because a Linux host can have hundreds of containers on board, each of which might want its own filesystem. This patch set targets at removing this limitation - see patch 4 for more details. Patches 1-3 prepare the code for this change. This patch (of 4): This will allow us to remove the uuid argument from cleancache_init_shared_fs. Signed-off-by: Vladimir Davydov Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Cc: Mark Fasheh Cc: Joel Becker Cc: Stefan Hengelein Cc: Florian Schmaus Cc: Andor Daam Cc: Dan Magenheimer Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c558bff99165..a811a95cfd5f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2070,6 +2070,8 @@ static int ocfs2_initialize_super(struct super_block *sb, cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); + memcpy(sb->s_uuid, di->id2.i_super.s_uuid, + sizeof(di->id2.i_super.s_uuid)); osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; -- cgit v1.2.3-58-ga151 From 9de1626290eaa7d921413ddc83544bc3bae27283 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 14 Apr 2015 15:46:42 -0700 Subject: cleancache: zap uuid arg of cleancache_init_shared_fs Use super_block->s_uuid instead. Every shared filesystem using cleancache must now initialize super_block->s_uuid before calling cleancache_init_shared_fs. The only one on the tree, ocfs2, already meets this requirement. Signed-off-by: Vladimir Davydov Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Cc: Mark Fasheh Cc: Joel Becker Cc: Stefan Hengelein Cc: Florian Schmaus Cc: Andor Daam Cc: Dan Magenheimer Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 2 +- include/linux/cleancache.h | 6 +++--- mm/cleancache.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index a811a95cfd5f..837ddce4b659 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2336,7 +2336,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog_errno(status); goto bail; } - cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb); + cleancache_init_shared_fs(sb); bail: return status; diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index 4ce9056b31a8..29657d1c83fb 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -36,7 +36,7 @@ struct cleancache_ops { extern struct cleancache_ops * cleancache_register_ops(struct cleancache_ops *ops); extern void __cleancache_init_fs(struct super_block *); -extern void __cleancache_init_shared_fs(char *, struct super_block *); +extern void __cleancache_init_shared_fs(struct super_block *); extern int __cleancache_get_page(struct page *); extern void __cleancache_put_page(struct page *); extern void __cleancache_invalidate_page(struct address_space *, struct page *); @@ -78,10 +78,10 @@ static inline void cleancache_init_fs(struct super_block *sb) __cleancache_init_fs(sb); } -static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb) +static inline void cleancache_init_shared_fs(struct super_block *sb) { if (cleancache_enabled) - __cleancache_init_shared_fs(uuid, sb); + __cleancache_init_shared_fs(sb); } static inline int cleancache_get_page(struct page *page) diff --git a/mm/cleancache.c b/mm/cleancache.c index 053bcd8f12fb..532495f2e4f4 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -155,7 +155,7 @@ void __cleancache_init_fs(struct super_block *sb) EXPORT_SYMBOL(__cleancache_init_fs); /* Called by a cleancache-enabled clustered filesystem at time of mount */ -void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) +void __cleancache_init_shared_fs(struct super_block *sb) { int i; @@ -163,10 +163,10 @@ void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { if (shared_fs_poolid_map[i] == FS_UNKNOWN) { sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET; - uuids[i] = uuid; + uuids[i] = sb->s_uuid; if (cleancache_ops) shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs - (uuid, PAGE_SIZE); + (sb->s_uuid, PAGE_SIZE); else shared_fs_poolid_map[i] = FS_NO_BACKEND; break; -- cgit v1.2.3-58-ga151 From 3cb29d11174f29b76addcba4374884b14f8ea4b1 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 14 Apr 2015 15:46:48 -0700 Subject: cleancache: remove limit on the number of cleancache enabled filesystems The limit equals 32 and is imposed by the number of entries in the fs_poolid_map and shared_fs_poolid_map. Nowadays it is insufficient, because with containers on board a Linux host can have hundreds of active fs mounts. These maps were introduced by commit 49a9ab815acb8 ("mm: cleancache: lazy initialization to allow tmem backends to build/run as modules") in order to allow compiling cleancache drivers as modules. Real pool ids are stored in these maps while super_block->cleancache_poolid points to an entry in the map, so that on cleancache registration we can walk over all (if there are <= 32 of them, of course) cleancache-enabled super blocks and assign real pool ids. Actually, there is absolutely no need in these maps, because we can iterate over all super blocks immediately using iterate_supers. This is not racy, because cleancache_init_ops is called from mount_fs with super_block->s_umount held for writing, while iterate_supers takes this semaphore for reading, so if we call iterate_supers after setting cleancache_ops, all super blocks that had been created before cleancache_register_ops was called will be assigned pool ids by the action function of iterate_supers while all newer super blocks will receive it in cleancache_init_fs. This patch therefore removes the maps and hence the artificial limit on the number of cleancache enabled filesystems. Signed-off-by: Vladimir Davydov Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Cc: Mark Fasheh Cc: Joel Becker Cc: Stefan Hengelein Cc: Florian Schmaus Cc: Andor Daam Cc: Dan Magenheimer Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 2 +- include/linux/cleancache.h | 4 + mm/cleancache.c | 270 +++++++++++++++------------------------------ 3 files changed, 94 insertions(+), 182 deletions(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 2b7dc90ccdbb..928c20f47af9 100644 --- a/fs/super.c +++ b/fs/super.c @@ -224,7 +224,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; - s->cleancache_poolid = -1; + s->cleancache_poolid = CLEANCACHE_NO_POOL; s->s_shrink.seeks = DEFAULT_SEEKS; s->s_shrink.scan_objects = super_cache_scan; diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index b23611f43cfb..bda5ec0b4b4d 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -5,6 +5,10 @@ #include #include +#define CLEANCACHE_NO_POOL -1 +#define CLEANCACHE_NO_BACKEND -2 +#define CLEANCACHE_NO_BACKEND_SHARED -3 + #define CLEANCACHE_KEY_MAX 6 /* diff --git a/mm/cleancache.c b/mm/cleancache.c index aa10f9a3bc88..8fc50811119b 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -19,7 +19,7 @@ #include /* - * cleancache_ops is set by cleancache_ops_register to contain the pointers + * cleancache_ops is set by cleancache_register_ops to contain the pointers * to the cleancache "backend" implementation functions. */ static struct cleancache_ops *cleancache_ops __read_mostly; @@ -34,104 +34,78 @@ static u64 cleancache_failed_gets; static u64 cleancache_puts; static u64 cleancache_invalidates; -/* - * When no backend is registered all calls to init_fs and init_shared_fs - * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or - * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array - * [shared_|]fs_poolid_map) are given to the respective super block - * (sb->cleancache_poolid) and no tmem_pools are created. When a backend - * registers with cleancache the previous calls to init_fs and init_shared_fs - * are executed to create tmem_pools and set the respective poolids. While no - * backend is registered all "puts", "gets" and "flushes" are ignored or failed. - */ -#define MAX_INITIALIZABLE_FS 32 -#define FAKE_FS_POOLID_OFFSET 1000 -#define FAKE_SHARED_FS_POOLID_OFFSET 2000 - -#define FS_NO_BACKEND (-1) -#define FS_UNKNOWN (-2) -static int fs_poolid_map[MAX_INITIALIZABLE_FS]; -static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS]; -static char *uuids[MAX_INITIALIZABLE_FS]; -/* - * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads - * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple - * threads calling mount (and ending up in __cleancache_init_[shared|]fs). - */ -static DEFINE_MUTEX(poolid_mutex); -/* - * When set to false (default) all calls to the cleancache functions, except - * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded - * by the if (!cleancache_ops) return. This means multiple threads (from - * different filesystems) will be checking cleancache_ops. The usage of a - * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are - * OK if the time between the backend's have been initialized (and - * cleancache_ops has been set to not NULL) and when the filesystems start - * actually calling the backends. The inverse (when unloading) is obviously - * not good - but this shim does not do that (yet). - */ - -/* - * The backends and filesystems work all asynchronously. This is b/c the - * backends can be built as modules. - * The usual sequence of events is: - * a) mount / -> __cleancache_init_fs is called. We set the - * [shared_|]fs_poolid_map and uuids for. - * - * b). user does I/Os -> we call the rest of __cleancache_* functions - * which return immediately as cleancache_ops is false. - * - * c). modprobe zcache -> cleancache_register_ops. We init the backend - * and set cleancache_ops to true, and for any fs_poolid_map - * (which is set by __cleancache_init_fs) we initialize the poolid. - * - * d). user does I/Os -> now that cleancache_ops is true all the - * __cleancache_* functions can call the backend. They all check - * that fs_poolid_map is valid and if so invoke the backend. - * - * e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is - * reset (which is the second check in the __cleancache_* ops - * to call the backend). - * - * The sequence of event could also be c), followed by a), and d). and e). The - * c) would not happen anymore. There is also the chance of c), and one thread - * doing a) + d), and another doing e). For that case we depend on the - * filesystem calling __cleancache_invalidate_fs in the proper sequence (so - * that it handles all I/Os before it invalidates the fs (which is last part - * of unmounting process). - * - * Note: The acute reader will notice that there is no "rmmod zcache" case. - * This is b/c the functionality for that is not yet implemented and when - * done, will require some extra locking not yet devised. - */ +static void cleancache_register_ops_sb(struct super_block *sb, void *unused) +{ + switch (sb->cleancache_poolid) { + case CLEANCACHE_NO_BACKEND: + __cleancache_init_fs(sb); + break; + case CLEANCACHE_NO_BACKEND_SHARED: + __cleancache_init_shared_fs(sb); + break; + } +} /* * Register operations for cleancache. Returns 0 on success. */ int cleancache_register_ops(struct cleancache_ops *ops) { - int i; - - mutex_lock(&poolid_mutex); - if (cleancache_ops) { - mutex_unlock(&poolid_mutex); + if (cmpxchg(&cleancache_ops, NULL, ops)) return -EBUSY; - } - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - if (fs_poolid_map[i] == FS_NO_BACKEND) - fs_poolid_map[i] = ops->init_fs(PAGE_SIZE); - if (shared_fs_poolid_map[i] == FS_NO_BACKEND) - shared_fs_poolid_map[i] = ops->init_shared_fs - (uuids[i], PAGE_SIZE); - } + /* - * We MUST set cleancache_ops _after_ we have called the backends - * init_fs or init_shared_fs functions. Otherwise the compiler might - * re-order where cleancache_ops is set in this function. + * A cleancache backend can be built as a module and hence loaded after + * a cleancache enabled filesystem has called cleancache_init_fs. To + * handle such a scenario, here we call ->init_fs or ->init_shared_fs + * for each active super block. To differentiate between local and + * shared filesystems, we temporarily initialize sb->cleancache_poolid + * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED + * respectively in case there is no backend registered at the time + * cleancache_init_fs or cleancache_init_shared_fs is called. + * + * Since filesystems can be mounted concurrently with cleancache + * backend registration, we have to be careful to guarantee that all + * cleancache enabled filesystems that has been mounted by the time + * cleancache_register_ops is called has got and all mounted later will + * get cleancache_poolid. This is assured by the following statements + * tied together: + * + * a) iterate_supers skips only those super blocks that has started + * ->kill_sb + * + * b) if iterate_supers encounters a super block that has not finished + * ->mount yet, it waits until it is finished + * + * c) cleancache_init_fs is called from ->mount and + * cleancache_invalidate_fs is called from ->kill_sb + * + * d) we call iterate_supers after cleancache_ops has been set + * + * From a) it follows that if iterate_supers skips a super block, then + * either the super block is already dead, in which case we do not need + * to bother initializing cleancache for it, or it was mounted after we + * initiated iterate_supers. In the latter case, it must have seen + * cleancache_ops set according to d) and initialized cleancache from + * ->mount by itself according to c). This proves that we call + * ->init_fs at least once for each active super block. + * + * From b) and c) it follows that if iterate_supers encounters a super + * block that has already started ->init_fs, it will wait until ->mount + * and hence ->init_fs has finished, then check cleancache_poolid, see + * that it has already been set and therefore do nothing. This proves + * that we call ->init_fs no more than once for each super block. + * + * Combined together, the last two paragraphs prove the function + * correctness. + * + * Note that various cleancache callbacks may proceed before this + * function is called or even concurrently with it, but since + * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop + * until the corresponding ->init_fs has been actually called and + * cleancache_ops has been set. */ - barrier(); - cleancache_ops = ops; - mutex_unlock(&poolid_mutex); + iterate_supers(cleancache_register_ops_sb, NULL); return 0; } EXPORT_SYMBOL(cleancache_register_ops); @@ -139,42 +113,28 @@ EXPORT_SYMBOL(cleancache_register_ops); /* Called by a cleancache-enabled filesystem at time of mount */ void __cleancache_init_fs(struct super_block *sb) { - int i; + int pool_id = CLEANCACHE_NO_BACKEND; - mutex_lock(&poolid_mutex); - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - if (fs_poolid_map[i] == FS_UNKNOWN) { - sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET; - if (cleancache_ops) - fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE); - else - fs_poolid_map[i] = FS_NO_BACKEND; - break; - } + if (cleancache_ops) { + pool_id = cleancache_ops->init_fs(PAGE_SIZE); + if (pool_id < 0) + pool_id = CLEANCACHE_NO_POOL; } - mutex_unlock(&poolid_mutex); + sb->cleancache_poolid = pool_id; } EXPORT_SYMBOL(__cleancache_init_fs); /* Called by a cleancache-enabled clustered filesystem at time of mount */ void __cleancache_init_shared_fs(struct super_block *sb) { - int i; + int pool_id = CLEANCACHE_NO_BACKEND_SHARED; - mutex_lock(&poolid_mutex); - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - if (shared_fs_poolid_map[i] == FS_UNKNOWN) { - sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET; - uuids[i] = sb->s_uuid; - if (cleancache_ops) - shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs - (sb->s_uuid, PAGE_SIZE); - else - shared_fs_poolid_map[i] = FS_NO_BACKEND; - break; - } + if (cleancache_ops) { + pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE); + if (pool_id < 0) + pool_id = CLEANCACHE_NO_POOL; } - mutex_unlock(&poolid_mutex); + sb->cleancache_poolid = pool_id; } EXPORT_SYMBOL(__cleancache_init_shared_fs); @@ -203,19 +163,6 @@ static int cleancache_get_key(struct inode *inode, return 0; } -/* - * Returns a pool_id that is associated with a given fake poolid. - */ -static int get_poolid_from_fake(int fake_pool_id) -{ - if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) - return shared_fs_poolid_map[fake_pool_id - - FAKE_SHARED_FS_POOLID_OFFSET]; - else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) - return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET]; - return FS_NO_BACKEND; -} - /* * "Get" data from cleancache associated with the poolid/inode/index * that were specified when the data was put to cleanache and, if @@ -231,7 +178,6 @@ int __cleancache_get_page(struct page *page) { int ret = -1; int pool_id; - int fake_pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) { @@ -240,17 +186,14 @@ int __cleancache_get_page(struct page *page) } VM_BUG_ON_PAGE(!PageLocked(page), page); - fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (fake_pool_id < 0) + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id < 0) goto out; - pool_id = get_poolid_from_fake(fake_pool_id); if (cleancache_get_key(page->mapping->host, &key) < 0) goto out; - if (pool_id >= 0) - ret = cleancache_ops->get_page(pool_id, - key, page->index, page); + ret = cleancache_ops->get_page(pool_id, key, page->index, page); if (ret == 0) cleancache_succ_gets++; else @@ -273,7 +216,6 @@ EXPORT_SYMBOL(__cleancache_get_page); void __cleancache_put_page(struct page *page) { int pool_id; - int fake_pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) { @@ -282,12 +224,7 @@ void __cleancache_put_page(struct page *page) } VM_BUG_ON_PAGE(!PageLocked(page), page); - fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (fake_pool_id < 0) - return; - - pool_id = get_poolid_from_fake(fake_pool_id); - + pool_id = page->mapping->host->i_sb->cleancache_poolid; if (pool_id >= 0 && cleancache_get_key(page->mapping->host, &key) >= 0) { cleancache_ops->put_page(pool_id, key, page->index, page); @@ -308,18 +245,13 @@ void __cleancache_invalidate_page(struct address_space *mapping, struct page *page) { /* careful... page->mapping is NULL sometimes when this is called */ - int pool_id; - int fake_pool_id = mapping->host->i_sb->cleancache_poolid; + int pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) return; - if (fake_pool_id >= 0) { - pool_id = get_poolid_from_fake(fake_pool_id); - if (pool_id < 0) - return; - + if (pool_id >= 0) { VM_BUG_ON_PAGE(!PageLocked(page), page); if (cleancache_get_key(mapping->host, &key) >= 0) { cleancache_ops->invalidate_page(pool_id, @@ -341,18 +273,12 @@ EXPORT_SYMBOL(__cleancache_invalidate_page); */ void __cleancache_invalidate_inode(struct address_space *mapping) { - int pool_id; - int fake_pool_id = mapping->host->i_sb->cleancache_poolid; + int pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) return; - if (fake_pool_id < 0) - return; - - pool_id = get_poolid_from_fake(fake_pool_id); - if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) cleancache_ops->invalidate_inode(pool_id, key); } @@ -365,32 +291,18 @@ EXPORT_SYMBOL(__cleancache_invalidate_inode); */ void __cleancache_invalidate_fs(struct super_block *sb) { - int index; - int fake_pool_id = sb->cleancache_poolid; - int old_poolid = fake_pool_id; + int pool_id; - mutex_lock(&poolid_mutex); - if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) { - index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET; - old_poolid = shared_fs_poolid_map[index]; - shared_fs_poolid_map[index] = FS_UNKNOWN; - uuids[index] = NULL; - } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) { - index = fake_pool_id - FAKE_FS_POOLID_OFFSET; - old_poolid = fs_poolid_map[index]; - fs_poolid_map[index] = FS_UNKNOWN; - } - sb->cleancache_poolid = -1; - if (cleancache_ops) - cleancache_ops->invalidate_fs(old_poolid); - mutex_unlock(&poolid_mutex); + pool_id = sb->cleancache_poolid; + sb->cleancache_poolid = CLEANCACHE_NO_POOL; + + if (cleancache_ops && pool_id >= 0) + cleancache_ops->invalidate_fs(pool_id); } EXPORT_SYMBOL(__cleancache_invalidate_fs); static int __init init_cleancache(void) { - int i; - #ifdef CONFIG_DEBUG_FS struct dentry *root = debugfs_create_dir("cleancache", NULL); if (root == NULL) @@ -402,10 +314,6 @@ static int __init init_cleancache(void) debugfs_create_u64("invalidates", S_IRUGO, root, &cleancache_invalidates); #endif - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - fs_poolid_map[i] = FS_UNKNOWN; - shared_fs_poolid_map[i] = FS_UNKNOWN; - } return 0; } module_init(init_cleancache) -- cgit v1.2.3-58-ga151 From a87938b2e246b81b4fb713edb371a9fa3c5c3c86 Mon Sep 17 00:00:00 2001 From: Michael Davidson Date: Tue, 14 Apr 2015 15:47:38 -0700 Subject: fs/binfmt_elf.c: fix bug in loading of PIE binaries With CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE enabled, and a normal top-down address allocation strategy, load_elf_binary() will attempt to map a PIE binary into an address range immediately below mm->mmap_base. Unfortunately, load_elf_ binary() does not take account of the need to allocate sufficient space for the entire binary which means that, while the first PT_LOAD segment is mapped below mm->mmap_base, the subsequent PT_LOAD segment(s) end up being mapped above mm->mmap_base into the are that is supposed to be the "gap" between the stack and the binary. Since the size of the "gap" on x86_64 is only guaranteed to be 128MB this means that binaries with large data segments > 128MB can end up mapping part of their data segment over their stack resulting in corruption of the stack (and the data segment once the binary starts to run). Any PIE binary with a data segment > 128MB is vulnerable to this although address randomization means that the actual gap between the stack and the end of the binary is normally greater than 128MB. The larger the data segment of the binary the higher the probability of failure. Fix this by calculating the total size of the binary in the same way as load_elf_interp(). Signed-off-by: Michael Davidson Cc: Alexander Viro Cc: Jiri Kosina Cc: Kees Cook Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 995986b8e36b..d925f55e4857 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -862,6 +862,7 @@ static int load_elf_binary(struct linux_binprm *bprm) i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { int elf_prot = 0, elf_flags; unsigned long k, vaddr; + unsigned long total_size = 0; if (elf_ppnt->p_type != PT_LOAD) continue; @@ -924,10 +925,16 @@ static int load_elf_binary(struct linux_binprm *bprm) #else load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); #endif + total_size = total_mapping_size(elf_phdata, + loc->elf_ex.e_phnum); + if (!total_size) { + error = -EINVAL; + goto out_free_dentry; + } } error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, - elf_prot, elf_flags, 0); + elf_prot, elf_flags, total_size); if (BAD_ADDR(error)) { retval = IS_ERR((void *)error) ? PTR_ERR((void*)error) : -EINVAL; -- cgit v1.2.3-58-ga151 From d1fd836dcf00d2028c700c7e44d2c23404062c90 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:48:07 -0700 Subject: mm: split ET_DYN ASLR from mmap ASLR This fixes the "offset2lib" weakness in ASLR for arm, arm64, mips, powerpc, and x86. The problem is that if there is a leak of ASLR from the executable (ET_DYN), it means a leak of shared library offset as well (mmap), and vice versa. Further details and a PoC of this attack is available here: http://cybersecurity.upv.es/attacks/offset2lib/offset2lib.html With this patch, a PIE linked executable (ET_DYN) has its own ASLR region: $ ./show_mmaps_pie 54859ccd6000-54859ccd7000 r-xp ... /tmp/show_mmaps_pie 54859ced6000-54859ced7000 r--p ... /tmp/show_mmaps_pie 54859ced7000-54859ced8000 rw-p ... /tmp/show_mmaps_pie 7f75be764000-7f75be91f000 r-xp ... /lib/x86_64-linux-gnu/libc.so.6 7f75be91f000-7f75beb1f000 ---p ... /lib/x86_64-linux-gnu/libc.so.6 7f75beb1f000-7f75beb23000 r--p ... /lib/x86_64-linux-gnu/libc.so.6 7f75beb23000-7f75beb25000 rw-p ... /lib/x86_64-linux-gnu/libc.so.6 7f75beb25000-7f75beb2a000 rw-p ... 7f75beb2a000-7f75beb4d000 r-xp ... /lib64/ld-linux-x86-64.so.2 7f75bed45000-7f75bed46000 rw-p ... 7f75bed46000-7f75bed47000 r-xp ... 7f75bed47000-7f75bed4c000 rw-p ... 7f75bed4c000-7f75bed4d000 r--p ... /lib64/ld-linux-x86-64.so.2 7f75bed4d000-7f75bed4e000 rw-p ... /lib64/ld-linux-x86-64.so.2 7f75bed4e000-7f75bed4f000 rw-p ... 7fffb3741000-7fffb3762000 rw-p ... [stack] 7fffb377b000-7fffb377d000 r--p ... [vvar] 7fffb377d000-7fffb377f000 r-xp ... [vdso] The change is to add a call the newly created arch_mmap_rnd() into the ELF loader for handling ET_DYN ASLR in a separate region from mmap ASLR, as was already done on s390. Removes CONFIG_BINFMT_ELF_RANDOMIZE_PIE, which is no longer needed. Signed-off-by: Kees Cook Reported-by: Hector Marco-Gisbert Cc: Russell King Reviewed-by: Ingo Molnar Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Alexander Viro Cc: Oleg Nesterov Cc: Andy Lutomirski Cc: "David A. Long" Cc: Andrey Ryabinin Cc: Arun Chandran Cc: Yann Droneaud Cc: Min-Hua Chen Cc: Paul Burton Cc: Alex Smith Cc: Markos Chandras Cc: Vineeth Vijayan Cc: Jeff Bailey Cc: Michael Holzheu Cc: Ben Hutchings Cc: Behan Webster Cc: Ismael Ripoll Cc: Jan-Simon Mller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 1 - arch/arm64/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/s390/include/asm/elf.h | 5 ++--- arch/s390/mm/mmap.c | 8 -------- arch/x86/Kconfig | 1 - fs/Kconfig.binfmt | 3 --- fs/binfmt_elf.c | 18 ++++-------------- 9 files changed, 6 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index f85200a63a8b..4b62f4caf0ce 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1,7 +1,6 @@ config ARM bool default y - select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7c1dbeb73e8d..34f487d5d84e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1,6 +1,5 @@ config ARM64 def_bool y - select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 688ce274f59d..a326c4cb8cf0 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -23,7 +23,6 @@ config MIPS select HAVE_KRETPROBES select HAVE_DEBUG_KMEMLEAK select HAVE_SYSCALL_TRACEPOINTS - select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ELF_RANDOMIZE select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES && 64BIT select RTC_LIB if !MACH_LOONGSON diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index fc5fffbb331b..e99014adf017 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -88,7 +88,6 @@ config PPC select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select BINFMT_ELF - select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ELF_RANDOMIZE select OF select OF_EARLY_FLATTREE diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index f8db4781a4c2..ff662155b2c4 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -163,10 +163,9 @@ extern unsigned int vdso_enabled; the loader. We need to make sure that it is out of the way of the program that it will "exec", and that there is sufficient room for the brk. 64-bit tasks are aligned to 4GB. */ -extern unsigned long randomize_et_dyn(void); -#define ELF_ET_DYN_BASE (randomize_et_dyn() + (is_32bit_task() ? \ +#define ELF_ET_DYN_BASE (is_32bit_task() ? \ (STACK_TOP / 3 * 2) : \ - (STACK_TOP / 3 * 2) & ~((1UL << 32) - 1))) + (STACK_TOP / 3 * 2) & ~((1UL << 32) - 1)) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. */ diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 8c11536f972d..bb3367c5cb0b 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -177,14 +177,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; } -unsigned long randomize_et_dyn(void) -{ - if (current->flags & PF_RANDOMIZE) - return arch_mmap_rnd(); - - return 0UL; -} - #ifndef CONFIG_64BIT /* diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 782ddbbc1c9a..1f7f185934a5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -87,7 +87,6 @@ config X86 select HAVE_ARCH_KMEMCHECK select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP select HAVE_USER_RETURN_NOTIFIER - select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ELF_RANDOMIZE select HAVE_ARCH_JUMP_LABEL select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 270c48148f79..2d0cbbd14cfc 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -27,9 +27,6 @@ config COMPAT_BINFMT_ELF bool depends on COMPAT && BINFMT_ELF -config ARCH_BINFMT_ELF_RANDOMIZE_PIE - bool - config ARCH_BINFMT_ELF_STATE bool diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d925f55e4857..b20c05477e90 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -910,21 +911,10 @@ static int load_elf_binary(struct linux_binprm *bprm) * default mmap base, as well as whatever program they * might try to exec. This is because the brk will * follow the loader, and is not movable. */ -#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE - /* Memory randomization might have been switched off - * in runtime via sysctl or explicit setting of - * personality flags. - * If that is the case, retain the original non-zero - * load_bias value in order to establish proper - * non-randomized mappings. - */ + load_bias = ELF_ET_DYN_BASE - vaddr; if (current->flags & PF_RANDOMIZE) - load_bias = 0; - else - load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); -#else - load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); -#endif + load_bias += arch_mmap_rnd(); + load_bias = ELF_PAGESTART(load_bias); total_size = total_mapping_size(elf_phdata, loc->elf_ex.e_phnum); if (!total_size) { -- cgit v1.2.3-58-ga151 From 204db6ed17743000691d930368a5abd6ea541c58 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:48:12 -0700 Subject: mm: fold arch_randomize_brk into ARCH_HAS_ELF_RANDOMIZE The arch_randomize_brk() function is used on several architectures, even those that don't support ET_DYN ASLR. To avoid bulky extern/#define tricks, consolidate the support under CONFIG_ARCH_HAS_ELF_RANDOMIZE for the architectures that support it, while still handling CONFIG_COMPAT_BRK. Signed-off-by: Kees Cook Cc: Hector Marco-Gisbert Cc: Russell King Reviewed-by: Ingo Molnar Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Alexander Viro Cc: Oleg Nesterov Cc: Andy Lutomirski Cc: "David A. Long" Cc: Andrey Ryabinin Cc: Arun Chandran Cc: Yann Droneaud Cc: Min-Hua Chen Cc: Paul Burton Cc: Alex Smith Cc: Markos Chandras Cc: Vineeth Vijayan Cc: Jeff Bailey Cc: Michael Holzheu Cc: Ben Hutchings Cc: Behan Webster Cc: Ismael Ripoll Cc: Jan-Simon Mller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 1 + arch/arm/include/asm/elf.h | 4 ---- arch/arm64/include/asm/elf.h | 4 ---- arch/mips/include/asm/elf.h | 4 ---- arch/powerpc/include/asm/elf.h | 4 ---- arch/s390/include/asm/elf.h | 3 --- arch/x86/include/asm/elf.h | 3 --- fs/binfmt_elf.c | 4 +--- include/linux/elf-randomize.h | 12 ++++++++++++ 9 files changed, 14 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/arch/Kconfig b/arch/Kconfig index 474904a8e540..e1068987bad1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -497,6 +497,7 @@ config ARCH_HAS_ELF_RANDOMIZE An architecture supports choosing randomized locations for stack, mmap, brk, and ET_DYN. Defined functions: - arch_mmap_rnd() + - arch_randomize_brk() # # ABI hall of shame diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h index afb9cafd3786..c1ff8ab12914 100644 --- a/arch/arm/include/asm/elf.h +++ b/arch/arm/include/asm/elf.h @@ -125,10 +125,6 @@ int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs); extern void elf_set_personality(const struct elf32_hdr *); #define SET_PERSONALITY(ex) elf_set_personality(&(ex)) -struct mm_struct; -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - #ifdef CONFIG_MMU #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 struct linux_binprm; diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index f724db00b235..faad6df49e5b 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -156,10 +156,6 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, #define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) #endif -struct mm_struct; -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - #ifdef CONFIG_COMPAT #ifdef __AARCH64EB__ diff --git a/arch/mips/include/asm/elf.h b/arch/mips/include/asm/elf.h index 535f196ffe02..31d747d46a23 100644 --- a/arch/mips/include/asm/elf.h +++ b/arch/mips/include/asm/elf.h @@ -410,10 +410,6 @@ struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); -struct mm_struct; -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - struct arch_elf_state { int fp_abi; int interp_fp_abi; diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h index 57d289acb803..ee46ffef608e 100644 --- a/arch/powerpc/include/asm/elf.h +++ b/arch/powerpc/include/asm/elf.h @@ -128,10 +128,6 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, (0x7ff >> (PAGE_SHIFT - 12)) : \ (0x3ffff >> (PAGE_SHIFT - 12))) -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - - #ifdef CONFIG_SPU_BASE /* Notes used in ET_CORE. Note name is "SPU//". */ #define NT_SPU 1 diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index ff662155b2c4..a5c4978462c1 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -226,9 +226,6 @@ struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 int arch_setup_additional_pages(struct linux_binprm *, int); -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - void *fill_cpu_elf_notes(void *ptr, struct save_area *sa, __vector128 *vxrs); #endif diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 935588d95c82..f161c189c27b 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -339,9 +339,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); #define compat_arch_setup_additional_pages compat_arch_setup_additional_pages -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - /* * True on X86_32 or when emulating IA32 on X86_64 */ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b20c05477e90..241ef68d2893 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1050,15 +1050,13 @@ static int load_elf_binary(struct linux_binprm *bprm) current->mm->end_data = end_data; current->mm->start_stack = bprm->p; -#ifdef arch_randomize_brk if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) { current->mm->brk = current->mm->start_brk = arch_randomize_brk(current->mm); -#ifdef CONFIG_COMPAT_BRK +#ifdef compat_brk_randomized current->brk_randomized = 1; #endif } -#endif if (current->personality & MMAP_PAGE_ZERO) { /* Why this, you ask??? Well SVr4 maps page 0 as read-only, diff --git a/include/linux/elf-randomize.h b/include/linux/elf-randomize.h index 7a4eda02d2b1..b5f0bda9472e 100644 --- a/include/linux/elf-randomize.h +++ b/include/linux/elf-randomize.h @@ -1,10 +1,22 @@ #ifndef _ELF_RANDOMIZE_H #define _ELF_RANDOMIZE_H +struct mm_struct; + #ifndef CONFIG_ARCH_HAS_ELF_RANDOMIZE static inline unsigned long arch_mmap_rnd(void) { return 0; } +# if defined(arch_randomize_brk) && defined(CONFIG_COMPAT_BRK) +# define compat_brk_randomized +# endif +# ifndef arch_randomize_brk +# define arch_randomize_brk(mm) (mm->brk) +# endif #else extern unsigned long arch_mmap_rnd(void); +extern unsigned long arch_randomize_brk(struct mm_struct *mm); +# ifdef CONFIG_COMPAT_BRK +# define compat_brk_randomized +# endif #endif #endif -- cgit v1.2.3-58-ga151 From 11d83360452ea2a95e699da01f8e1bcc4676a5de Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2015 15:48:21 -0700 Subject: mm, mempool: do not allow atomic resizing Allocating a large number of elements in atomic context could quickly deplete memory reserves, so just disallow atomic resizing entirely. Nothing currently uses mempool_resize() with anything other than GFP_KERNEL, so convert existing callers to drop the gfp_mask. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: David Rientjes Acked-by: Steffen Maier [zfcp] Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Steve French Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/s390/scsi/zfcp_erp.c | 4 ++-- fs/cifs/connect.c | 6 ++---- include/linux/mempool.h | 2 +- mm/mempool.c | 10 ++++++---- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c index 2c5d4567d1da..acde3f5d6e9e 100644 --- a/drivers/s390/scsi/zfcp_erp.c +++ b/drivers/s390/scsi/zfcp_erp.c @@ -738,11 +738,11 @@ static int zfcp_erp_adapter_strategy_open_fsf(struct zfcp_erp_action *act) return ZFCP_ERP_FAILED; if (mempool_resize(act->adapter->pool.sr_data, - act->adapter->stat_read_buf_num, GFP_KERNEL)) + act->adapter->stat_read_buf_num)) return ZFCP_ERP_FAILED; if (mempool_resize(act->adapter->pool.status_read_req, - act->adapter->stat_read_buf_num, GFP_KERNEL)) + act->adapter->stat_read_buf_num)) return ZFCP_ERP_FAILED; atomic_set(&act->adapter->stat_miss, act->adapter->stat_read_buf_num); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 480cf9c81d50..f3bfe08e177b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -773,8 +773,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) length = atomic_dec_return(&tcpSesAllocCount); if (length > 0) - mempool_resize(cifs_req_poolp, length + cifs_min_rcv, - GFP_KERNEL); + mempool_resize(cifs_req_poolp, length + cifs_min_rcv); } static int @@ -848,8 +847,7 @@ cifs_demultiplex_thread(void *p) length = atomic_inc_return(&tcpSesAllocCount); if (length > 1) - mempool_resize(cifs_req_poolp, length + cifs_min_rcv, - GFP_KERNEL); + mempool_resize(cifs_req_poolp, length + cifs_min_rcv); set_freezable(); while (server->tcpStatus != CifsExiting) { diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 39ed62ab5b8a..b19b3023c880 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -29,7 +29,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, gfp_t gfp_mask, int nid); -extern int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask); +extern int mempool_resize(mempool_t *pool, int new_min_nr); extern void mempool_destroy(mempool_t *pool); extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask); extern void mempool_free(void *element, mempool_t *pool); diff --git a/mm/mempool.c b/mm/mempool.c index e209c98c7203..949970db2874 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -113,23 +113,24 @@ EXPORT_SYMBOL(mempool_create_node); * mempool_create(). * @new_min_nr: the new minimum number of elements guaranteed to be * allocated for this pool. - * @gfp_mask: the usual allocation bitmask. * * This function shrinks/grows the pool. In the case of growing, * it cannot be guaranteed that the pool will be grown to the new * size immediately, but new mempool_free() calls will refill it. + * This function may sleep. * * Note, the caller must guarantee that no mempool_destroy is called * while this function is running. mempool_alloc() & mempool_free() * might be called (eg. from IRQ contexts) while this function executes. */ -int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) +int mempool_resize(mempool_t *pool, int new_min_nr) { void *element; void **new_elements; unsigned long flags; BUG_ON(new_min_nr <= 0); + might_sleep(); spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { @@ -145,7 +146,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) spin_unlock_irqrestore(&pool->lock, flags); /* Grow the pool */ - new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask); + new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements), + GFP_KERNEL); if (!new_elements) return -ENOMEM; @@ -164,7 +166,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) while (pool->curr_nr < pool->min_nr) { spin_unlock_irqrestore(&pool->lock, flags); - element = pool->alloc(gfp_mask, pool->pool_data); + element = pool->alloc(GFP_KERNEL, pool->pool_data); if (!element) goto out; spin_lock_irqsave(&pool->lock, flags); -- cgit v1.2.3-58-ga151