From a7d6e4ecdb7648478ddec76d30d87d03d6e22b31 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 15 Feb 2011 19:02:45 +0100 Subject: thp: prevent hugepages during args/env copying into the user stack Transparent hugepages can only be created if rmap is fully functional. So we must prevent hugepages to be created while is_vma_temporary_stack() is true. This also optmizes away some harmless but unnecessary setting of khugepaged_scan.address and it switches some BUG_ON to VM_BUG_ON. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e62ddb8f24b6..3e29781ee762 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1811,6 +1811,8 @@ static void collapse_huge_page(struct mm_struct *mm, /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ if (!vma->anon_vma || vma->vm_ops || vma->vm_file) goto out; + if (is_vma_temporary_stack(vma)) + goto out; VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); pgd = pgd_offset(mm, address); @@ -2032,32 +2034,27 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || (vma->vm_flags & VM_NOHUGEPAGE)) { + skip: progress++; continue; } - /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ - if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { - khugepaged_scan.address = vma->vm_end; - progress++; - continue; - } + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + goto skip; + if (is_vma_temporary_stack(vma)) + goto skip; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; - if (hstart >= hend) { - progress++; - continue; - } + if (hstart >= hend) + goto skip; + if (khugepaged_scan.address > hend) + goto skip; if (khugepaged_scan.address < hstart) khugepaged_scan.address = hstart; - if (khugepaged_scan.address > hend) { - khugepaged_scan.address = hend + HPAGE_PMD_SIZE; - progress++; - continue; - } - BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { int ret; @@ -2086,7 +2083,7 @@ breakouterloop: breakouterloop_mmap_sem: spin_lock(&khugepaged_mm_lock); - BUG_ON(khugepaged_scan.mm_slot != mm_slot); + VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); /* * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. @@ -2241,9 +2238,9 @@ static int khugepaged(void *none) for (;;) { mutex_unlock(&khugepaged_mutex); - BUG_ON(khugepaged_thread != current); + VM_BUG_ON(khugepaged_thread != current); khugepaged_loop(); - BUG_ON(khugepaged_thread != current); + VM_BUG_ON(khugepaged_thread != current); mutex_lock(&khugepaged_mutex); if (!khugepaged_enabled()) -- cgit v1.2.3-58-ga151 From 2aa15890f3c191326678f1bd68af61ec6b8753ec Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 23 Feb 2011 13:49:47 +0100 Subject: mm: prevent concurrent unmap_mapping_range() on the same inode Michael Leun reported that running parallel opens on a fuse filesystem can trigger a "kernel BUG at mm/truncate.c:475" Gurudas Pai reported the same bug on NFS. The reason is, unmap_mapping_range() is not prepared for more than one concurrent invocation per inode. For example: thread1: going through a big range, stops in the middle of a vma and stores the restart address in vm_truncate_count. thread2: comes in with a small (e.g. single page) unmap request on the same vma, somewhere before restart_address, finds that the vma was already unmapped up to the restart address and happily returns without doing anything. Another scenario would be two big unmap requests, both having to restart the unmapping and each one setting vm_truncate_count to its own value. This could go on forever without any of them being able to finish. Truncate and hole punching already serialize with i_mutex. Other callers of unmap_mapping_range() do not, and it's difficult to get i_mutex protection for all callers. In particular ->d_revalidate(), which calls invalidate_inode_pages2_range() in fuse, may be called with or without i_mutex. This patch adds a new mutex to 'struct address_space' to prevent running multiple concurrent unmap_mapping_range() on the same mapping. [ We'll hopefully get rid of all this with the upcoming mm preemptibility series by Peter Zijlstra, the "mm: Remove i_mmap_mutex lockbreak" patch in particular. But that is for 2.6.39 ] Signed-off-by: Miklos Szeredi Reported-by: Michael Leun Reported-by: Gurudas Pai Tested-by: Gurudas Pai Acked-by: Hugh Dickins Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/gfs2/main.c | 9 +-------- fs/inode.c | 22 +++++++++++++++------- fs/nilfs2/btnode.c | 5 ----- fs/nilfs2/btnode.h | 1 - fs/nilfs2/mdt.c | 4 ++-- fs/nilfs2/page.c | 13 ------------- fs/nilfs2/page.h | 1 - fs/nilfs2/super.c | 2 +- include/linux/fs.h | 2 ++ mm/memory.c | 2 ++ 10 files changed, 23 insertions(+), 38 deletions(-) (limited to 'mm') diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 85ba027d1c4d..72c31a315d96 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -59,14 +59,7 @@ static void gfs2_init_gl_aspace_once(void *foo) struct address_space *mapping = (struct address_space *)(gl + 1); gfs2_init_glock_once(gl); - memset(mapping, 0, sizeof(*mapping)); - INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); - spin_lock_init(&mapping->tree_lock); - spin_lock_init(&mapping->i_mmap_lock); - INIT_LIST_HEAD(&mapping->private_list); - spin_lock_init(&mapping->private_lock); - INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); - INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); + address_space_init_once(mapping); } /** diff --git a/fs/inode.c b/fs/inode.c index da85e56378f3..9c2b795ccc93 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -295,6 +295,20 @@ static void destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, i_callback); } +void address_space_init_once(struct address_space *mapping) +{ + memset(mapping, 0, sizeof(*mapping)); + INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); + spin_lock_init(&mapping->tree_lock); + spin_lock_init(&mapping->i_mmap_lock); + INIT_LIST_HEAD(&mapping->private_list); + spin_lock_init(&mapping->private_lock); + INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); + INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); + mutex_init(&mapping->unmap_mutex); +} +EXPORT_SYMBOL(address_space_init_once); + /* * These are initializations that only need to be done * once, because the fields are idempotent across use @@ -308,13 +322,7 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); - INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); - spin_lock_init(&inode->i_data.tree_lock); - spin_lock_init(&inode->i_data.i_mmap_lock); - INIT_LIST_HEAD(&inode->i_data.private_list); - spin_lock_init(&inode->i_data.private_lock); - INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap); - INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear); + address_space_init_once(&inode->i_data); i_size_ordered_init(inode); #ifdef CONFIG_FSNOTIFY INIT_HLIST_HEAD(&inode->i_fsnotify_marks); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 388e9e8f5286..85f7baa15f5d 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -35,11 +35,6 @@ #include "btnode.h" -void nilfs_btnode_cache_init_once(struct address_space *btnc) -{ - nilfs_mapping_init_once(btnc); -} - static const struct address_space_operations def_btnode_aops = { .sync_page = block_sync_page, }; diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 79037494f1e0..1b8ebd888c28 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt { struct buffer_head *newbh; }; -void nilfs_btnode_cache_init_once(struct address_space *); void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *); void nilfs_btnode_cache_clear(struct address_space *); struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 6a0e2a189f60..a0babd2bff6a 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -454,9 +454,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode, struct backing_dev_info *bdi = inode->i_sb->s_bdi; INIT_LIST_HEAD(&shadow->frozen_buffers); - nilfs_mapping_init_once(&shadow->frozen_data); + address_space_init_once(&shadow->frozen_data); nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops); - nilfs_mapping_init_once(&shadow->frozen_btnodes); + address_space_init_once(&shadow->frozen_btnodes); nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops); mi->mi_shadow = shadow; return 0; diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 0c432416cfef..a585b35fd6bc 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -492,19 +492,6 @@ unsigned nilfs_page_count_clean_buffers(struct page *page, return nc; } -void nilfs_mapping_init_once(struct address_space *mapping) -{ - memset(mapping, 0, sizeof(*mapping)); - INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); - spin_lock_init(&mapping->tree_lock); - INIT_LIST_HEAD(&mapping->private_list); - spin_lock_init(&mapping->private_lock); - - spin_lock_init(&mapping->i_mmap_lock); - INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); - INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); -} - void nilfs_mapping_init(struct address_space *mapping, struct backing_dev_info *bdi, const struct address_space_operations *aops) diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index 622df27cd891..2a00953ebd5f 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -61,7 +61,6 @@ void nilfs_free_private_page(struct page *); int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); void nilfs_clear_dirty_pages(struct address_space *); -void nilfs_mapping_init_once(struct address_space *mapping); void nilfs_mapping_init(struct address_space *mapping, struct backing_dev_info *bdi, const struct address_space_operations *aops); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 58fd707174e1..1673b3d99842 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1279,7 +1279,7 @@ static void nilfs_inode_init_once(void *obj) #ifdef CONFIG_NILFS_XATTR init_rwsem(&ii->xattr_sem); #endif - nilfs_btnode_cache_init_once(&ii->i_btnode_cache); + address_space_init_once(&ii->i_btnode_cache); ii->i_bmap = &ii->i_bmap_data; inode_init_once(&ii->vfs_inode); } diff --git a/include/linux/fs.h b/include/linux/fs.h index bd3215940c37..97d08d8a7de8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -649,6 +649,7 @@ struct address_space { spinlock_t private_lock; /* for use by the address_space */ struct list_head private_list; /* ditto */ struct address_space *assoc_mapping; /* ditto */ + struct mutex unmap_mutex; /* to protect unmapping */ } __attribute__((aligned(sizeof(long)))); /* * On most architectures that alignment is already the case; but @@ -2225,6 +2226,7 @@ extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); extern int inode_init_always(struct super_block *, struct inode *); extern void inode_init_once(struct inode *); +extern void address_space_init_once(struct address_space *mapping); extern void ihold(struct inode * inode); extern void iput(struct inode *); extern struct inode * igrab(struct inode *); diff --git a/mm/memory.c b/mm/memory.c index 8e8c18324863..5823698c2b71 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2648,6 +2648,7 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; details.i_mmap_lock = &mapping->i_mmap_lock; + mutex_lock(&mapping->unmap_mutex); spin_lock(&mapping->i_mmap_lock); /* Protect against endless unmapping loops */ @@ -2664,6 +2665,7 @@ void unmap_mapping_range(struct address_space *mapping, if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->unmap_mutex); } EXPORT_SYMBOL(unmap_mapping_range); -- cgit v1.2.3-58-ga151 From a3e8cc643d22d2c8ed36b9be7d9c9ca21efcf7f7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 23 Feb 2011 21:39:49 -0800 Subject: mm: fix possible cause of a page_mapped BUG Robert Swiecki reported a BUG_ON(page_mapped) from a fuzzer, punching a hole with madvise(,, MADV_REMOVE). That path is under mutex, and cannot be explained by lack of serialization in unmap_mapping_range(). Reviewing the code, I found one place where vm_truncate_count handling should have been updated, when I switched at the last minute from one way of managing the restart_addr to another: mremap move changes the virtual addresses, so it ought to adjust the restart_addr. But rather than exporting the notion of restart_addr from memory.c, or converting to restart_pgoff throughout, simply reset vm_truncate_count to 0 to force a rescan if mremap move races with preempted truncation. We have no confirmation that this fixes Robert's BUG, but it is a fix that's worth making anyway. Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/mremap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mremap.c b/mm/mremap.c index 9925b6391b80..1de98d492ddc 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -94,9 +94,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, */ mapping = vma->vm_file->f_mapping; spin_lock(&mapping->i_mmap_lock); - if (new_vma->vm_truncate_count && - new_vma->vm_truncate_count != vma->vm_truncate_count) - new_vma->vm_truncate_count = 0; + new_vma->vm_truncate_count = 0; } /* -- cgit v1.2.3-58-ga151 From 8074b26f67165bf045d92e778c9c10dc5e207fc6 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 24 Feb 2011 15:49:53 +0100 Subject: mm: fix refcounting in swapon Grab a reference to bdev before calling blkdev_get(), which expects the refcount to be already incremented and either returns success or decrements the refcount and returns an error. The bug was introduced by e525fd89 (block: make blkdev_get/put() handle exclusive access), which didn't take into account this behavior of blkdev_get(). Acked-by: Tejun Heo Signed-off-by: Miklos Szeredi Signed-off-by: Linus Torvalds --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 07a458d72fa8..0341c5700e34 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1940,7 +1940,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -EINVAL; if (S_ISBLK(inode->i_mode)) { - bdev = I_BDEV(inode); + bdev = bdgrab(I_BDEV(inode)); error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, sys_swapon); if (error < 0) { -- cgit v1.2.3-58-ga151 From a879bf582dfb3a79d30d76ca3af2ae8a0f39010c Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 25 Feb 2011 14:44:13 -0800 Subject: mm: grab rcu read lock in move_pages() The move_pages() usage of find_task_by_vpid() requires rcu_read_lock() to prevent free_pid() from reclaiming the pid. Without this patch, RCU warnings are printed in v2.6.38-rc4 move_pages() with: CONFIG_LOCKUP_DETECTOR=y CONFIG_PREEMPT=y CONFIG_LOCKDEP=y CONFIG_PROVE_LOCKING=y CONFIG_PROVE_RCU=y Previously, migrate_pages() went through a similar transformation replacing usage of tasklist_lock with rcu read lock: commit 55cfaa3cbdd29c4919ecb5fb8965c310f357e48c Author: Zeng Zhaoming Date: Thu Dec 2 14:31:13 2010 -0800 mm/mempolicy.c: add rcu read lock to protect pid structure commit 1e50df39f6e2c3a4a3394df62baa8a213df16c54 Author: KOSAKI Motohiro Date: Thu Jan 13 15:46:14 2011 -0800 mempolicy: remove tasklist_lock from migrate_pages Signed-off-by: Greg Thelen Cc: Mel Gorman Cc: Minchan Kim Cc: Rik van Riel Cc: KAMEZAWA Hiroyuki Cc: "Paul E. McKenney" Cc: Tetsuo Handa Cc: Sergey Senozhatsky Cc: Oleg Nesterov Cc: Zeng Zhaoming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 766115253807..352de555626c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1287,14 +1287,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, return -EPERM; /* Find the mm_struct */ - read_lock(&tasklist_lock); + rcu_read_lock(); task = pid ? find_task_by_vpid(pid) : current; if (!task) { - read_unlock(&tasklist_lock); + rcu_read_unlock(); return -ESRCH; } mm = get_task_mm(task); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!mm) return -EINVAL; -- cgit v1.2.3-58-ga151 From 2876592f231d436c295b67726313f6f3cfb6e243 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 25 Feb 2011 14:44:20 -0800 Subject: mm: vmscan: stop reclaim/compaction earlier due to insufficient progress if !__GFP_REPEAT should_continue_reclaim() for reclaim/compaction allows scanning to continue even if pages are not being reclaimed until the full list is scanned. In terms of allocation success, this makes sense but potentially it introduces unwanted latency for high-order allocations such as transparent hugepages and network jumbo frames that would prefer to fail the allocation attempt and fallback to order-0 pages. Worse, there is a potential that the full LRU scan will clear all the young bits, distort page aging information and potentially push pages into swap that would have otherwise remained resident. This patch will stop reclaim/compaction if no pages were reclaimed in the last SWAP_CLUSTER_MAX pages that were considered. For allocations such as hugetlbfs that use __GFP_REPEAT and have fewer fallback options, the full LRU list may still be scanned. Order-0 allocation should not be affected because RECLAIM_MODE_COMPACTION is not set so the following avoids the gfp_mask being examined: if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) return false; A tool was developed based on ftrace that tracked the latency of high-order allocations while transparent hugepage support was enabled and three benchmarks were run. The "fix-infinite" figures are 2.6.38-rc4 with Johannes's patch "vmscan: fix zone shrinking exit when scan work is done" applied. STREAM Highorder Allocation Latency Statistics fix-infinite break-early 1 :: Count 10298 10229 1 :: Min 0.4560 0.4640 1 :: Mean 1.0589 1.0183 1 :: Max 14.5990 11.7510 1 :: Stddev 0.5208 0.4719 2 :: Count 2 1 2 :: Min 1.8610 3.7240 2 :: Mean 3.4325 3.7240 2 :: Max 5.0040 3.7240 2 :: Stddev 1.5715 0.0000 9 :: Count 111696 111694 9 :: Min 0.5230 0.4110 9 :: Mean 10.5831 10.5718 9 :: Max 38.4480 43.2900 9 :: Stddev 1.1147 1.1325 Mean time for order-1 allocations is reduced. order-2 looks increased but with so few allocations, it's not particularly significant. THP mean allocation latency is also reduced. That said, allocation time varies so significantly that the reductions are within noise. Max allocation time is reduced by a significant amount for low-order allocations but reduced for THP allocations which presumably are now breaking before reclaim has done enough work. SysBench Highorder Allocation Latency Statistics fix-infinite break-early 1 :: Count 15745 15677 1 :: Min 0.4250 0.4550 1 :: Mean 1.1023 1.0810 1 :: Max 14.4590 10.8220 1 :: Stddev 0.5117 0.5100 2 :: Count 1 1 2 :: Min 3.0040 2.1530 2 :: Mean 3.0040 2.1530 2 :: Max 3.0040 2.1530 2 :: Stddev 0.0000 0.0000 9 :: Count 2017 1931 9 :: Min 0.4980 0.7480 9 :: Mean 10.4717 10.3840 9 :: Max 24.9460 26.2500 9 :: Stddev 1.1726 1.1966 Again, mean time for order-1 allocations is reduced while order-2 allocations are too few to draw conclusions from. The mean time for THP allocations is also slightly reduced albeit the reductions are within varianes. Once again, our maximum allocation time is significantly reduced for low-order allocations and slightly increased for THP allocations. Anon stream mmap reference Highorder Allocation Latency Statistics 1 :: Count 1376 1790 1 :: Min 0.4940 0.5010 1 :: Mean 1.0289 0.9732 1 :: Max 6.2670 4.2540 1 :: Stddev 0.4142 0.2785 2 :: Count 1 - 2 :: Min 1.9060 - 2 :: Mean 1.9060 - 2 :: Max 1.9060 - 2 :: Stddev 0.0000 - 9 :: Count 11266 11257 9 :: Min 0.4990 0.4940 9 :: Mean 27250.4669 24256.1919 9 :: Max 11439211.0000 6008885.0000 9 :: Stddev 226427.4624 186298.1430 This benchmark creates one thread per CPU which references an amount of anonymous memory 1.5 times the size of physical RAM. This pounds swap quite heavily and is intended to exercise THP a bit. Mean allocation time for order-1 is reduced as before. It's also reduced for THP allocations but the variations here are pretty massive due to swap. As before, maximum allocation times are significantly reduced. Overall, the patch reduces the mean and maximum allocation latencies for the smaller high-order allocations. This was with Slab configured so it would be expected to be more significant with Slub which uses these size allocations more aggressively. The mean allocation times for THP allocations are also slightly reduced. The maximum latency was slightly increased as predicted by the comments due to reclaim/compaction breaking early. However, workloads care more about the latency of lower-order allocations than THP so it's an acceptable trade-off. Signed-off-by: Mel Gorman Acked-by: Andrea Arcangeli Acked-by: Johannes Weiner Reviewed-by: Minchan Kim Acked-by: Andrea Arcangeli Acked-by: Rik van Riel Cc: Michal Hocko Cc: Kent Overstreet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 17497d0cd8b9..6771ea70bfe7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1841,16 +1841,28 @@ static inline bool should_continue_reclaim(struct zone *zone, if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) return false; - /* - * If we failed to reclaim and have scanned the full list, stop. - * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far - * faster but obviously would be less likely to succeed - * allocation. If this is desirable, use GFP_REPEAT to decide - * if both reclaimed and scanned should be checked or just - * reclaimed - */ - if (!nr_reclaimed && !nr_scanned) - return false; + /* Consider stopping depending on scan and reclaim activity */ + if (sc->gfp_mask & __GFP_REPEAT) { + /* + * For __GFP_REPEAT allocations, stop reclaiming if the + * full LRU list has been scanned and we are still failing + * to reclaim pages. This full LRU scan is potentially + * expensive but a __GFP_REPEAT caller really wants to succeed + */ + if (!nr_reclaimed && !nr_scanned) + return false; + } else { + /* + * For non-__GFP_REPEAT allocations which can presumably + * fail without consequence, stop if we failed to reclaim + * any pages from the last SWAP_CLUSTER_MAX number of + * pages that were scanned. This will return to the + * caller faster at the risk reclaim/compaction and + * the resulting allocation attempt fails + */ + if (!nr_reclaimed) + return false; + } /* * If we have not reclaimed enough pages for compaction and the -- cgit v1.2.3-58-ga151 From 29723fccc837d20039078f7a571e8d457eb0d6c6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 25 Feb 2011 14:44:25 -0800 Subject: mm: fix dubious code in __count_immobile_pages() When pfn_valid_within() failed 'iter' was incremented twice. Signed-off-by: Namhyung Kim Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a873e61e312e..cdef1d4b4e47 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5376,10 +5376,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { unsigned long check = pfn + iter; - if (!pfn_valid_within(check)) { - iter++; + if (!pfn_valid_within(check)) continue; - } + page = pfn_to_page(check); if (!page_count(page)) { if (PageBuddy(page)) -- cgit v1.2.3-58-ga151 From 8eac563c1c3a2047083022357ae63722b19e4e08 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 25 Feb 2011 14:44:28 -0800 Subject: thp: fix interleaving for transparent hugepages The THP code didn't pass the correct interleaving shift to the memory policy code. Fix this here by adjusting for the order. Signed-off-by: Andi Kleen Reviewed-by: Christoph Lameter Acked-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 368fc9d23610..49355a970be2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1830,7 +1830,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); put_mems_allowed(); -- cgit v1.2.3-58-ga151 From e5598f8bf5449bc09e4005600ead32e6f2a3e79b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 25 Feb 2011 14:44:29 -0800 Subject: memcg: more mem_cgroup_uncharge() batching It seems odd that truncate_inode_pages_range(), called not only when truncating but also when evicting inodes, has mem_cgroup_uncharge_start and _end() batching in its second loop to clear up a few leftovers, but not in its first loop that does almost all the work: add them there too. Signed-off-by: Hugh Dickins Acked-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Acked-by: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index 49feb46e77b8..d64296be00d3 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -225,6 +225,7 @@ void truncate_inode_pages_range(struct address_space *mapping, next = start; while (next <= end && pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t page_index = page->index; @@ -247,6 +248,7 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); } pagevec_release(&pvec); + mem_cgroup_uncharge_end(); cond_resched(); } -- cgit v1.2.3-58-ga151 From 2f5f9486f8c12e3aa40fe3775a18cb14efc5cea2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 4 Mar 2011 17:36:29 -0800 Subject: mm: change alloc_pages_vma to pass down the policy node for local policy Currently alloc_pages_vma() always uses the local node as policy node for the LOCAL policy. Pass this node down as an argument instead. No behaviour change from this patch, but will be needed for followons. Acked-by: Andrea Arcangeli Signed-off-by: Andi Kleen Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 9 +++++---- mm/huge_memory.c | 2 +- mm/mempolicy.c | 11 +++++------ 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0b84c61607e8..37b8af5db091 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -332,16 +332,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) return alloc_pages_current(gfp_mask, order); } extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, - struct vm_area_struct *vma, unsigned long addr); + struct vm_area_struct *vma, unsigned long addr, + int node); #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_pages_vma(gfp_mask, order, vma, addr) \ +#define alloc_pages_vma(gfp_mask, order, vma, addr, node) \ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) -#define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr) +#define alloc_page_vma(gfp_mask, vma, addr) \ + alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id()) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3e29781ee762..c7c2cd925599 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -653,7 +653,7 @@ static inline struct page *alloc_hugepage_vma(int defrag, unsigned long haddr) { return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), - HPAGE_PMD_ORDER, vma, haddr); + HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); } #ifndef CONFIG_NUMA diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 49355a970be2..25a5a9146619 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) } /* Return a zonelist indicated by gfp for node representing a mempolicy */ -static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) +static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, + int nd) { - int nd = numa_node_id(); - switch (policy->mode) { case MPOL_PREFERRED: if (!(policy->flags & MPOL_F_LOCAL)) @@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, zl = node_zonelist(interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))), gfp_flags); } else { - zl = policy_zonelist(gfp_flags, *mpol); + zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); if ((*mpol)->mode == MPOL_BIND) *nodemask = &(*mpol)->v.nodes; } @@ -1820,7 +1819,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, int node) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; @@ -1836,7 +1835,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, put_mems_allowed(); return page; } - zl = policy_zonelist(gfp, pol); + zl = policy_zonelist(gfp, pol, node); if (unlikely(mpol_needs_cond_ref(pol))) { /* * slow path: ref counted shared policy -- cgit v1.2.3-58-ga151 From 19ee151e140daa5183c4984981801e542e0544fb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 4 Mar 2011 17:36:31 -0800 Subject: mm: preserve original node for transparent huge page copies This makes a difference for LOCAL policy, where the node cannot be determined from the policy itself, but has to be gotten from the original page. Acked-by: Andrea Arcangeli Signed-off-by: Andi Kleen Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c7c2cd925599..1802db819e28 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -799,8 +799,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, } for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, - vma, address); + pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, + vma, address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_newpage_charge(pages[i], mm, GFP_KERNEL))) { -- cgit v1.2.3-58-ga151 From 5c4b4be3b6b937256103a5ae49177e0c3a17cb8f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 4 Mar 2011 17:36:32 -0800 Subject: mm: use correct numa policy node for transparent hugepages Pass down the correct node for a transparent hugepage allocation. Most callers continue to use the current node, however the hugepaged daemon now uses the previous node of the first to be collapsed page instead. This ensures that khugepaged does not mess up local memory for an existing process which uses local policy. The choice of node is somewhat primitive currently: it just uses the node of the first page in the pmd range. An alternative would be to look at multiple pages and use the most popular node. I used the simplest variant for now which should work well enough for the case of all pages being on the same node. [akpm@linux-foundation.org: coding-style fixes] Acked-by: Andrea Arcangeli Signed-off-by: Andi Kleen Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 24 +++++++++++++++++------- mm/mempolicy.c | 3 ++- 2 files changed, 19 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1802db819e28..dbe99a5f2073 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -650,10 +650,10 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag) static inline struct page *alloc_hugepage_vma(int defrag, struct vm_area_struct *vma, - unsigned long haddr) + unsigned long haddr, int nd) { return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), - HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); + HPAGE_PMD_ORDER, vma, haddr, nd); } #ifndef CONFIG_NUMA @@ -678,7 +678,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(khugepaged_enter(vma))) return VM_FAULT_OOM; page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr); + vma, haddr, numa_node_id()); if (unlikely(!page)) goto out; if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { @@ -902,7 +902,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr); + vma, haddr, numa_node_id()); else new_page = NULL; @@ -1745,7 +1745,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + int node) { pgd_t *pgd; pud_t *pud; @@ -1773,7 +1774,8 @@ static void collapse_huge_page(struct mm_struct *mm, * mmap_sem in read mode is good idea also to allow greater * scalability. */ - new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, + node); if (unlikely(!new_page)) { up_read(&mm->mmap_sem); *hpage = ERR_PTR(-ENOMEM); @@ -1919,6 +1921,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct page *page; unsigned long _address; spinlock_t *ptl; + int node = -1; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -1949,6 +1952,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, page = vm_normal_page(vma, _address, pteval); if (unlikely(!page)) goto out_unmap; + /* + * Chose the node of the first page. This could + * be more sophisticated and look at more pages, + * but isn't for now. + */ + if (node == -1) + node = page_to_nid(page); VM_BUG_ON(PageCompound(page)); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) goto out_unmap; @@ -1965,7 +1975,7 @@ out_unmap: pte_unmap_unlock(pte, ptl); if (ret) /* collapse_huge_page will return with the mmap_sem released */ - collapse_huge_page(mm, address, hpage, vma); + collapse_huge_page(mm, address, hpage, vma, node); out: return ret; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 25a5a9146619..b53ec99f1428 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1891,7 +1891,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else page = __alloc_pages_nodemask(gfp, order, - policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); + policy_zonelist(gfp, pol, numa_node_id()), + policy_nodemask(gfp, pol)); put_mems_allowed(); return page; } -- cgit v1.2.3-58-ga151 From 2da28bfd9665f49d40abb4c7720b43135feaf79a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 11 Mar 2011 14:58:29 -0800 Subject: thp: fix page_referenced to modify mapcount/vm_flags only if page is found When vmscan.c calls page_referenced(), if an anon page was created before a process forked, rmap will search for it in both of the processes, even though one of them might have since broken COW. If the child process mlocks the vma where the COWed page belongs to, page_referenced() running on the page mapped by the parent would lead to *vm_flags getting VM_LOCKED set erroneously (leading to the references on the parent page being ignored and evicting the parent page too early). *mapcount would also be decremented by page_referenced_one even if the page wasn't found by page_check_address. This also lets pmdp_clear_flush_young_notify() go ahead on a pmd_trans_splitting() pmd. We hold the page_table_lock so __split_huge_page_map() must wait the pmdp_clear_flush_young_notify() to complete before it can modify the pmd. The pmd is also still mapped in userland so the young bit may materialize through a tlb miss before split_huge_page_map runs. This will provide a more accurate page_referenced() behavior during split_huge_page(). Signed-off-by: Andrea Arcangeli Reported-by: Michel Lespinasse Reviewed-by: Michel Lespinasse Reviewed-by: Minchan Kim Reviewed-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 54 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index f21f4a1d6a1c..941bf82e8961 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -497,41 +497,51 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; int referenced = 0; - /* - * Don't want to elevate referenced for mlocked page that gets this far, - * in order that it progresses to try_to_unmap and is moved to the - * unevictable list. - */ - if (vma->vm_flags & VM_LOCKED) { - *mapcount = 0; /* break early from loop */ - *vm_flags |= VM_LOCKED; - goto out; - } - - /* Pretend the page is referenced if the task has the - swap token and is in the middle of a page fault. */ - if (mm != current->mm && has_swap_token(mm) && - rwsem_is_locked(&mm->mmap_sem)) - referenced++; - if (unlikely(PageTransHuge(page))) { pmd_t *pmd; spin_lock(&mm->page_table_lock); + /* + * rmap might return false positives; we must filter + * these out using page_check_address_pmd(). + */ pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_FLAG); - if (pmd && !pmd_trans_splitting(*pmd) && - pmdp_clear_flush_young_notify(vma, address, pmd)) + if (!pmd) { + spin_unlock(&mm->page_table_lock); + goto out; + } + + if (vma->vm_flags & VM_LOCKED) { + spin_unlock(&mm->page_table_lock); + *mapcount = 0; /* break early from loop */ + *vm_flags |= VM_LOCKED; + goto out; + } + + /* go ahead even if the pmd is pmd_trans_splitting() */ + if (pmdp_clear_flush_young_notify(vma, address, pmd)) referenced++; spin_unlock(&mm->page_table_lock); } else { pte_t *pte; spinlock_t *ptl; + /* + * rmap might return false positives; we must filter + * these out using page_check_address(). + */ pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; + if (vma->vm_flags & VM_LOCKED) { + pte_unmap_unlock(pte, ptl); + *mapcount = 0; /* break early from loop */ + *vm_flags |= VM_LOCKED; + goto out; + } + if (ptep_clear_flush_young_notify(vma, address, pte)) { /* * Don't treat a reference through a sequentially read @@ -546,6 +556,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } + /* Pretend the page is referenced if the task has the + swap token and is in the middle of a page fault. */ + if (mm != current->mm && has_swap_token(mm) && + rwsem_is_locked(&mm->mmap_sem)) + referenced++; + (*mapcount)--; if (referenced) -- cgit v1.2.3-58-ga151 From 2fbfac4e053861925fa3fffcdc327649b09af54c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Mar 2011 01:08:47 -0700 Subject: thp+memcg-numa: fix BUG at include/linux/mm.h:370! THP's collapse_huge_page() has an understandable but ugly difference in when its huge page is allocated: inside if NUMA but outside if not. It's hardly surprising that the memcg failure path forgot that, freeing the page in the non-NUMA case, then hitting a VM_BUG_ON in get_page() (or even worse, using the freed page). Signed-off-by: Hugh Dickins Reviewed-by: Minchan Kim Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index dbe99a5f2073..113e35c47502 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1762,6 +1762,10 @@ static void collapse_huge_page(struct mm_struct *mm, #ifndef CONFIG_NUMA VM_BUG_ON(!*hpage); new_page = *hpage; + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + up_read(&mm->mmap_sem); + return; + } #else VM_BUG_ON(*hpage); /* @@ -1781,12 +1785,12 @@ static void collapse_huge_page(struct mm_struct *mm, *hpage = ERR_PTR(-ENOMEM); return; } -#endif if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { up_read(&mm->mmap_sem); put_page(new_page); return; } +#endif /* after allocating the hugepage upgrade to mmap_sem write mode */ up_read(&mm->mmap_sem); -- cgit v1.2.3-58-ga151 From dc1b83ab08f1954335692cdcd499f78c94f4c42a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 14 Mar 2011 20:05:30 +0100 Subject: oom: oom_kill_process: fix the child_points logic oom_kill_process() starts with victim_points == 0. This means that (most likely) any child has more points and can be killed erroneously. Also, "children has a different mm" doesn't match the reality, we should check child->mm != t->mm. This check is not exactly correct if t->mm == NULL but this doesn't really matter, oom_kill_task() will kill them anyway. Note: "Kill all processes sharing p->mm" in oom_kill_task() is wrong too. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7dcca55ede7c..b19c78eb74c0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -458,10 +458,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct mem_cgroup *mem, nodemask_t *nodemask, const char *message) { - struct task_struct *victim = p; + struct task_struct *victim; struct task_struct *child; - struct task_struct *t = p; - unsigned int victim_points = 0; + struct task_struct *t; + unsigned int victim_points; if (printk_ratelimit()) dump_header(p, gfp_mask, order, mem, nodemask); @@ -487,10 +487,15 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * parent. This attempts to lose the minimal amount of work done while * still freeing memory. */ + victim_points = oom_badness(p, mem, nodemask, totalpages); + victim = p; + t = p; do { list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; + if (child->mm == t->mm) + continue; /* * oom_badness() returns 0 if the thread is unkillable */ -- cgit v1.2.3-58-ga151 From 52d3c03675fdbe1965b9b1909072b40ad2f80063 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 14 Mar 2011 15:17:07 -0700 Subject: Revert "oom: oom_kill_process: fix the child_points logic" This reverts the parent commit. I hate doing that, but it's generating some discussion ("half of it is right"), and since I am planning on doing the 2.6.38 release later today we can punt it to stable if required. Let's not rock the boat right now. Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b19c78eb74c0..7dcca55ede7c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -458,10 +458,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct mem_cgroup *mem, nodemask_t *nodemask, const char *message) { - struct task_struct *victim; + struct task_struct *victim = p; struct task_struct *child; - struct task_struct *t; - unsigned int victim_points; + struct task_struct *t = p; + unsigned int victim_points = 0; if (printk_ratelimit()) dump_header(p, gfp_mask, order, mem, nodemask); @@ -487,15 +487,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * parent. This attempts to lose the minimal amount of work done while * still freeing memory. */ - victim_points = oom_badness(p, mem, nodemask, totalpages); - victim = p; - t = p; do { list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; - if (child->mm == t->mm) - continue; /* * oom_badness() returns 0 if the thread is unkillable */ -- cgit v1.2.3-58-ga151